legacy-arrflix/playbooks/subtitles/lib/sub-yt-fetch.sh

#!/usr/bin/env bash
# Subtitle fetcher v3.5 — YouTube auto-captions via yt-dlp + cleaner.
#
# For shows that distribute on YouTube and have no community subs anywhere
# else (e.g. Big Lez Show universe: Sassy the Sasquatch, Donny & Clarence,
# Mike Nolan, Big Lez Saga). yt-dlp pulls the en-orig auto-CC track, the
# rolling-window VTT goes through yt-clean.py to deduplicate into a flat
# SRT, and the result is dropped on nullstone with the library filename.
#
# Quality caveats (per playbooks/subtitles/STYLE.md fallback policy):
#   - lowercase, no punctuation
#   - YouTube ASR mishears proper nouns (e.g. "Sassy" → "sasha")
#   - profanity is censored as "[ __ ]"
#   - capitalisation / sentence segmentation is absent
#
# These subs ship as a stop-gap. v4 (WhisperX large-v3 on the 4080 friend
# node) replaces them with full-quality transcriptions; see ROADMAP.
#
# Usage:
#   sub-yt-fetch.sh <playlist-or-channel-url> <out-dir> <name-template>
#
# Example (Sassy):
#   sub-yt-fetch.sh \
#     'https://www.youtube.com/playlist?list=PLGMC7oz7XpmDMGrALMQiNXCi9p7aqkWbj' \
#     /tmp/sassy-yt \
#     'Sassy the Sasquatch (2022) - S01E%(playlist_index)02d - %(title)s'
#
# After fetch: rename / copy each .en.srt to nullstone with the canonical
# library filename (`<videobasename>.eng.srt`). For now this is manual —
# automate when the next show comes through.

set -euo pipefail

PLAYLIST="${1:?playlist or channel URL required}"
OUTDIR="${2:?output directory required}"
NAMETMPL="${3:-S%(playlist_index)02d - %(title)s}"

mkdir -p "$OUTDIR"

if ! command -v yt-dlp >/dev/null; then
  echo "ERROR: yt-dlp not installed (pip install yt-dlp)" >&2
  exit 1
fi

# Pull raw VTT auto-CC, no video, en-orig only (matches en bytewise but is the
# canonical track to request).
yt-dlp --skip-download --write-auto-subs --sub-langs "en-orig" \
       --sub-format vtt \
       --sleep-requests 1 --sleep-subtitles 2 \
       -o "$OUTDIR/${NAMETMPL}-raw.%(ext)s" \
       "$PLAYLIST"

CLEANER="$(dirname "$0")/yt-clean.py"
if [[ ! -x "$CLEANER" ]]; then
  echo "ERROR: $CLEANER not found / not executable" >&2
  exit 2
fi

# Convert each raw VTT to clean SRT
shopt -s nullglob
for vtt in "$OUTDIR"/*-raw.en-orig.vtt; do
  out="${vtt%-raw.en-orig.vtt}.en.srt"
  python3 "$CLEANER" "$vtt" "$out"
  echo "OK  $out"
done

echo
echo "next: copy each .en.srt to nullstone with library filename, then library scan."