#!/usr/bin/env bash # Subtitle fetcher v3.5 — YouTube auto-captions via yt-dlp + cleaner. # # For shows that distribute on YouTube and have no community subs anywhere # else (e.g. Big Lez Show universe: Sassy the Sasquatch, Donny & Clarence, # Mike Nolan, Big Lez Saga). yt-dlp pulls the en-orig auto-CC track, the # rolling-window VTT goes through yt-clean.py to deduplicate into a flat # SRT, and the result is dropped on nullstone with the library filename. # # Quality caveats (per processes/subtitles/STYLE.md fallback policy): # - lowercase, no punctuation # - YouTube ASR mishears proper nouns (e.g. "Sassy" → "sasha") # - profanity is censored as "[ __ ]" # - capitalisation / sentence segmentation is absent # # These subs ship as a stop-gap. v4 (WhisperX large-v3 on the 4080 friend # node) replaces them with full-quality transcriptions; see ROADMAP. # # Usage: # sub-yt-fetch.sh # # Example (Sassy): # sub-yt-fetch.sh \ # 'https://www.youtube.com/playlist?list=PLGMC7oz7XpmDMGrALMQiNXCi9p7aqkWbj' \ # /tmp/sassy-yt \ # 'Sassy the Sasquatch (2022) - S01E%(playlist_index)02d - %(title)s' # # After fetch: rename / copy each .en.srt to nullstone with the canonical # library filename (`.eng.srt`). For now this is manual — # automate when the next show comes through. set -euo pipefail PLAYLIST="${1:?playlist or channel URL required}" OUTDIR="${2:?output directory required}" NAMETMPL="${3:-S%(playlist_index)02d - %(title)s}" mkdir -p "$OUTDIR" if ! command -v yt-dlp >/dev/null; then echo "ERROR: yt-dlp not installed (pip install yt-dlp)" >&2 exit 1 fi # Pull raw VTT auto-CC, no video, en-orig only (matches en bytewise but is the # canonical track to request). yt-dlp --skip-download --write-auto-subs --sub-langs "en-orig" \ --sub-format vtt \ --sleep-requests 1 --sleep-subtitles 2 \ -o "$OUTDIR/${NAMETMPL}-raw.%(ext)s" \ "$PLAYLIST" CLEANER="$(dirname "$0")/yt-clean.py" if [[ ! -x "$CLEANER" ]]; then echo "ERROR: $CLEANER not found / not executable" >&2 exit 2 fi # Convert each raw VTT to clean SRT shopt -s nullglob for vtt in "$OUTDIR"/*-raw.en-orig.vtt; do out="${vtt%-raw.en-orig.vtt}.en.srt" python3 "$CLEANER" "$vtt" "$out" echo "OK $out" done echo echo "next: copy each .en.srt to nullstone with library filename, then library scan."