Adds lib/sub-yt-fetch.sh (yt-dlp wrapper) and lib/yt-clean.py (collapses YouTube's rolling-window auto-caption VTT into a flat SRT). For shows distributed YouTube-first that have no community subs anywhere -- verified via three parallel research agents covering OpenSubtitles REST, OS legacy, Addic7ed, SubDL, SubSource, and Podnapisi for the 5 niche shows in the library, plus a price-vs-coverage analysis of OpenSubtitles VIP. Findings: OS VIP would not have helped on the niche shows (it is download-cap relief, not coverage unlock; same catalog as free). All 4 Jarrad Wright shows in the library (Sassy, Big Lez Saga, Donny & Clarence, Mike Nolan) live on the same channel and have only YouTube auto-CC available. v3.5 ships those, explicitly violating STYLE.md 'best quality' as a tracked stop-gap. Sassy the Sasquatch S01 5/5 episodes subbed with cleaned auto-CC. Mike Nolan special-case noted: a 'COMPLETE SEASON | SUBTITLES' YT upload from Oct 2025 carries hand-typed CCs and should be preferred over per-episode auto-CC when subbing that show. ROADMAP H5 added: v4 WhisperX large-v3 on the friend RTX 4080 node will regenerate the v3.5 stop-gap with proper-noun-prompted transcription (~4-6%% WER vs ~12%% YT auto-CC) and restore the STYLE.md quality bar. H1 OpenSubtitles credentials marked done (was completed 2026-05-09).
56 lines
2.3 KiB
Python
Executable file
56 lines
2.3 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""Clean YouTube auto-caption VTT into a flat SRT with no rolling-window dupes."""
|
|
import re, sys, pathlib
|
|
|
|
def parse_vtt(text):
|
|
"""Yield (start, end, line) tuples, dropping inline timing tags and empty lines."""
|
|
blocks = re.split(r'\n\n+', text.strip())
|
|
for b in blocks:
|
|
if 'WEBVTT' in b or b.startswith('Kind:') or b.startswith('Language:'):
|
|
continue
|
|
m = re.search(r'(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})', b)
|
|
if not m: continue
|
|
start, end = m.group(1), m.group(2)
|
|
# Strip cue settings and inline <00:..><c>...</c> tags
|
|
body = b[m.end():].strip()
|
|
body = re.sub(r'<\d{2}:\d{2}:\d{2}\.\d{3}>', '', body)
|
|
body = re.sub(r'</?c[^>]*>', '', body)
|
|
body = re.sub(r'align:\S+|position:\S+', '', body).strip()
|
|
# Last non-empty line is "new" content (rolling window puts the freshly spoken line at bottom)
|
|
lines = [ln.strip() for ln in body.split('\n') if ln.strip()]
|
|
if not lines: continue
|
|
yield start, end, lines[-1]
|
|
|
|
def to_srt_time(t):
|
|
return t.replace('.', ',')
|
|
|
|
def merge(events):
|
|
"""Drop the 10ms 'gap' cues and merge consecutive identical text."""
|
|
out = []
|
|
for s, e, txt in events:
|
|
# Skip the bridge cue with same text already on top
|
|
if out and out[-1][2] == txt:
|
|
out[-1] = (out[-1][0], to_srt_time(e), txt) # extend
|
|
continue
|
|
out.append([to_srt_time(s), to_srt_time(e), txt])
|
|
# second pass to drop micro-cues
|
|
final = []
|
|
for s, e, txt in out:
|
|
sh, sm, ssms = s.split(':'); ssec, sms = ssms.split(',')
|
|
eh, em, esms = e.split(':'); esec, ems = esms.split(',')
|
|
sm_total = int(sh)*3600+int(sm)*60+int(ssec)+int(sms)/1000
|
|
em_total = int(eh)*3600+int(em)*60+int(esec)+int(ems)/1000
|
|
if em_total - sm_total < 0.05: continue # 50ms bridge cue
|
|
final.append((s, e, txt))
|
|
return final
|
|
|
|
def write_srt(events, path):
|
|
with open(path, 'w') as f:
|
|
for i, (s, e, txt) in enumerate(events, 1):
|
|
f.write(f"{i}\n{s} --> {e}\n{txt}\n\n")
|
|
|
|
if __name__ == '__main__':
|
|
vtt = pathlib.Path(sys.argv[1]).read_text()
|
|
events = list(merge(parse_vtt(vtt)))
|
|
write_srt(events, sys.argv[2])
|
|
print(f"wrote {len(events)} cues -> {sys.argv[2]}")
|