legacy-arrflix/playbooks/subtitles/lib/yt-clean.py
s8n 24a9497e7d playbooks/ rename + import-media v1.0 + lilo&stitch run
processes/ -> playbooks/ (git mv preserves history; updated cross-refs
in ROADMAP, README, subtitles playbook + scripts).

playbooks/import-media/README.md v1.0 — 7-step import workflow:
  stage on onyx -> rsync to nullstone -> chmod -> verify scan ->
  Items/Counts bump -> optional subtitle pass -> run-log
Cross-references docs/05/07/08, ADMIN-GUIDE, README. Mirrors the
existing subtitles playbook structure (CHANGELOG + runs/_template).

CHANGELOG v1.0 lists known gaps (bin/cleanup-import.sh and
bin/normalize.py still doc-only, ROADMAP M6).

First run logged: playbooks/import-media/runs/lilo-stitch-2002.md.
Lilo & Stitch (2002) imported to /home/user/media/movies/, item
c2f4aff133c1b9631500fadf293b0b2f, TMDb 11544, MovieCount 3 -> 4.
LibraryMonitor didn't auto-fire — needed manual /Library/Refresh;
playbook updated to make this an unconditional step.

Source: 1080p BluRay HEVC 10-bit / EAC3 5.1 / 2x PGS embedded subs.
Per quality bar (README.md:41) — passes.
2026-05-10 02:29:57 +01:00

56 lines
2.3 KiB
Python
Executable file

#!/usr/bin/env python3
"""Clean YouTube auto-caption VTT into a flat SRT with no rolling-window dupes."""
import re, sys, pathlib
def parse_vtt(text):
"""Yield (start, end, line) tuples, dropping inline timing tags and empty lines."""
blocks = re.split(r'\n\n+', text.strip())
for b in blocks:
if 'WEBVTT' in b or b.startswith('Kind:') or b.startswith('Language:'):
continue
m = re.search(r'(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})', b)
if not m: continue
start, end = m.group(1), m.group(2)
# Strip cue settings and inline <00:..><c>...</c> tags
body = b[m.end():].strip()
body = re.sub(r'<\d{2}:\d{2}:\d{2}\.\d{3}>', '', body)
body = re.sub(r'</?c[^>]*>', '', body)
body = re.sub(r'align:\S+|position:\S+', '', body).strip()
# Last non-empty line is "new" content (rolling window puts the freshly spoken line at bottom)
lines = [ln.strip() for ln in body.split('\n') if ln.strip()]
if not lines: continue
yield start, end, lines[-1]
def to_srt_time(t):
return t.replace('.', ',')
def merge(events):
"""Drop the 10ms 'gap' cues and merge consecutive identical text."""
out = []
for s, e, txt in events:
# Skip the bridge cue with same text already on top
if out and out[-1][2] == txt:
out[-1] = (out[-1][0], to_srt_time(e), txt) # extend
continue
out.append([to_srt_time(s), to_srt_time(e), txt])
# second pass to drop micro-cues
final = []
for s, e, txt in out:
sh, sm, ssms = s.split(':'); ssec, sms = ssms.split(',')
eh, em, esms = e.split(':'); esec, ems = esms.split(',')
sm_total = int(sh)*3600+int(sm)*60+int(ssec)+int(sms)/1000
em_total = int(eh)*3600+int(em)*60+int(esec)+int(ems)/1000
if em_total - sm_total < 0.05: continue # 50ms bridge cue
final.append((s, e, txt))
return final
def write_srt(events, path):
with open(path, 'w') as f:
for i, (s, e, txt) in enumerate(events, 1):
f.write(f"{i}\n{s} --> {e}\n{txt}\n\n")
if __name__ == '__main__':
vtt = pathlib.Path(sys.argv[1]).read_text()
events = list(merge(parse_vtt(vtt)))
write_srt(events, sys.argv[2])
print(f"wrote {len(events)} cues -> {sys.argv[2]}")