legacy-arrflix/processes/subtitles/lib/yt-clean.py

#!/usr/bin/env python3
"""Clean YouTube auto-caption VTT into a flat SRT with no rolling-window dupes."""
import re, sys, pathlib

def parse_vtt(text):
    """Yield (start, end, line) tuples, dropping inline timing tags and empty lines."""
    blocks = re.split(r'\n\n+', text.strip())
    for b in blocks:
        if 'WEBVTT' in b or b.startswith('Kind:') or b.startswith('Language:'):
            continue
        m = re.search(r'(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})', b)
        if not m: continue
        start, end = m.group(1), m.group(2)
        # Strip cue settings and inline <00:..><c>...</c> tags
        body = b[m.end():].strip()
        body = re.sub(r'<\d{2}:\d{2}:\d{2}\.\d{3}>', '', body)
        body = re.sub(r'</?c[^>]*>', '', body)
        body = re.sub(r'align:\S+|position:\S+', '', body).strip()
        # Last non-empty line is "new" content (rolling window puts the freshly spoken line at bottom)
        lines = [ln.strip() for ln in body.split('\n') if ln.strip()]
        if not lines: continue
        yield start, end, lines[-1]

def to_srt_time(t):
    return t.replace('.', ',')

def merge(events):
    """Drop the 10ms 'gap' cues and merge consecutive identical text."""
    out = []
    for s, e, txt in events:
        # Skip the bridge cue with same text already on top
        if out and out[-1][2] == txt:
            out[-1] = (out[-1][0], to_srt_time(e), txt)  # extend
            continue
        out.append([to_srt_time(s), to_srt_time(e), txt])
    # second pass to drop micro-cues
    final = []
    for s, e, txt in out:
        sh, sm, ssms = s.split(':'); ssec, sms = ssms.split(',')
        eh, em, esms = e.split(':'); esec, ems = esms.split(',')
        sm_total = int(sh)*3600+int(sm)*60+int(ssec)+int(sms)/1000
        em_total = int(eh)*3600+int(em)*60+int(esec)+int(ems)/1000
        if em_total - sm_total < 0.05: continue  # 50ms bridge cue
        final.append((s, e, txt))
    return final

def write_srt(events, path):
    with open(path, 'w') as f:
        for i, (s, e, txt) in enumerate(events, 1):
            f.write(f"{i}\n{s} --> {e}\n{txt}\n\n")

if __name__ == '__main__':
    vtt = pathlib.Path(sys.argv[1]).read_text()
    events = list(merge(parse_vtt(vtt)))
    write_srt(events, sys.argv[2])
    print(f"wrote {len(events)} cues -> {sys.argv[2]}")