legacy-arrflix/processes/subtitles/lib/sub-a7d-fetch.py

#!/usr/bin/env python3
"""Subtitle fetcher v3 — Addic7ed via subliminal.

Free, no daily quota. Uses OpenSubtitles REST (search-only, no downloads,
no quota burn) to translate library S/E numbering to the show's primary
catalogue numbering (e.g. Hulu→Fox for American Dad), then drives
subliminal's addic7ed provider for the actual download.

Why v3: OS REST `/download` is capped at 20/day on free tier. Addic7ed
serves anonymous downloads with no daily limit. v2 (lib/sub-rest-fetch.py)
remains the right tool when quota isn't the bottleneck — addic7ed has
narrower coverage than OpenSubtitles (English only, mostly).

Picker: subliminal's own scoring against the matched Video (filename, S/E,
year). For AD, addic7ed catalogues by Fox airing order, so the script
remaps library Hulu numbering via per-ep IMDB id lookup on OS REST.

Usage:
    sub-a7d-fetch.py <series-id> --season N [--start E] [--end E]
    sub-a7d-fetch.py <series-id> --all

Env (required):
    JELLYFIN_TOKEN         X-Emby-Token for nullstone Jellyfin
    OPENSUBTITLES_API_KEY  Path to file holding the OS REST key (search only)

Env (optional):
    NULLSTONE              SSH target, default user@192.168.0.100
    DRY_RUN=1              search + remap only, no download
"""
from __future__ import annotations

import argparse
import json
import os
import re
import shlex
import subprocess
import sys
import tempfile
import urllib.parse

from babelfish import Language
from subliminal import (Video, region, list_subtitles, download_subtitles,
                        save_subtitles)

OS_BASE = "https://api.opensubtitles.com/api/v1"
USER_AGENT = "arrflix v1.0.0"
JF_BASE = "http://localhost:8096"
NULLSTONE = os.environ.get("NULLSTONE", "user@192.168.0.100")

region.configure("dogpile.cache.memory")


def die(msg: str, code: int = 1) -> None:
    print(f"ERROR: {msg}", file=sys.stderr)
    sys.exit(code)


def env_or_die(name: str) -> str:
    v = os.environ.get(name)
    if not v:
        die(f"{name} not set")
    return v


def load_api_key() -> str:
    path = env_or_die("OPENSUBTITLES_API_KEY")
    with open(path) as f:
        return f.read().strip()


def jellyfin(path: str, params: dict | None = None) -> dict:
    tok = env_or_die("JELLYFIN_TOKEN")
    qs = "?" + urllib.parse.urlencode(params, safe=",") if params else ""
    url = JF_BASE + path + qs
    cmd = ["ssh", NULLSTONE,
           f"docker exec jellyfin curl -s -H 'X-Emby-Token: {tok}' {shlex.quote(url)}"]
    return json.loads(subprocess.check_output(cmd, text=True))


def list_episodes(series_id: str) -> list[dict]:
    d = jellyfin("/Items", {
        "ParentId": series_id,
        "IncludeItemTypes": "Episode",
        "Recursive": "true",
        "Fields": "Path,ParentIndexNumber,IndexNumber,ProviderIds",
        "SortBy": "ParentIndexNumber,IndexNumber",
    })
    return d["Items"]


def imdb_strip(s: str | None) -> str | None:
    if not s:
        return None
    return s[2:] if s.startswith("tt") else s


def os_search_imdb(api_key: str, imdb_no_tt: str) -> tuple[int, int] | None:
    """Look up the show's primary catalogue (season, episode) by per-ep IMDB id.
    Uses OS feature_details S/E (which appears to align with what Addic7ed
    indexes for at least the test shows). Search calls do not consume the
    daily quota. If the resulting download mismatches expected dialogue,
    consider re-running with the v2 OS REST path which uses imdb_id directly."""
    cmd = ["curl", "-sSf",
           "-H", f"Api-Key: {api_key}",
           "-H", f"User-Agent: {USER_AGENT}",
           f"{OS_BASE}/subtitles?imdb_id={imdb_no_tt}&languages=en&per_page=5"]
    raw = subprocess.check_output(cmd)
    j = json.loads(raw.decode())
    for h in j.get("data", []):
        fd = h.get("attributes", {}).get("feature_details", {})
        s, e = fd.get("season_number"), fd.get("episode_number")
        if s and e:
            return int(s), int(e)
    return None


def episode_to_paths(ep: dict) -> tuple[str, str]:
    """Return (remote_dir, base_filename) for sidecar placement on nullstone."""
    container_path = ep["Path"]
    host_path = container_path.replace("/media/", "/home/user/media/")
    return os.path.dirname(host_path), os.path.splitext(os.path.basename(host_path))[0]


def addic7ed_safe_name(series: str, year: int | None, fox_s: int, fox_e: int) -> str:
    """Build filename that subliminal+addic7ed match. Strip '!' (breaks matcher)
    and other punctuation; keep year if known."""
    cleaned = re.sub(r"[!?:]", "", series).replace(" ", ".")
    yearbit = f".{year}" if year else ""
    return f"{cleaned}{yearbit}.S{fox_s:02d}E{fox_e:02d}.HDTV.x264.mkv"


def write_sidecar_remote(content: bytes, remote_path: str) -> None:
    p = subprocess.Popen(["ssh", NULLSTONE, f"cat > {shlex.quote(remote_path)}"],
                         stdin=subprocess.PIPE)
    p.communicate(content)
    if p.returncode != 0:
        die(f"failed writing {remote_path}")


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("series_id")
    ap.add_argument("--season", type=int, default=None)
    ap.add_argument("--start", type=int, default=1)
    ap.add_argument("--end", type=int, default=10**6)
    ap.add_argument("--all", action="store_true")
    args = ap.parse_args()

    if args.season is None and not args.all:
        die("pass --season N or --all")

    api_key = load_api_key()
    dry = os.environ.get("DRY_RUN") == "1"

    eps = list_episodes(args.series_id)
    work = []
    for ep in eps:
        s, n = ep["ParentIndexNumber"], ep["IndexNumber"]
        if not args.all and s != args.season:
            continue
        if not (args.start <= n <= args.end):
            continue
        work.append(ep)
    if not work:
        die("no episodes selected")

    print(f"[plan] {len(work)} episodes selected", file=sys.stderr)

    ok = 0
    fail = []
    for ep in work:
        s, n = ep["ParentIndexNumber"], ep["IndexNumber"]
        label = f"libS{s:02}E{n:02} {ep['Name']}"

        imdb = imdb_strip(ep.get("ProviderIds", {}).get("Imdb"))
        if not imdb:
            print(f"[skip] {label} — no IMDB id", file=sys.stderr)
            fail.append((label, "no-imdb"))
            continue

        try:
            fox = os_search_imdb(api_key, imdb)
        except subprocess.CalledProcessError as e:
            print(f"[skip] {label} — OS search err {e.returncode}", file=sys.stderr)
            fail.append((label, "os-search"))
            continue
        if fox is None:
            print(f"[skip] {label} — OS has no S/E for imdb={imdb}", file=sys.stderr)
            fail.append((label, "no-fox-se"))
            continue
        fox_s, fox_e = fox

        # series name + year — pull from path or item
        series_name = ep.get("SeriesName") or "Show"
        year = None
        ymatch = re.search(r"\((\d{4})\)", ep.get("Path", ""))
        if ymatch:
            year = int(ymatch.group(1))

        v_name = addic7ed_safe_name(series_name, year, fox_s, fox_e)
        v = Video.fromname(v_name)

        try:
            hits = list_subtitles([v], {Language("eng")},
                                  providers=["addic7ed"]).get(v, [])
        except Exception as e:
            print(f"[skip] {label} — addic7ed list err: {type(e).__name__}",
                  file=sys.stderr)
            fail.append((label, "a7d-list"))
            continue

        if not hits:
            print(f"[skip] {label} — addic7ed 0 subs (foxS{fox_s:02}E{fox_e:02})",
                  file=sys.stderr)
            fail.append((label, "a7d-no-hits"))
            continue

        pick = hits[0]  # subliminal returns ordered; take first
        print(f"[pick] {label} -> foxS{fox_s:02}E{fox_e:02} a7d={pick.id}",
              file=sys.stderr)

        if dry:
            ok += 1
            continue

        try:
            download_subtitles([pick])
        except Exception as e:
            print(f"[fail] {label} — addic7ed dl err: {type(e).__name__}: {e}",
                  file=sys.stderr)
            fail.append((label, "a7d-dl"))
            continue

        if not pick.content:
            print(f"[fail] {label} — empty content", file=sys.stderr)
            fail.append((label, "empty"))
            continue

        remote_dir, base = episode_to_paths(ep)
        dest = f"{remote_dir}/{base}.eng.srt"
        write_sidecar_remote(pick.content, dest)
        print(f"[ok]  {label} -> {dest}", file=sys.stderr)
        ok += 1

    print(f"\n[done] ok={ok}/{len(work)} failures={len(fail)}", file=sys.stderr)
    for lab, why in fail:
        print(f"  - {lab}: {why}", file=sys.stderr)
    return 0 if ok else 2


if __name__ == "__main__":
    sys.exit(main())