legacy-arrflix/processes/subtitles/lib/sub-rest-fetch.py
s8n 23520df2df processes/subtitles: v2 REST fetcher + AD S02E01-E12 subbed
Adds lib/sub-rest-fetch.py: direct OpenSubtitles REST, looks up subs by
per-episode IMDB id (e.g. tt0511631) instead of the plugin's
(parent_imdb_id, season, episode) combo path. This sidesteps shows where
library numbering diverges from OpenSubtitles' catalogued numbering --
American Dad uses Hulu S1=7 eps; OS uses Fox S1=23 eps; the plugin path
returns 0 hits past S01E07 even though every per-episode IMDB id is
correct.

Recipe README updated to surface the two paths (v1 plugin / v2 REST) and
recommend v2 by default. American Dad run log now shows 19/58 episodes
subbed (S01 7/7 via v1, S02E01-E12 via v2). S02E13-S04 (39 eps) deferred
to next 20/day quota windows.

Quirk fixed in v2: OpenSubtitles /download endpoint consistently returns
HTTP 503 to Python urllib.request despite identical headers/body via curl.
_curl() shim routes all OS API calls through curl. Each 503 still
consumes a download slot, so urllib path was unsafe to retry on.
2026-05-09 23:09:09 +01:00

285 lines
9.6 KiB
Python
Executable file

#!/usr/bin/env python3
"""Subtitle fetcher v2 — direct OpenSubtitles REST API.
Bypasses the Jellyfin OpenSubtitles plugin to dodge season/episode numbering
mismatches. Looks each library episode up by its per-episode IMDB id, picks
the best English match, downloads via the REST endpoint, and writes the
sidecar straight onto nullstone next to the media file (via SSH).
Why v2 exists: see ../CHANGELOG.md "Known break" — American Dad library
uses Hulu season numbering, OS catalogues by Fox airing order; the plugin
queries by (parent_imdb_id, season, episode) so library S02E01 → OS S01E08
returned 0 hits even though the per-episode IMDB id (tt0511631) is real.
Picker: highest download_count among non-HI, non-MT, non-AI, non-Forced
candidates; 23.976fps preferred. Falls back to all candidates if every match
is HI/MT/AI/Forced.
Usage:
sub-rest-fetch.py <series-id> --season <N> [--start <ep>] [--end <ep>]
sub-rest-fetch.py <series-id> --all
Env (required):
JELLYFIN_TOKEN X-Emby-Token for nullstone Jellyfin
OPENSUBTITLES_API_KEY Path to file holding the API key
OPENSUBTITLES_USER OS account username
OPENSUBTITLES_PASS OS account password
Env (optional):
NULLSTONE SSH target, default user@192.168.0.100
DRY_RUN=1 search + pick only, no download
"""
from __future__ import annotations
import argparse
import json
import os
import shlex
import subprocess
import sys
import time
import urllib.parse
OS_BASE = "https://api.opensubtitles.com/api/v1"
USER_AGENT = "arrflix v1.0.0"
JF_BASE = "http://localhost:8096"
NULLSTONE = os.environ.get("NULLSTONE", "user@192.168.0.100")
def die(msg: str, code: int = 1) -> None:
print(f"ERROR: {msg}", file=sys.stderr)
sys.exit(code)
def env_or_die(name: str) -> str:
v = os.environ.get(name)
if not v:
die(f"{name} not set")
return v
def load_api_key() -> str:
path = env_or_die("OPENSUBTITLES_API_KEY")
with open(path) as f:
return f.read().strip()
def _curl(url: str, method: str = "GET", headers: dict | None = None,
body: dict | None = None, binary: bool = False) -> bytes:
"""OpenSubtitles' frontend rejects urllib (consistent 503 on /download).
curl works against the same endpoint and headers. Use curl uniformly."""
cmd = ["curl", "-sSf", "-X", method, url]
for k, v in (headers or {}).items():
cmd += ["-H", f"{k}: {v}"]
if body is not None:
cmd += ["--data", json.dumps(body)]
return subprocess.check_output(cmd)
def http_json(url: str, method: str = "GET", headers: dict | None = None,
body: dict | None = None) -> dict:
raw = _curl(url, method, headers, body)
return json.loads(raw.decode())
def http_get_bytes(url: str) -> bytes:
return _curl(url, "GET", headers={"User-Agent": USER_AGENT})
def jellyfin(path: str, params: dict | None = None) -> dict:
"""Run Jellyfin API call inside the container on nullstone via SSH."""
tok = env_or_die("JELLYFIN_TOKEN")
qs = ""
if params:
qs = "?" + urllib.parse.urlencode(params, safe=",")
url = JF_BASE + path + qs
cmd = ["ssh", NULLSTONE,
f"docker exec jellyfin curl -s -H 'X-Emby-Token: {tok}' {shlex.quote(url)}"]
out = subprocess.check_output(cmd, text=True)
return json.loads(out)
def list_episodes(series_id: str) -> list[dict]:
d = jellyfin(f"/Items", {
"ParentId": series_id,
"IncludeItemTypes": "Episode",
"Recursive": "true",
"Fields": "Path,ParentIndexNumber,IndexNumber,ProviderIds",
"SortBy": "ParentIndexNumber,IndexNumber",
})
return d["Items"]
def os_login(api_key: str, user: str, password: str) -> str:
res = http_json(f"{OS_BASE}/login", "POST", headers={
"Api-Key": api_key,
"Content-Type": "application/json",
"User-Agent": USER_AGENT,
}, body={"username": user, "password": password})
return res["token"]
def os_user_info(api_key: str, bearer: str) -> dict:
return http_json(f"{OS_BASE}/infos/user", headers={
"Api-Key": api_key,
"Authorization": f"Bearer {bearer}",
"User-Agent": USER_AGENT,
})["data"]
def os_search(api_key: str, imdb_id: str) -> list[dict]:
"""imdb_id without the 'tt' prefix per OS convention."""
res = http_json(
f"{OS_BASE}/subtitles?imdb_id={imdb_id}&languages=en",
headers={"Api-Key": api_key, "User-Agent": USER_AGENT})
return res.get("data", [])
def pick_best(hits: list[dict]) -> dict | None:
"""Filter HI/MT/AI/Forced, prefer 23.976fps, sort by download_count desc."""
def attr(h, k):
return h["attributes"].get(k)
clean = [h for h in hits
if not attr(h, "hearing_impaired")
and not attr(h, "machine_translated")
and not attr(h, "ai_translated")
and not attr(h, "foreign_parts_only")]
if not clean:
clean = hits
fps2398 = [h for h in clean if abs((attr(h, "fps") or 0) - 23.976) < 0.01]
pool = fps2398 if fps2398 else clean
pool.sort(key=lambda h: -(attr(h, "download_count") or 0))
return pool[0] if pool else None
def os_download(api_key: str, bearer: str, file_id: int) -> dict:
return http_json(f"{OS_BASE}/download", "POST", headers={
"Api-Key": api_key,
"Authorization": f"Bearer {bearer}",
"Content-Type": "application/json",
"User-Agent": USER_AGENT,
}, body={"file_id": file_id})
def write_sidecar_remote(content: bytes, remote_path: str) -> None:
"""ssh redirect file content to nullstone."""
cmd = ["ssh", NULLSTONE, f"cat > {shlex.quote(remote_path)}"]
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
p.communicate(content)
if p.returncode != 0:
die(f"failed writing {remote_path}")
def imdb_strip(s: str | None) -> str | None:
if not s:
return None
return s[2:] if s.startswith("tt") else s
def episode_to_paths(ep: dict) -> tuple[str, str]:
"""Return (remote_dir, base_filename) for sidecar placement."""
container_path = ep["Path"] # /media/tv/Show/Season XX/Show - SxxExx - Title.mkv
host_path = container_path.replace("/media/", "/home/user/media/")
remote_dir = os.path.dirname(host_path)
base = os.path.splitext(os.path.basename(host_path))[0]
return remote_dir, base
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("series_id")
ap.add_argument("--season", type=int, default=None)
ap.add_argument("--start", type=int, default=1)
ap.add_argument("--end", type=int, default=10**6)
ap.add_argument("--all", action="store_true")
args = ap.parse_args()
if args.season is None and not args.all:
die("pass --season N or --all")
api_key = load_api_key()
user = env_or_die("OPENSUBTITLES_USER")
pw = env_or_die("OPENSUBTITLES_PASS")
dry = os.environ.get("DRY_RUN") == "1"
bearer = os_login(api_key, user, pw)
info = os_user_info(api_key, bearer)
print(f"[quota] remaining={info['remaining_downloads']}/{info['allowed_downloads']}, "
f"resets in {info['reset_time']}", file=sys.stderr)
eps = list_episodes(args.series_id)
work = []
for ep in eps:
s = ep["ParentIndexNumber"]
n = ep["IndexNumber"]
if not args.all and s != args.season:
continue
if not (args.start <= n <= args.end):
continue
work.append(ep)
if not work:
die("no episodes selected")
print(f"[plan] {len(work)} episodes selected", file=sys.stderr)
if not dry and len(work) > info["remaining_downloads"]:
print(f"[warn] {len(work)} > quota {info['remaining_downloads']}; "
f"will halt mid-run", file=sys.stderr)
ok = 0
fail = []
for ep in work:
s, n = ep["ParentIndexNumber"], ep["IndexNumber"]
label = f"S{s:02}E{n:02} {ep['Name']}"
imdb = imdb_strip(ep.get("ProviderIds", {}).get("Imdb"))
if not imdb:
print(f"[skip] {label} — no IMDB id", file=sys.stderr)
fail.append((label, "no-imdb"))
continue
hits = os_search(api_key, imdb)
pick = pick_best(hits)
if not pick:
print(f"[skip] {label} — 0 hits for imdb={imdb}", file=sys.stderr)
fail.append((label, "no-hits"))
continue
a = pick["attributes"]
f = a["files"][0]
print(f"[pick] {label} imdb={imdb} fid={f['file_id']} dl={a.get('download_count')} "
f"fps={a.get('fps')} fname={f.get('file_name')}", file=sys.stderr)
if dry:
ok += 1
continue
try:
dl = os_download(api_key, bearer, f["file_id"])
except subprocess.CalledProcessError as e:
print(f"[fail] {label} download (curl exit {e.returncode})", file=sys.stderr)
fail.append((label, f"dl-curl-{e.returncode}"))
break # may be quota; stop run
link = dl.get("link")
if not link:
print(f"[fail] {label} no download link in response: {dl}", file=sys.stderr)
fail.append((label, "no-link"))
break
content = http_get_bytes(link)
remote_dir, base = episode_to_paths(ep)
dest = f"{remote_dir}/{base}.eng.srt"
write_sidecar_remote(content, dest)
print(f"[ok] {label} -> {dest} (remaining={dl.get('remaining')})",
file=sys.stderr)
ok += 1
time.sleep(0.5) # be polite
print(f"\n[done] ok={ok}/{len(work)} failures={len(fail)}", file=sys.stderr)
for lab, why in fail:
print(f" - {lab}: {why}", file=sys.stderr)
return 0 if ok else 2
if __name__ == "__main__":
sys.exit(main())