Tfe

Ongi etorri tfe-ren webgunera...
Blog/guau/download_guau_podcast.py

(Deskargatu)
#!/usr/bin/env python3
"""
Guau-ko podcasteko audio atalak deskargatzen ditu honetatik:
  - sailaren orria: https://guau.eus/s/<slug>
  - RSS jario zuzena: https://guau.eus/series/<id>/rss.xml

Guau-ren RSS jarioak normalean sail bereko denboraldi guztiak biltzen ditu
 (itunes:season etiketa anitz, jario berean).

Mendeko liburutegia: Python liburutegi estandarra soilik.
"""

from __future__ import annotations

import argparse
import re
import ssl
import sys
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
from pathlib import Path
from urllib.parse import urlparse, urlunparse

GUAU_HOSTS = ("guau.eus", "www.guau.eus")
ITUNES_NS = {"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"}
DEFAULT_UA = (
    "Mozilla/5.0 (compatible; GuauPodcastFetcher/1.0; +https://guau.eus/)"
)


def _parse_itunes_int(el: ET.Element | None) -> int | None:
    if el is None or not el.text:
        return None
    t = str(el.text).strip()
    if t.lstrip("-").isdigit():
        n = int(t)
        return n if n >= 0 else None
    return None


def fetch(url: str, timeout: int, user_agent: str) -> bytes:
    req = urllib.request.Request(
        url,
        headers={"User-Agent": user_agent, "Accept": "*/*"},
    )
    ctx = ssl.create_default_context()
    with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
        return resp.read()


def normalize_guau_url(url: str) -> str:
    p = urlparse(url.strip())
    if not p.scheme:
        p = urlparse("https://" + url.strip().lstrip("/"))
    if p.netloc.lower() in ("guau.eus", "www.guau.eus") and not p.scheme:
        p = p._replace(scheme="https")
    return urlunparse(p)


def discover_rss_url(series_page_html: str) -> str | None:
    # Lehenengo /series/<id>/rss.xml esteka erakusten den sailari dagokio
    # (orrialdearen hydratazioa, erlazionatutako «related» blokeak baino lehen).
    matches = re.findall(
        r"https://guau\.eus/series/(\d+)/rss\.xml",
        series_page_html,
    )
    if not matches:
        return None
    sid = matches[0]
    return f"https://guau.eus/series/{sid}/rss.xml"


def parse_rss_items(
    rss_xml: bytes,
) -> list[tuple[str, str, int | None, int | None, int | None]]:
    """Itzultzen du (izenburua, mp3_url, byte_kopurua, denboraldia, atala) <item> bakoitzeko.

    byte_kopurua: RSS-ko length atributua. denboraldia / atala: itunes:season, itunes:episode.
    """
    root = ET.fromstring(rss_xml)
    channel = root.find("channel")
    if channel is None:
        return []
    out: list[tuple[str, str, int | None, int | None, int | None]] = []
    for item in channel.findall("item"):
        title_el = item.find("title")
        title = (title_el.text or "").strip() if title_el is not None else ""
        enc = item.find("enclosure")
        if enc is None:
            continue
        mp3 = enc.get("url") or ""
        if not mp3:
            continue
        raw_len = enc.get("length")
        length: int | None = None
        if raw_len and str(raw_len).strip().isdigit():
            length = int(str(raw_len).strip())
            if length <= 0:
                length = None
        season = _parse_itunes_int(item.find("itunes:season", ITUNES_NS))
        episode = _parse_itunes_int(item.find("itunes:episode", ITUNES_NS))
        out.append((title, mp3, length, season, episode))
    return out


def multi_season_feed(items: list[tuple[str, str, int | None, int | None, int | None]]) -> bool:
    """True, RSS-ak bi denboraldi edo gehiago baditu, edo 1 baino handiago den denboraldi bat (SxxEyy)."""
    seasons = {s for _, _, _, s, _ in items if s is not None}
    if len(seasons) > 1:
        return True
    if seasons and max(seasons) > 1:
        return True
    return False


def http_head_content_length(url: str, timeout: int, user_agent: str) -> int | None:
    req = urllib.request.Request(
        url,
        method="HEAD",
        headers={"User-Agent": user_agent, "Accept": "*/*"},
    )
    ctx = ssl.create_default_context()
    try:
        with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
            cl = resp.headers.get("Content-Length")
            if cl and str(cl).strip().isdigit():
                n = int(str(cl).strip())
                return n if n > 0 else None
    except (urllib.error.HTTPError, urllib.error.URLError, OSError, ValueError):
        pass
    return None


def resolved_expected_bytes(
    mp3_url: str,
    rss_length: int | None,
    timeout: int,
    user_agent: str,
) -> int | None:
    if rss_length is not None:
        return rss_length
    return http_head_content_length(mp3_url, timeout, user_agent)


def safe_filename(
    title: str,
    index: int,
    season: int | None,
    episode: int | None,
    use_season_prefix: bool,
) -> str:
    title_part = re.sub(r'[<>:"/\\|?*\x00-\x1f]', "_", title).strip()
    title_part = re.sub(r"\s+", " ", title_part)
    if not title_part:
        title_part = f"episode_{index:02d}"
    prefix = ""
    if use_season_prefix and season is not None and episode is not None:
        prefix = f"S{season:02d}E{episode:02d} - "
    elif use_season_prefix and season is not None:
        prefix = f"S{season:02d} - "
    elif use_season_prefix and episode is not None:
        prefix = f"E{episode:02d} - "
    base = prefix + title_part
    if not base.lower().endswith(".mp3"):
        base += ".mp3"
    return base


def download_file(url: str, dest: Path, timeout: int, user_agent: str) -> None:
    dest.parent.mkdir(parents=True, exist_ok=True)
    req = urllib.request.Request(
        url,
        headers={"User-Agent": user_agent, "Accept": "*/*"},
    )
    ctx = ssl.create_default_context()
    tmp = dest.with_suffix(dest.suffix + ".part")
    try:
        with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
            total = int(resp.headers.get("Content-Length") or 0)
            chunk = 1024 * 256
            got = 0
            with open(tmp, "wb") as f:
                while True:
                    buf = resp.read(chunk)
                    if not buf:
                        break
                    f.write(buf)
                    got += len(buf)
                    if total and sys.stderr.isatty():
                        pct = 100.0 * got / total
                        sys.stderr.write(f"\r  {pct:5.1f}% ({got // (1024 * 1024)} MiB)")
        if sys.stderr.isatty():
            sys.stderr.write("\n")
        tmp.replace(dest)
    finally:
        if tmp.exists():
            tmp.unlink(missing_ok=True)


def main() -> int:
    ap = argparse.ArgumentParser(
        description="Deskargatu MP3ak guau.eus-etik (/s/…) edo RSS ofizialetik."
    )
    ap.add_argument(
        "url",
        help="Sailaren orria (https://guau.eus/s/slug) edo RSS jarioa (/series/<id>/rss.xml)",
    )
    ap.add_argument(
        "-o",
        "--output-dir",
        type=Path,
        default=Path("."),
        help="Helburuko karpeta (lehenetsia: uneko direktorioa)",
    )
    ap.add_argument(
        "-n",
        "--dry-run",
        action="store_true",
        help="Erakutsi atalak deskargatu gabe",
    )
    ap.add_argument(
        "--timeout",
        type=int,
        default=120,
        help="HTTP denbora-muga segundotan (lehenetsia: 120)",
    )
    ap.add_argument(
        "--user-agent",
        default=DEFAULT_UA,
        help="HTTP eskaeretarako User-Agent",
    )
    args = ap.parse_args()
    raw = normalize_guau_url(args.url)
    parsed = urlparse(raw)

    rss_url: str | None = None
    if re.search(r"/series/\d+/rss\.xml$", parsed.path or "", re.I):
        rss_url = raw.split("?", 1)[0]
    elif re.search(r"/series/[^/]+/rss\.xml$", parsed.path or "", re.I):
        # Adib. /series/ainguratuak/rss.xml 500 itzuli dezake; saihesten da.
        print(
            "Erabili /s/<slug> orrialdearen URLa edo /series/<zenbaki id>/rss.xml.",
            file=sys.stderr,
        )
        return 1
    elif (parsed.netloc.lower().removesuffix(":443") in GUAU_HOSTS or not parsed.netloc) and (
        parsed.path or ""
    ).startswith("/s/"):
        try:
            html = fetch(raw, args.timeout, args.user_agent).decode(
                "utf-8", errors="replace"
            )
        except urllib.error.HTTPError as e:
            print(f"HTTP errorea orrialdea kargatzean: {e}", file=sys.stderr)
            return 1
        except Exception as e:
            print(f"Sareko errorea: {e}", file=sys.stderr)
            return 1
        rss_url = discover_rss_url(html)
        if not rss_url:
            print(
                "Ezin izan da RSS jarioa aurkitu orrian (series/<id>/rss.xml).",
                file=sys.stderr,
            )
            return 1
        print(f"RSS jarioa: {rss_url}")
    else:
        print(
            "URL ezezaguna. Espero daitekeena: https://guau.eus/s/<slug> "
            "edo https://guau.eus/series/<id>/rss.xml",
            file=sys.stderr,
        )
        return 1

    try:
        rss_body = fetch(rss_url, args.timeout, args.user_agent)
    except urllib.error.HTTPError as e:
        print(f"HTTP errorea RSS-ean: {e}", file=sys.stderr)
        return 1
    except Exception as e:
        print(f"Sareko errorea (RSS): {e}", file=sys.stderr)
        return 1

    items = parse_rss_items(rss_body)
    if not items:
        print("Ez dago enclosure duen atalik RSS-ean.", file=sys.stderr)
        return 1

    use_season_prefix = multi_season_feed(items)
    out_dir: Path = args.output_dir.resolve()
    print(f"{len(items)} atal — karpeta: {out_dir}")
    if use_season_prefix:
        seasons = sorted({s for _, _, _, s, _ in items if s is not None})
        if seasons:
            print(
                f"Hainbat denboraldi jario berean (denboraldiak: {', '.join(map(str, seasons))}) "
                "— SxxEyy aurrizkia fitxategi-izenetan."
            )

    for i, (title, mp3_url, rss_length, season, episode) in enumerate(items, start=1):
        name = safe_filename(title, i, season, episode, use_season_prefix)
        dest = out_dir / name
        print(f"[{i}/{len(items)}] {name}")
        if args.dry_run:
            continue
        expected = resolved_expected_bytes(
            mp3_url, rss_length, args.timeout, args.user_agent
        )
        if dest.exists() and dest.is_file():
            local_sz = dest.stat().st_size
            if expected is not None:
                if local_sz == expected:
                    print(f"  badago eta osatuta ({local_sz} B), saltatua")
                    continue
                if local_sz > 0:
                    print(
                        f"  osatu gabe edo zaharkitua ({local_sz}/{expected} B), berriro deskargatzen"
                    )
            else:
                if local_sz > 0:
                    print(
                        "  fitxategia badago, tamaina ezezaguna (RSS-ko length gabe, HEAD-k "
                        f"Content-Length gabe) — saltatua: {dest}"
                    )
                    continue
        try:
            download_file(mp3_url, dest, args.timeout, args.user_agent)
        except urllib.error.HTTPError as e:
            print(f"  HTTP hutsegitea: {e}", file=sys.stderr)
            return 1
        except Exception as e:
            print(f"  hutsegitea: {e}", file=sys.stderr)
            return 1
        print(f"  OK → {dest}")

    return 0


if __name__ == "__main__":
    sys.exit(main())