Blog/guau/download_guau_podcast.py
(Deskargatu)
#!/usr/bin/env python3
"""
Guau-ko podcasteko audio atalak deskargatzen ditu honetatik:
- sailaren orria: https://guau.eus/s/<slug>
- RSS jario zuzena: https://guau.eus/series/<id>/rss.xml
Guau-ren RSS jarioak normalean sail bereko denboraldi guztiak biltzen ditu
(itunes:season etiketa anitz, jario berean).
Mendeko liburutegia: Python liburutegi estandarra soilik.
"""
from __future__ import annotations
import argparse
import re
import ssl
import sys
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
from pathlib import Path
from urllib.parse import urlparse, urlunparse
GUAU_HOSTS = ("guau.eus", "www.guau.eus")
ITUNES_NS = {"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"}
DEFAULT_UA = (
"Mozilla/5.0 (compatible; GuauPodcastFetcher/1.0; +https://guau.eus/)"
)
def _parse_itunes_int(el: ET.Element | None) -> int | None:
if el is None or not el.text:
return None
t = str(el.text).strip()
if t.lstrip("-").isdigit():
n = int(t)
return n if n >= 0 else None
return None
def fetch(url: str, timeout: int, user_agent: str) -> bytes:
req = urllib.request.Request(
url,
headers={"User-Agent": user_agent, "Accept": "*/*"},
)
ctx = ssl.create_default_context()
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
return resp.read()
def normalize_guau_url(url: str) -> str:
p = urlparse(url.strip())
if not p.scheme:
p = urlparse("https://" + url.strip().lstrip("/"))
if p.netloc.lower() in ("guau.eus", "www.guau.eus") and not p.scheme:
p = p._replace(scheme="https")
return urlunparse(p)
def discover_rss_url(series_page_html: str) -> str | None:
# Lehenengo /series/<id>/rss.xml esteka erakusten den sailari dagokio
# (orrialdearen hydratazioa, erlazionatutako «related» blokeak baino lehen).
matches = re.findall(
r"https://guau\.eus/series/(\d+)/rss\.xml",
series_page_html,
)
if not matches:
return None
sid = matches[0]
return f"https://guau.eus/series/{sid}/rss.xml"
def parse_rss_items(
rss_xml: bytes,
) -> list[tuple[str, str, int | None, int | None, int | None]]:
"""Itzultzen du (izenburua, mp3_url, byte_kopurua, denboraldia, atala) <item> bakoitzeko.
byte_kopurua: RSS-ko length atributua. denboraldia / atala: itunes:season, itunes:episode.
"""
root = ET.fromstring(rss_xml)
channel = root.find("channel")
if channel is None:
return []
out: list[tuple[str, str, int | None, int | None, int | None]] = []
for item in channel.findall("item"):
title_el = item.find("title")
title = (title_el.text or "").strip() if title_el is not None else ""
enc = item.find("enclosure")
if enc is None:
continue
mp3 = enc.get("url") or ""
if not mp3:
continue
raw_len = enc.get("length")
length: int | None = None
if raw_len and str(raw_len).strip().isdigit():
length = int(str(raw_len).strip())
if length <= 0:
length = None
season = _parse_itunes_int(item.find("itunes:season", ITUNES_NS))
episode = _parse_itunes_int(item.find("itunes:episode", ITUNES_NS))
out.append((title, mp3, length, season, episode))
return out
def multi_season_feed(items: list[tuple[str, str, int | None, int | None, int | None]]) -> bool:
"""True, RSS-ak bi denboraldi edo gehiago baditu, edo 1 baino handiago den denboraldi bat (SxxEyy)."""
seasons = {s for _, _, _, s, _ in items if s is not None}
if len(seasons) > 1:
return True
if seasons and max(seasons) > 1:
return True
return False
def http_head_content_length(url: str, timeout: int, user_agent: str) -> int | None:
req = urllib.request.Request(
url,
method="HEAD",
headers={"User-Agent": user_agent, "Accept": "*/*"},
)
ctx = ssl.create_default_context()
try:
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
cl = resp.headers.get("Content-Length")
if cl and str(cl).strip().isdigit():
n = int(str(cl).strip())
return n if n > 0 else None
except (urllib.error.HTTPError, urllib.error.URLError, OSError, ValueError):
pass
return None
def resolved_expected_bytes(
mp3_url: str,
rss_length: int | None,
timeout: int,
user_agent: str,
) -> int | None:
if rss_length is not None:
return rss_length
return http_head_content_length(mp3_url, timeout, user_agent)
def safe_filename(
title: str,
index: int,
season: int | None,
episode: int | None,
use_season_prefix: bool,
) -> str:
title_part = re.sub(r'[<>:"/\\|?*\x00-\x1f]', "_", title).strip()
title_part = re.sub(r"\s+", " ", title_part)
if not title_part:
title_part = f"episode_{index:02d}"
prefix = ""
if use_season_prefix and season is not None and episode is not None:
prefix = f"S{season:02d}E{episode:02d} - "
elif use_season_prefix and season is not None:
prefix = f"S{season:02d} - "
elif use_season_prefix and episode is not None:
prefix = f"E{episode:02d} - "
base = prefix + title_part
if not base.lower().endswith(".mp3"):
base += ".mp3"
return base
def download_file(url: str, dest: Path, timeout: int, user_agent: str) -> None:
dest.parent.mkdir(parents=True, exist_ok=True)
req = urllib.request.Request(
url,
headers={"User-Agent": user_agent, "Accept": "*/*"},
)
ctx = ssl.create_default_context()
tmp = dest.with_suffix(dest.suffix + ".part")
try:
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
total = int(resp.headers.get("Content-Length") or 0)
chunk = 1024 * 256
got = 0
with open(tmp, "wb") as f:
while True:
buf = resp.read(chunk)
if not buf:
break
f.write(buf)
got += len(buf)
if total and sys.stderr.isatty():
pct = 100.0 * got / total
sys.stderr.write(f"\r {pct:5.1f}% ({got // (1024 * 1024)} MiB)")
if sys.stderr.isatty():
sys.stderr.write("\n")
tmp.replace(dest)
finally:
if tmp.exists():
tmp.unlink(missing_ok=True)
def main() -> int:
ap = argparse.ArgumentParser(
description="Deskargatu MP3ak guau.eus-etik (/s/…) edo RSS ofizialetik."
)
ap.add_argument(
"url",
help="Sailaren orria (https://guau.eus/s/slug) edo RSS jarioa (/series/<id>/rss.xml)",
)
ap.add_argument(
"-o",
"--output-dir",
type=Path,
default=Path("."),
help="Helburuko karpeta (lehenetsia: uneko direktorioa)",
)
ap.add_argument(
"-n",
"--dry-run",
action="store_true",
help="Erakutsi atalak deskargatu gabe",
)
ap.add_argument(
"--timeout",
type=int,
default=120,
help="HTTP denbora-muga segundotan (lehenetsia: 120)",
)
ap.add_argument(
"--user-agent",
default=DEFAULT_UA,
help="HTTP eskaeretarako User-Agent",
)
args = ap.parse_args()
raw = normalize_guau_url(args.url)
parsed = urlparse(raw)
rss_url: str | None = None
if re.search(r"/series/\d+/rss\.xml$", parsed.path or "", re.I):
rss_url = raw.split("?", 1)[0]
elif re.search(r"/series/[^/]+/rss\.xml$", parsed.path or "", re.I):
# Adib. /series/ainguratuak/rss.xml 500 itzuli dezake; saihesten da.
print(
"Erabili /s/<slug> orrialdearen URLa edo /series/<zenbaki id>/rss.xml.",
file=sys.stderr,
)
return 1
elif (parsed.netloc.lower().removesuffix(":443") in GUAU_HOSTS or not parsed.netloc) and (
parsed.path or ""
).startswith("/s/"):
try:
html = fetch(raw, args.timeout, args.user_agent).decode(
"utf-8", errors="replace"
)
except urllib.error.HTTPError as e:
print(f"HTTP errorea orrialdea kargatzean: {e}", file=sys.stderr)
return 1
except Exception as e:
print(f"Sareko errorea: {e}", file=sys.stderr)
return 1
rss_url = discover_rss_url(html)
if not rss_url:
print(
"Ezin izan da RSS jarioa aurkitu orrian (series/<id>/rss.xml).",
file=sys.stderr,
)
return 1
print(f"RSS jarioa: {rss_url}")
else:
print(
"URL ezezaguna. Espero daitekeena: https://guau.eus/s/<slug> "
"edo https://guau.eus/series/<id>/rss.xml",
file=sys.stderr,
)
return 1
try:
rss_body = fetch(rss_url, args.timeout, args.user_agent)
except urllib.error.HTTPError as e:
print(f"HTTP errorea RSS-ean: {e}", file=sys.stderr)
return 1
except Exception as e:
print(f"Sareko errorea (RSS): {e}", file=sys.stderr)
return 1
items = parse_rss_items(rss_body)
if not items:
print("Ez dago enclosure duen atalik RSS-ean.", file=sys.stderr)
return 1
use_season_prefix = multi_season_feed(items)
out_dir: Path = args.output_dir.resolve()
print(f"{len(items)} atal — karpeta: {out_dir}")
if use_season_prefix:
seasons = sorted({s for _, _, _, s, _ in items if s is not None})
if seasons:
print(
f"Hainbat denboraldi jario berean (denboraldiak: {', '.join(map(str, seasons))}) "
"— SxxEyy aurrizkia fitxategi-izenetan."
)
for i, (title, mp3_url, rss_length, season, episode) in enumerate(items, start=1):
name = safe_filename(title, i, season, episode, use_season_prefix)
dest = out_dir / name
print(f"[{i}/{len(items)}] {name}")
if args.dry_run:
continue
expected = resolved_expected_bytes(
mp3_url, rss_length, args.timeout, args.user_agent
)
if dest.exists() and dest.is_file():
local_sz = dest.stat().st_size
if expected is not None:
if local_sz == expected:
print(f" badago eta osatuta ({local_sz} B), saltatua")
continue
if local_sz > 0:
print(
f" osatu gabe edo zaharkitua ({local_sz}/{expected} B), berriro deskargatzen"
)
else:
if local_sz > 0:
print(
" fitxategia badago, tamaina ezezaguna (RSS-ko length gabe, HEAD-k "
f"Content-Length gabe) — saltatua: {dest}"
)
continue
try:
download_file(mp3_url, dest, args.timeout, args.user_agent)
except urllib.error.HTTPError as e:
print(f" HTTP hutsegitea: {e}", file=sys.stderr)
return 1
except Exception as e:
print(f" hutsegitea: {e}", file=sys.stderr)
return 1
print(f" OK → {dest}")
return 0
if __name__ == "__main__":
sys.exit(main())