publist

managing my list of publications, talks, reviews
git clone https://a3nm.net/git/publist/
Log | Files | Refs | README | LICENSE

commit 9831b58287dca45d75b87186c396c45b19278bb7
parent 2358676c716d7f0d2f6886336c9152d912f115c8
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sat, 25 Oct 2025 14:02:54 +0200

improve script

Diffstat:
fix_links | 192+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
1 file changed, 120 insertions(+), 72 deletions(-)

diff --git a/fix_links b/fix_links @@ -1,21 +1,22 @@ #!/usr/bin/env python3 -# chatgpt - +# Chatgpt """ -fix_links_inplace_verified.py - -In-place fixer: - - upgrades http:// -> https:// when the https version actually loads - - if a URL doesn't load (or redirects to a clearly-nonsensical location such as the site's root), - replaces it with the latest working Internet Archive (Wayback) snapshot that actually loads - (the script verifies each Wayback snapshot by requesting it) - - edits the file in-place (creates <file>.bak unless --no-backup) - - prints per-request progress (disable with --quiet) - - prints list of substitutions at the end and detailed counts - -Dependencies: requests -pip install requests +fix_links_inplace_rate_limited.py + +In-place fixer with retries and Internet Archive rate-limiting. + +Features: + - Upgrades http:// -> https:// when the https version actually loads + - If a URL doesn't load (or redirects to a clearly-nonsensical location), + replaces it with the latest verified Internet Archive (Wayback) snapshot. + - Verifies Wayback snapshots by actually loading them. + - Retries every network access up to 3 times (exponential backoff). + - Ensures at least 3 seconds between successive requests to web.archive.org (global rate limit). + - Edits the file in-place (creates <file>.bak unless --no-backup). + - Prints per-request progress (disable with --quiet). + - Shows substitutions and prominently lists URLs that are broken with no Wayback snapshot. """ + from __future__ import annotations import argparse @@ -34,6 +35,8 @@ import requests # ---------------- Configuration ---------------- DEFAULT_TIMEOUT = 12 DEFAULT_CONCURRENCY = 8 +RETRIES = 3 +ARCHIVE_MIN_INTERVAL = 3.0 # seconds between successive requests to web.archive.org # A modern browser UA (Chrome on Windows) to appear like a regular browser USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " @@ -44,6 +47,11 @@ URL_RE = re.compile(r"""(?P<url>https?://[^\s'"<>()\]\}]+)""", re.IGNORECASE) CDX_LIMIT = 12 # how many CDX candidates to inspect (we'll verify each) +# ---------------- Globals for archive rate-limiting ---------------- +_archive_lock = threading.Lock() +_last_archive_request_time = 0.0 # epoch seconds + + # ---------------- Utilities ---------------- print_lock = threading.Lock() @@ -63,32 +71,68 @@ def canonicalize_https(url: str) -> str: return url +def _sleep_backoff(attempt: int) -> None: + # simple exponential backoff: 1s, 2s, 4s... + time.sleep(1 * (2 ** (attempt - 1))) + + +def request_with_retries(session: requests.Session, url: str, timeout: int, quiet: bool, + archive: bool = False, retries: int = RETRIES) -> Optional[requests.Response]: + """ + GET the URL with up to `retries` attempts. If `archive=True`, enforce global 3s spacing + between successive archive requests (CDX or web.archive.org). + Returns the Response object on success (resp.raise_for_status not enforced here), or None on persistent failure. + """ + global _last_archive_request_time + attempt = 0 + while attempt < retries: + attempt += 1 + try: + if archive: + # global rate-limiting: ensure at least ARCHIVE_MIN_INTERVAL since last archive request + with _archive_lock: + now = time.time() + elapsed = now - _last_archive_request_time + if elapsed < ARCHIVE_MIN_INTERVAL: + wait = ARCHIVE_MIN_INTERVAL - elapsed + log(f"[ARCHIVE RATE] waiting {wait:.2f}s before archive request to avoid spamming", quiet) + time.sleep(wait) + # proceed and update timestamp (speculative: set it now so other threads will wait) + _last_archive_request_time = time.time() + log(f"[REQ] GET (attempt {attempt}/{retries}) {url}", quiet) + resp = session.get(url, timeout=timeout, allow_redirects=True, headers={"User-Agent": USER_AGENT}) + log(f"[RESP] {url} -> {getattr(resp, 'status_code', 'NO_RESP')} final={getattr(resp, 'url', '')}", quiet) + return resp + except requests.RequestException as e: + log(f"[ERR] {url} attempt {attempt} -> {e.__class__.__name__}: {e}", quiet) + if attempt < retries: + _sleep_backoff(attempt) + else: + return None + return None + + def get_response_info(session: requests.Session, url: str, timeout: int, quiet: bool) -> Tuple[bool, Optional[str], Optional[int], Optional[str]]: """ - Perform GET request (follow redirects). - Returns tuple: + Perform GET (with retries). Returns: (ok_bool, final_url_or_none, status_code_or_none, error_message_or_none) - - ok_bool is True when response returned a status_code < 400. - final_url is resp.url (useful to detect redirects). + ok_bool is True when we obtained a response and resp.status_code < 400. """ + resp = request_with_retries(session, url, timeout, quiet, archive=False) + if resp is None: + return False, None, None, "request_failed" try: - log(f"[REQ] GET {url}", quiet) - resp = session.get(url, timeout=timeout, allow_redirects=True, headers={"User-Agent": USER_AGENT}) final = resp.url - log(f"[RESP] {url} -> {resp.status_code} final={final}", quiet) - return (resp.status_code < 400), final, resp.status_code, None - except requests.RequestException as e: - log(f"[ERR] {url} -> {e.__class__.__name__}: {e}", quiet) - return False, None, None, str(e) + except Exception: + final = None + return (resp.status_code < 400), final, resp.status_code, None def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool: """ - Heuristic to detect 'clearly nonsensical' redirects such as: - - original had a non-root path but final is the root (path == '/' or empty) - - final netloc differs drastically (optional rule -- we only treat root-truncation as definite bad) - Returns True when the redirect should be considered 'broken' and cause us to prefer a Wayback snapshot. + Heuristic to detect 'clearly nonsensical' redirects: + - original had a non-root path but final is the root (path == '/' or index file) + - final netloc differs and final is root index -> suspect """ if not final_url: return False @@ -97,16 +141,12 @@ def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool: po = urlparse(original_url) pf = urlparse(final_url) - # If original had a meaningful path (not root/index) and final is root -> bad orig_path = po.path or "/" final_path = pf.path or "/" - # consider common index filenames as not 'meaningful path' index_paths = {"/", "/index.html", "/index.htm", "/index.php"} if (orig_path.lower() not in index_paths) and (final_path in index_paths): - # Example: http://domain/some/page -> http://domain/ return True - # If final netloc differs and final is root of a different domain, treat as suspect. if pf.netloc and (pf.netloc != po.netloc) and (final_path in index_paths): return True @@ -115,8 +155,7 @@ def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool: def query_wayback_candidates(session: requests.Session, url: str, timeout: int, quiet: bool, limit: int = CDX_LIMIT) -> List[Tuple[str, str]]: """ - Query CDX and return a list of (timestamp, original) candidates (newest first). - Will return an empty list on error. + Query CDX API (rate-limited + retried) and return list of (timestamp, original) candidates (newest first). """ cdx_url = ( "https://web.archive.org/cdx/search/cdx?" @@ -127,43 +166,41 @@ def query_wayback_candidates(session: requests.Session, url: str, timeout: int, f"&limit={limit}" f"&url={quote_plus(url)}" ) - try: - log(f"[REQ] CDX {cdx_url}", quiet) - r = session.get(cdx_url, timeout=timeout, headers={"User-Agent": USER_AGENT}) - r.raise_for_status() - data = r.json() - if not isinstance(data, list) or len(data) < 2: - return [] - # skip header if present - start_idx = 1 if isinstance(data[0], list) and any("timestamp" in str(x).lower() for x in data[0]) else 0 - candidates = [] - for row in data[start_idx:]: - if len(row) >= 2: - ts = row[0] - orig = row[1] - candidates.append((ts, orig)) - log(f"[CDX] found {len(candidates)} candidates for {url}", quiet) - return candidates - except requests.RequestException as e: - log(f"[ERR] CDX {url} -> {e.__class__.__name__}: {e}", quiet) + resp = request_with_retries(session, cdx_url, timeout, quiet, archive=True) + if resp is None: + log(f"[CDX] failed to fetch CDX results for {url}", quiet) return [] + try: + data = resp.json() except ValueError as e: log(f"[ERR] CDX JSON decode {url} -> {e}", quiet) return [] + if not isinstance(data, list) or len(data) < 2: + return [] + start_idx = 1 if isinstance(data[0], list) and any("timestamp" in str(x).lower() for x in data[0]) else 0 + candidates = [] + for row in data[start_idx:]: + if len(row) >= 2: + ts = row[0] + orig = row[1] + candidates.append((ts, orig)) + log(f"[CDX] found {len(candidates)} candidates for {url}", quiet) + return candidates def verify_wayback_snapshot(session: requests.Session, timestamp: str, original: str, timeout: int, quiet: bool) -> Optional[str]: """ - Build a Wayback snapshot URL and GET it to verify it actually loads (status < 400). - Returns the working wayback URL (string) or None. + Build a Wayback snapshot URL and GET it (rate-limited + retried) to verify it actually loads. + Returns the working wayback URL or None. """ wb_url = f"https://web.archive.org/web/{timestamp}/{original}" - ok, final, status, err = get_response_info(session, wb_url, timeout, quiet) - # Accept when request returned status < 400 and final url is at web.archive.org - if ok and final and ("web.archive.org" in final): + resp = request_with_retries(session, wb_url, timeout, quiet, archive=True) + if resp is None: + return None + # Accept only if final URL is at web.archive.org and status < 400 + final = getattr(resp, "url", None) + if resp.status_code < 400 and final and ("web.archive.org" in final): return wb_url - # Some snapshots might redirect to a different archived representation, - # but we require final to be on web.archive.org and status < 400. return None @@ -171,9 +208,13 @@ def verify_wayback_snapshot(session: requests.Session, timestamp: str, original: def process_single_url(url: str, session: requests.Session, timeout: int, quiet: bool) -> Tuple[str, Optional[str], str]: """ Process one URL and return (original, replacement_or_None, note) - note is one of: "upgraded" (http->https), "ok" (left unchanged), "wayback" (replaced by wayback), - "no_snapshot" (broken, no wayback found) + note: "upgraded", "ok", "wayback", "no_snapshot" """ + # Skip Wayback Machine URLs entirely + if url.lower().startswith("https://web.archive.org/"): + log(f"[SKIP] already a Wayback Machine URL -> {url}", quiet) + return (url, url, "ok") + parsed = urlparse(url) scheme = parsed.scheme.lower() @@ -183,10 +224,8 @@ def process_single_url(url: str, session: requests.Session, timeout: int, quiet: log(f"[INFO] trying https for {url} -> {https_url}", quiet) ok, final, status, err = get_response_info(session, https_url, timeout, quiet) if ok: - # If it loaded but redirected to something meaningless, treat as failure if is_bad_redirect(https_url, final): log(f"[WARN] https for {url} redirected badly -> {final}; will try Wayback", quiet) - # fallthrough to next checks (do not accept https) else: return (url, https_url, "upgraded") @@ -196,7 +235,6 @@ def process_single_url(url: str, session: requests.Session, timeout: int, quiet: if ok: if is_bad_redirect(url, final): log(f"[WARN] {url} redirected badly -> {final}; treating as broken and trying Wayback", quiet) - # treat as broken, go to wayback flow else: return (url, url, "ok") @@ -212,12 +250,9 @@ def process_single_url(url: str, session: requests.Session, timeout: int, quiet: log(f"[INFO] no suitable Wayback snapshot found for {url}", quiet) return (url, None, "no_snapshot") - # ---------------- Main CLI ---------------- def main() -> None: - import argparse - - ap = argparse.ArgumentParser(description="In-place fix: upgrade http->https when possible, replace broken links with verified Wayback snapshots.") + ap = argparse.ArgumentParser(description="In-place fix: upgrade http->https where possible, replace broken links with verified Wayback snapshots. Retries + archive rate-limiting included.") ap.add_argument("file", help="Path to the file to edit in-place (backup saved as <file>.bak unless --no-backup).") ap.add_argument("--map", help="Optional CSV mapping original_url -> replacement_url for audit.") ap.add_argument("--concurrency", type=int, default=DEFAULT_CONCURRENCY, help="Number of worker threads (default %(default)s).") @@ -249,6 +284,7 @@ def main() -> None: mapping = {} # orig -> (replacement or None, note) start = time.time() + # Process URLs concurrently (non-archive network operations are also retried inside helpers). with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as ex: futures = {ex.submit(process_single_url, url, session, args.timeout, args.quiet): url for url in unique_urls} for fut in concurrent.futures.as_completed(futures): @@ -321,5 +357,17 @@ def main() -> None: else: print("\nNo substitutions performed.") + # PROMINENTLY print broken URLs with no snapshot + if n_no_snapshot > 0: + print("\n" + "=" * 80) + print("BROKEN URLS WITH NO INTERNET ARCHIVE SNAPSHOT FOUND (ATTENTION)".center(80)) + print("=" * 80) + for orig, (repl, note) in mapping.items(): + if note == "no_snapshot": + print(f"- {orig}") + print("=" * 80 + "\n") + else: + print("\nAll broken URLs had suitable Wayback snapshots (or were left unchanged because they worked).") + if __name__ == "__main__": main()