improve script - publist - managing my list of publications, talks, reviews

commit 9831b58287dca45d75b87186c396c45b19278bb7
parent 2358676c716d7f0d2f6886336c9152d912f115c8
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sat, 25 Oct 2025 14:02:54 +0200

improve script

Diffstat:
fix_links  | 192 +++++++++++++++++++++++++++++++++++++++++++++++++------------------------------

1 file changed, 120 insertions(+), 72 deletions(-)
diff --git a/fix_links b/fix_links
@@ -1,21 +1,22 @@
 #!/usr/bin/env python3
-# chatgpt
-
+# Chatgpt
 """
-fix_links_inplace_verified.py
-
-In-place fixer:
- - upgrades http:// -> https:// when the https version actually loads
- - if a URL doesn't load (or redirects to a clearly-nonsensical location such as the site's root),
-   replaces it with the latest working Internet Archive (Wayback) snapshot that actually loads
-   (the script verifies each Wayback snapshot by requesting it)
- - edits the file in-place (creates <file>.bak unless --no-backup)
- - prints per-request progress (disable with --quiet)
- - prints list of substitutions at the end and detailed counts
-
-Dependencies: requests
-pip install requests
+fix_links_inplace_rate_limited.py
+
+In-place fixer with retries and Internet Archive rate-limiting.
+
+Features:
+ - Upgrades http:// -> https:// when the https version actually loads
+ - If a URL doesn't load (or redirects to a clearly-nonsensical location),
+   replaces it with the latest verified Internet Archive (Wayback) snapshot.
+ - Verifies Wayback snapshots by actually loading them.
+ - Retries every network access up to 3 times (exponential backoff).
+ - Ensures at least 3 seconds between successive requests to web.archive.org (global rate limit).
+ - Edits the file in-place (creates <file>.bak unless --no-backup).
+ - Prints per-request progress (disable with --quiet).
+ - Shows substitutions and prominently lists URLs that are broken with no Wayback snapshot.
 """
+
 from __future__ import annotations
 
 import argparse
@@ -34,6 +35,8 @@ import requests
 # ---------------- Configuration ----------------
 DEFAULT_TIMEOUT = 12
 DEFAULT_CONCURRENCY = 8
+RETRIES = 3
+ARCHIVE_MIN_INTERVAL = 3.0  # seconds between successive requests to web.archive.org
 # A modern browser UA (Chrome on Windows) to appear like a regular browser
 USER_AGENT = (
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -44,6 +47,11 @@ URL_RE = re.compile(r"""(?P<url>https?://[^\s'"<>()\]\}]+)""", re.IGNORECASE)
 CDX_LIMIT = 12  # how many CDX candidates to inspect (we'll verify each)
 
 
+# ---------------- Globals for archive rate-limiting ----------------
+_archive_lock = threading.Lock()
+_last_archive_request_time = 0.0  # epoch seconds
+
+
 # ---------------- Utilities ----------------
 print_lock = threading.Lock()
 
@@ -63,32 +71,68 @@ def canonicalize_https(url: str) -> str:
     return url
 
 
+def _sleep_backoff(attempt: int) -> None:
+    # simple exponential backoff: 1s, 2s, 4s...
+    time.sleep(1 * (2 ** (attempt - 1)))
+
+
+def request_with_retries(session: requests.Session, url: str, timeout: int, quiet: bool,
+                         archive: bool = False, retries: int = RETRIES) -> Optional[requests.Response]:
+    """
+    GET the URL with up to `retries` attempts. If `archive=True`, enforce global 3s spacing
+    between successive archive requests (CDX or web.archive.org).
+    Returns the Response object on success (resp.raise_for_status not enforced here), or None on persistent failure.
+    """
+    global _last_archive_request_time
+    attempt = 0
+    while attempt < retries:
+        attempt += 1
+        try:
+            if archive:
+                # global rate-limiting: ensure at least ARCHIVE_MIN_INTERVAL since last archive request
+                with _archive_lock:
+                    now = time.time()
+                    elapsed = now - _last_archive_request_time
+                    if elapsed < ARCHIVE_MIN_INTERVAL:
+                        wait = ARCHIVE_MIN_INTERVAL - elapsed
+                        log(f"[ARCHIVE RATE] waiting {wait:.2f}s before archive request to avoid spamming", quiet)
+                        time.sleep(wait)
+                    # proceed and update timestamp (speculative: set it now so other threads will wait)
+                    _last_archive_request_time = time.time()
+            log(f"[REQ] GET (attempt {attempt}/{retries}) {url}", quiet)
+            resp = session.get(url, timeout=timeout, allow_redirects=True, headers={"User-Agent": USER_AGENT})
+            log(f"[RESP] {url} -> {getattr(resp, 'status_code', 'NO_RESP')} final={getattr(resp, 'url', '')}", quiet)
+            return resp
+        except requests.RequestException as e:
+            log(f"[ERR] {url} attempt {attempt} -> {e.__class__.__name__}: {e}", quiet)
+            if attempt < retries:
+                _sleep_backoff(attempt)
+            else:
+                return None
+    return None
+
+
 def get_response_info(session: requests.Session, url: str, timeout: int, quiet: bool) -> Tuple[bool, Optional[str], Optional[int], Optional[str]]:
     """
-    Perform GET request (follow redirects).
-    Returns tuple:
+    Perform GET (with retries). Returns:
       (ok_bool, final_url_or_none, status_code_or_none, error_message_or_none)
-
-    ok_bool is True when response returned a status_code < 400.
-    final_url is resp.url (useful to detect redirects).
+    ok_bool is True when we obtained a response and resp.status_code < 400.
     """
+    resp = request_with_retries(session, url, timeout, quiet, archive=False)
+    if resp is None:
+        return False, None, None, "request_failed"
     try:
-        log(f"[REQ] GET {url}", quiet)
-        resp = session.get(url, timeout=timeout, allow_redirects=True, headers={"User-Agent": USER_AGENT})
         final = resp.url
-        log(f"[RESP] {url} -> {resp.status_code} final={final}", quiet)
-        return (resp.status_code < 400), final, resp.status_code, None
-    except requests.RequestException as e:
-        log(f"[ERR] {url} -> {e.__class__.__name__}: {e}", quiet)
-        return False, None, None, str(e)
+    except Exception:
+        final = None
+    return (resp.status_code < 400), final, resp.status_code, None
 
 
 def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool:
     """
-    Heuristic to detect 'clearly nonsensical' redirects such as:
-      - original had a non-root path but final is the root (path == '/' or empty)
-      - final netloc differs drastically (optional rule -- we only treat root-truncation as definite bad)
-    Returns True when the redirect should be considered 'broken' and cause us to prefer a Wayback snapshot.
+    Heuristic to detect 'clearly nonsensical' redirects:
+      - original had a non-root path but final is the root (path == '/' or index file)
+      - final netloc differs and final is root index -> suspect
     """
     if not final_url:
         return False
@@ -97,16 +141,12 @@ def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool:
     po = urlparse(original_url)
     pf = urlparse(final_url)
 
-    # If original had a meaningful path (not root/index) and final is root -> bad
     orig_path = po.path or "/"
     final_path = pf.path or "/"
-    # consider common index filenames as not 'meaningful path'
     index_paths = {"/", "/index.html", "/index.htm", "/index.php"}
     if (orig_path.lower() not in index_paths) and (final_path in index_paths):
-        # Example: http://domain/some/page  -> http://domain/
         return True
 
-    # If final netloc differs and final is root of a different domain, treat as suspect.
     if pf.netloc and (pf.netloc != po.netloc) and (final_path in index_paths):
         return True
 
@@ -115,8 +155,7 @@ def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool:
 
 def query_wayback_candidates(session: requests.Session, url: str, timeout: int, quiet: bool, limit: int = CDX_LIMIT) -> List[Tuple[str, str]]:
     """
-    Query CDX and return a list of (timestamp, original) candidates (newest first).
-    Will return an empty list on error.
+    Query CDX API (rate-limited + retried) and return list of (timestamp, original) candidates (newest first).
     """
     cdx_url = (
         "https://web.archive.org/cdx/search/cdx?"
@@ -127,43 +166,41 @@ def query_wayback_candidates(session: requests.Session, url: str, timeout: int, 
         f"&limit={limit}"
         f"&url={quote_plus(url)}"
     )
-    try:
-        log(f"[REQ] CDX {cdx_url}", quiet)
-        r = session.get(cdx_url, timeout=timeout, headers={"User-Agent": USER_AGENT})
-        r.raise_for_status()
-        data = r.json()
-        if not isinstance(data, list) or len(data) < 2:
-            return []
-        # skip header if present
-        start_idx = 1 if isinstance(data[0], list) and any("timestamp" in str(x).lower() for x in data[0]) else 0
-        candidates = []
-        for row in data[start_idx:]:
-            if len(row) >= 2:
-                ts = row[0]
-                orig = row[1]
-                candidates.append((ts, orig))
-        log(f"[CDX] found {len(candidates)} candidates for {url}", quiet)
-        return candidates
-    except requests.RequestException as e:
-        log(f"[ERR] CDX {url} -> {e.__class__.__name__}: {e}", quiet)
+    resp = request_with_retries(session, cdx_url, timeout, quiet, archive=True)
+    if resp is None:
+        log(f"[CDX] failed to fetch CDX results for {url}", quiet)
         return []
+    try:
+        data = resp.json()
     except ValueError as e:
         log(f"[ERR] CDX JSON decode {url} -> {e}", quiet)
         return []
+    if not isinstance(data, list) or len(data) < 2:
+        return []
+    start_idx = 1 if isinstance(data[0], list) and any("timestamp" in str(x).lower() for x in data[0]) else 0
+    candidates = []
+    for row in data[start_idx:]:
+        if len(row) >= 2:
+            ts = row[0]
+            orig = row[1]
+            candidates.append((ts, orig))
+    log(f"[CDX] found {len(candidates)} candidates for {url}", quiet)
+    return candidates
 
 
 def verify_wayback_snapshot(session: requests.Session, timestamp: str, original: str, timeout: int, quiet: bool) -> Optional[str]:
     """
-    Build a Wayback snapshot URL and GET it to verify it actually loads (status < 400).
-    Returns the working wayback URL (string) or None.
+    Build a Wayback snapshot URL and GET it (rate-limited + retried) to verify it actually loads.
+    Returns the working wayback URL or None.
     """
     wb_url = f"https://web.archive.org/web/{timestamp}/{original}"
-    ok, final, status, err = get_response_info(session, wb_url, timeout, quiet)
-    # Accept when request returned status < 400 and final url is at web.archive.org
-    if ok and final and ("web.archive.org" in final):
+    resp = request_with_retries(session, wb_url, timeout, quiet, archive=True)
+    if resp is None:
+        return None
+    # Accept only if final URL is at web.archive.org and status < 400
+    final = getattr(resp, "url", None)
+    if resp.status_code < 400 and final and ("web.archive.org" in final):
         return wb_url
-    # Some snapshots might redirect to a different archived representation,
-    # but we require final to be on web.archive.org and status < 400.
     return None
 
 
@@ -171,9 +208,13 @@ def verify_wayback_snapshot(session: requests.Session, timestamp: str, original:
 def process_single_url(url: str, session: requests.Session, timeout: int, quiet: bool) -> Tuple[str, Optional[str], str]:
     """
     Process one URL and return (original, replacement_or_None, note)
-    note is one of: "upgraded" (http->https), "ok" (left unchanged), "wayback" (replaced by wayback),
-    "no_snapshot" (broken, no wayback found)
+    note: "upgraded", "ok", "wayback", "no_snapshot"
     """
+    # Skip Wayback Machine URLs entirely
+    if url.lower().startswith("https://web.archive.org/"):
+        log(f"[SKIP] already a Wayback Machine URL -> {url}", quiet)
+        return (url, url, "ok")
+
     parsed = urlparse(url)
     scheme = parsed.scheme.lower()
 
@@ -183,10 +224,8 @@ def process_single_url(url: str, session: requests.Session, timeout: int, quiet:
         log(f"[INFO] trying https for {url} -> {https_url}", quiet)
         ok, final, status, err = get_response_info(session, https_url, timeout, quiet)
         if ok:
-            # If it loaded but redirected to something meaningless, treat as failure
             if is_bad_redirect(https_url, final):
                 log(f"[WARN] https for {url} redirected badly -> {final}; will try Wayback", quiet)
-                # fallthrough to next checks (do not accept https)
             else:
                 return (url, https_url, "upgraded")
 
@@ -196,7 +235,6 @@ def process_single_url(url: str, session: requests.Session, timeout: int, quiet:
     if ok:
         if is_bad_redirect(url, final):
             log(f"[WARN] {url} redirected badly -> {final}; treating as broken and trying Wayback", quiet)
-            # treat as broken, go to wayback flow
         else:
             return (url, url, "ok")
 
@@ -212,12 +250,9 @@ def process_single_url(url: str, session: requests.Session, timeout: int, quiet:
     log(f"[INFO] no suitable Wayback snapshot found for {url}", quiet)
     return (url, None, "no_snapshot")
 
-
 # ---------------- Main CLI ----------------
 def main() -> None:
-    import argparse
-
-    ap = argparse.ArgumentParser(description="In-place fix: upgrade http->https when possible, replace broken links with verified Wayback snapshots.")
+    ap = argparse.ArgumentParser(description="In-place fix: upgrade http->https where possible, replace broken links with verified Wayback snapshots. Retries + archive rate-limiting included.")
     ap.add_argument("file", help="Path to the file to edit in-place (backup saved as <file>.bak unless --no-backup).")
     ap.add_argument("--map", help="Optional CSV mapping original_url -> replacement_url for audit.")
     ap.add_argument("--concurrency", type=int, default=DEFAULT_CONCURRENCY, help="Number of worker threads (default %(default)s).")
@@ -249,6 +284,7 @@ def main() -> None:
     mapping = {}  # orig -> (replacement or None, note)
     start = time.time()
 
+    # Process URLs concurrently (non-archive network operations are also retried inside helpers).
     with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as ex:
         futures = {ex.submit(process_single_url, url, session, args.timeout, args.quiet): url for url in unique_urls}
         for fut in concurrent.futures.as_completed(futures):
@@ -321,5 +357,17 @@ def main() -> None:
     else:
         print("\nNo substitutions performed.")
 
+    # PROMINENTLY print broken URLs with no snapshot
+    if n_no_snapshot > 0:
+        print("\n" + "=" * 80)
+        print("BROKEN URLS WITH NO INTERNET ARCHIVE SNAPSHOT FOUND (ATTENTION)".center(80))
+        print("=" * 80)
+        for orig, (repl, note) in mapping.items():
+            if note == "no_snapshot":
+                print(f"- {orig}")
+        print("=" * 80 + "\n")
+    else:
+        print("\nAll broken URLs had suitable Wayback snapshots (or were left unchanged because they worked).")
+
 if __name__ == "__main__":
     main()

	publist managing my list of publications, talks, reviews
	git clone https://a3nm.net/git/publist/
	Log \| Files \| Refs \| README \| LICENSE