fix_links - publist - managing my list of publications, talks, reviews

fix_links (14856B)
      1 #!/usr/bin/env python3
      2 # Chatgpt
      3 """
      4 fix_links_inplace_rate_limited.py
      5 
      6 In-place fixer with retries and Internet Archive rate-limiting.
      7 
      8 Features:
      9  - Upgrades http:// -> https:// when the https version actually loads
     10  - If a URL doesn't load (or redirects to a clearly-nonsensical location),
     11    replaces it with the latest verified Internet Archive (Wayback) snapshot.
     12  - Verifies Wayback snapshots by actually loading them.
     13  - Retries every network access up to 3 times (exponential backoff).
     14  - Ensures at least 3 seconds between successive requests to web.archive.org (global rate limit).
     15  - Edits the file in-place (creates <file>.bak unless --no-backup).
     16  - Prints per-request progress (disable with --quiet).
     17  - Shows substitutions and prominently lists URLs that are broken with no Wayback snapshot.
     18 """
     19 
     20 from __future__ import annotations
     21 
     22 import argparse
     23 import concurrent.futures
     24 import csv
     25 import os
     26 import re
     27 import threading
     28 import time
     29 from collections import OrderedDict
     30 from typing import Optional, Tuple, List
     31 from urllib.parse import quote_plus, urlparse, urlunparse
     32 
     33 import requests
     34 
     35 # ---------------- Configuration ----------------
     36 DEFAULT_TIMEOUT = 12
     37 DEFAULT_CONCURRENCY = 8
     38 RETRIES = 3
     39 ARCHIVE_MIN_INTERVAL = 3.0  # seconds between successive requests to web.archive.org
     40 # A modern browser UA (Chrome on Windows) to appear like a regular browser
     41 USER_AGENT = (
     42     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
     43     "AppleWebKit/537.36 (KHTML, like Gecko) "
     44     "Chrome/119.0.0.0 Safari/537.36"
     45 )
     46 URL_RE = re.compile(r"""(?P<url>https?://[^\s'"<>()\]\}]+)""", re.IGNORECASE)
     47 CDX_LIMIT = 12  # how many CDX candidates to inspect (we'll verify each)
     48 
     49 
     50 # ---------------- Globals for archive rate-limiting ----------------
     51 _archive_lock = threading.Lock()
     52 _last_archive_request_time = 0.0  # epoch seconds
     53 
     54 
     55 # ---------------- Utilities ----------------
     56 print_lock = threading.Lock()
     57 
     58 
     59 def log(msg: str, quiet: bool) -> None:
     60     if quiet:
     61         return
     62     with print_lock:
     63         print(msg, flush=True)
     64 
     65 
     66 def canonicalize_https(url: str) -> str:
     67     p = urlparse(url)
     68     if p.scheme.lower() == "http":
     69         p = p._replace(scheme="https")
     70         return urlunparse(p)
     71     return url
     72 
     73 
     74 def _sleep_backoff(attempt: int) -> None:
     75     # simple exponential backoff: 1s, 2s, 4s...
     76     time.sleep(1 * (2 ** (attempt - 1)))
     77 
     78 
     79 def request_with_retries(session: requests.Session, url: str, timeout: int, quiet: bool,
     80                          archive: bool = False, retries: int = RETRIES) -> Optional[requests.Response]:
     81     """
     82     GET the URL with up to `retries` attempts. If `archive=True`, enforce global 3s spacing
     83     between successive archive requests (CDX or web.archive.org).
     84     Returns the Response object on success (resp.raise_for_status not enforced here), or None on persistent failure.
     85     """
     86     global _last_archive_request_time
     87     attempt = 0
     88     while attempt < retries:
     89         attempt += 1
     90         try:
     91             if archive:
     92                 # global rate-limiting: ensure at least ARCHIVE_MIN_INTERVAL since last archive request
     93                 with _archive_lock:
     94                     now = time.time()
     95                     elapsed = now - _last_archive_request_time
     96                     if elapsed < ARCHIVE_MIN_INTERVAL:
     97                         wait = ARCHIVE_MIN_INTERVAL - elapsed
     98                         log(f"[ARCHIVE RATE] waiting {wait:.2f}s before archive request to avoid spamming", quiet)
     99                         time.sleep(wait)
    100                     # proceed and update timestamp (speculative: set it now so other threads will wait)
    101                     _last_archive_request_time = time.time()
    102             log(f"[REQ] GET (attempt {attempt}/{retries}) {url}", quiet)
    103             resp = session.get(url, timeout=timeout, allow_redirects=True, headers={"User-Agent": USER_AGENT})
    104             log(f"[RESP] {url} -> {getattr(resp, 'status_code', 'NO_RESP')} final={getattr(resp, 'url', '')}", quiet)
    105             return resp
    106         except requests.RequestException as e:
    107             log(f"[ERR] {url} attempt {attempt} -> {e.__class__.__name__}: {e}", quiet)
    108             if attempt < retries:
    109                 _sleep_backoff(attempt)
    110             else:
    111                 return None
    112     return None
    113 
    114 
    115 def get_response_info(session: requests.Session, url: str, timeout: int, quiet: bool) -> Tuple[bool, Optional[str], Optional[int], Optional[str]]:
    116     """
    117     Perform GET (with retries). Returns:
    118       (ok_bool, final_url_or_none, status_code_or_none, error_message_or_none)
    119     ok_bool is True when we obtained a response and resp.status_code < 400.
    120     """
    121     resp = request_with_retries(session, url, timeout, quiet, archive=False)
    122     if resp is None:
    123         return False, None, None, "request_failed"
    124     try:
    125         final = resp.url
    126     except Exception:
    127         final = None
    128     return (resp.status_code < 400), final, resp.status_code, None
    129 
    130 
    131 def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool:
    132     """
    133     Heuristic to detect 'clearly nonsensical' redirects:
    134       - original had a non-root path but final is the root (path == '/' or index file)
    135       - final netloc differs and final is root index -> suspect
    136     """
    137     if not final_url:
    138         return False
    139     if original_url == final_url:
    140         return False
    141     po = urlparse(original_url)
    142     pf = urlparse(final_url)
    143 
    144     orig_path = po.path or "/"
    145     final_path = pf.path or "/"
    146     index_paths = {"/", "/index.html", "/index.htm", "/index.php"}
    147     if (orig_path.lower() not in index_paths) and (final_path in index_paths):
    148         return True
    149 
    150     if pf.netloc and (pf.netloc != po.netloc) and (final_path in index_paths):
    151         return True
    152 
    153     return False
    154 
    155 
    156 def query_wayback_candidates(session: requests.Session, url: str, timeout: int, quiet: bool, limit: int = CDX_LIMIT) -> List[Tuple[str, str]]:
    157     """
    158     Query CDX API (rate-limited + retried) and return list of (timestamp, original) candidates (newest first).
    159     """
    160     cdx_url = (
    161         "https://web.archive.org/cdx/search/cdx?"
    162         "output=json"
    163         "&fl=timestamp,original,statuscode"
    164         "&filter=statuscode:200"
    165         "&sort=reverse"
    166         f"&limit={limit}"
    167         f"&url={quote_plus(url)}"
    168     )
    169     resp = request_with_retries(session, cdx_url, timeout, quiet, archive=True)
    170     if resp is None:
    171         log(f"[CDX] failed to fetch CDX results for {url}", quiet)
    172         return []
    173     try:
    174         data = resp.json()
    175     except ValueError as e:
    176         log(f"[ERR] CDX JSON decode {url} -> {e}", quiet)
    177         return []
    178     if not isinstance(data, list) or len(data) < 2:
    179         return []
    180     start_idx = 1 if isinstance(data[0], list) and any("timestamp" in str(x).lower() for x in data[0]) else 0
    181     candidates = []
    182     for row in data[start_idx:]:
    183         if len(row) >= 2:
    184             ts = row[0]
    185             orig = row[1]
    186             candidates.append((ts, orig))
    187     log(f"[CDX] found {len(candidates)} candidates for {url}", quiet)
    188     return candidates
    189 
    190 
    191 def verify_wayback_snapshot(session: requests.Session, timestamp: str, original: str, timeout: int, quiet: bool) -> Optional[str]:
    192     """
    193     Build a Wayback snapshot URL and GET it (rate-limited + retried) to verify it actually loads.
    194     Returns the working wayback URL or None.
    195     """
    196     wb_url = f"https://web.archive.org/web/{timestamp}/{original}"
    197     resp = request_with_retries(session, wb_url, timeout, quiet, archive=True)
    198     if resp is None:
    199         return None
    200     # Accept only if final URL is at web.archive.org and status < 400
    201     final = getattr(resp, "url", None)
    202     if resp.status_code < 400 and final and ("web.archive.org" in final):
    203         return wb_url
    204     return None
    205 
    206 
    207 # ---------------- Core logic ----------------
    208 def process_single_url(url: str, session: requests.Session, timeout: int, quiet: bool) -> Tuple[str, Optional[str], str]:
    209     """
    210     Process one URL and return (original, replacement_or_None, note)
    211     note: "upgraded", "ok", "wayback", "no_snapshot"
    212     """
    213     # Skip Wayback Machine URLs entirely
    214     if url.lower().startswith("https://web.archive.org/"):
    215         log(f"[SKIP] already a Wayback Machine URL -> {url}", quiet)
    216         return (url, url, "ok")
    217 
    218     parsed = urlparse(url)
    219     scheme = parsed.scheme.lower()
    220 
    221     # 1) If http: try https
    222     if scheme == "http":
    223         https_url = canonicalize_https(url)
    224         log(f"[INFO] trying https for {url} -> {https_url}", quiet)
    225         ok, final, status, err = get_response_info(session, https_url, timeout, quiet)
    226         if ok:
    227             if is_bad_redirect(https_url, final):
    228                 log(f"[WARN] https for {url} redirected badly -> {final}; will try Wayback", quiet)
    229             else:
    230                 return (url, https_url, "upgraded")
    231 
    232     # 2) Try original
    233     log(f"[INFO] checking original {url}", quiet)
    234     ok, final, status, err = get_response_info(session, url, timeout, quiet)
    235     if ok:
    236         if is_bad_redirect(url, final):
    237             log(f"[WARN] {url} redirected badly -> {final}; treating as broken and trying Wayback", quiet)
    238         else:
    239             return (url, url, "ok")
    240 
    241     # 3) Broken or bad redirect -> query Wayback CDX and verify candidates
    242     log(f"[INFO] querying Wayback for {url}", quiet)
    243     candidates = query_wayback_candidates(session, url, timeout, quiet, limit=CDX_LIMIT)
    244     for ts, orig in candidates:
    245         wb = verify_wayback_snapshot(session, ts, orig, timeout, quiet)
    246         if wb:
    247             log(f"[WAYBACK] selected {wb} for {url}", quiet)
    248             return (url, wb, "wayback")
    249     # nothing found
    250     log(f"[INFO] no suitable Wayback snapshot found for {url}", quiet)
    251     return (url, None, "no_snapshot")
    252 
    253 # ---------------- Main CLI ----------------
    254 def main() -> None:
    255     ap = argparse.ArgumentParser(description="In-place fix: upgrade http->https where possible, replace broken links with verified Wayback snapshots. Retries + archive rate-limiting included.")
    256     ap.add_argument("file", help="Path to the file to edit in-place (backup saved as <file>.bak unless --no-backup).")
    257     ap.add_argument("--map", help="Optional CSV mapping original_url -> replacement_url for audit.")
    258     ap.add_argument("--concurrency", type=int, default=DEFAULT_CONCURRENCY, help="Number of worker threads (default %(default)s).")
    259     ap.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help="Network timeout seconds (default %(default)s).")
    260     ap.add_argument("--quiet", action="store_true", help="Suppress per-request progress output (still prints final summary and substitutions).")
    261     ap.add_argument("--no-backup", action="store_true", help="Do not write <file>.bak backup (use with caution).")
    262     args = ap.parse_args()
    263 
    264     filepath = args.file
    265     if not os.path.isfile(filepath):
    266         print(f"Error: file not found: {filepath}")
    267         raise SystemExit(1)
    268 
    269     with open(filepath, "r", encoding="utf-8") as f:
    270         text = f.read()
    271 
    272     found = [m.group("url") for m in URL_RE.finditer(text)]
    273     unique_urls = list(OrderedDict.fromkeys(found).keys())
    274 
    275     if not unique_urls:
    276         print("No URLs found in file. Nothing to do.")
    277         return
    278 
    279     print(f"Found {len(unique_urls)} unique URLs. Processing with concurrency={args.concurrency}...")
    280 
    281     session = requests.Session()
    282     session.headers.update({"User-Agent": USER_AGENT})
    283 
    284     mapping = {}  # orig -> (replacement or None, note)
    285     start = time.time()
    286 
    287     # Process URLs concurrently (non-archive network operations are also retried inside helpers).
    288     with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as ex:
    289         futures = {ex.submit(process_single_url, url, session, args.timeout, args.quiet): url for url in unique_urls}
    290         for fut in concurrent.futures.as_completed(futures):
    291             orig = futures[fut]
    292             try:
    293                 o, r, note = fut.result()
    294             except Exception as exc:
    295                 log(f"[ERR] processing {orig} -> exception {exc}", args.quiet)
    296                 r = None
    297                 note = "no_snapshot"
    298             mapping[orig] = (r, note)
    299 
    300     elapsed = time.time() - start
    301     print(f"Processing finished in {elapsed:.1f}s.")
    302 
    303     # Apply replacements and collect substitution list
    304     substitutions = []
    305 
    306     def repl_func(m):
    307         u = m.group("url")
    308         repl, note = mapping.get(u, (None, None))
    309         if repl and repl != u:
    310             substitutions.append((u, repl, note))
    311             return repl
    312         return u
    313 
    314     new_text = URL_RE.sub(repl_func, text)
    315 
    316     # Backup and write
    317     if not args.no_backup:
    318         bak_path = filepath + ".bak"
    319         with open(bak_path, "w", encoding="utf-8") as bf:
    320             bf.write(text)
    321         print(f"Backup written to: {bak_path}")
    322     with open(filepath, "w", encoding="utf-8") as out:
    323         out.write(new_text)
    324     print(f"File updated in-place: {filepath}")
    325 
    326     # Optional mapping CSV
    327     if args.map:
    328         try:
    329             with open(args.map, "w", encoding="utf-8", newline="") as csvf:
    330                 w = csv.writer(csvf)
    331                 w.writerow(["original_url", "replacement_url", "note"])
    332                 for orig, (repl, note) in mapping.items():
    333                     w.writerow([orig, repl or "", note])
    334             print(f"Mapping CSV written to: {args.map}")
    335         except Exception as e:
    336             print(f"Warning: failed to write mapping CSV: {e}")
    337 
    338     # Counts
    339     n_total = len(unique_urls)
    340     n_upgraded = sum(1 for o, (r, n) in mapping.items() if n == "upgraded")
    341     n_wayback = sum(1 for o, (r, n) in mapping.items() if n == "wayback")
    342     n_unchanged = sum(1 for o, (r, n) in mapping.items() if n == "ok")
    343     n_no_snapshot = sum(1 for o, (r, n) in mapping.items() if n == "no_snapshot")
    344 
    345     print("\nSummary:")
    346     print(f"  total_urls: {n_total}")
    347     print(f"  upgraded_http_to_https: {n_upgraded}")
    348     print(f"  replaced_with_wayback: {n_wayback}")
    349     print(f"  unchanged (working as-is): {n_unchanged}")
    350     print(f"  no_snapshot (broken, left unchanged): {n_no_snapshot}")
    351 
    352     # Substitutions printed
    353     if substitutions:
    354         print("\nSubstitutions performed (original -> replacement) [note]:")
    355         for orig, repl, note in substitutions:
    356             print(f"- {orig}  ->  {repl}    [{note}]")
    357     else:
    358         print("\nNo substitutions performed.")
    359 
    360     # PROMINENTLY print broken URLs with no snapshot
    361     if n_no_snapshot > 0:
    362         print("\n" + "=" * 80)
    363         print("BROKEN URLS WITH NO INTERNET ARCHIVE SNAPSHOT FOUND (ATTENTION)".center(80))
    364         print("=" * 80)
    365         for orig, (repl, note) in mapping.items():
    366             if note == "no_snapshot":
    367                 print(f"- {orig}")
    368         print("=" * 80 + "\n")
    369     else:
    370         print("\nAll broken URLs had suitable Wayback snapshots (or were left unchanged because they worked).")
    371 
    372 if __name__ == "__main__":
    373     main()
	publist managing my list of publications, talks, reviews
	git clone https://a3nm.net/git/publist/
	Log \| Files \| Refs \| README \| LICENSE