fix_links (14856B)
1 #!/usr/bin/env python3 2 # Chatgpt 3 """ 4 fix_links_inplace_rate_limited.py 5 6 In-place fixer with retries and Internet Archive rate-limiting. 7 8 Features: 9 - Upgrades http:// -> https:// when the https version actually loads 10 - If a URL doesn't load (or redirects to a clearly-nonsensical location), 11 replaces it with the latest verified Internet Archive (Wayback) snapshot. 12 - Verifies Wayback snapshots by actually loading them. 13 - Retries every network access up to 3 times (exponential backoff). 14 - Ensures at least 3 seconds between successive requests to web.archive.org (global rate limit). 15 - Edits the file in-place (creates <file>.bak unless --no-backup). 16 - Prints per-request progress (disable with --quiet). 17 - Shows substitutions and prominently lists URLs that are broken with no Wayback snapshot. 18 """ 19 20 from __future__ import annotations 21 22 import argparse 23 import concurrent.futures 24 import csv 25 import os 26 import re 27 import threading 28 import time 29 from collections import OrderedDict 30 from typing import Optional, Tuple, List 31 from urllib.parse import quote_plus, urlparse, urlunparse 32 33 import requests 34 35 # ---------------- Configuration ---------------- 36 DEFAULT_TIMEOUT = 12 37 DEFAULT_CONCURRENCY = 8 38 RETRIES = 3 39 ARCHIVE_MIN_INTERVAL = 3.0 # seconds between successive requests to web.archive.org 40 # A modern browser UA (Chrome on Windows) to appear like a regular browser 41 USER_AGENT = ( 42 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " 43 "AppleWebKit/537.36 (KHTML, like Gecko) " 44 "Chrome/119.0.0.0 Safari/537.36" 45 ) 46 URL_RE = re.compile(r"""(?P<url>https?://[^\s'"<>()\]\}]+)""", re.IGNORECASE) 47 CDX_LIMIT = 12 # how many CDX candidates to inspect (we'll verify each) 48 49 50 # ---------------- Globals for archive rate-limiting ---------------- 51 _archive_lock = threading.Lock() 52 _last_archive_request_time = 0.0 # epoch seconds 53 54 55 # ---------------- Utilities ---------------- 56 print_lock = threading.Lock() 57 58 59 def log(msg: str, quiet: bool) -> None: 60 if quiet: 61 return 62 with print_lock: 63 print(msg, flush=True) 64 65 66 def canonicalize_https(url: str) -> str: 67 p = urlparse(url) 68 if p.scheme.lower() == "http": 69 p = p._replace(scheme="https") 70 return urlunparse(p) 71 return url 72 73 74 def _sleep_backoff(attempt: int) -> None: 75 # simple exponential backoff: 1s, 2s, 4s... 76 time.sleep(1 * (2 ** (attempt - 1))) 77 78 79 def request_with_retries(session: requests.Session, url: str, timeout: int, quiet: bool, 80 archive: bool = False, retries: int = RETRIES) -> Optional[requests.Response]: 81 """ 82 GET the URL with up to `retries` attempts. If `archive=True`, enforce global 3s spacing 83 between successive archive requests (CDX or web.archive.org). 84 Returns the Response object on success (resp.raise_for_status not enforced here), or None on persistent failure. 85 """ 86 global _last_archive_request_time 87 attempt = 0 88 while attempt < retries: 89 attempt += 1 90 try: 91 if archive: 92 # global rate-limiting: ensure at least ARCHIVE_MIN_INTERVAL since last archive request 93 with _archive_lock: 94 now = time.time() 95 elapsed = now - _last_archive_request_time 96 if elapsed < ARCHIVE_MIN_INTERVAL: 97 wait = ARCHIVE_MIN_INTERVAL - elapsed 98 log(f"[ARCHIVE RATE] waiting {wait:.2f}s before archive request to avoid spamming", quiet) 99 time.sleep(wait) 100 # proceed and update timestamp (speculative: set it now so other threads will wait) 101 _last_archive_request_time = time.time() 102 log(f"[REQ] GET (attempt {attempt}/{retries}) {url}", quiet) 103 resp = session.get(url, timeout=timeout, allow_redirects=True, headers={"User-Agent": USER_AGENT}) 104 log(f"[RESP] {url} -> {getattr(resp, 'status_code', 'NO_RESP')} final={getattr(resp, 'url', '')}", quiet) 105 return resp 106 except requests.RequestException as e: 107 log(f"[ERR] {url} attempt {attempt} -> {e.__class__.__name__}: {e}", quiet) 108 if attempt < retries: 109 _sleep_backoff(attempt) 110 else: 111 return None 112 return None 113 114 115 def get_response_info(session: requests.Session, url: str, timeout: int, quiet: bool) -> Tuple[bool, Optional[str], Optional[int], Optional[str]]: 116 """ 117 Perform GET (with retries). Returns: 118 (ok_bool, final_url_or_none, status_code_or_none, error_message_or_none) 119 ok_bool is True when we obtained a response and resp.status_code < 400. 120 """ 121 resp = request_with_retries(session, url, timeout, quiet, archive=False) 122 if resp is None: 123 return False, None, None, "request_failed" 124 try: 125 final = resp.url 126 except Exception: 127 final = None 128 return (resp.status_code < 400), final, resp.status_code, None 129 130 131 def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool: 132 """ 133 Heuristic to detect 'clearly nonsensical' redirects: 134 - original had a non-root path but final is the root (path == '/' or index file) 135 - final netloc differs and final is root index -> suspect 136 """ 137 if not final_url: 138 return False 139 if original_url == final_url: 140 return False 141 po = urlparse(original_url) 142 pf = urlparse(final_url) 143 144 orig_path = po.path or "/" 145 final_path = pf.path or "/" 146 index_paths = {"/", "/index.html", "/index.htm", "/index.php"} 147 if (orig_path.lower() not in index_paths) and (final_path in index_paths): 148 return True 149 150 if pf.netloc and (pf.netloc != po.netloc) and (final_path in index_paths): 151 return True 152 153 return False 154 155 156 def query_wayback_candidates(session: requests.Session, url: str, timeout: int, quiet: bool, limit: int = CDX_LIMIT) -> List[Tuple[str, str]]: 157 """ 158 Query CDX API (rate-limited + retried) and return list of (timestamp, original) candidates (newest first). 159 """ 160 cdx_url = ( 161 "https://web.archive.org/cdx/search/cdx?" 162 "output=json" 163 "&fl=timestamp,original,statuscode" 164 "&filter=statuscode:200" 165 "&sort=reverse" 166 f"&limit={limit}" 167 f"&url={quote_plus(url)}" 168 ) 169 resp = request_with_retries(session, cdx_url, timeout, quiet, archive=True) 170 if resp is None: 171 log(f"[CDX] failed to fetch CDX results for {url}", quiet) 172 return [] 173 try: 174 data = resp.json() 175 except ValueError as e: 176 log(f"[ERR] CDX JSON decode {url} -> {e}", quiet) 177 return [] 178 if not isinstance(data, list) or len(data) < 2: 179 return [] 180 start_idx = 1 if isinstance(data[0], list) and any("timestamp" in str(x).lower() for x in data[0]) else 0 181 candidates = [] 182 for row in data[start_idx:]: 183 if len(row) >= 2: 184 ts = row[0] 185 orig = row[1] 186 candidates.append((ts, orig)) 187 log(f"[CDX] found {len(candidates)} candidates for {url}", quiet) 188 return candidates 189 190 191 def verify_wayback_snapshot(session: requests.Session, timestamp: str, original: str, timeout: int, quiet: bool) -> Optional[str]: 192 """ 193 Build a Wayback snapshot URL and GET it (rate-limited + retried) to verify it actually loads. 194 Returns the working wayback URL or None. 195 """ 196 wb_url = f"https://web.archive.org/web/{timestamp}/{original}" 197 resp = request_with_retries(session, wb_url, timeout, quiet, archive=True) 198 if resp is None: 199 return None 200 # Accept only if final URL is at web.archive.org and status < 400 201 final = getattr(resp, "url", None) 202 if resp.status_code < 400 and final and ("web.archive.org" in final): 203 return wb_url 204 return None 205 206 207 # ---------------- Core logic ---------------- 208 def process_single_url(url: str, session: requests.Session, timeout: int, quiet: bool) -> Tuple[str, Optional[str], str]: 209 """ 210 Process one URL and return (original, replacement_or_None, note) 211 note: "upgraded", "ok", "wayback", "no_snapshot" 212 """ 213 # Skip Wayback Machine URLs entirely 214 if url.lower().startswith("https://web.archive.org/"): 215 log(f"[SKIP] already a Wayback Machine URL -> {url}", quiet) 216 return (url, url, "ok") 217 218 parsed = urlparse(url) 219 scheme = parsed.scheme.lower() 220 221 # 1) If http: try https 222 if scheme == "http": 223 https_url = canonicalize_https(url) 224 log(f"[INFO] trying https for {url} -> {https_url}", quiet) 225 ok, final, status, err = get_response_info(session, https_url, timeout, quiet) 226 if ok: 227 if is_bad_redirect(https_url, final): 228 log(f"[WARN] https for {url} redirected badly -> {final}; will try Wayback", quiet) 229 else: 230 return (url, https_url, "upgraded") 231 232 # 2) Try original 233 log(f"[INFO] checking original {url}", quiet) 234 ok, final, status, err = get_response_info(session, url, timeout, quiet) 235 if ok: 236 if is_bad_redirect(url, final): 237 log(f"[WARN] {url} redirected badly -> {final}; treating as broken and trying Wayback", quiet) 238 else: 239 return (url, url, "ok") 240 241 # 3) Broken or bad redirect -> query Wayback CDX and verify candidates 242 log(f"[INFO] querying Wayback for {url}", quiet) 243 candidates = query_wayback_candidates(session, url, timeout, quiet, limit=CDX_LIMIT) 244 for ts, orig in candidates: 245 wb = verify_wayback_snapshot(session, ts, orig, timeout, quiet) 246 if wb: 247 log(f"[WAYBACK] selected {wb} for {url}", quiet) 248 return (url, wb, "wayback") 249 # nothing found 250 log(f"[INFO] no suitable Wayback snapshot found for {url}", quiet) 251 return (url, None, "no_snapshot") 252 253 # ---------------- Main CLI ---------------- 254 def main() -> None: 255 ap = argparse.ArgumentParser(description="In-place fix: upgrade http->https where possible, replace broken links with verified Wayback snapshots. Retries + archive rate-limiting included.") 256 ap.add_argument("file", help="Path to the file to edit in-place (backup saved as <file>.bak unless --no-backup).") 257 ap.add_argument("--map", help="Optional CSV mapping original_url -> replacement_url for audit.") 258 ap.add_argument("--concurrency", type=int, default=DEFAULT_CONCURRENCY, help="Number of worker threads (default %(default)s).") 259 ap.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help="Network timeout seconds (default %(default)s).") 260 ap.add_argument("--quiet", action="store_true", help="Suppress per-request progress output (still prints final summary and substitutions).") 261 ap.add_argument("--no-backup", action="store_true", help="Do not write <file>.bak backup (use with caution).") 262 args = ap.parse_args() 263 264 filepath = args.file 265 if not os.path.isfile(filepath): 266 print(f"Error: file not found: {filepath}") 267 raise SystemExit(1) 268 269 with open(filepath, "r", encoding="utf-8") as f: 270 text = f.read() 271 272 found = [m.group("url") for m in URL_RE.finditer(text)] 273 unique_urls = list(OrderedDict.fromkeys(found).keys()) 274 275 if not unique_urls: 276 print("No URLs found in file. Nothing to do.") 277 return 278 279 print(f"Found {len(unique_urls)} unique URLs. Processing with concurrency={args.concurrency}...") 280 281 session = requests.Session() 282 session.headers.update({"User-Agent": USER_AGENT}) 283 284 mapping = {} # orig -> (replacement or None, note) 285 start = time.time() 286 287 # Process URLs concurrently (non-archive network operations are also retried inside helpers). 288 with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as ex: 289 futures = {ex.submit(process_single_url, url, session, args.timeout, args.quiet): url for url in unique_urls} 290 for fut in concurrent.futures.as_completed(futures): 291 orig = futures[fut] 292 try: 293 o, r, note = fut.result() 294 except Exception as exc: 295 log(f"[ERR] processing {orig} -> exception {exc}", args.quiet) 296 r = None 297 note = "no_snapshot" 298 mapping[orig] = (r, note) 299 300 elapsed = time.time() - start 301 print(f"Processing finished in {elapsed:.1f}s.") 302 303 # Apply replacements and collect substitution list 304 substitutions = [] 305 306 def repl_func(m): 307 u = m.group("url") 308 repl, note = mapping.get(u, (None, None)) 309 if repl and repl != u: 310 substitutions.append((u, repl, note)) 311 return repl 312 return u 313 314 new_text = URL_RE.sub(repl_func, text) 315 316 # Backup and write 317 if not args.no_backup: 318 bak_path = filepath + ".bak" 319 with open(bak_path, "w", encoding="utf-8") as bf: 320 bf.write(text) 321 print(f"Backup written to: {bak_path}") 322 with open(filepath, "w", encoding="utf-8") as out: 323 out.write(new_text) 324 print(f"File updated in-place: {filepath}") 325 326 # Optional mapping CSV 327 if args.map: 328 try: 329 with open(args.map, "w", encoding="utf-8", newline="") as csvf: 330 w = csv.writer(csvf) 331 w.writerow(["original_url", "replacement_url", "note"]) 332 for orig, (repl, note) in mapping.items(): 333 w.writerow([orig, repl or "", note]) 334 print(f"Mapping CSV written to: {args.map}") 335 except Exception as e: 336 print(f"Warning: failed to write mapping CSV: {e}") 337 338 # Counts 339 n_total = len(unique_urls) 340 n_upgraded = sum(1 for o, (r, n) in mapping.items() if n == "upgraded") 341 n_wayback = sum(1 for o, (r, n) in mapping.items() if n == "wayback") 342 n_unchanged = sum(1 for o, (r, n) in mapping.items() if n == "ok") 343 n_no_snapshot = sum(1 for o, (r, n) in mapping.items() if n == "no_snapshot") 344 345 print("\nSummary:") 346 print(f" total_urls: {n_total}") 347 print(f" upgraded_http_to_https: {n_upgraded}") 348 print(f" replaced_with_wayback: {n_wayback}") 349 print(f" unchanged (working as-is): {n_unchanged}") 350 print(f" no_snapshot (broken, left unchanged): {n_no_snapshot}") 351 352 # Substitutions printed 353 if substitutions: 354 print("\nSubstitutions performed (original -> replacement) [note]:") 355 for orig, repl, note in substitutions: 356 print(f"- {orig} -> {repl} [{note}]") 357 else: 358 print("\nNo substitutions performed.") 359 360 # PROMINENTLY print broken URLs with no snapshot 361 if n_no_snapshot > 0: 362 print("\n" + "=" * 80) 363 print("BROKEN URLS WITH NO INTERNET ARCHIVE SNAPSHOT FOUND (ATTENTION)".center(80)) 364 print("=" * 80) 365 for orig, (repl, note) in mapping.items(): 366 if note == "no_snapshot": 367 print(f"- {orig}") 368 print("=" * 80 + "\n") 369 else: 370 print("\nAll broken URLs had suitable Wayback snapshots (or were left unchanged because they worked).") 371 372 if __name__ == "__main__": 373 main()