commit 2358676c716d7f0d2f6886336c9152d912f115c8
parent 67d7ca3c6bdddebaf6bae53083a8a9fba1b235f2
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sat, 25 Oct 2025 13:48:25 +0200
fix_links script
Diffstat:
| fix_links | | | 325 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 325 insertions(+), 0 deletions(-)
diff --git a/fix_links b/fix_links
@@ -0,0 +1,325 @@
+#!/usr/bin/env python3
+# chatgpt
+
+"""
+fix_links_inplace_verified.py
+
+In-place fixer:
+ - upgrades http:// -> https:// when the https version actually loads
+ - if a URL doesn't load (or redirects to a clearly-nonsensical location such as the site's root),
+ replaces it with the latest working Internet Archive (Wayback) snapshot that actually loads
+ (the script verifies each Wayback snapshot by requesting it)
+ - edits the file in-place (creates <file>.bak unless --no-backup)
+ - prints per-request progress (disable with --quiet)
+ - prints list of substitutions at the end and detailed counts
+
+Dependencies: requests
+pip install requests
+"""
+from __future__ import annotations
+
+import argparse
+import concurrent.futures
+import csv
+import os
+import re
+import threading
+import time
+from collections import OrderedDict
+from typing import Optional, Tuple, List
+from urllib.parse import quote_plus, urlparse, urlunparse
+
+import requests
+
+# ---------------- Configuration ----------------
+DEFAULT_TIMEOUT = 12
+DEFAULT_CONCURRENCY = 8
+# A modern browser UA (Chrome on Windows) to appear like a regular browser
+USER_AGENT = (
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/119.0.0.0 Safari/537.36"
+)
+URL_RE = re.compile(r"""(?P<url>https?://[^\s'"<>()\]\}]+)""", re.IGNORECASE)
+CDX_LIMIT = 12 # how many CDX candidates to inspect (we'll verify each)
+
+
+# ---------------- Utilities ----------------
+print_lock = threading.Lock()
+
+
+def log(msg: str, quiet: bool) -> None:
+ if quiet:
+ return
+ with print_lock:
+ print(msg, flush=True)
+
+
+def canonicalize_https(url: str) -> str:
+ p = urlparse(url)
+ if p.scheme.lower() == "http":
+ p = p._replace(scheme="https")
+ return urlunparse(p)
+ return url
+
+
+def get_response_info(session: requests.Session, url: str, timeout: int, quiet: bool) -> Tuple[bool, Optional[str], Optional[int], Optional[str]]:
+ """
+ Perform GET request (follow redirects).
+ Returns tuple:
+ (ok_bool, final_url_or_none, status_code_or_none, error_message_or_none)
+
+ ok_bool is True when response returned a status_code < 400.
+ final_url is resp.url (useful to detect redirects).
+ """
+ try:
+ log(f"[REQ] GET {url}", quiet)
+ resp = session.get(url, timeout=timeout, allow_redirects=True, headers={"User-Agent": USER_AGENT})
+ final = resp.url
+ log(f"[RESP] {url} -> {resp.status_code} final={final}", quiet)
+ return (resp.status_code < 400), final, resp.status_code, None
+ except requests.RequestException as e:
+ log(f"[ERR] {url} -> {e.__class__.__name__}: {e}", quiet)
+ return False, None, None, str(e)
+
+
+def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool:
+ """
+ Heuristic to detect 'clearly nonsensical' redirects such as:
+ - original had a non-root path but final is the root (path == '/' or empty)
+ - final netloc differs drastically (optional rule -- we only treat root-truncation as definite bad)
+ Returns True when the redirect should be considered 'broken' and cause us to prefer a Wayback snapshot.
+ """
+ if not final_url:
+ return False
+ if original_url == final_url:
+ return False
+ po = urlparse(original_url)
+ pf = urlparse(final_url)
+
+ # If original had a meaningful path (not root/index) and final is root -> bad
+ orig_path = po.path or "/"
+ final_path = pf.path or "/"
+ # consider common index filenames as not 'meaningful path'
+ index_paths = {"/", "/index.html", "/index.htm", "/index.php"}
+ if (orig_path.lower() not in index_paths) and (final_path in index_paths):
+ # Example: http://domain/some/page -> http://domain/
+ return True
+
+ # If final netloc differs and final is root of a different domain, treat as suspect.
+ if pf.netloc and (pf.netloc != po.netloc) and (final_path in index_paths):
+ return True
+
+ return False
+
+
+def query_wayback_candidates(session: requests.Session, url: str, timeout: int, quiet: bool, limit: int = CDX_LIMIT) -> List[Tuple[str, str]]:
+ """
+ Query CDX and return a list of (timestamp, original) candidates (newest first).
+ Will return an empty list on error.
+ """
+ cdx_url = (
+ "https://web.archive.org/cdx/search/cdx?"
+ "output=json"
+ "&fl=timestamp,original,statuscode"
+ "&filter=statuscode:200"
+ "&sort=reverse"
+ f"&limit={limit}"
+ f"&url={quote_plus(url)}"
+ )
+ try:
+ log(f"[REQ] CDX {cdx_url}", quiet)
+ r = session.get(cdx_url, timeout=timeout, headers={"User-Agent": USER_AGENT})
+ r.raise_for_status()
+ data = r.json()
+ if not isinstance(data, list) or len(data) < 2:
+ return []
+ # skip header if present
+ start_idx = 1 if isinstance(data[0], list) and any("timestamp" in str(x).lower() for x in data[0]) else 0
+ candidates = []
+ for row in data[start_idx:]:
+ if len(row) >= 2:
+ ts = row[0]
+ orig = row[1]
+ candidates.append((ts, orig))
+ log(f"[CDX] found {len(candidates)} candidates for {url}", quiet)
+ return candidates
+ except requests.RequestException as e:
+ log(f"[ERR] CDX {url} -> {e.__class__.__name__}: {e}", quiet)
+ return []
+ except ValueError as e:
+ log(f"[ERR] CDX JSON decode {url} -> {e}", quiet)
+ return []
+
+
+def verify_wayback_snapshot(session: requests.Session, timestamp: str, original: str, timeout: int, quiet: bool) -> Optional[str]:
+ """
+ Build a Wayback snapshot URL and GET it to verify it actually loads (status < 400).
+ Returns the working wayback URL (string) or None.
+ """
+ wb_url = f"https://web.archive.org/web/{timestamp}/{original}"
+ ok, final, status, err = get_response_info(session, wb_url, timeout, quiet)
+ # Accept when request returned status < 400 and final url is at web.archive.org
+ if ok and final and ("web.archive.org" in final):
+ return wb_url
+ # Some snapshots might redirect to a different archived representation,
+ # but we require final to be on web.archive.org and status < 400.
+ return None
+
+
+# ---------------- Core logic ----------------
+def process_single_url(url: str, session: requests.Session, timeout: int, quiet: bool) -> Tuple[str, Optional[str], str]:
+ """
+ Process one URL and return (original, replacement_or_None, note)
+ note is one of: "upgraded" (http->https), "ok" (left unchanged), "wayback" (replaced by wayback),
+ "no_snapshot" (broken, no wayback found)
+ """
+ parsed = urlparse(url)
+ scheme = parsed.scheme.lower()
+
+ # 1) If http: try https
+ if scheme == "http":
+ https_url = canonicalize_https(url)
+ log(f"[INFO] trying https for {url} -> {https_url}", quiet)
+ ok, final, status, err = get_response_info(session, https_url, timeout, quiet)
+ if ok:
+ # If it loaded but redirected to something meaningless, treat as failure
+ if is_bad_redirect(https_url, final):
+ log(f"[WARN] https for {url} redirected badly -> {final}; will try Wayback", quiet)
+ # fallthrough to next checks (do not accept https)
+ else:
+ return (url, https_url, "upgraded")
+
+ # 2) Try original
+ log(f"[INFO] checking original {url}", quiet)
+ ok, final, status, err = get_response_info(session, url, timeout, quiet)
+ if ok:
+ if is_bad_redirect(url, final):
+ log(f"[WARN] {url} redirected badly -> {final}; treating as broken and trying Wayback", quiet)
+ # treat as broken, go to wayback flow
+ else:
+ return (url, url, "ok")
+
+ # 3) Broken or bad redirect -> query Wayback CDX and verify candidates
+ log(f"[INFO] querying Wayback for {url}", quiet)
+ candidates = query_wayback_candidates(session, url, timeout, quiet, limit=CDX_LIMIT)
+ for ts, orig in candidates:
+ wb = verify_wayback_snapshot(session, ts, orig, timeout, quiet)
+ if wb:
+ log(f"[WAYBACK] selected {wb} for {url}", quiet)
+ return (url, wb, "wayback")
+ # nothing found
+ log(f"[INFO] no suitable Wayback snapshot found for {url}", quiet)
+ return (url, None, "no_snapshot")
+
+
+# ---------------- Main CLI ----------------
+def main() -> None:
+ import argparse
+
+ ap = argparse.ArgumentParser(description="In-place fix: upgrade http->https when possible, replace broken links with verified Wayback snapshots.")
+ ap.add_argument("file", help="Path to the file to edit in-place (backup saved as <file>.bak unless --no-backup).")
+ ap.add_argument("--map", help="Optional CSV mapping original_url -> replacement_url for audit.")
+ ap.add_argument("--concurrency", type=int, default=DEFAULT_CONCURRENCY, help="Number of worker threads (default %(default)s).")
+ ap.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help="Network timeout seconds (default %(default)s).")
+ ap.add_argument("--quiet", action="store_true", help="Suppress per-request progress output (still prints final summary and substitutions).")
+ ap.add_argument("--no-backup", action="store_true", help="Do not write <file>.bak backup (use with caution).")
+ args = ap.parse_args()
+
+ filepath = args.file
+ if not os.path.isfile(filepath):
+ print(f"Error: file not found: {filepath}")
+ raise SystemExit(1)
+
+ with open(filepath, "r", encoding="utf-8") as f:
+ text = f.read()
+
+ found = [m.group("url") for m in URL_RE.finditer(text)]
+ unique_urls = list(OrderedDict.fromkeys(found).keys())
+
+ if not unique_urls:
+ print("No URLs found in file. Nothing to do.")
+ return
+
+ print(f"Found {len(unique_urls)} unique URLs. Processing with concurrency={args.concurrency}...")
+
+ session = requests.Session()
+ session.headers.update({"User-Agent": USER_AGENT})
+
+ mapping = {} # orig -> (replacement or None, note)
+ start = time.time()
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as ex:
+ futures = {ex.submit(process_single_url, url, session, args.timeout, args.quiet): url for url in unique_urls}
+ for fut in concurrent.futures.as_completed(futures):
+ orig = futures[fut]
+ try:
+ o, r, note = fut.result()
+ except Exception as exc:
+ log(f"[ERR] processing {orig} -> exception {exc}", args.quiet)
+ r = None
+ note = "no_snapshot"
+ mapping[orig] = (r, note)
+
+ elapsed = time.time() - start
+ print(f"Processing finished in {elapsed:.1f}s.")
+
+ # Apply replacements and collect substitution list
+ substitutions = []
+
+ def repl_func(m):
+ u = m.group("url")
+ repl, note = mapping.get(u, (None, None))
+ if repl and repl != u:
+ substitutions.append((u, repl, note))
+ return repl
+ return u
+
+ new_text = URL_RE.sub(repl_func, text)
+
+ # Backup and write
+ if not args.no_backup:
+ bak_path = filepath + ".bak"
+ with open(bak_path, "w", encoding="utf-8") as bf:
+ bf.write(text)
+ print(f"Backup written to: {bak_path}")
+ with open(filepath, "w", encoding="utf-8") as out:
+ out.write(new_text)
+ print(f"File updated in-place: {filepath}")
+
+ # Optional mapping CSV
+ if args.map:
+ try:
+ with open(args.map, "w", encoding="utf-8", newline="") as csvf:
+ w = csv.writer(csvf)
+ w.writerow(["original_url", "replacement_url", "note"])
+ for orig, (repl, note) in mapping.items():
+ w.writerow([orig, repl or "", note])
+ print(f"Mapping CSV written to: {args.map}")
+ except Exception as e:
+ print(f"Warning: failed to write mapping CSV: {e}")
+
+ # Counts
+ n_total = len(unique_urls)
+ n_upgraded = sum(1 for o, (r, n) in mapping.items() if n == "upgraded")
+ n_wayback = sum(1 for o, (r, n) in mapping.items() if n == "wayback")
+ n_unchanged = sum(1 for o, (r, n) in mapping.items() if n == "ok")
+ n_no_snapshot = sum(1 for o, (r, n) in mapping.items() if n == "no_snapshot")
+
+ print("\nSummary:")
+ print(f" total_urls: {n_total}")
+ print(f" upgraded_http_to_https: {n_upgraded}")
+ print(f" replaced_with_wayback: {n_wayback}")
+ print(f" unchanged (working as-is): {n_unchanged}")
+ print(f" no_snapshot (broken, left unchanged): {n_no_snapshot}")
+
+ # Substitutions printed
+ if substitutions:
+ print("\nSubstitutions performed (original -> replacement) [note]:")
+ for orig, repl, note in substitutions:
+ print(f"- {orig} -> {repl} [{note}]")
+ else:
+ print("\nNo substitutions performed.")
+
+if __name__ == "__main__":
+ main()