commit 9831b58287dca45d75b87186c396c45b19278bb7
parent 2358676c716d7f0d2f6886336c9152d912f115c8
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sat, 25 Oct 2025 14:02:54 +0200
improve script
Diffstat:
| fix_links | | | 192 | +++++++++++++++++++++++++++++++++++++++++++++++++------------------------------ |
1 file changed, 120 insertions(+), 72 deletions(-)
diff --git a/fix_links b/fix_links
@@ -1,21 +1,22 @@
#!/usr/bin/env python3
-# chatgpt
-
+# Chatgpt
"""
-fix_links_inplace_verified.py
-
-In-place fixer:
- - upgrades http:// -> https:// when the https version actually loads
- - if a URL doesn't load (or redirects to a clearly-nonsensical location such as the site's root),
- replaces it with the latest working Internet Archive (Wayback) snapshot that actually loads
- (the script verifies each Wayback snapshot by requesting it)
- - edits the file in-place (creates <file>.bak unless --no-backup)
- - prints per-request progress (disable with --quiet)
- - prints list of substitutions at the end and detailed counts
-
-Dependencies: requests
-pip install requests
+fix_links_inplace_rate_limited.py
+
+In-place fixer with retries and Internet Archive rate-limiting.
+
+Features:
+ - Upgrades http:// -> https:// when the https version actually loads
+ - If a URL doesn't load (or redirects to a clearly-nonsensical location),
+ replaces it with the latest verified Internet Archive (Wayback) snapshot.
+ - Verifies Wayback snapshots by actually loading them.
+ - Retries every network access up to 3 times (exponential backoff).
+ - Ensures at least 3 seconds between successive requests to web.archive.org (global rate limit).
+ - Edits the file in-place (creates <file>.bak unless --no-backup).
+ - Prints per-request progress (disable with --quiet).
+ - Shows substitutions and prominently lists URLs that are broken with no Wayback snapshot.
"""
+
from __future__ import annotations
import argparse
@@ -34,6 +35,8 @@ import requests
# ---------------- Configuration ----------------
DEFAULT_TIMEOUT = 12
DEFAULT_CONCURRENCY = 8
+RETRIES = 3
+ARCHIVE_MIN_INTERVAL = 3.0 # seconds between successive requests to web.archive.org
# A modern browser UA (Chrome on Windows) to appear like a regular browser
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -44,6 +47,11 @@ URL_RE = re.compile(r"""(?P<url>https?://[^\s'"<>()\]\}]+)""", re.IGNORECASE)
CDX_LIMIT = 12 # how many CDX candidates to inspect (we'll verify each)
+# ---------------- Globals for archive rate-limiting ----------------
+_archive_lock = threading.Lock()
+_last_archive_request_time = 0.0 # epoch seconds
+
+
# ---------------- Utilities ----------------
print_lock = threading.Lock()
@@ -63,32 +71,68 @@ def canonicalize_https(url: str) -> str:
return url
+def _sleep_backoff(attempt: int) -> None:
+ # simple exponential backoff: 1s, 2s, 4s...
+ time.sleep(1 * (2 ** (attempt - 1)))
+
+
+def request_with_retries(session: requests.Session, url: str, timeout: int, quiet: bool,
+ archive: bool = False, retries: int = RETRIES) -> Optional[requests.Response]:
+ """
+ GET the URL with up to `retries` attempts. If `archive=True`, enforce global 3s spacing
+ between successive archive requests (CDX or web.archive.org).
+ Returns the Response object on success (resp.raise_for_status not enforced here), or None on persistent failure.
+ """
+ global _last_archive_request_time
+ attempt = 0
+ while attempt < retries:
+ attempt += 1
+ try:
+ if archive:
+ # global rate-limiting: ensure at least ARCHIVE_MIN_INTERVAL since last archive request
+ with _archive_lock:
+ now = time.time()
+ elapsed = now - _last_archive_request_time
+ if elapsed < ARCHIVE_MIN_INTERVAL:
+ wait = ARCHIVE_MIN_INTERVAL - elapsed
+ log(f"[ARCHIVE RATE] waiting {wait:.2f}s before archive request to avoid spamming", quiet)
+ time.sleep(wait)
+ # proceed and update timestamp (speculative: set it now so other threads will wait)
+ _last_archive_request_time = time.time()
+ log(f"[REQ] GET (attempt {attempt}/{retries}) {url}", quiet)
+ resp = session.get(url, timeout=timeout, allow_redirects=True, headers={"User-Agent": USER_AGENT})
+ log(f"[RESP] {url} -> {getattr(resp, 'status_code', 'NO_RESP')} final={getattr(resp, 'url', '')}", quiet)
+ return resp
+ except requests.RequestException as e:
+ log(f"[ERR] {url} attempt {attempt} -> {e.__class__.__name__}: {e}", quiet)
+ if attempt < retries:
+ _sleep_backoff(attempt)
+ else:
+ return None
+ return None
+
+
def get_response_info(session: requests.Session, url: str, timeout: int, quiet: bool) -> Tuple[bool, Optional[str], Optional[int], Optional[str]]:
"""
- Perform GET request (follow redirects).
- Returns tuple:
+ Perform GET (with retries). Returns:
(ok_bool, final_url_or_none, status_code_or_none, error_message_or_none)
-
- ok_bool is True when response returned a status_code < 400.
- final_url is resp.url (useful to detect redirects).
+ ok_bool is True when we obtained a response and resp.status_code < 400.
"""
+ resp = request_with_retries(session, url, timeout, quiet, archive=False)
+ if resp is None:
+ return False, None, None, "request_failed"
try:
- log(f"[REQ] GET {url}", quiet)
- resp = session.get(url, timeout=timeout, allow_redirects=True, headers={"User-Agent": USER_AGENT})
final = resp.url
- log(f"[RESP] {url} -> {resp.status_code} final={final}", quiet)
- return (resp.status_code < 400), final, resp.status_code, None
- except requests.RequestException as e:
- log(f"[ERR] {url} -> {e.__class__.__name__}: {e}", quiet)
- return False, None, None, str(e)
+ except Exception:
+ final = None
+ return (resp.status_code < 400), final, resp.status_code, None
def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool:
"""
- Heuristic to detect 'clearly nonsensical' redirects such as:
- - original had a non-root path but final is the root (path == '/' or empty)
- - final netloc differs drastically (optional rule -- we only treat root-truncation as definite bad)
- Returns True when the redirect should be considered 'broken' and cause us to prefer a Wayback snapshot.
+ Heuristic to detect 'clearly nonsensical' redirects:
+ - original had a non-root path but final is the root (path == '/' or index file)
+ - final netloc differs and final is root index -> suspect
"""
if not final_url:
return False
@@ -97,16 +141,12 @@ def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool:
po = urlparse(original_url)
pf = urlparse(final_url)
- # If original had a meaningful path (not root/index) and final is root -> bad
orig_path = po.path or "/"
final_path = pf.path or "/"
- # consider common index filenames as not 'meaningful path'
index_paths = {"/", "/index.html", "/index.htm", "/index.php"}
if (orig_path.lower() not in index_paths) and (final_path in index_paths):
- # Example: http://domain/some/page -> http://domain/
return True
- # If final netloc differs and final is root of a different domain, treat as suspect.
if pf.netloc and (pf.netloc != po.netloc) and (final_path in index_paths):
return True
@@ -115,8 +155,7 @@ def is_bad_redirect(original_url: str, final_url: Optional[str]) -> bool:
def query_wayback_candidates(session: requests.Session, url: str, timeout: int, quiet: bool, limit: int = CDX_LIMIT) -> List[Tuple[str, str]]:
"""
- Query CDX and return a list of (timestamp, original) candidates (newest first).
- Will return an empty list on error.
+ Query CDX API (rate-limited + retried) and return list of (timestamp, original) candidates (newest first).
"""
cdx_url = (
"https://web.archive.org/cdx/search/cdx?"
@@ -127,43 +166,41 @@ def query_wayback_candidates(session: requests.Session, url: str, timeout: int,
f"&limit={limit}"
f"&url={quote_plus(url)}"
)
- try:
- log(f"[REQ] CDX {cdx_url}", quiet)
- r = session.get(cdx_url, timeout=timeout, headers={"User-Agent": USER_AGENT})
- r.raise_for_status()
- data = r.json()
- if not isinstance(data, list) or len(data) < 2:
- return []
- # skip header if present
- start_idx = 1 if isinstance(data[0], list) and any("timestamp" in str(x).lower() for x in data[0]) else 0
- candidates = []
- for row in data[start_idx:]:
- if len(row) >= 2:
- ts = row[0]
- orig = row[1]
- candidates.append((ts, orig))
- log(f"[CDX] found {len(candidates)} candidates for {url}", quiet)
- return candidates
- except requests.RequestException as e:
- log(f"[ERR] CDX {url} -> {e.__class__.__name__}: {e}", quiet)
+ resp = request_with_retries(session, cdx_url, timeout, quiet, archive=True)
+ if resp is None:
+ log(f"[CDX] failed to fetch CDX results for {url}", quiet)
return []
+ try:
+ data = resp.json()
except ValueError as e:
log(f"[ERR] CDX JSON decode {url} -> {e}", quiet)
return []
+ if not isinstance(data, list) or len(data) < 2:
+ return []
+ start_idx = 1 if isinstance(data[0], list) and any("timestamp" in str(x).lower() for x in data[0]) else 0
+ candidates = []
+ for row in data[start_idx:]:
+ if len(row) >= 2:
+ ts = row[0]
+ orig = row[1]
+ candidates.append((ts, orig))
+ log(f"[CDX] found {len(candidates)} candidates for {url}", quiet)
+ return candidates
def verify_wayback_snapshot(session: requests.Session, timestamp: str, original: str, timeout: int, quiet: bool) -> Optional[str]:
"""
- Build a Wayback snapshot URL and GET it to verify it actually loads (status < 400).
- Returns the working wayback URL (string) or None.
+ Build a Wayback snapshot URL and GET it (rate-limited + retried) to verify it actually loads.
+ Returns the working wayback URL or None.
"""
wb_url = f"https://web.archive.org/web/{timestamp}/{original}"
- ok, final, status, err = get_response_info(session, wb_url, timeout, quiet)
- # Accept when request returned status < 400 and final url is at web.archive.org
- if ok and final and ("web.archive.org" in final):
+ resp = request_with_retries(session, wb_url, timeout, quiet, archive=True)
+ if resp is None:
+ return None
+ # Accept only if final URL is at web.archive.org and status < 400
+ final = getattr(resp, "url", None)
+ if resp.status_code < 400 and final and ("web.archive.org" in final):
return wb_url
- # Some snapshots might redirect to a different archived representation,
- # but we require final to be on web.archive.org and status < 400.
return None
@@ -171,9 +208,13 @@ def verify_wayback_snapshot(session: requests.Session, timestamp: str, original:
def process_single_url(url: str, session: requests.Session, timeout: int, quiet: bool) -> Tuple[str, Optional[str], str]:
"""
Process one URL and return (original, replacement_or_None, note)
- note is one of: "upgraded" (http->https), "ok" (left unchanged), "wayback" (replaced by wayback),
- "no_snapshot" (broken, no wayback found)
+ note: "upgraded", "ok", "wayback", "no_snapshot"
"""
+ # Skip Wayback Machine URLs entirely
+ if url.lower().startswith("https://web.archive.org/"):
+ log(f"[SKIP] already a Wayback Machine URL -> {url}", quiet)
+ return (url, url, "ok")
+
parsed = urlparse(url)
scheme = parsed.scheme.lower()
@@ -183,10 +224,8 @@ def process_single_url(url: str, session: requests.Session, timeout: int, quiet:
log(f"[INFO] trying https for {url} -> {https_url}", quiet)
ok, final, status, err = get_response_info(session, https_url, timeout, quiet)
if ok:
- # If it loaded but redirected to something meaningless, treat as failure
if is_bad_redirect(https_url, final):
log(f"[WARN] https for {url} redirected badly -> {final}; will try Wayback", quiet)
- # fallthrough to next checks (do not accept https)
else:
return (url, https_url, "upgraded")
@@ -196,7 +235,6 @@ def process_single_url(url: str, session: requests.Session, timeout: int, quiet:
if ok:
if is_bad_redirect(url, final):
log(f"[WARN] {url} redirected badly -> {final}; treating as broken and trying Wayback", quiet)
- # treat as broken, go to wayback flow
else:
return (url, url, "ok")
@@ -212,12 +250,9 @@ def process_single_url(url: str, session: requests.Session, timeout: int, quiet:
log(f"[INFO] no suitable Wayback snapshot found for {url}", quiet)
return (url, None, "no_snapshot")
-
# ---------------- Main CLI ----------------
def main() -> None:
- import argparse
-
- ap = argparse.ArgumentParser(description="In-place fix: upgrade http->https when possible, replace broken links with verified Wayback snapshots.")
+ ap = argparse.ArgumentParser(description="In-place fix: upgrade http->https where possible, replace broken links with verified Wayback snapshots. Retries + archive rate-limiting included.")
ap.add_argument("file", help="Path to the file to edit in-place (backup saved as <file>.bak unless --no-backup).")
ap.add_argument("--map", help="Optional CSV mapping original_url -> replacement_url for audit.")
ap.add_argument("--concurrency", type=int, default=DEFAULT_CONCURRENCY, help="Number of worker threads (default %(default)s).")
@@ -249,6 +284,7 @@ def main() -> None:
mapping = {} # orig -> (replacement or None, note)
start = time.time()
+ # Process URLs concurrently (non-archive network operations are also retried inside helpers).
with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as ex:
futures = {ex.submit(process_single_url, url, session, args.timeout, args.quiet): url for url in unique_urls}
for fut in concurrent.futures.as_completed(futures):
@@ -321,5 +357,17 @@ def main() -> None:
else:
print("\nNo substitutions performed.")
+ # PROMINENTLY print broken URLs with no snapshot
+ if n_no_snapshot > 0:
+ print("\n" + "=" * 80)
+ print("BROKEN URLS WITH NO INTERNET ARCHIVE SNAPSHOT FOUND (ATTENTION)".center(80))
+ print("=" * 80)
+ for orig, (repl, note) in mapping.items():
+ if note == "no_snapshot":
+ print(f"- {orig}")
+ print("=" * 80 + "\n")
+ else:
+ print("\nAll broken URLs had suitable Wayback snapshots (or were left unchanged because they worked).")
+
if __name__ == "__main__":
main()