htmlrebase -- relative link resolution in HTML according to a given base URL
I found this code lying around, so I'm dumping it here in case someone needs it. It takes an HTML file on standard input and a URL as a command-line argument and produces the HTML file on standard output where all relative links have been resolved according to the given base URL.
#!/usr/bin/env python """Resolve relative links in an HTML blob according to a base""" from BeautifulSoup import BeautifulSoup import sys import urlparse # source: http://stackoverflow.com/q/2725156/414272 # TODO: "These aren't necessarily simple URLs ..." targets = [ ('a', 'href'), ('applet', 'codebase'), ('area', 'href'), ('base', 'href'), ('blockquote', 'cite'), ('body', 'background'), ('del', 'cite'), ('form', 'action'), ('frame', 'longdesc'), ('frame', 'src'), ('head', 'profile'), ('iframe', 'longdesc'), ('iframe', 'src'), ('img', 'longdesc'), ('img', 'src'), ('img', 'usemap'), ('input', 'src'), ('input', 'usemap'), ('ins', 'cite'), ('link', 'href'), ('object', 'classid'), ('object', 'codebase'), ('object', 'data'), ('object', 'usemap'), ('q', 'cite'), ('script', 'src'), ('audio', 'src'), ('button', 'formaction'), ('command', 'icon'), ('embed', 'src'), ('html', 'manifest'), ('input', 'formaction'), ('source', 'src'), ('video', 'poster'), ('video', 'src'), ] def rebase_one(base, url): """Rebase one url according to base""" parsed = urlparse.urlparse(url) if parsed.scheme == parsed.netloc == '': return urlparse.urljoin(base, url) else: return url def rebase(base, data): """Rebase the HTML blob data according to base""" soup = BeautifulSoup(data) for (tag, attr) in targets: for link in soup.findAll(tag): try: url = link[attr] except KeyError: pass else: link[attr] = rebase_one(base, url) return unicode(soup) if __name__ == '__main__': try: base = sys.argv[1] except IndexError: print >> sys.stderr, "Usage: %s BASEURL" % sys.argv[0] sys.exit(1) data = sys.stdin.read() print rebase(base, data)