htmlbase.py (1917B)
1 #!/usr/bin/env python 2 3 """Resolve relative links in an HTML blob according to a base""" 4 5 from BeautifulSoup import BeautifulSoup 6 import sys 7 import urlparse 8 9 # source: http://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value 10 # TODO: "These aren't necessarily simple URLs ..." 11 targets = [ 12 ('a', 'href'), ('applet', 'codebase'), ('area', 'href'), ('base', 'href'), 13 ('blockquote', 'cite'), ('body', 'background'), ('del', 'cite'), 14 ('form', 'action'), ('frame', 'longdesc'), ('frame', 'src'), 15 ('head', 'profile'), ('iframe', 'longdesc'), ('iframe', 'src'), 16 ('img', 'longdesc'), ('img', 'src'), ('img', 'usemap'), ('input', 'src'), 17 ('input', 'usemap'), ('ins', 'cite'), ('link', 'href'), 18 ('object', 'classid'), ('object', 'codebase'), ('object', 'data'), 19 ('object', 'usemap'), ('q', 'cite'), ('script', 'src'), ('audio', 'src'), 20 ('button', 'formaction'), ('command', 'icon'), ('embed', 'src'), 21 ('html', 'manifest'), ('input', 'formaction'), ('source', 'src'), 22 ('video', 'poster'), ('video', 'src'), 23 ] 24 25 def rebase_one(base, url): 26 """Rebase one url according to base""" 27 28 parsed = urlparse.urlparse(url) 29 if parsed.scheme == parsed.netloc == '': 30 return urlparse.urljoin(base, url) 31 else: 32 return url 33 34 def rebase(base, data): 35 """Rebase the HTML blob data according to base""" 36 37 soup = BeautifulSoup(data) 38 39 for (tag, attr) in targets: 40 for link in soup.findAll(tag): 41 try: 42 url = link[attr] 43 except KeyError: 44 pass 45 else: 46 link[attr] = rebase_one(base, url) 47 return unicode(soup) 48 49 50 if __name__ == '__main__': 51 try: 52 base = sys.argv[1] 53 except IndexError: 54 print >> sys.stderr, "Usage: %s BASEURL" % sys.argv[0] 55 sys.exit(1) 56 57 data = sys.stdin.read() 58 print rebase(base, data) 59