mybin

my ~/bin
git clone https://a3nm.net/git/mybin/
Log | Files | Refs | README

htmlbase.py (1917B)


      1 #!/usr/bin/env python
      2 
      3 """Resolve relative links in an HTML blob according to a base"""
      4 
      5 from BeautifulSoup import BeautifulSoup
      6 import sys
      7 import urlparse
      8 
      9 # source: http://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
     10 # TODO: "These aren't necessarily simple URLs ..."
     11 targets = [
     12     ('a', 'href'), ('applet', 'codebase'), ('area', 'href'), ('base', 'href'),
     13     ('blockquote', 'cite'), ('body', 'background'), ('del', 'cite'),
     14     ('form', 'action'), ('frame', 'longdesc'), ('frame', 'src'),
     15     ('head', 'profile'), ('iframe', 'longdesc'), ('iframe', 'src'),
     16     ('img', 'longdesc'), ('img', 'src'), ('img', 'usemap'), ('input', 'src'),
     17     ('input', 'usemap'), ('ins', 'cite'), ('link', 'href'),
     18     ('object', 'classid'), ('object', 'codebase'), ('object', 'data'),
     19     ('object', 'usemap'), ('q', 'cite'), ('script', 'src'), ('audio', 'src'),
     20     ('button', 'formaction'), ('command', 'icon'), ('embed', 'src'),
     21     ('html', 'manifest'), ('input', 'formaction'), ('source', 'src'),
     22     ('video', 'poster'), ('video', 'src'),
     23 ]
     24 
     25 def rebase_one(base, url):
     26     """Rebase one url according to base"""
     27 
     28     parsed = urlparse.urlparse(url)
     29     if parsed.scheme == parsed.netloc == '':
     30         return urlparse.urljoin(base, url)
     31     else:
     32         return url
     33 
     34 def rebase(base, data):
     35     """Rebase the HTML blob data according to base"""
     36 
     37     soup = BeautifulSoup(data)
     38 
     39     for (tag, attr) in targets:
     40         for link in soup.findAll(tag):
     41             try:
     42                 url = link[attr]
     43             except KeyError:
     44                 pass
     45             else:
     46                 link[attr] = rebase_one(base, url)
     47     return unicode(soup)
     48 
     49 
     50 if __name__ == '__main__':
     51     try:
     52         base = sys.argv[1]
     53     except IndexError:
     54         print >> sys.stderr, "Usage: %s BASEURL" % sys.argv[0]
     55         sys.exit(1)
     56 
     57     data = sys.stdin.read()
     58     print rebase(base, data)
     59