rdupes

recursively search for duplicate files
git clone https://a3nm.net/git/rdupes/
Log | Files | Refs

rdupes.py (1008B)


      1 #!/usr/bin/env python
      2 
      3 # TODO don't show file duplicates under duplicate folders
      4 import os
      5 import sys
      6 import hashlib
      7 
      8 def register(f, h, s):
      9   print "%s\t%s\t%s" % (h, s, f)
     10 
     11 def hashfile(f):
     12   sha1 = hashlib.sha1()
     13   fp = open(f, 'rb')
     14   try:
     15     while True:
     16       buf = fp.read(16*1024*1024)
     17       if not buf:
     18         break
     19       sha1.update(buf)
     20   finally:
     21     fp.close()
     22   return sha1.hexdigest()
     23 
     24 def prefix(p, s):
     25   return [os.path.join(p, x) for x in s]
     26 
     27 def explore(d):
     28   size = 1 # sort folders before files
     29   hashes = []
     30   #print "explore %s" % d
     31   files = os.listdir(d)
     32   for f in prefix(d, files):
     33     try:
     34       if os.path.isdir(f):
     35         h, s = explore(f)
     36       else:
     37           s = os.stat(f).st_size
     38           h = hashfile(f)
     39       register(f, h, s)
     40       hashes.append(h)
     41       size += s
     42     except (OSError, IOError):
     43       continue
     44   sha1 = hashlib.sha1()
     45   hashes.sort()
     46   sha1.update('d' + '-'.join(hashes))
     47   h = sha1.hexdigest()
     48   return h, size
     49 
     50 for d in sys.argv[1:]:
     51   explore(d)
     52