rdupes.py (1008B)
1 #!/usr/bin/env python 2 3 # TODO don't show file duplicates under duplicate folders 4 import os 5 import sys 6 import hashlib 7 8 def register(f, h, s): 9 print "%s\t%s\t%s" % (h, s, f) 10 11 def hashfile(f): 12 sha1 = hashlib.sha1() 13 fp = open(f, 'rb') 14 try: 15 while True: 16 buf = fp.read(16*1024*1024) 17 if not buf: 18 break 19 sha1.update(buf) 20 finally: 21 fp.close() 22 return sha1.hexdigest() 23 24 def prefix(p, s): 25 return [os.path.join(p, x) for x in s] 26 27 def explore(d): 28 size = 1 # sort folders before files 29 hashes = [] 30 #print "explore %s" % d 31 files = os.listdir(d) 32 for f in prefix(d, files): 33 try: 34 if os.path.isdir(f): 35 h, s = explore(f) 36 else: 37 s = os.stat(f).st_size 38 h = hashfile(f) 39 register(f, h, s) 40 hashes.append(h) 41 size += s 42 except (OSError, IOError): 43 continue 44 sha1 = hashlib.sha1() 45 hashes.sort() 46 sha1.update('d' + '-'.join(hashes)) 47 h = sha1.hexdigest() 48 return h, size 49 50 for d in sys.argv[1:]: 51 explore(d) 52