rdupes

recursively search for duplicate files
git clone https://a3nm.net/git/rdupes/
Log | Files | Refs

commit 10025a7fc483089af9bb9c38484b57009e8786e1
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Fri,  6 Jan 2012 20:20:01 +0100

initial write

Diffstat:
rdupes.py | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 60 insertions(+), 0 deletions(-)

diff --git a/rdupes.py b/rdupes.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +import os +import sys +import hashlib + +hashes = {} +sizes = {} + +def register(f, h, s): + print >> sys.stderr, f + if h in hashes.keys(): + hashes[h].append(f) + assert(sizes[h] == s) + else: + hashes[h] = [f] + sizes[h] = s + +def hashfile(f): + sha1 = hashlib.sha1() + fp = open(f, 'rb') + try: + sha1.update(fp.read()) + finally: + fp.close() + return sha1.hexdigest() + +def prefix(p, s): + return [os.path.join(p, x) for x in s] + +def explore(d): + size = 1 # sort folders before files + hashes = [] + #print "explore %s" % d + files = os.listdir(d) + for f in prefix(d, files): + if os.path.isdir(f): + h, s = explore(f) + else: + try: + s = os.stat(f).st_size + h = hashfile(f) + except OSError: + continue + register(f, h, s) + hashes.append(h) + size += s + sha1 = hashlib.sha1() + hashes.sort() + sha1.update('d' + '-'.join(hashes)) + h = sha1.hexdigest() + return h, size + +for d in sys.argv[1:]: + explore(d) + +hashes2 = [(sizes[x], hashes[x]) for x in hashes.keys() if len(hashes[x]) > 1] +hashes2.sort(reverse=True) +print '\n'.join([str(s) + ": " + ' '.join(f) for (s, f) in hashes2]) +