commit 10025a7fc483089af9bb9c38484b57009e8786e1
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Fri,  6 Jan 2012 20:20:01 +0100
initial write
Diffstat:
| rdupes.py | | | 60 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | 
1 file changed, 60 insertions(+), 0 deletions(-)
diff --git a/rdupes.py b/rdupes.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import hashlib
+
+hashes = {}
+sizes = {}
+
+def register(f, h, s):
+  print >> sys.stderr, f
+  if h in hashes.keys():
+    hashes[h].append(f)
+    assert(sizes[h] == s)
+  else:
+    hashes[h] = [f]
+    sizes[h] = s
+
+def hashfile(f):
+  sha1 = hashlib.sha1()
+  fp = open(f, 'rb')
+  try:
+    sha1.update(fp.read())
+  finally:
+    fp.close()
+  return sha1.hexdigest()
+
+def prefix(p, s):
+  return [os.path.join(p, x) for x in s]
+
+def explore(d):
+  size = 1 # sort folders before files
+  hashes = []
+  #print "explore %s" % d
+  files = os.listdir(d)
+  for f in prefix(d, files):
+    if os.path.isdir(f):
+      h, s = explore(f)
+    else:
+      try:
+        s = os.stat(f).st_size
+        h = hashfile(f)
+      except OSError:
+        continue
+    register(f, h, s)
+    hashes.append(h)
+    size += s
+  sha1 = hashlib.sha1()
+  hashes.sort()
+  sha1.update('d' + '-'.join(hashes))
+  h = sha1.hexdigest()
+  return h, size
+
+for d in sys.argv[1:]:
+  explore(d)
+
+hashes2 = [(sizes[x], hashes[x]) for x in hashes.keys() if len(hashes[x]) > 1]
+hashes2.sort(reverse=True)
+print '\n'.join([str(s) + ": " + ' '.join(f) for (s, f) in hashes2])
+