From 8e717750feb5e4ca12da06ee0736e0463880b3a1 Mon Sep 17 00:00:00 2001
From: josch <j.schauer@email.de>
Date: Sun, 29 Jun 2014 02:25:23 +0200
Subject: [PATCH] initial commit

---
 README.md            | 24 +++++++++++
 findduplicatedirs.py | 96 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 README.md
 create mode 100755 findduplicatedirs.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5850883
--- /dev/null
+++ b/README.md
@@ -0,0 +1,24 @@
+Description
+-----------
+
+Give a directory, find all directories of which two or more duplicates exist.
+
+A directory is considered a duplicate if its sha256 checksum is equal. A
+directory's sha256 checksum is calculated from the concatenation of:
+
+ - all file names
+ - all file contents
+ - the targets of all symlinks (links are not followed)
+ - all direct subdirectory names
+ - the sha256 of its direct subdirectories
+
+To avoid clutter, only those duplicate directories are printed for which at
+least one parent directory has a different hash from the others. This avoids
+printing duplicate directories for which their parents are also all the exact
+duplicate of each other.
+
+The output format has 4 or more columns separated by tabs. The first column
+lists the diskusage as it would be returned by `du -b`. The second column lists
+the amount of elements in this sub directory tree as it would be returned by
+`find | wc -l`. All subsequent columns list the duplicate directories with the
+same content.
diff --git a/findduplicatedirs.py b/findduplicatedirs.py
new file mode 100755
index 0000000..1a1318a
--- /dev/null
+++ b/findduplicatedirs.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2014 Johannes Schauer <j.schauer@email.de>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+import os
+import sys
+import hashlib
+from collections import defaultdict
+
+if len(sys.argv) != 2:
+    print("usage: %s directory"%sys.argv[0])
+    exit(1)
+
+dirhashes=dict()
+duplicates=defaultdict(list)
+diskusage=dict()
+subtreesize=dict()
+
+root=os.path.abspath(sys.argv[1])
+
+directorywalk = list(os.walk(root, topdown=False))
+
+total = len(directorywalk)
+
+for (i, (dirpath, dirnames, filenames)) in enumerate(directorywalk):
+    print("%.02f\r"%((100.0*(i+1))/total), file=sys.stderr, end='')
+    h = hashlib.sha256()
+    # initialize disk usage to the size of this directory
+    du = os.path.getsize(dirpath)
+    # initialize the subtreesize to the number of files in this directory plus
+    # one for this directory itself
+    sts = len(filenames)+1
+    # process all files
+    for filename in sorted(filenames):
+        h.update(filename.encode('utf8', 'surrogateescape'))
+        filename = os.path.join(dirpath, filename)
+        # we ignore the content of everything that is not a regular file or symlink
+        # the content of a symlink is its destination
+        if os.path.islink(filename):
+            h.update(os.readlink(filename).encode('utf8', 'surrogateescape'))
+        elif os.path.isfile(filename):
+            du += os.path.getsize(filename)
+            with open(filename, 'rb') as f:
+                for chunk in iter(lambda: f.read(4096), b''):
+                    h.update(chunk)
+    # process all directories
+    for dirname in sorted(dirnames):
+        h.update(dirname.encode('utf8', 'surrogateescape'))
+        dirname = os.path.join(dirpath, dirname)
+        if os.path.islink(dirname):
+            h.update(os.readlink(dirname).encode('utf8', 'surrogateescape'))
+        else:
+            sha = dirhashes[dirname]
+            du += diskusage[sha]
+            sts += subtreesize[sha]
+            h.update(sha)
+    # update information
+    sha = h.digest()
+    dirhashes[dirpath] = sha
+    subtreesize[sha] = sts
+    diskusage[sha] = du
+    duplicates[sha].append(dirpath)
+
+# filter the list of hashes such that only hashes where the directories
+# belonging to the hash have direct parent directories with a different hash
+# remain
+nondups = list()
+for k,v in duplicates.items():
+    # if a hash only has one directory, there is no duplicate
+    if len(v) == 1:
+        continue
+    # if all directories have the same parent, then it's a duplicate
+    if len(set([os.path.dirname(p) for p in v])) == 1:
+        nondups.append(k)
+        continue
+    # if all parents have the same hash, do not append because parent will be
+    # added
+    if len(set([dirhashes[os.path.dirname(p)] for p in v])) != 1:
+        nondups.append(k)
+        continue
+
+for sha in nondups:
+    du = diskusage[sha]
+    sts = subtreesize[sha]
+    dirs = [os.path.relpath(p) for p in duplicates[sha]]
+    print("%d\t%d\t%s"%(du, sts, "\t".join(dirs)))