From 8e717750feb5e4ca12da06ee0736e0463880b3a1 Mon Sep 17 00:00:00 2001 From: josch Date: Sun, 29 Jun 2014 02:25:23 +0200 Subject: [PATCH] initial commit --- README.md | 24 +++++++++++ findduplicatedirs.py | 96 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 README.md create mode 100755 findduplicatedirs.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..5850883 --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +Description +----------- + +Give a directory, find all directories of which two or more duplicates exist. + +A directory is considered a duplicate if its sha256 checksum is equal. A +directory's sha256 checksum is calculated from the concatenation of: + + - all file names + - all file contents + - the targets of all symlinks (links are not followed) + - all direct subdirectory names + - the sha256 of its direct subdirectories + +To avoid clutter, only those duplicate directories are printed for which at +least one parent directory has a different hash from the others. This avoids +printing duplicate directories for which their parents are also all the exact +duplicate of each other. + +The output format has 4 or more columns separated by tabs. The first column +lists the diskusage as it would be returned by `du -b`. The second column lists +the amount of elements in this sub directory tree as it would be returned by +`find | wc -l`. All subsequent columns list the duplicate directories with the +same content. diff --git a/findduplicatedirs.py b/findduplicatedirs.py new file mode 100755 index 0000000..1a1318a --- /dev/null +++ b/findduplicatedirs.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2014 Johannes Schauer +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +import os +import sys +import hashlib +from collections import defaultdict + +if len(sys.argv) != 2: + print("usage: %s directory"%sys.argv[0]) + exit(1) + +dirhashes=dict() +duplicates=defaultdict(list) +diskusage=dict() +subtreesize=dict() + +root=os.path.abspath(sys.argv[1]) + +directorywalk = list(os.walk(root, topdown=False)) + +total = len(directorywalk) + +for (i, (dirpath, dirnames, filenames)) in enumerate(directorywalk): + print("%.02f\r"%((100.0*(i+1))/total), file=sys.stderr, end='') + h = hashlib.sha256() + # initialize disk usage to the size of this directory + du = os.path.getsize(dirpath) + # initialize the subtreesize to the number of files in this directory plus + # one for this directory itself + sts = len(filenames)+1 + # process all files + for filename in sorted(filenames): + h.update(filename.encode('utf8', 'surrogateescape')) + filename = os.path.join(dirpath, filename) + # we ignore the content of everything that is not a regular file or symlink + # the content of a symlink is its destination + if os.path.islink(filename): + h.update(os.readlink(filename).encode('utf8', 'surrogateescape')) + elif os.path.isfile(filename): + du += os.path.getsize(filename) + with open(filename, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b''): + h.update(chunk) + # process all directories + for dirname in sorted(dirnames): + h.update(dirname.encode('utf8', 'surrogateescape')) + dirname = os.path.join(dirpath, dirname) + if os.path.islink(dirname): + h.update(os.readlink(dirname).encode('utf8', 'surrogateescape')) + else: + sha = dirhashes[dirname] + du += diskusage[sha] + sts += subtreesize[sha] + h.update(sha) + # update information + sha = h.digest() + dirhashes[dirpath] = sha + subtreesize[sha] = sts + diskusage[sha] = du + duplicates[sha].append(dirpath) + +# filter the list of hashes such that only hashes where the directories +# belonging to the hash have direct parent directories with a different hash +# remain +nondups = list() +for k,v in duplicates.items(): + # if a hash only has one directory, there is no duplicate + if len(v) == 1: + continue + # if all directories have the same parent, then it's a duplicate + if len(set([os.path.dirname(p) for p in v])) == 1: + nondups.append(k) + continue + # if all parents have the same hash, do not append because parent will be + # added + if len(set([dirhashes[os.path.dirname(p)] for p in v])) != 1: + nondups.append(k) + continue + +for sha in nondups: + du = diskusage[sha] + sts = subtreesize[sha] + dirs = [os.path.relpath(p) for p in duplicates[sha]] + print("%d\t%d\t%s"%(du, sts, "\t".join(dirs)))