Commit 11b9f16a by willmcgugan

utils.find_duplicates now works. Needs unittests.

parent bb2f9bf1
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
""" """
import shutil import shutil
from mountfs import MountFS from fs.mountfs import MountFS
def copyfile(src_fs, src_path, dst_fs, dst_path, chunk_size=16384): def copyfile(src_fs, src_path, dst_fs, dst_path, chunk_size=16384):
"""Copy a file from one filesystem to another. Will use system copyfile, if both files have a syspath. """Copy a file from one filesystem to another. Will use system copyfile, if both files have a syspath.
...@@ -140,14 +140,14 @@ def countbytes(fs): ...@@ -140,14 +140,14 @@ def countbytes(fs):
total = sum(fs.getsize(f) for f in fs.walkfiles()) total = sum(fs.getsize(f) for f in fs.walkfiles())
return total return total
# Work in progress, not tested
def find_duplicates(fs, paths=None, quick=False, signature_size=16384): def find_duplicates(fs, compare_paths=None, quick=False, signature_size=16384):
"""A generator that yields the paths of duplicate files in an FS object. """A generator that yields the paths of duplicate files in an FS object.
Files are considered identical if the contents are the same (dates or Files are considered identical if the contents are the same (dates or
other attributes not take in to account). other attributes not take in to account).
fs -- A filesystem object fs -- A filesystem object
paths -- An iterable of paths in the FS object, or all files if omited compare_paths -- An iterable of paths in the FS object, or all files if omited
quick -- If set to True, the quick method of finding duplicates will be used, quick -- If set to True, the quick method of finding duplicates will be used,
which can potentially miss some duplicates. which can potentially miss some duplicates.
signature_size -- The chunk size in bytes used to generate file signatures, signature_size -- The chunk size in bytes used to generate file signatures,
...@@ -157,21 +157,23 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384): ...@@ -157,21 +157,23 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384):
""" """
from collections import defaultdict from collections import defaultdict
from zlib.crc32 import crc32 from zlib import crc32
if paths is None:
paths = fs.walkfiles()
paths = list(paths) if compare_paths is None:
compare_paths = fs.walkfiles()
# Create a dictionary that maps file sizes on to the paths of files with
# that filesize. So we can find files of the same size with a quick lookup
file_sizes = defaultdict(list) file_sizes = defaultdict(list)
for path in paths: for path in compare_paths:
file_sizes[fs.getsize(path)].append(path) file_sizes[fs.getsize(path)].append(path)
size_duplicates = [paths for paths in file_sizes if len(paths) > 1] size_duplicates = [paths for paths in file_sizes.itervalues() if len(paths) > 1]
signatures = defaultdict(list) signatures = defaultdict(list)
# A signature is a tuple of CRC32s for each 16K of the file
# This allows us to find potential duplicates with a dictionary lookup
for paths in size_duplicates: for paths in size_duplicates:
for path in paths: for path in paths:
signature = [] signature = []
...@@ -188,16 +190,16 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384): ...@@ -188,16 +190,16 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384):
fread.close() fread.close()
signatures[tuple(signature)].append(path) signatures[tuple(signature)].append(path)
# If 'quick' is True then the signature comparison is adequate (although
# it may result in false positives)
if quick: if quick:
for paths in signatures: for paths in signatures.itervalues():
if len(paths) > 1: if len(paths) > 1:
yield paths yield paths
return return
from itertools import izip
def identical(p1, p2): def identical(p1, p2):
""" Returns True if the contests of two files are identical. """
f1, f2 = None, None f1, f2 = None, None
try: try:
f1 = fs.open(p1, 'rb') f1 = fs.open(p1, 'rb')
...@@ -216,8 +218,11 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384): ...@@ -216,8 +218,11 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384):
if f2 is not None: if f2 is not None:
f2.close() f2.close()
# If we want to be accurate then we need to compare suspected duplicates
for paths in signatures: # byte by byte.
# All path groups in this loop have the same size and same signature, so are
# highly likely to be identical.
for paths in signatures.itervalues():
while len(paths) > 1: while len(paths) > 1:
...@@ -234,4 +239,8 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384): ...@@ -234,4 +239,8 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384):
paths = list(set(paths).difference(dups)) paths = list(set(paths).difference(dups))
if __name__ == "__main__":
from fs.osfs import *
fs = OSFS('~/duptest')
for files in find_duplicates(fs, quick=False):
print files
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment