Commit bf2f089f by willmcgugan

Some tuning of the find_duplicates algorithm

parent d68fcee0
...@@ -141,7 +141,7 @@ def countbytes(fs): ...@@ -141,7 +141,7 @@ def countbytes(fs):
return total return total
def find_duplicates(fs, compare_paths=None, quick=False, signature_size=16384): def find_duplicates(fs, compare_paths=None, quick=False, signature_chunk_size=16*1024, signature_size=10*16*1024):
"""A generator that yields the paths of duplicate files in an FS object. """A generator that yields the paths of duplicate files in an FS object.
Files are considered identical if the contents are the same (dates or Files are considered identical if the contents are the same (dates or
other attributes not take in to account). other attributes not take in to account).
...@@ -149,10 +149,13 @@ def find_duplicates(fs, compare_paths=None, quick=False, signature_size=16384): ...@@ -149,10 +149,13 @@ def find_duplicates(fs, compare_paths=None, quick=False, signature_size=16384):
fs -- A filesystem object fs -- A filesystem object
compare_paths -- An iterable of paths in the FS object, or all files if omited compare_paths -- An iterable of paths in the FS object, or all files if omited
quick -- If set to True, the quick method of finding duplicates will be used, quick -- If set to True, the quick method of finding duplicates will be used,
which can potentially miss some duplicates. which can potentially return false positives if the files have the same
signature_size -- The chunk size in bytes used to generate file signatures, size and start with the same data. Do not use when deleting files!
lower values will decrease the likelyhood of missed duplicates when used with
quick=True signature_chunk_size -- The number of bytes to read before generating a
signature checksum value
signature_size -- The total number of bytes read to generate a signature
""" """
...@@ -178,12 +181,14 @@ def find_duplicates(fs, compare_paths=None, quick=False, signature_size=16384): ...@@ -178,12 +181,14 @@ def find_duplicates(fs, compare_paths=None, quick=False, signature_size=16384):
for path in paths: for path in paths:
signature = [] signature = []
fread = None fread = None
bytes_read = 0
try: try:
fread = fs.open(path, 'rb') fread = fs.open(path, 'rb')
while len(signature) < 4: while signature_size is None or bytes_read < signature_size:
data = fread.read(signature_size) data = fread.read(signature_chunk_size)
if not data: if not data:
break break
bytes_read += len(data)
signature.append(crc32(data)) signature.append(crc32(data))
finally: finally:
if fread is not None: if fread is not None:
...@@ -244,4 +249,3 @@ if __name__ == "__main__": ...@@ -244,4 +249,3 @@ if __name__ == "__main__":
fs = OSFS('~/duptest') fs = OSFS('~/duptest')
for files in find_duplicates(fs, quick=False): for files in find_duplicates(fs, quick=False):
print files print files
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment