Commit 4d59bcde by willmcgugan

Modified signature generation to only read a portion of the beginning of the…

Modified signature generation to only read a portion of the beginning of the file, which should be enough to indicate a potention duplicate.
parent 11b9f16a
......@@ -172,7 +172,7 @@ def find_duplicates(fs, compare_paths=None, quick=False, signature_size=16384):
signatures = defaultdict(list)
# A signature is a tuple of CRC32s for each 16K of the file
# A signature is a tuple of CRC32s for each 4x16K of the file
# This allows us to find potential duplicates with a dictionary lookup
for paths in size_duplicates:
for path in paths:
......@@ -180,7 +180,7 @@ def find_duplicates(fs, compare_paths=None, quick=False, signature_size=16384):
fread = None
try:
fread = fs.open(path, 'rb')
while True:
while len(signature) < 4:
data = fread.read(signature_size)
if not data:
break
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment