Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
pyfs
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
OpenEdx
pyfs
Commits
11b9f16a
Commit
11b9f16a
authored
Nov 23, 2009
by
willmcgugan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
utils.find_duplicates now works. Needs unittests.
parent
bb2f9bf1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
47 additions
and
38 deletions
+47
-38
fs/utils.py
+47
-38
No files found.
fs/utils.py
View file @
11b9f16a
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
"""
"""
import
shutil
import
shutil
from
mountfs
import
MountFS
from
fs.
mountfs
import
MountFS
def
copyfile
(
src_fs
,
src_path
,
dst_fs
,
dst_path
,
chunk_size
=
16384
):
def
copyfile
(
src_fs
,
src_path
,
dst_fs
,
dst_path
,
chunk_size
=
16384
):
"""Copy a file from one filesystem to another. Will use system copyfile, if both files have a syspath.
"""Copy a file from one filesystem to another. Will use system copyfile, if both files have a syspath.
...
@@ -140,38 +140,40 @@ def countbytes(fs):
...
@@ -140,38 +140,40 @@ def countbytes(fs):
total
=
sum
(
fs
.
getsize
(
f
)
for
f
in
fs
.
walkfiles
())
total
=
sum
(
fs
.
getsize
(
f
)
for
f
in
fs
.
walkfiles
())
return
total
return
total
# Work in progress, not tested
def
find_duplicates
(
fs
,
paths
=
None
,
quick
=
False
,
signature_size
=
16384
):
def
find_duplicates
(
fs
,
compare_
paths
=
None
,
quick
=
False
,
signature_size
=
16384
):
"""A generator that yields the paths of duplicate files in an FS object.
"""A generator that yields the paths of duplicate files in an FS object.
Files are considered identical if the contents are the same (dates or
Files are considered identical if the contents are the same (dates or
other attributes not take in to account).
other attributes not take in to account).
fs -- A filesystem object
fs -- A filesystem object
paths -- An iterable of paths in the FS object, or all files if omited
compare_
paths -- An iterable of paths in the FS object, or all files if omited
quick -- If set to True, the quick method of finding duplicates will be used,
quick -- If set to True, the quick method of finding duplicates will be used,
which can potentially miss some duplicates.
which can potentially miss some duplicates.
signature_size -- The chunk size in bytes used to generate file signatures,
signature_size -- The chunk size in bytes used to generate file signatures,
lower values will decrease the likelyhood of missed duplicates when used with
lower values will decrease the likelyhood of missed duplicates when used with
quick=True
quick=True
"""
"""
from
collections
import
defaultdict
from
collections
import
defaultdict
from
zlib
.crc32
import
crc32
from
zlib
import
crc32
if
paths
is
None
:
if
compare_
paths
is
None
:
paths
=
fs
.
walkfiles
()
compare_
paths
=
fs
.
walkfiles
()
paths
=
list
(
paths
)
# Create a dictionary that maps file sizes on to the paths of files with
# that filesize. So we can find files of the same size with a quick lookup
file_sizes
=
defaultdict
(
list
)
file_sizes
=
defaultdict
(
list
)
for
path
in
paths
:
for
path
in
compare_
paths
:
file_sizes
[
fs
.
getsize
(
path
)]
.
append
(
path
)
file_sizes
[
fs
.
getsize
(
path
)]
.
append
(
path
)
size_duplicates
=
[
paths
for
paths
in
file_sizes
if
len
(
paths
)
>
1
]
size_duplicates
=
[
paths
for
paths
in
file_sizes
.
itervalues
()
if
len
(
paths
)
>
1
]
signatures
=
defaultdict
(
list
)
signatures
=
defaultdict
(
list
)
# A signature is a tuple of CRC32s for each 16K of the file
# This allows us to find potential duplicates with a dictionary lookup
for
paths
in
size_duplicates
:
for
paths
in
size_duplicates
:
for
path
in
paths
:
for
path
in
paths
:
signature
=
[]
signature
=
[]
...
@@ -187,26 +189,26 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384):
...
@@ -187,26 +189,26 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384):
if
fread
is
not
None
:
if
fread
is
not
None
:
fread
.
close
()
fread
.
close
()
signatures
[
tuple
(
signature
)]
.
append
(
path
)
signatures
[
tuple
(
signature
)]
.
append
(
path
)
# If 'quick' is True then the signature comparison is adequate (although
# it may result in false positives)
if
quick
:
if
quick
:
for
paths
in
signatures
:
for
paths
in
signatures
.
itervalues
()
:
if
len
(
paths
)
>
1
:
if
len
(
paths
)
>
1
:
yield
paths
yield
paths
return
return
from
itertools
import
izip
def
identical
(
p1
,
p2
):
def
identical
(
p1
,
p2
):
""" Returns True if the contests of two files are identical. """
f1
,
f2
=
None
,
None
f1
,
f2
=
None
,
None
try
:
try
:
f1
=
fs
.
open
(
p1
,
'rb'
)
f1
=
fs
.
open
(
p1
,
'rb'
)
f2
=
fs
.
open
(
p2
,
'rb'
)
f2
=
fs
.
open
(
p2
,
'rb'
)
while
True
:
while
True
:
chunk1
=
f1
.
read
(
16384
)
chunk1
=
f1
.
read
(
16384
)
if
not
chunk1
:
if
not
chunk1
:
break
break
chunk2
=
f2
.
read
(
16384
)
chunk2
=
f2
.
read
(
16384
)
if
chunk1
!=
chunk2
:
if
chunk1
!=
chunk2
:
return
False
return
False
return
True
return
True
...
@@ -215,23 +217,30 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384):
...
@@ -215,23 +217,30 @@ def find_duplicates(fs, paths=None, quick=False, signature_size=16384):
f1
.
close
()
f1
.
close
()
if
f2
is
not
None
:
if
f2
is
not
None
:
f2
.
close
()
f2
.
close
()
# If we want to be accurate then we need to compare suspected duplicates
for
paths
in
signatures
:
# byte by byte.
# All path groups in this loop have the same size and same signature, so are
while
len
(
paths
)
>
1
:
# highly likely to be identical.
for
paths
in
signatures
.
itervalues
():
while
len
(
paths
)
>
1
:
test_p
=
paths
.
pop
()
test_p
=
paths
.
pop
()
dups
=
[
test_p
]
dups
=
[
test_p
]
for
path
in
paths
:
for
path
in
paths
:
if
identical
(
test_p
,
path
):
if
identical
(
test_p
,
path
):
dups
.
append
(
path
)
dups
.
append
(
path
)
if
len
(
dups
)
>
1
:
if
len
(
dups
)
>
1
:
yield
dups
yield
dups
paths
=
list
(
set
(
paths
)
.
difference
(
dups
))
paths
=
list
(
set
(
paths
)
.
difference
(
dups
))
if
__name__
==
"__main__"
:
from
fs.osfs
import
*
fs
=
OSFS
(
'~/duptest'
)
for
files
in
find_duplicates
(
fs
,
quick
=
False
):
print
files
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment