Unverified Commit d6fef5b0 by brianhw Committed by GitHub

Merge pull request #496 from edx/brian/fix-gs-rsync

Exclude unloadable files at rsync time
parents 89139226 819b5fbd
...@@ -267,20 +267,19 @@ class BigQueryLoadTask(BigQueryLoadDownstreamMixin, luigi.Task): ...@@ -267,20 +267,19 @@ class BigQueryLoadTask(BigQueryLoadDownstreamMixin, luigi.Task):
def _copy_data_to_gs(self, source_path, destination_path): def _copy_data_to_gs(self, source_path, destination_path):
if self.is_file(source_path): if self.is_file(source_path):
return_code = subprocess.call(['gsutil', 'cp', source_path, destination_path]) command = ['gsutil', 'cp', source_path, destination_path]
else: else:
log.debug(" ".join(['gsutil', '-m', 'rsync', source_path, destination_path])) # Exclude any files which should not be uploaded to
return_code = subprocess.call(['gsutil', '-m', 'rsync', source_path, destination_path]) # BigQuery. It is easier to remove them here than in the
if return_code == 0: # load steps. The pattern is a Python regular expression.
# Remove any files that were copied whose names have leading underscores, since exclusion_pattern = ".*_SUCCESS$|.*_metadata$"
# these files cannot be uploaded to BigQuery. It is easier to remove them here command = ['gsutil', '-m', 'rsync', '-x', exclusion_pattern, source_path, destination_path]
# than to exclude them either in the rsync or in the load steps.
underscore_path = url_path_join(destination_path, '_*') log.debug(" ".join(command))
log.debug(" ".join(['gsutil', 'rm', underscore_path])) return_code = subprocess.call(command)
return_code = subprocess.call(['gsutil', 'rm', underscore_path])
if return_code != 0: if return_code != 0:
raise RuntimeError('Error while syncing {source} to {destination}'.format( raise RuntimeError('Error {code} while syncing {source} to {destination}'.format(
code=return_code,
source=source_path, source=source_path,
destination=destination_path, destination=destination_path,
)) ))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment