Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
edx-analytics-pipeline
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
edx-analytics-pipeline
Commits
6c5dcf74
Commit
6c5dcf74
authored
Jan 03, 2018
by
Andrew Zafft
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Hive partition inserts overwrite by default and added in a marker file check for DE-490
parent
4709a8e7
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
39 additions
and
3 deletions
+39
-3
edx/analytics/tasks/util/hive.py
+18
-3
edx/analytics/tasks/util/url.py
+21
-0
No files found.
edx/analytics/tasks/util/hive.py
View file @
6c5dcf74
...
...
@@ -6,7 +6,7 @@ import textwrap
import
luigi
from
luigi.configuration
import
get_config
from
luigi.hive
import
HivePartitionTarget
,
HiveQueryRunner
,
HiveQueryTask
,
HiveTableTarget
from
luigi.parameter
import
Parameter
from
luigi.parameter
import
Parameter
,
BooleanParameter
from
edx.analytics.tasks.common.mysql_load
import
MysqlInsertTask
from
edx.analytics.tasks.util.overwrite
import
OverwriteOutputMixin
...
...
@@ -426,6 +426,13 @@ class OverwriteAwareHiveQueryDataTask(WarehouseMixin, OverwriteOutputMixin, Hive
A generalized Data task whose output is a hive table populated from a hive query.
"""
overwrite_target_partition
=
BooleanParameter
(
significant
=
False
,
description
=
'Overwrite the target partition, deleting any existing data. This will not impact other '
'partitions. Do not use with incrementally built partitions.'
,
default
=
True
)
@property
def
insert_query
(
self
):
"""The query builder that controls the structure and fields inserted into the new table. This insert_query()
...
...
@@ -437,18 +444,26 @@ class OverwriteAwareHiveQueryDataTask(WarehouseMixin, OverwriteOutputMixin, Hive
"""The HivePartitionTask that needs to be generated."""
raise
NotImplementedError
@property
def
data_modification_sql_text
(
self
):
"""Returns the appropriate SQL text for the chosen overwrite_target_partition strategy."""
if
self
.
overwrite_target_partition
:
return
"OVERWRITE"
else
:
return
"INTO"
def
query
(
self
):
# pragma: no cover
full_insert_query
=
"""
USE {database_name};
INSERT
INTO
TABLE {table}
INSERT
{into_or_overwrite}
TABLE {table}
PARTITION ({partition.query_spec})
{insert_query};
"""
.
format
(
database_name
=
hive_database_name
(),
into_or_overwrite
=
self
.
data_modification_sql_text
,
table
=
self
.
partition_task
.
hive_table_task
.
table
,
partition
=
self
.
partition
,
insert_query
=
self
.
insert_query
.
strip
(),
# pylint: disable=no-member
)
return
textwrap
.
dedent
(
full_insert_query
)
@property
...
...
edx/analytics/tasks/util/url.py
View file @
6c5dcf74
...
...
@@ -10,7 +10,9 @@ Examples::
"""
from
__future__
import
absolute_import
import
logging
import
os
import
time
import
urlparse
import
luigi
...
...
@@ -23,11 +25,16 @@ from luigi.s3 import S3Target
from
edx.analytics.tasks.util.s3_util
import
S3HdfsTarget
,
ScalableS3Client
log
=
logging
.
getLogger
(
__name__
)
class
MarkerMixin
(
object
):
"""This mixin handles Targets that cannot accurately be measured by the existence of data files, and instead need
another positive marker to indicate Task success."""
# Check if the marker file is readable after being written, and if not then block for up to 10 minutes until a read
# is successful.
confirm_marker_file_after_writing
=
True
def
exists
(
self
):
# pragma: no cover
"""Completion of this target is based solely on the existence of the marker file."""
return
self
.
fs
.
exists
(
self
.
path
+
"/_SUCCESS"
)
...
...
@@ -37,6 +44,20 @@ class MarkerMixin(object):
marker
=
self
.
__class__
(
path
=
self
.
path
+
"/_SUCCESS"
)
marker
.
open
(
"w"
)
.
close
()
if
self
.
confirm_marker_file_after_writing
:
read_attempts
=
10
marker_exists
=
False
while
read_attempts
>
0
and
not
marker_exists
:
marker_exists
=
self
.
exists
()
if
not
marker_exists
:
log
.
debug
(
"Marker file
%
s does not exist, sleeping for 60 seconds"
,
marker
)
time
.
sleep
(
60
)
if
not
marker_exists
:
log
.
error
(
"Error Marker file
%
s should have been created but could not be read!"
,
marker
)
class
S3MarkerTarget
(
MarkerMixin
,
S3Target
):
"""An S3 Target that uses a marker file to indicate success."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment