Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
E
edx-analytics-data-api
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
edx
edx-analytics-data-api
Commits
3aded9ec
Commit
3aded9ec
authored
Mar 25, 2014
by
Gabe Mulley
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Support manifest based input file lists
Change-Id: Ic059030e633848fcde13216d5a96b8b03e5c4b55
parent
b16d3b46
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
132 additions
and
12 deletions
+132
-12
edx/analytics/tasks/answer_dist.py
+26
-3
edx/analytics/tasks/course_enroll.py
+18
-2
edx/analytics/tasks/mapreduce.py
+31
-0
edx/analytics/tasks/pathutil.py
+22
-2
edx/analytics/tasks/s3_util.py
+31
-1
edx/analytics/tasks/url.py
+3
-3
setup.cfg
+1
-1
No files found.
edx/analytics/tasks/answer_dist.py
View file @
3aded9ec
...
...
@@ -578,11 +578,15 @@ class BaseAnswerDistributionTask(MapReduceJobTask):
dest: a URL to the root location to write output file(s).
include: a list of patterns to be used to match input files, relative to `src` URL.
The default value is ['*'].
manifest: a URL to a file location that can store the complete set of input files.
"""
name
=
luigi
.
Parameter
()
src
=
luigi
.
Parameter
()
dest
=
luigi
.
Parameter
()
include
=
luigi
.
Parameter
(
is_list
=
True
,
default
=
(
'*'
,))
# A manifest file is required by hadoop if there are too many input paths. It hits an operating system limit on the
# number of arguments passed to the mapper process on the task nodes.
manifest
=
luigi
.
Parameter
(
default
=
None
)
def
extra_modules
(
self
):
# Boto is used for S3 access and cjson for parsing log files.
...
...
@@ -596,7 +600,7 @@ class LastProblemCheckEvent(LastProblemCheckEventMixin, BaseAnswerDistributionTa
"""Identifies last problem_check event for a user on a problem in a course, given raw event log input."""
def
requires
(
self
):
return
PathSetTask
(
self
.
src
,
self
.
include
)
return
PathSetTask
(
self
.
src
,
self
.
include
,
self
.
manifest
)
def
output
(
self
):
output_name
=
u'last_problem_check_events_{name}/'
.
format
(
name
=
self
.
name
)
...
...
@@ -611,13 +615,26 @@ class AnswerDistributionPerCourse(AnswerDistributionPerCourseMixin, BaseAnswerDi
Additional Parameters:
answer_metadata: optional file to provide information about particular answers.
Includes problem_display_name, input_type, response_type, and question.
base_input_format: The input format to use on the first map reduce job in the chain. This job takes in the most
input and may need a custom input format.
"""
answer_metadata
=
luigi
.
Parameter
(
default
=
None
)
base_input_format
=
luigi
.
Parameter
(
default
=
None
)
def
requires
(
self
):
results
=
{
'events'
:
LastProblemCheckEvent
(
self
.
mapreduce_engine
,
self
.
name
,
self
.
src
,
self
.
dest
,
self
.
include
),
'events'
:
LastProblemCheckEvent
(
mapreduce_engine
=
self
.
mapreduce_engine
,
input_format
=
self
.
base_input_format
,
lib_jar
=
self
.
lib_jar
,
n_reduce_tasks
=
self
.
n_reduce_tasks
,
name
=
self
.
name
,
src
=
self
.
src
,
dest
=
self
.
dest
,
include
=
self
.
include
,
manifest
=
self
.
manifest
,
),
}
if
self
.
answer_metadata
:
...
...
@@ -660,15 +677,21 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
name
=
luigi
.
Parameter
(
default
=
'periodic'
)
output_root
=
luigi
.
Parameter
()
answer_metadata
=
luigi
.
Parameter
(
default
=
None
)
manifest
=
luigi
.
Parameter
(
default
=
None
)
base_input_format
=
luigi
.
Parameter
(
default
=
None
)
def
requires
(
self
):
return
AnswerDistributionPerCourse
(
mapreduce_engine
=
self
.
mapreduce_engine
,
lib_jar
=
self
.
lib_jar
,
base_input_format
=
self
.
base_input_format
,
n_reduce_tasks
=
self
.
n_reduce_tasks
,
src
=
self
.
src
,
dest
=
self
.
dest
,
include
=
self
.
include
,
name
=
self
.
name
,
answer_metadata
=
self
.
answer_metadata
answer_metadata
=
self
.
answer_metadata
,
manifest
=
self
.
manifest
,
)
def
mapper
(
self
,
line
):
...
...
edx/analytics/tasks/course_enroll.py
View file @
3aded9ec
...
...
@@ -175,11 +175,15 @@ class BaseCourseEnrollmentTask(MapReduceJobTask):
dest: a URL to the root location to write output file(s).
include: a list of patterns to be used to match input files, relative to `src` URL.
The default value is ['*'].
manifest: a URL to a file location that can store the complete set of input files.
"""
name
=
luigi
.
Parameter
()
src
=
luigi
.
Parameter
()
dest
=
luigi
.
Parameter
()
include
=
luigi
.
Parameter
(
is_list
=
True
,
default
=
(
'*'
,))
# A manifest file is required by hadoop if there are too many input paths. It hits an operating system limit on the
# number of arguments passed to the mapper process on the task nodes.
manifest
=
luigi
.
Parameter
(
default
=
None
)
def
extra_modules
(
self
):
# The following are needed for (almost) every course enrollment task.
...
...
@@ -194,7 +198,7 @@ class CourseEnrollmentEventsPerDay(CourseEnrollmentEventsPerDayMixin, BaseCourse
"""Calculates daily change in enrollment for a user in a course, given raw event log input."""
def
requires
(
self
):
return
PathSetTask
(
self
.
src
,
self
.
include
)
return
PathSetTask
(
self
.
src
,
self
.
include
,
self
.
manifest
)
def
output
(
self
):
output_name
=
'course_enrollment_events_per_day_{name}'
.
format
(
name
=
self
.
name
)
...
...
@@ -204,8 +208,20 @@ class CourseEnrollmentEventsPerDay(CourseEnrollmentEventsPerDayMixin, BaseCourse
class
CourseEnrollmentChangesPerDay
(
CourseEnrollmentChangesPerDayMixin
,
BaseCourseEnrollmentTask
):
"""Calculates daily changes in enrollment, given per-user net changes by date."""
base_input_format
=
luigi
.
Parameter
(
default
=
None
)
def
requires
(
self
):
return
CourseEnrollmentEventsPerDay
(
self
.
mapreduce_engine
,
self
.
name
,
self
.
src
,
self
.
dest
,
self
.
include
)
return
CourseEnrollmentEventsPerDay
(
mapreduce_engine
=
self
.
mapreduce_engine
,
input_format
=
self
.
base_input_format
,
lib_jar
=
self
.
lib_jar
,
n_reduce_tasks
=
self
.
n_reduce_tasks
,
name
=
self
.
name
,
src
=
self
.
src
,
dest
=
self
.
dest
,
include
=
self
.
include
,
manifest
=
self
.
manifest
)
def
output
(
self
):
output_name
=
'course_enrollment_changes_per_day_{name}'
.
format
(
name
=
self
.
name
)
...
...
edx/analytics/tasks/mapreduce.py
View file @
3aded9ec
...
...
@@ -6,6 +6,7 @@ from __future__ import absolute_import
import
luigi
import
luigi.hdfs
import
luigi.hadoop
from
luigi
import
configuration
from
edx.analytics.tasks.url
import
get_target_from_url
,
IgnoredTarget
...
...
@@ -19,6 +20,12 @@ class MapReduceJobTask(luigi.hadoop.JobTask):
mapreduce_engine
=
luigi
.
Parameter
(
default_from_config
=
{
'section'
:
'map-reduce'
,
'name'
:
'engine'
}
)
input_format
=
luigi
.
Parameter
(
default
=
None
)
lib_jar
=
luigi
.
Parameter
(
is_list
=
True
,
default
=
[])
# Override the parent class definition of this parameter. This typically wants to scale with the cluster size so the
# user should be able to tweak it depending on their particular configuration.
n_reduce_tasks
=
luigi
.
Parameter
(
default
=
25
)
def
job_runner
(
self
):
# Lazily import this since this module will be loaded on hadoop worker nodes however stevedore will not be
...
...
@@ -31,9 +38,33 @@ class MapReduceJobTask(luigi.hadoop.JobTask):
except
KeyError
:
raise
KeyError
(
'A map reduce engine must be specified in order to run MapReduceJobTasks'
)
if
issubclass
(
engine_class
,
MapReduceJobRunner
):
return
engine_class
(
libjars_in_hdfs
=
self
.
lib_jar
,
input_format
=
self
.
input_format
)
else
:
return
engine_class
()
class
MapReduceJobRunner
(
luigi
.
hadoop
.
HadoopJobRunner
):
"""
Support more customization of the streaming command.
Args:
libjars_in_hdfs (list): An optional list of library jars that the hadoop job can make use of.
input_format (str): An optional full class name of a hadoop input format to use.
"""
def
__init__
(
self
,
libjars_in_hdfs
=
None
,
input_format
=
None
):
libjars_in_hdfs
=
libjars_in_hdfs
or
[]
config
=
configuration
.
get_config
()
streaming_jar
=
config
.
get
(
'hadoop'
,
'streaming-jar'
)
super
(
MapReduceJobRunner
,
self
)
.
__init__
(
streaming_jar
,
input_format
=
input_format
,
libjars_in_hdfs
=
libjars_in_hdfs
)
class
MultiOutputMapReduceJobTask
(
MapReduceJobTask
):
"""
Produces multiple output files from a map reduce job.
...
...
edx/analytics/tasks/pathutil.py
View file @
3aded9ec
...
...
@@ -15,7 +15,7 @@ import luigi.hdfs
import
luigi.format
from
edx.analytics.tasks.s3_util
import
generate_s3_sources
from
edx.analytics.tasks.url
import
ExternalURL
,
url_path_join
from
edx.analytics.tasks.url
import
ExternalURL
,
url_path_join
,
get_target_from_url
class
PathSetTask
(
luigi
.
Task
):
...
...
@@ -26,15 +26,18 @@ class PathSetTask(luigi.Task):
src: a URL pointing to a folder in s3:// or local FS.
include: a list of patterns to use to select. Multiple patterns are OR'd.
manifest: a URL pointing to a manifest file location.
"""
src
=
luigi
.
Parameter
()
include
=
luigi
.
Parameter
(
is_list
=
True
,
default
=
(
'*'
,))
manifest
=
luigi
.
Parameter
(
default
=
None
)
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
PathSetTask
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
self
.
s3_conn
=
None
def
requires
(
self
):
def
generate_file_list
(
self
):
"""Yield each individual path given a source folder and a set of glob expressions."""
if
self
.
src
.
startswith
(
's3'
):
# connect lazily as needed:
if
self
.
s3_conn
is
None
:
...
...
@@ -50,6 +53,23 @@ class PathSetTask(luigi.Task):
for
filepath
in
filelist
:
yield
ExternalURL
(
filepath
)
def
manifest_file_list
(
self
):
"""Write each individual path to a manifest file and yield the path to that file."""
manifest_target
=
get_target_from_url
(
self
.
manifest
)
if
not
manifest_target
.
exists
():
with
manifest_target
.
open
(
'w'
)
as
manifest_file
:
for
external_url_task
in
self
.
generate_file_list
():
manifest_file
.
write
(
external_url_task
.
url
+
'
\n
'
)
yield
ExternalURL
(
self
.
manifest
)
def
requires
(
self
):
if
self
.
manifest
is
not
None
:
return
self
.
manifest_file_list
()
else
:
return
self
.
generate_file_list
()
def
complete
(
self
):
# An optimization: just declare that the task is always
# complete, by definition, because it is whatever files were
...
...
edx/analytics/tasks/s3_util.py
View file @
3aded9ec
...
...
@@ -6,7 +6,9 @@ from fnmatch import fnmatch
from
urlparse
import
urlparse
from
boto.s3.key
import
Key
from
luigi.s3
import
S3Client
from
luigi.s3
import
S3Client
,
AtomicS3File
,
ReadableS3File
,
FileNotFoundException
import
luigi.hdfs
def
get_s3_bucket_key_names
(
url
):
...
...
@@ -94,3 +96,31 @@ class RestrictedPermissionsS3Client(S3Client):
s3_key
=
Key
(
s3_bucket
)
s3_key
.
key
=
key
s3_key
.
set_contents_from_filename
(
local_path
,
policy
=
'bucket-owner-full-control'
)
class
S3HdfsTarget
(
luigi
.
hdfs
.
HdfsTarget
):
"""HDFS target that supports writing and reading files directly in S3."""
# Luigi does not support HDFS targets that point to complete URLs like "s3://foo/bar" it only supports HDFS paths
# that look like standard file paths "/foo/bar". Once this bug is fixed this class is no longer necessary.
# TODO: Fix the upstream bug in luigi that prevents writing to HDFS files that are specified by complete URLs
def
__init__
(
self
,
path
=
None
,
format
=
luigi
.
hdfs
.
Plain
,
is_tmp
=
False
):
super
(
S3HdfsTarget
,
self
)
.
__init__
(
path
=
path
,
format
=
format
,
is_tmp
=
is_tmp
)
self
.
s3_client
=
RestrictedPermissionsS3Client
()
def
open
(
self
,
mode
=
'r'
):
if
mode
not
in
(
'r'
,
'w'
):
raise
ValueError
(
"Unsupported open mode '{mode}'"
.
format
(
mode
=
mode
))
safe_path
=
self
.
path
.
replace
(
's3n://'
,
's3://'
)
if
mode
==
'r'
:
s3_key
=
self
.
s3_client
.
get_key
(
safe_path
)
if
s3_key
:
return
ReadableS3File
(
s3_key
)
else
:
raise
FileNotFoundException
(
"Could not find file at
%
s"
%
safe_path
)
else
:
return
AtomicS3File
(
safe_path
,
self
.
s3_client
)
edx/analytics/tasks/url.py
View file @
3aded9ec
...
...
@@ -18,7 +18,7 @@ import luigi.format
import
luigi.hdfs
import
luigi.s3
from
edx.analytics.tasks.s3_util
import
RestrictedPermissionsS3Client
from
edx.analytics.tasks.s3_util
import
RestrictedPermissionsS3Client
,
S3HdfsTarget
class
ExternalURL
(
luigi
.
ExternalTask
):
...
...
@@ -44,8 +44,8 @@ class IgnoredTarget(luigi.hdfs.HdfsTarget):
DEFAULT_TARGET_CLASS
=
luigi
.
LocalTarget
URL_SCHEME_TO_TARGET_CLASS
=
{
'hdfs'
:
luigi
.
hdfs
.
HdfsTarget
,
's3'
:
luigi
.
hdfs
.
HdfsTarget
,
's3n'
:
luigi
.
hdfs
.
HdfsTarget
,
's3'
:
S3
HdfsTarget
,
's3n'
:
S3
HdfsTarget
,
'file'
:
luigi
.
LocalTarget
,
's3+https'
:
luigi
.
s3
.
S3Target
,
}
...
...
setup.cfg
View file @
3aded9ec
...
...
@@ -31,5 +31,5 @@ edx.analytics.tasks =
answer_dist = edx.analytics.tasks.answer_dist:AnswerDistributionPerCourse
mapreduce.engine =
hadoop =
luigi.hadoop:DefaultHadoop
JobRunner
hadoop =
edx.analytics.tasks.mapreduce:MapReduce
JobRunner
local = luigi.hadoop:LocalJobRunner
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment