Support manifest based input file lists

Change-Id: Ic059030e633848fcde13216d5a96b8b03e5c4b55

Support manifest based input file lists
Change-Id: Ic059030e633848fcde13216d5a96b8b03e5c4b55
3aded9ec · Gabe Mulley · b16d3b46 · 3aded9ec · 3aded9ec · 3aded9ec
Commit 3aded9ec authored Mar 25, 2014 by Gabe Mulley
7 changed files
--- a/edx/analytics/tasks/answer_dist.py
+++ b/edx/analytics/tasks/answer_dist.py
@@ -578,11 +578,15 @@ class BaseAnswerDistributionTask(MapReduceJobTask):
        dest:  a URL to the root location to write output file(s).
        include:  a list of patterns to be used to match input files, relative to `src` URL.
            The default value is ['*'].
+        manifest: a URL to a file location that can store the complete set of input files.
    """
    name = luigi.Parameter()
    src = luigi.Parameter()
    dest = luigi.Parameter()
    include = luigi.Parameter(is_list=True, default=('*',))
+    # A manifest file is required by hadoop if there are too many input paths. It hits an operating system limit on the
+    # number of arguments passed to the mapper process on the task nodes.
+    manifest = luigi.Parameter(default=None)

    def extra_modules(self):
        # Boto is used for S3 access and cjson for parsing log files.
@@ -596,7 +600,7 @@ class LastProblemCheckEvent(LastProblemCheckEventMixin, BaseAnswerDistributionTa
    """Identifies last problem_check event for a user on a problem in a course, given raw event log input."""

    def requires(self):
-        return PathSetTask(self.src, self.include)
+        return PathSetTask(self.src, self.include, self.manifest)

    def output(self):
        output_name = u'last_problem_check_events_{name}/'.format(name=self.name)
@@ -611,13 +615,26 @@ class AnswerDistributionPerCourse(AnswerDistributionPerCourseMixin, BaseAnswerDi
    Additional Parameters:
        answer_metadata:  optional file to provide information about particular answers.
            Includes problem_display_name, input_type, response_type, and question.
+        base_input_format:  The input format to use on the first map reduce job in the chain. This job takes in the most
+            input and may need a custom input format.
    """

    answer_metadata = luigi.Parameter(default=None)
+    base_input_format = luigi.Parameter(default=None)

    def requires(self):
        results = {
-            'events': LastProblemCheckEvent(self.mapreduce_engine, self.name, self.src, self.dest, self.include),
+            'events': LastProblemCheckEvent(
+                mapreduce_engine=self.mapreduce_engine,
+                input_format=self.base_input_format,
+                lib_jar=self.lib_jar,
+                n_reduce_tasks=self.n_reduce_tasks,
+                name=self.name,
+                src=self.src,
+                dest=self.dest,
+                include=self.include,
+                manifest=self.manifest,
+            ),
        }

        if self.answer_metadata:
@@ -660,15 +677,21 @@ class AnswerDistributionOneFilePerCourseTask(MultiOutputMapReduceJobTask):
    name = luigi.Parameter(default='periodic')
    output_root = luigi.Parameter()
    answer_metadata = luigi.Parameter(default=None)
+    manifest = luigi.Parameter(default=None)
+    base_input_format = luigi.Parameter(default=None)

    def requires(self):
        return AnswerDistributionPerCourse(
            mapreduce_engine=self.mapreduce_engine,
+            lib_jar=self.lib_jar,
+            base_input_format=self.base_input_format,
+            n_reduce_tasks=self.n_reduce_tasks,
            src=self.src,
            dest=self.dest,
            include=self.include,
            name=self.name,
-            answer_metadata=self.answer_metadata
+            answer_metadata=self.answer_metadata,
+            manifest=self.manifest,
        )

    def mapper(self, line):

--- a/edx/analytics/tasks/course_enroll.py
+++ b/edx/analytics/tasks/course_enroll.py
@@ -175,11 +175,15 @@ class BaseCourseEnrollmentTask(MapReduceJobTask):
      dest:  a URL to the root location to write output file(s).
      include:  a list of patterns to be used to match input files, relative to `src` URL.
          The default value is ['*'].
+      manifest: a URL to a file location that can store the complete set of input files.
    """
    name = luigi.Parameter()
    src = luigi.Parameter()
    dest = luigi.Parameter()
    include = luigi.Parameter(is_list=True, default=('*',))
+    # A manifest file is required by hadoop if there are too many input paths. It hits an operating system limit on the
+    # number of arguments passed to the mapper process on the task nodes.
+    manifest = luigi.Parameter(default=None)

    def extra_modules(self):
        # The following are needed for (almost) every course enrollment task.
@@ -194,7 +198,7 @@ class CourseEnrollmentEventsPerDay(CourseEnrollmentEventsPerDayMixin, BaseCourse
    """Calculates daily change in enrollment for a user in a course, given raw event log input."""

    def requires(self):
-        return PathSetTask(self.src, self.include)
+        return PathSetTask(self.src, self.include, self.manifest)

    def output(self):
        output_name = 'course_enrollment_events_per_day_{name}'.format(name=self.name)
@@ -204,8 +208,20 @@ class CourseEnrollmentEventsPerDay(CourseEnrollmentEventsPerDayMixin, BaseCourse
 class CourseEnrollmentChangesPerDay(CourseEnrollmentChangesPerDayMixin, BaseCourseEnrollmentTask):
    """Calculates daily changes in enrollment, given per-user net changes by date."""

+    base_input_format = luigi.Parameter(default=None)
+
    def requires(self):
-        return CourseEnrollmentEventsPerDay(self.mapreduce_engine, self.name, self.src, self.dest, self.include)
+        return CourseEnrollmentEventsPerDay(
+            mapreduce_engine=self.mapreduce_engine,
+            input_format=self.base_input_format,
+            lib_jar=self.lib_jar,
+            n_reduce_tasks=self.n_reduce_tasks,
+            name=self.name,
+            src=self.src,
+            dest=self.dest,
+            include=self.include,
+            manifest=self.manifest
+        )

    def output(self):
        output_name = 'course_enrollment_changes_per_day_{name}'.format(name=self.name)

--- a/edx/analytics/tasks/mapreduce.py
+++ b/edx/analytics/tasks/mapreduce.py
@@ -6,6 +6,7 @@ from __future__ import absolute_import
 import luigi
 import luigi.hdfs
 import luigi.hadoop
+from luigi import configuration

 from edx.analytics.tasks.url import get_target_from_url, IgnoredTarget

@@ -19,6 +20,12 @@ class MapReduceJobTask(luigi.hadoop.JobTask):
    mapreduce_engine = luigi.Parameter(
        default_from_config={'section': 'map-reduce', 'name': 'engine'}
    )
+    input_format = luigi.Parameter(default=None)
+    lib_jar = luigi.Parameter(is_list=True, default=[])
+
+    # Override the parent class definition of this parameter. This typically wants to scale with the cluster size so the
+    # user should be able to tweak it depending on their particular configuration.
+    n_reduce_tasks = luigi.Parameter(default=25)

    def job_runner(self):
        # Lazily import this since this module will be loaded on hadoop worker nodes however stevedore will not be
@@ -31,9 +38,33 @@ class MapReduceJobTask(luigi.hadoop.JobTask):
        except KeyError:
            raise KeyError('A map reduce engine must be specified in order to run MapReduceJobTasks')

+        if issubclass(engine_class, MapReduceJobRunner):
+            return engine_class(libjars_in_hdfs=self.lib_jar, input_format=self.input_format)
+        else:
            return engine_class()


+class MapReduceJobRunner(luigi.hadoop.HadoopJobRunner):
+    """
+    Support more customization of the streaming command.
+
+    Args:
+        libjars_in_hdfs (list): An optional list of library jars that the hadoop job can make use of.
+        input_format (str): An optional full class name of a hadoop input format to use.
+    """
+
+    def __init__(self, libjars_in_hdfs=None, input_format=None):
+        libjars_in_hdfs = libjars_in_hdfs or []
+        config = configuration.get_config()
+        streaming_jar = config.get('hadoop', 'streaming-jar')
+
+        super(MapReduceJobRunner, self).__init__(
+            streaming_jar,
+            input_format=input_format,
+            libjars_in_hdfs=libjars_in_hdfs
+        )
+
+
 class MultiOutputMapReduceJobTask(MapReduceJobTask):
    """
    Produces multiple output files from a map reduce job.

--- a/edx/analytics/tasks/pathutil.py
+++ b/edx/analytics/tasks/pathutil.py
@@ -15,7 +15,7 @@ import luigi.hdfs
 import luigi.format

 from edx.analytics.tasks.s3_util import generate_s3_sources
-from edx.analytics.tasks.url import ExternalURL, url_path_join
+from edx.analytics.tasks.url import ExternalURL, url_path_join, get_target_from_url


 class PathSetTask(luigi.Task):
@@ -26,15 +26,18 @@ class PathSetTask(luigi.Task):

      src: a URL pointing to a folder in s3:// or local FS.
      include:  a list of patterns to use to select.  Multiple patterns are OR'd.
+      manifest: a URL pointing to a manifest file location.
    """
    src = luigi.Parameter()
    include = luigi.Parameter(is_list=True, default=('*',))
+    manifest = luigi.Parameter(default=None)

    def __init__(self, *args, **kwargs):
        super(PathSetTask, self).__init__(*args, **kwargs)
        self.s3_conn = None

-    def requires(self):
+    def generate_file_list(self):
+        """Yield each individual path given a source folder and a set of glob expressions."""
        if self.src.startswith('s3'):
            # connect lazily as needed:
            if self.s3_conn is None:
@@ -50,6 +53,23 @@ class PathSetTask(luigi.Task):
            for filepath in filelist:
                yield ExternalURL(filepath)

+    def manifest_file_list(self):
+        """Write each individual path to a manifest file and yield the path to that file."""
+        manifest_target = get_target_from_url(self.manifest)
+        if not manifest_target.exists():
+            with manifest_target.open('w') as manifest_file:
+                for external_url_task in self.generate_file_list():
+                    manifest_file.write(external_url_task.url + '\n')
+
+        yield ExternalURL(self.manifest)
+
+    def requires(self):
+        if self.manifest is not None:
+            return self.manifest_file_list()
+        else:
+            return self.generate_file_list()
+
+
    def complete(self):
        # An optimization: just declare that the task is always
        # complete, by definition, because it is whatever files were

--- a/edx/analytics/tasks/s3_util.py
+++ b/edx/analytics/tasks/s3_util.py
@@ -6,7 +6,9 @@ from fnmatch import fnmatch
 from urlparse import urlparse

 from boto.s3.key import Key
-from luigi.s3 import S3Client
+from luigi.s3 import S3Client, AtomicS3File, ReadableS3File, FileNotFoundException
+
+import luigi.hdfs


 def get_s3_bucket_key_names(url):
@@ -94,3 +96,31 @@ class RestrictedPermissionsS3Client(S3Client):
        s3_key = Key(s3_bucket)
        s3_key.key = key
        s3_key.set_contents_from_filename(local_path, policy='bucket-owner-full-control')
+
+
+class S3HdfsTarget(luigi.hdfs.HdfsTarget):
+    """HDFS target that supports writing and reading files directly in S3."""
+
+    # Luigi does not support HDFS targets that point to complete URLs like "s3://foo/bar" it only supports HDFS paths
+    # that look like standard file paths "/foo/bar".  Once this bug is fixed this class is no longer necessary.
+
+    # TODO: Fix the upstream bug in luigi that prevents writing to HDFS files that are specified by complete URLs
+
+    def __init__(self, path=None, format=luigi.hdfs.Plain, is_tmp=False):
+        super(S3HdfsTarget, self).__init__(path=path, format=format, is_tmp=is_tmp)
+        self.s3_client = RestrictedPermissionsS3Client()
+
+    def open(self, mode='r'):
+        if mode not in ('r', 'w'):
+            raise ValueError("Unsupported open mode '{mode}'".format(mode=mode))
+
+        safe_path = self.path.replace('s3n://', 's3://')
+
+        if mode == 'r':
+            s3_key = self.s3_client.get_key(safe_path)
+            if s3_key:
+                return ReadableS3File(s3_key)
+            else:
+                raise FileNotFoundException("Could not find file at %s" % safe_path)
+        else:
+            return AtomicS3File(safe_path, self.s3_client)
--- a/edx/analytics/tasks/url.py
+++ b/edx/analytics/tasks/url.py
@@ -18,7 +18,7 @@ import luigi.format
 import luigi.hdfs
 import luigi.s3

-from edx.analytics.tasks.s3_util import RestrictedPermissionsS3Client
+from edx.analytics.tasks.s3_util import RestrictedPermissionsS3Client, S3HdfsTarget


 class ExternalURL(luigi.ExternalTask):
@@ -44,8 +44,8 @@ class IgnoredTarget(luigi.hdfs.HdfsTarget):
 DEFAULT_TARGET_CLASS = luigi.LocalTarget
 URL_SCHEME_TO_TARGET_CLASS = {
    'hdfs': luigi.hdfs.HdfsTarget,
-    's3': luigi.hdfs.HdfsTarget,
-    's3n': luigi.hdfs.HdfsTarget,
+    's3': S3HdfsTarget,
+    's3n': S3HdfsTarget,
    'file': luigi.LocalTarget,
    's3+https': luigi.s3.S3Target,
 }

--- a/setup.cfg
+++ b/setup.cfg
@@ -31,5 +31,5 @@ edx.analytics.tasks =
    answer_dist = edx.analytics.tasks.answer_dist:AnswerDistributionPerCourse

 mapreduce.engine =
-    hadoop = luigi.hadoop:DefaultHadoopJobRunner
+    hadoop = edx.analytics.tasks.mapreduce:MapReduceJobRunner
    local = luigi.hadoop:LocalJobRunner