Add transcript preference in Video and use during file discovery

6648d3bf · Qubad786 · muhammad-ammar · 7d3b62e9 · 6648d3bf · 6648d3bf
Commit 6648d3bf authored Aug 03, 2017 by Qubad786 Committed by muhammad-ammar Aug 15, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 339 additions and 85 deletions

.gitignore
+3 -0

VEDA_OS01/admin.py
+7 -1

VEDA_OS01/models.py
+226 -68

VEDA_OS01/serializers.py
+31 -1

control/veda_file_discovery.py
+42 -10

control/veda_file_ingest.py
+30 -5

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,6 @@ sandbox.db
 .coverage
 coverage/
+# TODO remove this once migrations are shipped with the code
+*/migrations/*
--- a/VEDA_OS01/admin.py
+++ b/VEDA_OS01/admin.py
@@ -34,7 +34,13 @@ class VideoAdmin(admin.ModelAdmin):
        'studio_id',
        'video_trans_start',
        'video_trans_status',
-        'video_active'
+        'video_active',
+        'process_transcription',
+        'provider',
+        'three_play_turnaround',
+        'cielo24_turnaround',
+        'cielo24_fidelity',
+        'preferred_languages',
    ]
    list_filter = ['inst_class__institution']
    search_fields = ['edx_id', 'client_title', 'studio_id']

--- a/VEDA_OS01/models.py
+++ b/VEDA_OS01/models.py
 """
 Models for Video Pipeline
 """
+import json
 import uuid
 from django.db import models
 from model_utils.models import TimeStampedModel
@@ -40,12 +41,185 @@ class TranscriptStatus(object):
    )
+class Cielo24Turnaround(object):
+    """
+    Turnaround Enumeration.
+    Its the time taken by Cielo24 transcription process.
+    """
+    STANDARD = 'STANDARD'
+    PRIORITY = 'PRIORITY'
+    CHOICES = (
+        (STANDARD, 'Standard, 48h'),
+        (PRIORITY, 'Priority, 24h'),
+    )
+class Cielo24Fidelity(object):
+    """
+    Fidelity Enumeration.
+    This decides transcript's accuracy and supported languages.
+    """
+    MECHANICAL = 'MECHANICAL'
+    PREMIUM = 'PREMIUM'
+    PROFESSIONAL = 'PROFESSIONAL'
+    CHOICES = (
+        (MECHANICAL, 'Mechanical, 75% Accuracy'),
+        (PREMIUM, 'Premium, 95% Accuracy'),
+        (PROFESSIONAL, 'Professional, 99% Accuracy'),
+    )
+class ThreePlayTurnaround(object):
+    """
+    Turnaround Enumeration.
+    Its the time taken by 3PlayMedia transcription process.
+    """
+    EXTENDED_SERVICE = 'extended_service'
+    DEFAULT = 'default'
+    EXPEDITED_SERVICE = 'expedited_service'
+    RUSH_SERVICE = 'rush_service'
+    SAME_DAY_SERVICE = 'same_day_service'
+    CHOICES = (
+        (EXTENDED_SERVICE, '10-Day/Extended'),
+        (DEFAULT, '4-Day/Default'),
+        (EXPEDITED_SERVICE, '2-Day/Expedited'),
+        (RUSH_SERVICE, '24 hour/Rush'),
+        (SAME_DAY_SERVICE, 'Same Day'),
+    )
+class VideoStatus(object):
+    """
+    Video Status Enumeration
+    TODO: STATUS REMODEL:
+    Change to
+    'Ingest',
+    'Queued',
+    'In Progress',
+    'Corrupt',
+    'Complete',
+    'Error',
+    'Duplicate',
+    'Review',
+    'Reject'
+    Possibles:
+        'Invalid' (for ingest detected)
+        'Retry'
+        'Delivery' (for celery states?)
+    """
+    SI = 'Ingest'
+    TQ = 'Transcode Queue'
+    AT = 'Active Transcode'
+    TR = 'Transcode Retry'
+    TC = 'Transcode Complete'
+    DU = 'Deliverable Upload'
+    FC = 'File Complete'
+    TE = 'Transcode Error'
+    CF = 'Corrupt File'
+    RH = 'Review Hold'
+    RR = 'Review Reject'
+    RP = 'Final Publish'
+    YD = 'Youtube Duplicate'
+    QUEUE = 'In Encode Queue'
+    PROGRESS = 'Progress'
+    COMPLETE = 'Complete'
+    TRANSCRIPTION_IN_PROGRESS = 'transcription_in_progress'
+    TRANSCRIPTION_READY = 'transcription_ready'
+    CHOICES = (
+        (SI, 'System Ingest'),
+        (TQ, 'Transcode Queue'),
+        (AT, 'Active Transcode'),
+        (TR, 'Transcode Retry'),
+        (TC, 'Transcode Complete'),
+        (DU, 'Deliverable Upload'),
+        (FC, 'File Complete'),
+        (TE, 'Transcode Error'),
+        (CF, 'Corrupt File on Ingest'),
+        (RH, 'Review Hold'),
+        (RR, 'Review Rejected'),
+        (RP, 'Review to Final Publish'),
+        (YD, 'Youtube Duplicate'),
+        (QUEUE, 'In Encode Queue'),
+        (PROGRESS, 'In Progress'),
+        (COMPLETE, 'Complete'),
+        (TRANSCRIPTION_IN_PROGRESS, 'Transcription In Progress'),
+        (TRANSCRIPTION_READY, 'Transcription Ready'),
+    )
+class ListField(models.TextField):
+    """
+    A List Field which can be used to store and retrieve pythonic list of strings.
+    """
+    def get_prep_value(self, value):
+        """
+        Converts a list to its json representation to store in database as text.
+        """
+        if value and not isinstance(value, list):
+            raise ValueError(u'The given value {} is not a list.'.format(value))
+        return json.dumps(self.validate_list(value) or [])
+    def from_db_value(self, value, expression, connection, context):
+        """
+        Converts a json list representation in a database to a python object.
+        """
+        return self.to_python(value)
+    def to_python(self, value):
+        """
+        Converts the value into a list.
+        """
+        if not value:
+            value = []
+        # If a list is set then validated its items
+        if isinstance(value, list):
+            py_list = self.validate_list(value)
+        else:  # try to de-serialize value and expect list and then validate
+            try:
+                py_list = json.loads(value)
+                if not isinstance(py_list, list):
+                    raise TypeError
+                self.validate_list(py_list)
+            except (ValueError, TypeError):
+                raise ValueError(u'Must be a valid list of strings.')
+        return py_list
+    def validate_list(self, value):
+        """
+        Validate the data before saving into the database.
+        Arguments:
+            value(list): list to be validated
+        Returns:
+            A list if validation is successful
+        Raises:
+            ValidationError
+        """
+        if all(isinstance(item, basestring) for item in value) is False:
+            raise ValueError(u'list must only contain strings.')
+        return value
 class Institution (models.Model):
    institution_code = models.CharField(max_length=4)
    institution_name = models.CharField(max_length=50)
    def __unicode__(self):
-        return u'%s %s'.format(self.institution_name, self.institution_code) or u''
+        return u'{institution_name} {institution_code}'.format(
+            institution_name=self.institution_name,
+            institution_code=self.institution_code,
+        )
 class Course (models.Model):
@@ -204,11 +378,11 @@ class Course (models.Model):
    )
    def __unicode__(self):
-        return u'%s %s %s'.format(
+        return u'{institution} {edx_class_id} {course_name}'.format(
-            self.institution,
+            institution=self.institution,
-            self.edx_classid,
+            edx_class_id=self.edx_classid,
-            self.course_name
+            course_name=self.course_name,
-        ) or u''
+        )
 class Video (models.Model):
@@ -255,73 +429,53 @@ class Video (models.Model):
    video_trans_start = models.DateTimeField('Process Start', null=True, blank=True)
    video_trans_end = models.DateTimeField('Process Complete', null=True, blank=True)
-    """
-    TODO: STATUS REMODEL:
-    Change to
-    'Ingest',
-    'Queued',
-    'In Progress',
-    'Corrupt',
-    'Complete',
-    'Error',
-    'Duplicate',
-    'Review',
-    'Reject'
-    Possile:
-        'Invalid' (for ingest detected)
-        'Retry'
-        'Delivery' (for celery states?)
-    """
-    SI = 'Ingest'
-    TQ = 'Transcode Queue'
-    AT = 'Active Transcode'
-    TR = 'Transcode Retry'
-    TC = 'Transcode Complete'
-    DU = 'Deliverable Upload'
-    FC = 'File Complete'
-    CF = 'Corrupt File'
-    RH = 'Review Hold'
-    RR = 'Review Reject'
-    RP = 'Final Publish'
-    YD = 'Youtube Duplicate'
-    TRANS_STATUS_OPTIONS = (
-        (SI, "System Ingest"),
-        (TQ, "Transcode Queue"),
-        (AT, "Active Transcode"),
-        (TR, "Transcode Retry"),
-        (TC, "Transcode Complete"),
-        (DU, "Deliverable Upload"),
-        (FC, "File Complete"),
-        ('Transcode Error', "Transcode Error"),
-        (CF, "Corrupt File on Ingest"),
-        (RH, "Review Hold"),
-        (RR, "Review Rejected"),
-        (RP, "Review to Final Publish"),
-        (YD, "Youtube Duplicate"),
-        ('Queue', "In Encode Queue"),
-        ('Progress', "In Progress"),
-        ('Complete', "Complete")
-    )
    video_trans_status = models.CharField(
        'Transcode Status',
        max_length=100,
-        choices=TRANS_STATUS_OPTIONS,
+        choices=VideoStatus.CHOICES,
-        default=SI
+        default=VideoStatus.SI
    )
    video_glacierid = models.CharField('Glacier Archive ID String', max_length=200, null=True, blank=True)
    abvid_serial = models.CharField('VEDA Upload Process Serial', max_length=20, null=True, blank=True)
    stat_queuetime = models.FloatField('Video Avg. Queuetime (sec)', default=0)
+    # 3rd Party Transcription
+    process_transcription = models.BooleanField('Process transcripts from Cielo24/3PlayMedia', default=False)
+    provider = models.CharField(
+        'Transcription provider',
+        max_length=20,
+        choices=TranscriptProvider.CHOICES,
+        null=True,
+        blank=True,
+    )
+    three_play_turnaround = models.CharField(
+        '3PlayMedia Turnaround',
+        max_length=20,
+        choices=ThreePlayTurnaround.CHOICES,
+        null=True,
+        blank=True,
+    )
+    cielo24_turnaround = models.CharField(
+        'Cielo24 Turnaround', max_length=20,
+        choices=Cielo24Turnaround.CHOICES,
+        null=True,
+        blank=True,
+    )
+    cielo24_fidelity = models.CharField(
+        'Cielo24 Fidelity',
+        max_length=20,
+        choices=Cielo24Fidelity.CHOICES,
+        null=True,
+        blank=True,
+    )
+    preferred_languages = ListField(blank=True, default=[])
    class Meta:
        get_latest_by = 'video_trans_start'
    def __unicode__(self):
-        return u'%s'.format(self.edx_id) or u''
+        return u'{edx_id}'.format(edx_id=self.edx_id)
 class Destination (models.Model):
@@ -378,7 +532,7 @@ class Encode (models.Model):
    xuetang_proc = models.BooleanField('Submit to XuetangX', default=False)
    def __unicode__(self):
-        return u'%s'.format(self.encode_name)
+        return u'{encode_profile}'.format(encode_profile=self.encode_name)
 class URL (models.Model):
@@ -401,7 +555,11 @@ class URL (models.Model):
        get_latest_by = 'url_date'
    def __unicode__(self):
-        return u'%s : %s : %s'.format(self.videoID, self.encode_profile.encode_name, self.url_date) or u''
+        return u'{video_id} : {encode_profile} : {date}'.format(
+            video_id=self.videoID,
+            encode_profile=self.encode_profile.encode_name,
+            date=self.url_date,
+        )
 class VedaUpload (models.Model):
@@ -453,11 +611,11 @@ class VedaUpload (models.Model):
        get_latest_by = 'upload_date'
    def __unicode__(self):
-        return u'%s %s %s %s'.format(
+        return u'{client_information} {upload_filename} {status_email} {file_complete}'.format(
-            self.client_information,
+            client_information=self.client_information,
-            self.upload_filename,
+            upload_filename=self.upload_filename,
-            self.status_email,
+            status_email=self.status_email,
-            self.file_complete
+            file_complete=self.file_complete
        )
@@ -498,8 +656,8 @@ class TranscriptProcessMetadata(TimeStampedModel):
    )
    class Meta:
-        unique_together = ('video', 'provider', 'lang_code')
        verbose_name_plural = 'Transcript process metadata'
+        get_latest_by = 'modified'
    def __unicode__(self):
        return u'{video} - {provider} - {lang}'.format(

--- a/VEDA_OS01/serializers.py
+++ b/VEDA_OS01/serializers.py
@@ -73,7 +73,13 @@ class VideoSerializer(serializers.ModelSerializer):
            'video_trans_end',
            'video_trans_status',
            'video_glacierid',
-            'course_ids'
+            'course_ids',
+            'process_transcription',
+            'provider',
+            'three_play_turnaround',
+            'cielo24_turnaround',
+            'cielo24_fidelity',
+            'preferred_languages',
        )
    def get_course_ids(self, video):
@@ -128,6 +134,30 @@ class VideoSerializer(serializers.ModelSerializer):
            'video_glacierid',
            instance.video_glacierid
        )
+        instance.process_transcription = validated_data.get(
+            'process_transcription',
+            instance.process_transcription
+        )
+        instance.provider = validated_data.get(
+            'provider',
+            instance.provider
+        )
+        instance.three_play_turnaround = validated_data.get(
+            'three_play_turnaround',
+            instance.three_play_turnaround
+        )
+        instance.cielo24_turnaround = validated_data.get(
+            'cielo24_turnaround',
+            instance.cielo24_turnaround
+        )
+        instance.cielo24_fidelity = validated_data.get(
+            'cielo24_fidelity',
+            instance.cielo24_fidelity
+        )
+        instance.preferred_languages = validated_data.get(
+            'preferred_languages',
+            instance.preferred_languages
+        )
        instance.save()
        return instance

--- a/control/veda_file_discovery.py
+++ b/control/veda_file_discovery.py
+import json
+import logging
 import os.path
 import boto
 import boto.s3
 from boto.exception import S3ResponseError, S3DataError
 import yaml
+from VEDA_OS01.models import TranscriptPreferences
 try:
    boto.config.add_section('Boto')
 except:
@@ -26,6 +30,8 @@ from veda_utils import ErrorObject
 from veda_file_ingest import VideoProto, VedaIngest
 from veda_val import VALAPICall
+LOGGER = logging.getLogger(__name__)
 class FileDiscovery(object):
@@ -159,6 +165,7 @@ class FileDiscovery(object):
        client_title = meta.get_metadata('client_video_id')
        course_hex = meta.get_metadata('course_video_upload_token')
        course_url = meta.get_metadata('course_key')
+        transcript_preferences = meta.get_metadata('transcript_preferences')
        edx_filename = key.name[::-1].split('/')[0][::-1]
        if len(course_hex) == 0:
@@ -226,24 +233,49 @@ class FileDiscovery(object):
            key.delete()
            return
-        """
+        # Make decision if this video needs the transcription as well.
-        Trigger Ingest Process
+        try:
-        """
+            transcript_preferences = json.loads(transcript_preferences)
-        V = VideoProto(
+            TranscriptPreferences.objects.get(
+                # TODO: Once ammar is done with cielo24.
+                # org=extract_course_org(course_url),
+                org=transcript_preferences.get('org'),
+                provider=transcript_preferences.get('provider')
+            )
+            process_transcription = True
+        except (TypeError, TranscriptPreferences.DoesNotExist):
+            # when the preferences are not set OR these are set to some data in invalid format OR these don't
+            # have associated 3rd party transcription provider API keys.
+            process_transcription = False
+        except ValueError:
+            LOGGER.error('[VIDEO-PIPELINE] File Discovery - Invalid transcripts preferences=%s', transcript_preferences)
+            process_transcription = False
+        # Trigger Ingest Process
+        video_metadata = dict(
            s3_filename=edx_filename,
            client_title=client_title,
            file_extension=file_extension,
-            platform_course_url=course_url
+            platform_course_url=course_url,
        )
+        if process_transcription:
-        I = VedaIngest(
+            video_metadata.update({
+                'process_transcription': process_transcription,
+                'provider': transcript_preferences.get('provider'),
+                'three_play_turnaround': transcript_preferences.get('three_play_turnaround'),
+                'cielo24_turnaround': transcript_preferences.get('cielo24_turnaround'),
+                'cielo24_fidelity': transcript_preferences.get('cielo24_fidelity'),
+                'preferred_languages': transcript_preferences.get('preferred_languages'),
+            })
+        ingest = VedaIngest(
            course_object=course_query[0],
-            video_proto=V,
+            video_proto=VideoProto(**video_metadata),
            node_work_directory=self.node_work_directory
        )
-        I.insert()
+        ingest.insert()
-        if I.complete is False:
+        if ingest.complete is False:
            return
        """

--- a/control/veda_file_ingest.py
+++ b/control/veda_file_ingest.py
+import logging
 import os
 import sys
 import subprocess
@@ -7,6 +7,7 @@ from datetime import timedelta
 import time
 import fnmatch
 import django
+from django.db.utils import DatabaseError
 from django.utils.timezone import utc
 from django.db import reset_queries
 import uuid
@@ -32,6 +33,8 @@ from veda_val import VALAPICall
 from veda_encode import VedaEncode
 import celeryapp
+LOGGER = logging.getLogger(__name__)
 '''
 V = VideoProto(
    s3_filename=edx_filename,
@@ -59,9 +62,16 @@ class VideoProto():
        self.file_extension = kwargs.get('file_extension', None)
        self.platform_course_url = kwargs.get('platform_course_url', None)
        self.abvid_serial = kwargs.get('abvid_serial', None)
-        """
-        Determined Attrib
+        # Transcription Process related Attributes
-        """
+        self.process_transcription = kwargs.get('process_transcription', False)
+        self.provider = kwargs.get('provider', None)
+        self.three_play_turnaround = kwargs.get('three_play_turnaround', None)
+        self.cielo24_turnaround = kwargs.get('cielo24_turnaround', None)
+        self.cielo24_fidelity = kwargs.get('cielo24_fidelity', None)
+        self.preferred_languages = kwargs.get('preferred_languages', [])
+        # Determined Attributes
        self.valid = False
        self.filesize = 0
        self.duration = 0
@@ -333,6 +343,15 @@ class VedaIngest:
            self.complete = True
            return None
+        # Update transcription preferences for the Video
+        if self.video_proto.process_transcription:
+            v1.process_transcription = self.video_proto.process_transcription
+            v1.provider = self.video_proto.provider
+            v1.three_play_turnaround = self.video_proto.three_play_turnaround
+            v1.cielo24_turnaround = self.video_proto.cielo24_turnaround
+            v1.cielo24_fidelity = self.video_proto.cielo24_fidelity
+            v1.preferred_languages = self.video_proto.preferred_languages
        """
        Files Below are all valid
        """
@@ -353,7 +372,8 @@ class VedaIngest:
        """
        try:
            v1.save()
-        except:
+        except DatabaseError:
+            # in case if the client title's length is too long
            char_string = self.video_proto.client_title
            string_len = len(char_string)
            s1 = 0
@@ -368,6 +388,11 @@ class VedaIngest:
            v1.client_title = final_string
            v1.save()
+        except Exception:
+            # Log the exception and raise.
+            LOGGER.exception('[VIDEO-PIPELINE] File Ingest - Cataloging of video=%s failed.', self.video_proto.veda_id)
+            raise
    def val_insert(self):
        if self.video_proto.abvid_serial is not None:
            return None