Commit 6648d3bf by Qubad786 Committed by muhammad-ammar

Add transcript preference in Video and use during file discovery

parent 7d3b62e9
...@@ -10,3 +10,6 @@ sandbox.db ...@@ -10,3 +10,6 @@ sandbox.db
.coverage .coverage
coverage/ coverage/
# TODO remove this once migrations are shipped with the code
*/migrations/*
...@@ -34,7 +34,13 @@ class VideoAdmin(admin.ModelAdmin): ...@@ -34,7 +34,13 @@ class VideoAdmin(admin.ModelAdmin):
'studio_id', 'studio_id',
'video_trans_start', 'video_trans_start',
'video_trans_status', 'video_trans_status',
'video_active' 'video_active',
'process_transcription',
'provider',
'three_play_turnaround',
'cielo24_turnaround',
'cielo24_fidelity',
'preferred_languages',
] ]
list_filter = ['inst_class__institution'] list_filter = ['inst_class__institution']
search_fields = ['edx_id', 'client_title', 'studio_id'] search_fields = ['edx_id', 'client_title', 'studio_id']
......
""" """
Models for Video Pipeline Models for Video Pipeline
""" """
import json
import uuid import uuid
from django.db import models from django.db import models
from model_utils.models import TimeStampedModel from model_utils.models import TimeStampedModel
...@@ -40,12 +41,185 @@ class TranscriptStatus(object): ...@@ -40,12 +41,185 @@ class TranscriptStatus(object):
) )
class Cielo24Turnaround(object):
"""
Turnaround Enumeration.
Its the time taken by Cielo24 transcription process.
"""
STANDARD = 'STANDARD'
PRIORITY = 'PRIORITY'
CHOICES = (
(STANDARD, 'Standard, 48h'),
(PRIORITY, 'Priority, 24h'),
)
class Cielo24Fidelity(object):
"""
Fidelity Enumeration.
This decides transcript's accuracy and supported languages.
"""
MECHANICAL = 'MECHANICAL'
PREMIUM = 'PREMIUM'
PROFESSIONAL = 'PROFESSIONAL'
CHOICES = (
(MECHANICAL, 'Mechanical, 75% Accuracy'),
(PREMIUM, 'Premium, 95% Accuracy'),
(PROFESSIONAL, 'Professional, 99% Accuracy'),
)
class ThreePlayTurnaround(object):
"""
Turnaround Enumeration.
Its the time taken by 3PlayMedia transcription process.
"""
EXTENDED_SERVICE = 'extended_service'
DEFAULT = 'default'
EXPEDITED_SERVICE = 'expedited_service'
RUSH_SERVICE = 'rush_service'
SAME_DAY_SERVICE = 'same_day_service'
CHOICES = (
(EXTENDED_SERVICE, '10-Day/Extended'),
(DEFAULT, '4-Day/Default'),
(EXPEDITED_SERVICE, '2-Day/Expedited'),
(RUSH_SERVICE, '24 hour/Rush'),
(SAME_DAY_SERVICE, 'Same Day'),
)
class VideoStatus(object):
"""
Video Status Enumeration
TODO: STATUS REMODEL:
Change to
'Ingest',
'Queued',
'In Progress',
'Corrupt',
'Complete',
'Error',
'Duplicate',
'Review',
'Reject'
Possibles:
'Invalid' (for ingest detected)
'Retry'
'Delivery' (for celery states?)
"""
SI = 'Ingest'
TQ = 'Transcode Queue'
AT = 'Active Transcode'
TR = 'Transcode Retry'
TC = 'Transcode Complete'
DU = 'Deliverable Upload'
FC = 'File Complete'
TE = 'Transcode Error'
CF = 'Corrupt File'
RH = 'Review Hold'
RR = 'Review Reject'
RP = 'Final Publish'
YD = 'Youtube Duplicate'
QUEUE = 'In Encode Queue'
PROGRESS = 'Progress'
COMPLETE = 'Complete'
TRANSCRIPTION_IN_PROGRESS = 'transcription_in_progress'
TRANSCRIPTION_READY = 'transcription_ready'
CHOICES = (
(SI, 'System Ingest'),
(TQ, 'Transcode Queue'),
(AT, 'Active Transcode'),
(TR, 'Transcode Retry'),
(TC, 'Transcode Complete'),
(DU, 'Deliverable Upload'),
(FC, 'File Complete'),
(TE, 'Transcode Error'),
(CF, 'Corrupt File on Ingest'),
(RH, 'Review Hold'),
(RR, 'Review Rejected'),
(RP, 'Review to Final Publish'),
(YD, 'Youtube Duplicate'),
(QUEUE, 'In Encode Queue'),
(PROGRESS, 'In Progress'),
(COMPLETE, 'Complete'),
(TRANSCRIPTION_IN_PROGRESS, 'Transcription In Progress'),
(TRANSCRIPTION_READY, 'Transcription Ready'),
)
class ListField(models.TextField):
"""
A List Field which can be used to store and retrieve pythonic list of strings.
"""
def get_prep_value(self, value):
"""
Converts a list to its json representation to store in database as text.
"""
if value and not isinstance(value, list):
raise ValueError(u'The given value {} is not a list.'.format(value))
return json.dumps(self.validate_list(value) or [])
def from_db_value(self, value, expression, connection, context):
"""
Converts a json list representation in a database to a python object.
"""
return self.to_python(value)
def to_python(self, value):
"""
Converts the value into a list.
"""
if not value:
value = []
# If a list is set then validated its items
if isinstance(value, list):
py_list = self.validate_list(value)
else: # try to de-serialize value and expect list and then validate
try:
py_list = json.loads(value)
if not isinstance(py_list, list):
raise TypeError
self.validate_list(py_list)
except (ValueError, TypeError):
raise ValueError(u'Must be a valid list of strings.')
return py_list
def validate_list(self, value):
"""
Validate the data before saving into the database.
Arguments:
value(list): list to be validated
Returns:
A list if validation is successful
Raises:
ValidationError
"""
if all(isinstance(item, basestring) for item in value) is False:
raise ValueError(u'list must only contain strings.')
return value
class Institution (models.Model): class Institution (models.Model):
institution_code = models.CharField(max_length=4) institution_code = models.CharField(max_length=4)
institution_name = models.CharField(max_length=50) institution_name = models.CharField(max_length=50)
def __unicode__(self): def __unicode__(self):
return u'%s %s'.format(self.institution_name, self.institution_code) or u'' return u'{institution_name} {institution_code}'.format(
institution_name=self.institution_name,
institution_code=self.institution_code,
)
class Course (models.Model): class Course (models.Model):
...@@ -204,11 +378,11 @@ class Course (models.Model): ...@@ -204,11 +378,11 @@ class Course (models.Model):
) )
def __unicode__(self): def __unicode__(self):
return u'%s %s %s'.format( return u'{institution} {edx_class_id} {course_name}'.format(
self.institution, institution=self.institution,
self.edx_classid, edx_class_id=self.edx_classid,
self.course_name course_name=self.course_name,
) or u'' )
class Video (models.Model): class Video (models.Model):
...@@ -255,73 +429,53 @@ class Video (models.Model): ...@@ -255,73 +429,53 @@ class Video (models.Model):
video_trans_start = models.DateTimeField('Process Start', null=True, blank=True) video_trans_start = models.DateTimeField('Process Start', null=True, blank=True)
video_trans_end = models.DateTimeField('Process Complete', null=True, blank=True) video_trans_end = models.DateTimeField('Process Complete', null=True, blank=True)
"""
TODO: STATUS REMODEL:
Change to
'Ingest',
'Queued',
'In Progress',
'Corrupt',
'Complete',
'Error',
'Duplicate',
'Review',
'Reject'
Possile:
'Invalid' (for ingest detected)
'Retry'
'Delivery' (for celery states?)
"""
SI = 'Ingest'
TQ = 'Transcode Queue'
AT = 'Active Transcode'
TR = 'Transcode Retry'
TC = 'Transcode Complete'
DU = 'Deliverable Upload'
FC = 'File Complete'
CF = 'Corrupt File'
RH = 'Review Hold'
RR = 'Review Reject'
RP = 'Final Publish'
YD = 'Youtube Duplicate'
TRANS_STATUS_OPTIONS = (
(SI, "System Ingest"),
(TQ, "Transcode Queue"),
(AT, "Active Transcode"),
(TR, "Transcode Retry"),
(TC, "Transcode Complete"),
(DU, "Deliverable Upload"),
(FC, "File Complete"),
('Transcode Error', "Transcode Error"),
(CF, "Corrupt File on Ingest"),
(RH, "Review Hold"),
(RR, "Review Rejected"),
(RP, "Review to Final Publish"),
(YD, "Youtube Duplicate"),
('Queue', "In Encode Queue"),
('Progress', "In Progress"),
('Complete', "Complete")
)
video_trans_status = models.CharField( video_trans_status = models.CharField(
'Transcode Status', 'Transcode Status',
max_length=100, max_length=100,
choices=TRANS_STATUS_OPTIONS, choices=VideoStatus.CHOICES,
default=SI default=VideoStatus.SI
) )
video_glacierid = models.CharField('Glacier Archive ID String', max_length=200, null=True, blank=True) video_glacierid = models.CharField('Glacier Archive ID String', max_length=200, null=True, blank=True)
abvid_serial = models.CharField('VEDA Upload Process Serial', max_length=20, null=True, blank=True) abvid_serial = models.CharField('VEDA Upload Process Serial', max_length=20, null=True, blank=True)
stat_queuetime = models.FloatField('Video Avg. Queuetime (sec)', default=0) stat_queuetime = models.FloatField('Video Avg. Queuetime (sec)', default=0)
# 3rd Party Transcription
process_transcription = models.BooleanField('Process transcripts from Cielo24/3PlayMedia', default=False)
provider = models.CharField(
'Transcription provider',
max_length=20,
choices=TranscriptProvider.CHOICES,
null=True,
blank=True,
)
three_play_turnaround = models.CharField(
'3PlayMedia Turnaround',
max_length=20,
choices=ThreePlayTurnaround.CHOICES,
null=True,
blank=True,
)
cielo24_turnaround = models.CharField(
'Cielo24 Turnaround', max_length=20,
choices=Cielo24Turnaround.CHOICES,
null=True,
blank=True,
)
cielo24_fidelity = models.CharField(
'Cielo24 Fidelity',
max_length=20,
choices=Cielo24Fidelity.CHOICES,
null=True,
blank=True,
)
preferred_languages = ListField(blank=True, default=[])
class Meta: class Meta:
get_latest_by = 'video_trans_start' get_latest_by = 'video_trans_start'
def __unicode__(self): def __unicode__(self):
return u'%s'.format(self.edx_id) or u'' return u'{edx_id}'.format(edx_id=self.edx_id)
class Destination (models.Model): class Destination (models.Model):
...@@ -378,7 +532,7 @@ class Encode (models.Model): ...@@ -378,7 +532,7 @@ class Encode (models.Model):
xuetang_proc = models.BooleanField('Submit to XuetangX', default=False) xuetang_proc = models.BooleanField('Submit to XuetangX', default=False)
def __unicode__(self): def __unicode__(self):
return u'%s'.format(self.encode_name) return u'{encode_profile}'.format(encode_profile=self.encode_name)
class URL (models.Model): class URL (models.Model):
...@@ -401,7 +555,11 @@ class URL (models.Model): ...@@ -401,7 +555,11 @@ class URL (models.Model):
get_latest_by = 'url_date' get_latest_by = 'url_date'
def __unicode__(self): def __unicode__(self):
return u'%s : %s : %s'.format(self.videoID, self.encode_profile.encode_name, self.url_date) or u'' return u'{video_id} : {encode_profile} : {date}'.format(
video_id=self.videoID,
encode_profile=self.encode_profile.encode_name,
date=self.url_date,
)
class VedaUpload (models.Model): class VedaUpload (models.Model):
...@@ -453,11 +611,11 @@ class VedaUpload (models.Model): ...@@ -453,11 +611,11 @@ class VedaUpload (models.Model):
get_latest_by = 'upload_date' get_latest_by = 'upload_date'
def __unicode__(self): def __unicode__(self):
return u'%s %s %s %s'.format( return u'{client_information} {upload_filename} {status_email} {file_complete}'.format(
self.client_information, client_information=self.client_information,
self.upload_filename, upload_filename=self.upload_filename,
self.status_email, status_email=self.status_email,
self.file_complete file_complete=self.file_complete
) )
...@@ -498,8 +656,8 @@ class TranscriptProcessMetadata(TimeStampedModel): ...@@ -498,8 +656,8 @@ class TranscriptProcessMetadata(TimeStampedModel):
) )
class Meta: class Meta:
unique_together = ('video', 'provider', 'lang_code')
verbose_name_plural = 'Transcript process metadata' verbose_name_plural = 'Transcript process metadata'
get_latest_by = 'modified'
def __unicode__(self): def __unicode__(self):
return u'{video} - {provider} - {lang}'.format( return u'{video} - {provider} - {lang}'.format(
......
...@@ -73,7 +73,13 @@ class VideoSerializer(serializers.ModelSerializer): ...@@ -73,7 +73,13 @@ class VideoSerializer(serializers.ModelSerializer):
'video_trans_end', 'video_trans_end',
'video_trans_status', 'video_trans_status',
'video_glacierid', 'video_glacierid',
'course_ids' 'course_ids',
'process_transcription',
'provider',
'three_play_turnaround',
'cielo24_turnaround',
'cielo24_fidelity',
'preferred_languages',
) )
def get_course_ids(self, video): def get_course_ids(self, video):
...@@ -128,6 +134,30 @@ class VideoSerializer(serializers.ModelSerializer): ...@@ -128,6 +134,30 @@ class VideoSerializer(serializers.ModelSerializer):
'video_glacierid', 'video_glacierid',
instance.video_glacierid instance.video_glacierid
) )
instance.process_transcription = validated_data.get(
'process_transcription',
instance.process_transcription
)
instance.provider = validated_data.get(
'provider',
instance.provider
)
instance.three_play_turnaround = validated_data.get(
'three_play_turnaround',
instance.three_play_turnaround
)
instance.cielo24_turnaround = validated_data.get(
'cielo24_turnaround',
instance.cielo24_turnaround
)
instance.cielo24_fidelity = validated_data.get(
'cielo24_fidelity',
instance.cielo24_fidelity
)
instance.preferred_languages = validated_data.get(
'preferred_languages',
instance.preferred_languages
)
instance.save() instance.save()
return instance return instance
......
import json
import logging
import os.path import os.path
import boto import boto
import boto.s3 import boto.s3
from boto.exception import S3ResponseError, S3DataError from boto.exception import S3ResponseError, S3DataError
import yaml import yaml
from VEDA_OS01.models import TranscriptPreferences
try: try:
boto.config.add_section('Boto') boto.config.add_section('Boto')
except: except:
...@@ -26,6 +30,8 @@ from veda_utils import ErrorObject ...@@ -26,6 +30,8 @@ from veda_utils import ErrorObject
from veda_file_ingest import VideoProto, VedaIngest from veda_file_ingest import VideoProto, VedaIngest
from veda_val import VALAPICall from veda_val import VALAPICall
LOGGER = logging.getLogger(__name__)
class FileDiscovery(object): class FileDiscovery(object):
...@@ -159,6 +165,7 @@ class FileDiscovery(object): ...@@ -159,6 +165,7 @@ class FileDiscovery(object):
client_title = meta.get_metadata('client_video_id') client_title = meta.get_metadata('client_video_id')
course_hex = meta.get_metadata('course_video_upload_token') course_hex = meta.get_metadata('course_video_upload_token')
course_url = meta.get_metadata('course_key') course_url = meta.get_metadata('course_key')
transcript_preferences = meta.get_metadata('transcript_preferences')
edx_filename = key.name[::-1].split('/')[0][::-1] edx_filename = key.name[::-1].split('/')[0][::-1]
if len(course_hex) == 0: if len(course_hex) == 0:
...@@ -226,24 +233,49 @@ class FileDiscovery(object): ...@@ -226,24 +233,49 @@ class FileDiscovery(object):
key.delete() key.delete()
return return
""" # Make decision if this video needs the transcription as well.
Trigger Ingest Process try:
""" transcript_preferences = json.loads(transcript_preferences)
V = VideoProto( TranscriptPreferences.objects.get(
# TODO: Once ammar is done with cielo24.
# org=extract_course_org(course_url),
org=transcript_preferences.get('org'),
provider=transcript_preferences.get('provider')
)
process_transcription = True
except (TypeError, TranscriptPreferences.DoesNotExist):
# when the preferences are not set OR these are set to some data in invalid format OR these don't
# have associated 3rd party transcription provider API keys.
process_transcription = False
except ValueError:
LOGGER.error('[VIDEO-PIPELINE] File Discovery - Invalid transcripts preferences=%s', transcript_preferences)
process_transcription = False
# Trigger Ingest Process
video_metadata = dict(
s3_filename=edx_filename, s3_filename=edx_filename,
client_title=client_title, client_title=client_title,
file_extension=file_extension, file_extension=file_extension,
platform_course_url=course_url platform_course_url=course_url,
) )
if process_transcription:
I = VedaIngest( video_metadata.update({
'process_transcription': process_transcription,
'provider': transcript_preferences.get('provider'),
'three_play_turnaround': transcript_preferences.get('three_play_turnaround'),
'cielo24_turnaround': transcript_preferences.get('cielo24_turnaround'),
'cielo24_fidelity': transcript_preferences.get('cielo24_fidelity'),
'preferred_languages': transcript_preferences.get('preferred_languages'),
})
ingest = VedaIngest(
course_object=course_query[0], course_object=course_query[0],
video_proto=V, video_proto=VideoProto(**video_metadata),
node_work_directory=self.node_work_directory node_work_directory=self.node_work_directory
) )
I.insert() ingest.insert()
if I.complete is False: if ingest.complete is False:
return return
""" """
......
import logging
import os import os
import sys import sys
import subprocess import subprocess
...@@ -7,6 +7,7 @@ from datetime import timedelta ...@@ -7,6 +7,7 @@ from datetime import timedelta
import time import time
import fnmatch import fnmatch
import django import django
from django.db.utils import DatabaseError
from django.utils.timezone import utc from django.utils.timezone import utc
from django.db import reset_queries from django.db import reset_queries
import uuid import uuid
...@@ -32,6 +33,8 @@ from veda_val import VALAPICall ...@@ -32,6 +33,8 @@ from veda_val import VALAPICall
from veda_encode import VedaEncode from veda_encode import VedaEncode
import celeryapp import celeryapp
LOGGER = logging.getLogger(__name__)
''' '''
V = VideoProto( V = VideoProto(
s3_filename=edx_filename, s3_filename=edx_filename,
...@@ -59,9 +62,16 @@ class VideoProto(): ...@@ -59,9 +62,16 @@ class VideoProto():
self.file_extension = kwargs.get('file_extension', None) self.file_extension = kwargs.get('file_extension', None)
self.platform_course_url = kwargs.get('platform_course_url', None) self.platform_course_url = kwargs.get('platform_course_url', None)
self.abvid_serial = kwargs.get('abvid_serial', None) self.abvid_serial = kwargs.get('abvid_serial', None)
"""
Determined Attrib # Transcription Process related Attributes
""" self.process_transcription = kwargs.get('process_transcription', False)
self.provider = kwargs.get('provider', None)
self.three_play_turnaround = kwargs.get('three_play_turnaround', None)
self.cielo24_turnaround = kwargs.get('cielo24_turnaround', None)
self.cielo24_fidelity = kwargs.get('cielo24_fidelity', None)
self.preferred_languages = kwargs.get('preferred_languages', [])
# Determined Attributes
self.valid = False self.valid = False
self.filesize = 0 self.filesize = 0
self.duration = 0 self.duration = 0
...@@ -333,6 +343,15 @@ class VedaIngest: ...@@ -333,6 +343,15 @@ class VedaIngest:
self.complete = True self.complete = True
return None return None
# Update transcription preferences for the Video
if self.video_proto.process_transcription:
v1.process_transcription = self.video_proto.process_transcription
v1.provider = self.video_proto.provider
v1.three_play_turnaround = self.video_proto.three_play_turnaround
v1.cielo24_turnaround = self.video_proto.cielo24_turnaround
v1.cielo24_fidelity = self.video_proto.cielo24_fidelity
v1.preferred_languages = self.video_proto.preferred_languages
""" """
Files Below are all valid Files Below are all valid
""" """
...@@ -353,7 +372,8 @@ class VedaIngest: ...@@ -353,7 +372,8 @@ class VedaIngest:
""" """
try: try:
v1.save() v1.save()
except: except DatabaseError:
# in case if the client title's length is too long
char_string = self.video_proto.client_title char_string = self.video_proto.client_title
string_len = len(char_string) string_len = len(char_string)
s1 = 0 s1 = 0
...@@ -368,6 +388,11 @@ class VedaIngest: ...@@ -368,6 +388,11 @@ class VedaIngest:
v1.client_title = final_string v1.client_title = final_string
v1.save() v1.save()
except Exception:
# Log the exception and raise.
LOGGER.exception('[VIDEO-PIPELINE] File Ingest - Cataloging of video=%s failed.', self.video_proto.veda_id)
raise
def val_insert(self): def val_insert(self):
if self.video_proto.abvid_serial is not None: if self.video_proto.abvid_serial is not None:
return None return None
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment