Add transcript model and VEDA endpoint to create video transcripts.

ed6a1588 · Qubad786 · muzaffaryousaf · cb392214 · ed6a1588 · ed6a1588
Commit ed6a1588 authored Aug 02, 2017 by Qubad786 Committed by muzaffaryousaf Aug 17, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 221 additions and 113 deletions

edxval/admin.py
+2 -2

edxval/api.py
+101 -9

edxval/models.py
+42 -28

edxval/serializers.py
+6 -42

edxval/urls.py
+3 -8

edxval/views.py
+67 -24

No files found.
--- a/edxval/admin.py
+++ b/edxval/admin.py
@@ -3,7 +3,7 @@ Admin file for django app edxval.
 """
 from django.contrib import admin
-from .models import Video, Profile, EncodedVideo, Subtitle, CourseVideo, VideoImage
+from .models import Video, Profile, EncodedVideo, Transcript, CourseVideo, VideoImage
 class ProfileAdmin(admin.ModelAdmin):  # pylint: disable=C0111
@@ -48,6 +48,6 @@ class CourseVideoAdmin(admin.ModelAdmin):
 admin.site.register(Profile, ProfileAdmin)
 admin.site.register(Video, VideoAdmin)
-admin.site.register(Subtitle)
+admin.site.register(Transcript)
 admin.site.register(VideoImage, VideoImageAdmin)
 admin.site.register(CourseVideo, CourseVideoAdmin)
--- a/edxval/api.py
+++ b/edxval/api.py
@@ -9,16 +9,22 @@ from lxml.etree import Element, SubElement
 from enum import Enum
 from django.core.exceptions import ValidationError, ObjectDoesNotExist
-from django.core.files.base import ContentFile
-from edxval.models import Video, EncodedVideo, CourseVideo, Profile, VideoImage
+from edxval.models import (
-from edxval.serializers import VideoSerializer
+    Video,
+    EncodedVideo,
+    CourseVideo,
+    Profile,
+    VideoImage,
+    Transcript,
+    TranscriptProviderType,
+)
+from edxval.serializers import VideoSerializer, TranscriptSerializer
 from edxval.exceptions import (  # pylint: disable=unused-import
-    ValError,
    ValInternalError,
    ValVideoNotFoundError,
    ValCannotCreateError,
-    ValCannotUpdateError
+    ValCannotUpdateError,
 )
 logger = logging.getLogger(__name__)  # pylint: disable=C0103
@@ -143,6 +149,87 @@ def update_video_status(edx_video_id, status):
    video.save()
+def get_video_transcript(video_id, lang_code):
+    """
+    Get a video's transcript
+    Arguments:
+        video_id: it can be an edx_video_id or an external_id extracted from external sources in a video component.
+        lang_code: it will the language code of the requested transcript.
+    """
+    try:
+        transcript = Transcript.objects.get(video_id=video_id, language=lang_code)
+    except Transcript.DoesNotExist:
+        transcript = None
+    return transcript
+def get_video_transcripts(video_id):
+    """
+    Get a video's transcripts
+    Arguments:
+        video_id: it can be an edx_video_id or an external_id extracted from external sources in a video component.
+    """
+    transcripts_set = Transcript.objects.filter(video_id=video_id)
+    transcripts = []
+    if transcripts_set.exists():
+        transcripts = TranscriptSerializer(transcripts_set, many=True).data
+    return transcripts
+def create_video_transcript(video_id, language, transcript_url, transcript_format, provider=TranscriptProviderType.CUSTOM):
+    """
+    Creates a transcript record for a video.
+    Arguments:
+        video_id: it can be an edx_video_id or an external_id extracted from external sources in a video component.
+        language: language code of a video transcript
+        transcript_url: url of a video transcript
+        transcript_format: format of the transcript
+        provider: transcript provider
+    Raises:
+        IntegrityError: raises IntegrityError if there is an existing transcript with same video_id and lang_code.
+    """
+    transcript = Transcript.objects.create(
+        video_id=video_id,
+        language=language,
+        transcript_url=transcript_url,
+        fmt=transcript_format,
+        provider=provider,
+    )
+    return TranscriptSerializer(transcript).data
+def update_video_transcript(video_id, language, transcript_url, transcript_format, provider=TranscriptProviderType.CUSTOM):
+    """
+    Update a transcript for a video
+    Arguments:
+        video_id: it can be an edx_video_id or an external_id extracted from external sources in a video component.
+        language: language code of a video transcript
+        transcript_url: url of a video transcript
+        transcript_format: format of the transcript
+        provider: transcript provider
+    Raises:
+        DoesNotExist: raises DoesNotExist if there is not any transcript for the given video_id and lang_code.
+    """
+    transcript = Transcript.objects.get(video_id=video_id, language=language)
+    transcript.language = language
+    transcript.transcript_url = transcript_url
+    transcript.fmt = transcript_format
+    transcript.provider = provider
+    transcript.save()
+    return TranscriptSerializer(transcript).data
 def get_course_video_image_url(course_id, edx_video_id):
    """
    Returns course video image url or None if no image found
@@ -246,11 +333,11 @@ def get_video_info(edx_video_id):
                    url: url of the video
                    file_size: size of the video in bytes
                    profile: ID of the profile
-                subtitles: a list of Subtitle dicts
+                transcripts: a list of Subtitle dicts
                    fmt: file format (SRT or SJSON)
                    language: language code
-                    content_url: url of file
+                    provider: transcript provider 3Play/Cielo24/Custom
-                    url: api url to subtitle
+                    transcript_url: URL of the transcript file
            }
    Raises:
@@ -276,7 +363,12 @@ def get_video_info(edx_video_id):
            ]
        }
    """
-    return VideoSerializer(_get_video(edx_video_id)).data
+    serialized_video = VideoSerializer(_get_video(edx_video_id)).data
+    serialized_video.update({
+        'transcripts': get_video_transcripts(edx_video_id)
+    })
+    return serialized_video
 def get_urls_for_profiles(edx_video_id, profiles):

--- a/edxval/models.py
+++ b/edxval/models.py
@@ -335,45 +335,59 @@ class VideoImage(TimeStampedModel):
        return storage.url(self.image.name)
-SUBTITLE_FORMATS = (
+class TranscriptProviderType(object):
-    ('srt', 'SubRip'),
+    CUSTOM = 'Custom'
-    ('sjson', 'SRT JSON')
+    THREE_PLAY_MEDIA = '3PlayMedia'
-)
+    CIELO24 = 'Cielo24'
+    CHOICES = (
+        (CUSTOM, CUSTOM),
+        (THREE_PLAY_MEDIA, THREE_PLAY_MEDIA),
+        (CIELO24, CIELO24),
+    )
+class TranscriptFormat(object):
+    SRT = 'srt'
+    SJSON = 'sjson'
+    CHOICES = (
+        (SRT, 'SubRip'),
+        (SJSON, 'SRT JSON')
+    )
-class Subtitle(models.Model):
+class Transcript(TimeStampedModel):
    """
-    Subtitle for video
+    Transcript for a video
    Attributes:
-        video: the video that the subtitles are for
+        video_id: this is transcript's video.
-        fmt: the format of the subttitles file
+        language: language of this transcript.
+        provider: source of this transcript (Custom/3PlayMedia/Cielo24 uploaded)
+        fmt: format of this transcript srt/sjson
+        status: status of the transcript (pending/in progress/ready)
+        transcript_url: relative path to transcript from an S3 bucket
    """
-    created = models.DateTimeField(auto_now_add=True)
+    # It can be an edx_video_id or an external video id (e.g. in case of external URLs - YT/MP4/WEBM etc.)
-    modified = models.DateTimeField(auto_now=True)
+    video_id = models.CharField(max_length=255)
-    video = models.ForeignKey(Video, related_name="subtitles")
+    transcript_url = models.TextField(null=True, blank=True)
-    fmt = models.CharField(max_length=20, db_index=True, choices=SUBTITLE_FORMATS)
    language = models.CharField(max_length=8, db_index=True)
-    content = models.TextField(default='')
+    provider = models.CharField(
+        max_length=30,
-    def __str__(self):
+        choices=TranscriptProviderType.CHOICES,
-        return '%s Subtitle for %s' % (self.language, self.video)
+        default=TranscriptProviderType.CUSTOM,
+    )
+    fmt = models.CharField(max_length=20, db_index=True, choices=TranscriptFormat.CHOICES)
-    def get_absolute_url(self):
+    class Meta:
        """
-        Returns the full url link to the edx_video_id
+        course_id is listed first in this composite index
        """
-        return reverse('subtitle-content', args=[self.video.edx_video_id, self.language])
+        unique_together = ("video_id", "language")
-    @property
+    def __str__(self):
-    def content_type(self):
+        return '{lang} Transcript for {video}'.format(lang=self.language, video=self.video_id)
-        """
-        Sjson is returned as application/json, otherwise text/plain
-        """
-        if self.fmt == 'sjson':
-            return 'application/json'
-        else:
-            return 'text/plain'
 @receiver(models.signals.post_save, sender=Video)

--- a/edxval/serializers.py
+++ b/edxval/serializers.py
@@ -7,7 +7,7 @@ EncodedVideoSerializer which uses the profile_name as it's profile field.
 from rest_framework import serializers
 from rest_framework.fields import IntegerField, DateTimeField
-from edxval.models import Profile, Video, EncodedVideo, Subtitle, CourseVideo, VideoImage
+from edxval.models import Profile, Video, EncodedVideo, CourseVideo, VideoImage, Transcript
 class EncodedVideoSerializer(serializers.ModelSerializer):
@@ -50,37 +50,14 @@ class EncodedVideoSerializer(serializers.ModelSerializer):
        return data.get('profile', None)
-class SubtitleSerializer(serializers.ModelSerializer):
+class TranscriptSerializer(serializers.ModelSerializer):
    """
-    Serializer for Subtitle objects
+    Serializer for Transcript objects
    """
-    content_url = serializers.CharField(source='get_absolute_url', read_only=True)
-    content = serializers.CharField(write_only=True)
-    def validate(self, data):
-        """
-        Validate that the subtitle is in the correct format
-        """
-        value = data.get("content")
-        if data.get("fmt") == "sjson":
-            import json
-            try:
-                loaded = json.loads(value)
-            except ValueError:
-                raise serializers.ValidationError("Not in JSON format")
-            else:
-                data["content"] = json.dumps(loaded)
-        return data
    class Meta:  # pylint: disable=C1001, C0111
-        model = Subtitle
+        model = Transcript
-        lookup_field = "id"
+        lookup_field = 'video_id'
-        fields = (
+        fields = ('video_id', 'transcript_url', 'language', 'provider', 'fmt')
-            "fmt",
-            "language",
-            "content_url",
-            "content",
-        )
 class CourseSerializer(serializers.RelatedField):
@@ -118,7 +95,6 @@ class VideoSerializer(serializers.ModelSerializer):
    encoded_videos takes a list of dicts EncodedVideo data.
    """
    encoded_videos = EncodedVideoSerializer(many=True)
-    subtitles = SubtitleSerializer(many=True, required=False)
    courses = CourseSerializer(
        many=True,
        read_only=False,
@@ -179,11 +155,6 @@ class VideoSerializer(serializers.ModelSerializer):
            for video_data in encoded_videos
        )
-        Subtitle.objects.bulk_create(
-            Subtitle(video=video, **subtitle_data)
-            for subtitle_data in subtitles
-        )
        # The CourseSerializer will already have converted the course data
        # to CourseVideo models, so we can just set the video and save.
        # Also create VideoImage objects if an image filename is present
@@ -211,13 +182,6 @@ class VideoSerializer(serializers.ModelSerializer):
            for video_data in validated_data.get("encoded_videos", [])
        )
-        # Set subtitles
-        instance.subtitles.all().delete()
-        Subtitle.objects.bulk_create(
-            Subtitle(video=instance, **subtitle_data)
-            for subtitle_data in validated_data.get("subtitles", [])
-        )
        # Set courses
        # NOTE: for backwards compatibility with the DRF v2 behavior,
        # we do NOT delete existing course videos during the update.

--- a/edxval/urls.py
+++ b/edxval/urls.py
@@ -17,14 +17,9 @@ urlpatterns = [
        name="video-detail"
    ),
    url(
-        r'^videos/(?P<video__edx_video_id>[-\w]+)/(?P<language>[-_\w]+)$',
+        r'^videos/video-transcripts/create/$',
-        views.SubtitleDetail.as_view(),
+        views.VideoTranscriptView.as_view(),
-        name="subtitle-detail"
+        name='create-video-transcript'
-    ),
-    url(
-        r'^videos/(?P<edx_video_id>[-\w]+)/(?P<language>[-_\w]+)/subtitle$',
-        views.get_subtitle,
-        name="subtitle-content"
    ),
    url(
        r'^videos/video-images/update/$',

--- a/edxval/views.py
+++ b/edxval/views.py
@@ -13,10 +13,11 @@ from django.shortcuts import get_object_or_404
 from django.core.exceptions import ValidationError
 from django.views.decorators.http import last_modified
-from edxval.models import Video, Profile, Subtitle, CourseVideo, VideoImage
+from edxval.api import get_video_transcript, update_video_transcript, create_video_transcript
+from edxval.models import Video, Profile, Transcript, CourseVideo, VideoImage, TranscriptFormat, TranscriptProviderType
 from edxval.serializers import (
    VideoSerializer,
-    SubtitleSerializer
+    TranscriptSerializer,
 )
@@ -92,15 +93,73 @@ class VideoDetail(generics.RetrieveUpdateDestroyAPIView):
    serializer_class = VideoSerializer
-class SubtitleDetail(MultipleFieldLookupMixin, generics.RetrieveUpdateDestroyAPIView):
+class VideoTranscriptView(APIView):
    """
-    Gets a subtitle instance given its id
+    A Transcription View, used by VEDA to create video transcripts.
    """
    authentication_classes = (OAuth2Authentication, SessionAuthentication)
-    permission_classes = (ReadRestrictedDjangoModelPermissions,)
-    lookup_fields = ("video__edx_video_id", "language")
+    # noinspection PyMethodMayBeStatic
-    queryset = Subtitle.objects.all()
+    def post(self, request):
-    serializer_class = SubtitleSerializer
+        """
+        Creates a video transcript instance with the given information.
+        Arguments:
+            request: A WSGI request.
+        """
+        attrs = ('video_id', 'language', 'url', 'format', 'provider')
+        missing = [attr for attr in attrs if attr not in request.data]
+        if missing:
+            return Response(
+                status=status.HTTP_400_BAD_REQUEST,
+                data=dict(message=u'{missing} must be specified.'.format(missing=' and '.join(missing)))
+            )
+        video_id = request.data['video_id']
+        language = request.data['language']
+        transcript_url = request.data['url']
+        transcript_format = request.data['format']
+        provider = request.data['provider']
+        supported_formats = dict(TranscriptFormat.CHOICES).keys()
+        if transcript_format not in supported_formats:
+            return Response(
+                status=status.HTTP_400_BAD_REQUEST,
+                data=dict(
+                    message=(u'This transcript file type is not supported. Supported formats are'
+                             u'{supported_formats}').format(supported_formats=supported_formats)
+                )
+            )
+        supported_providers = dict(TranscriptProviderType.CHOICES).keys()
+        if provider not in supported_providers:
+            return Response(
+                status=status.HTTP_400_BAD_REQUEST,
+                data=dict(
+                    message=(u'This provider is not supported. Supported transcription providers are'
+                             u'{supported_providers}').format(supported_providers=supported_providers)
+                )
+            )
+        transcript = get_video_transcript(video_id, language)
+        if not transcript:
+            serialized_transcript = create_video_transcript(
+                video_id=video_id,
+                language=language,
+                transcript_url=transcript_url,
+                transcript_format=transcript_format,
+            )
+            response = Response(data=serialized_transcript, status=status.HTTP_200_OK)
+        else:
+            response = Response(
+                data=dict(
+                    message=(u'Transcript for video "{video_id}" and lang code "{language}" already exists. '
+                             u'It can not be overwritten.').format(video_id=video_id, language=language)
+                ),
+                status=status.HTTP_400_BAD_REQUEST
+            )
+        return response
 class VideoImagesView(APIView):
@@ -148,19 +207,3 @@ class VideoImagesView(APIView):
            )
        return Response()
-def _last_modified_subtitle(request, edx_video_id, language):  # pylint: disable=W0613
-    """
-    Returns the last modified subtitle
-    """
-    return Subtitle.objects.get(video__edx_video_id=edx_video_id, language=language).modified
-@last_modified(last_modified_func=_last_modified_subtitle)
-def get_subtitle(request, edx_video_id, language): # pylint: disable=W0613
-    """
-    Return content of subtitle by id
-    """
-    sub = Subtitle.objects.get(video__edx_video_id=edx_video_id, language=language)
-    response = HttpResponse(sub.content, content_type=sub.content_type)
-    return response