Commit ed6a1588 by Qubad786 Committed by muzaffaryousaf

Add transcript model and VEDA endpoint to create video transcripts.

parent cb392214
......@@ -3,7 +3,7 @@ Admin file for django app edxval.
"""
from django.contrib import admin
from .models import Video, Profile, EncodedVideo, Subtitle, CourseVideo, VideoImage
from .models import Video, Profile, EncodedVideo, Transcript, CourseVideo, VideoImage
class ProfileAdmin(admin.ModelAdmin): # pylint: disable=C0111
......@@ -48,6 +48,6 @@ class CourseVideoAdmin(admin.ModelAdmin):
admin.site.register(Profile, ProfileAdmin)
admin.site.register(Video, VideoAdmin)
admin.site.register(Subtitle)
admin.site.register(Transcript)
admin.site.register(VideoImage, VideoImageAdmin)
admin.site.register(CourseVideo, CourseVideoAdmin)
......@@ -9,16 +9,22 @@ from lxml.etree import Element, SubElement
from enum import Enum
from django.core.exceptions import ValidationError, ObjectDoesNotExist
from django.core.files.base import ContentFile
from edxval.models import Video, EncodedVideo, CourseVideo, Profile, VideoImage
from edxval.serializers import VideoSerializer
from edxval.models import (
Video,
EncodedVideo,
CourseVideo,
Profile,
VideoImage,
Transcript,
TranscriptProviderType,
)
from edxval.serializers import VideoSerializer, TranscriptSerializer
from edxval.exceptions import ( # pylint: disable=unused-import
ValError,
ValInternalError,
ValVideoNotFoundError,
ValCannotCreateError,
ValCannotUpdateError
ValCannotUpdateError,
)
logger = logging.getLogger(__name__) # pylint: disable=C0103
......@@ -143,6 +149,87 @@ def update_video_status(edx_video_id, status):
video.save()
def get_video_transcript(video_id, lang_code):
"""
Get a video's transcript
Arguments:
video_id: it can be an edx_video_id or an external_id extracted from external sources in a video component.
lang_code: it will the language code of the requested transcript.
"""
try:
transcript = Transcript.objects.get(video_id=video_id, language=lang_code)
except Transcript.DoesNotExist:
transcript = None
return transcript
def get_video_transcripts(video_id):
"""
Get a video's transcripts
Arguments:
video_id: it can be an edx_video_id or an external_id extracted from external sources in a video component.
"""
transcripts_set = Transcript.objects.filter(video_id=video_id)
transcripts = []
if transcripts_set.exists():
transcripts = TranscriptSerializer(transcripts_set, many=True).data
return transcripts
def create_video_transcript(video_id, language, transcript_url, transcript_format, provider=TranscriptProviderType.CUSTOM):
"""
Creates a transcript record for a video.
Arguments:
video_id: it can be an edx_video_id or an external_id extracted from external sources in a video component.
language: language code of a video transcript
transcript_url: url of a video transcript
transcript_format: format of the transcript
provider: transcript provider
Raises:
IntegrityError: raises IntegrityError if there is an existing transcript with same video_id and lang_code.
"""
transcript = Transcript.objects.create(
video_id=video_id,
language=language,
transcript_url=transcript_url,
fmt=transcript_format,
provider=provider,
)
return TranscriptSerializer(transcript).data
def update_video_transcript(video_id, language, transcript_url, transcript_format, provider=TranscriptProviderType.CUSTOM):
"""
Update a transcript for a video
Arguments:
video_id: it can be an edx_video_id or an external_id extracted from external sources in a video component.
language: language code of a video transcript
transcript_url: url of a video transcript
transcript_format: format of the transcript
provider: transcript provider
Raises:
DoesNotExist: raises DoesNotExist if there is not any transcript for the given video_id and lang_code.
"""
transcript = Transcript.objects.get(video_id=video_id, language=language)
transcript.language = language
transcript.transcript_url = transcript_url
transcript.fmt = transcript_format
transcript.provider = provider
transcript.save()
return TranscriptSerializer(transcript).data
def get_course_video_image_url(course_id, edx_video_id):
"""
Returns course video image url or None if no image found
......@@ -246,11 +333,11 @@ def get_video_info(edx_video_id):
url: url of the video
file_size: size of the video in bytes
profile: ID of the profile
subtitles: a list of Subtitle dicts
transcripts: a list of Subtitle dicts
fmt: file format (SRT or SJSON)
language: language code
content_url: url of file
url: api url to subtitle
provider: transcript provider 3Play/Cielo24/Custom
transcript_url: URL of the transcript file
}
Raises:
......@@ -276,7 +363,12 @@ def get_video_info(edx_video_id):
]
}
"""
return VideoSerializer(_get_video(edx_video_id)).data
serialized_video = VideoSerializer(_get_video(edx_video_id)).data
serialized_video.update({
'transcripts': get_video_transcripts(edx_video_id)
})
return serialized_video
def get_urls_for_profiles(edx_video_id, profiles):
......
......@@ -335,45 +335,59 @@ class VideoImage(TimeStampedModel):
return storage.url(self.image.name)
SUBTITLE_FORMATS = (
('srt', 'SubRip'),
('sjson', 'SRT JSON')
)
class TranscriptProviderType(object):
CUSTOM = 'Custom'
THREE_PLAY_MEDIA = '3PlayMedia'
CIELO24 = 'Cielo24'
CHOICES = (
(CUSTOM, CUSTOM),
(THREE_PLAY_MEDIA, THREE_PLAY_MEDIA),
(CIELO24, CIELO24),
)
class TranscriptFormat(object):
SRT = 'srt'
SJSON = 'sjson'
CHOICES = (
(SRT, 'SubRip'),
(SJSON, 'SRT JSON')
)
class Subtitle(models.Model):
class Transcript(TimeStampedModel):
"""
Subtitle for video
Transcript for a video
Attributes:
video: the video that the subtitles are for
fmt: the format of the subttitles file
video_id: this is transcript's video.
language: language of this transcript.
provider: source of this transcript (Custom/3PlayMedia/Cielo24 uploaded)
fmt: format of this transcript srt/sjson
status: status of the transcript (pending/in progress/ready)
transcript_url: relative path to transcript from an S3 bucket
"""
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
video = models.ForeignKey(Video, related_name="subtitles")
fmt = models.CharField(max_length=20, db_index=True, choices=SUBTITLE_FORMATS)
# It can be an edx_video_id or an external video id (e.g. in case of external URLs - YT/MP4/WEBM etc.)
video_id = models.CharField(max_length=255)
transcript_url = models.TextField(null=True, blank=True)
language = models.CharField(max_length=8, db_index=True)
content = models.TextField(default='')
def __str__(self):
return '%s Subtitle for %s' % (self.language, self.video)
provider = models.CharField(
max_length=30,
choices=TranscriptProviderType.CHOICES,
default=TranscriptProviderType.CUSTOM,
)
fmt = models.CharField(max_length=20, db_index=True, choices=TranscriptFormat.CHOICES)
def get_absolute_url(self):
class Meta:
"""
Returns the full url link to the edx_video_id
course_id is listed first in this composite index
"""
return reverse('subtitle-content', args=[self.video.edx_video_id, self.language])
unique_together = ("video_id", "language")
@property
def content_type(self):
"""
Sjson is returned as application/json, otherwise text/plain
"""
if self.fmt == 'sjson':
return 'application/json'
else:
return 'text/plain'
def __str__(self):
return '{lang} Transcript for {video}'.format(lang=self.language, video=self.video_id)
@receiver(models.signals.post_save, sender=Video)
......
......@@ -7,7 +7,7 @@ EncodedVideoSerializer which uses the profile_name as it's profile field.
from rest_framework import serializers
from rest_framework.fields import IntegerField, DateTimeField
from edxval.models import Profile, Video, EncodedVideo, Subtitle, CourseVideo, VideoImage
from edxval.models import Profile, Video, EncodedVideo, CourseVideo, VideoImage, Transcript
class EncodedVideoSerializer(serializers.ModelSerializer):
......@@ -50,37 +50,14 @@ class EncodedVideoSerializer(serializers.ModelSerializer):
return data.get('profile', None)
class SubtitleSerializer(serializers.ModelSerializer):
class TranscriptSerializer(serializers.ModelSerializer):
"""
Serializer for Subtitle objects
Serializer for Transcript objects
"""
content_url = serializers.CharField(source='get_absolute_url', read_only=True)
content = serializers.CharField(write_only=True)
def validate(self, data):
"""
Validate that the subtitle is in the correct format
"""
value = data.get("content")
if data.get("fmt") == "sjson":
import json
try:
loaded = json.loads(value)
except ValueError:
raise serializers.ValidationError("Not in JSON format")
else:
data["content"] = json.dumps(loaded)
return data
class Meta: # pylint: disable=C1001, C0111
model = Subtitle
lookup_field = "id"
fields = (
"fmt",
"language",
"content_url",
"content",
)
model = Transcript
lookup_field = 'video_id'
fields = ('video_id', 'transcript_url', 'language', 'provider', 'fmt')
class CourseSerializer(serializers.RelatedField):
......@@ -118,7 +95,6 @@ class VideoSerializer(serializers.ModelSerializer):
encoded_videos takes a list of dicts EncodedVideo data.
"""
encoded_videos = EncodedVideoSerializer(many=True)
subtitles = SubtitleSerializer(many=True, required=False)
courses = CourseSerializer(
many=True,
read_only=False,
......@@ -179,11 +155,6 @@ class VideoSerializer(serializers.ModelSerializer):
for video_data in encoded_videos
)
Subtitle.objects.bulk_create(
Subtitle(video=video, **subtitle_data)
for subtitle_data in subtitles
)
# The CourseSerializer will already have converted the course data
# to CourseVideo models, so we can just set the video and save.
# Also create VideoImage objects if an image filename is present
......@@ -211,13 +182,6 @@ class VideoSerializer(serializers.ModelSerializer):
for video_data in validated_data.get("encoded_videos", [])
)
# Set subtitles
instance.subtitles.all().delete()
Subtitle.objects.bulk_create(
Subtitle(video=instance, **subtitle_data)
for subtitle_data in validated_data.get("subtitles", [])
)
# Set courses
# NOTE: for backwards compatibility with the DRF v2 behavior,
# we do NOT delete existing course videos during the update.
......
......@@ -17,14 +17,9 @@ urlpatterns = [
name="video-detail"
),
url(
r'^videos/(?P<video__edx_video_id>[-\w]+)/(?P<language>[-_\w]+)$',
views.SubtitleDetail.as_view(),
name="subtitle-detail"
),
url(
r'^videos/(?P<edx_video_id>[-\w]+)/(?P<language>[-_\w]+)/subtitle$',
views.get_subtitle,
name="subtitle-content"
r'^videos/video-transcripts/create/$',
views.VideoTranscriptView.as_view(),
name='create-video-transcript'
),
url(
r'^videos/video-images/update/$',
......
......@@ -13,10 +13,11 @@ from django.shortcuts import get_object_or_404
from django.core.exceptions import ValidationError
from django.views.decorators.http import last_modified
from edxval.models import Video, Profile, Subtitle, CourseVideo, VideoImage
from edxval.api import get_video_transcript, update_video_transcript, create_video_transcript
from edxval.models import Video, Profile, Transcript, CourseVideo, VideoImage, TranscriptFormat, TranscriptProviderType
from edxval.serializers import (
VideoSerializer,
SubtitleSerializer
TranscriptSerializer,
)
......@@ -92,15 +93,73 @@ class VideoDetail(generics.RetrieveUpdateDestroyAPIView):
serializer_class = VideoSerializer
class SubtitleDetail(MultipleFieldLookupMixin, generics.RetrieveUpdateDestroyAPIView):
class VideoTranscriptView(APIView):
"""
Gets a subtitle instance given its id
A Transcription View, used by VEDA to create video transcripts.
"""
authentication_classes = (OAuth2Authentication, SessionAuthentication)
permission_classes = (ReadRestrictedDjangoModelPermissions,)
lookup_fields = ("video__edx_video_id", "language")
queryset = Subtitle.objects.all()
serializer_class = SubtitleSerializer
# noinspection PyMethodMayBeStatic
def post(self, request):
"""
Creates a video transcript instance with the given information.
Arguments:
request: A WSGI request.
"""
attrs = ('video_id', 'language', 'url', 'format', 'provider')
missing = [attr for attr in attrs if attr not in request.data]
if missing:
return Response(
status=status.HTTP_400_BAD_REQUEST,
data=dict(message=u'{missing} must be specified.'.format(missing=' and '.join(missing)))
)
video_id = request.data['video_id']
language = request.data['language']
transcript_url = request.data['url']
transcript_format = request.data['format']
provider = request.data['provider']
supported_formats = dict(TranscriptFormat.CHOICES).keys()
if transcript_format not in supported_formats:
return Response(
status=status.HTTP_400_BAD_REQUEST,
data=dict(
message=(u'This transcript file type is not supported. Supported formats are'
u'{supported_formats}').format(supported_formats=supported_formats)
)
)
supported_providers = dict(TranscriptProviderType.CHOICES).keys()
if provider not in supported_providers:
return Response(
status=status.HTTP_400_BAD_REQUEST,
data=dict(
message=(u'This provider is not supported. Supported transcription providers are'
u'{supported_providers}').format(supported_providers=supported_providers)
)
)
transcript = get_video_transcript(video_id, language)
if not transcript:
serialized_transcript = create_video_transcript(
video_id=video_id,
language=language,
transcript_url=transcript_url,
transcript_format=transcript_format,
)
response = Response(data=serialized_transcript, status=status.HTTP_200_OK)
else:
response = Response(
data=dict(
message=(u'Transcript for video "{video_id}" and lang code "{language}" already exists. '
u'It can not be overwritten.').format(video_id=video_id, language=language)
),
status=status.HTTP_400_BAD_REQUEST
)
return response
class VideoImagesView(APIView):
......@@ -148,19 +207,3 @@ class VideoImagesView(APIView):
)
return Response()
def _last_modified_subtitle(request, edx_video_id, language): # pylint: disable=W0613
"""
Returns the last modified subtitle
"""
return Subtitle.objects.get(video__edx_video_id=edx_video_id, language=language).modified
@last_modified(last_modified_func=_last_modified_subtitle)
def get_subtitle(request, edx_video_id, language): # pylint: disable=W0613
"""
Return content of subtitle by id
"""
sub = Subtitle.objects.get(video__edx_video_id=edx_video_id, language=language)
response = HttpResponse(sub.content, content_type=sub.content_type)
return response
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment