Merge pull request #130 from edx/mushtaq/import-transcripts

Import video transcripts

Merge pull request #130 from edx/mushtaq/import-transcripts
Import video transcripts
04dd6fd9 · Mushtaq Ali · GitHub · 6469fc26 · b8a64a57 · 04dd6fd9
Unverified Commit 04dd6fd9 authored Apr 10, 2018 by Mushtaq Ali Committed by GitHub Apr 10, 2018
Showing with 81 additions and 25 deletions

edxval/api.py
+37 -24

edxval/tests/constants.py
+26 -0

edxval/tests/data/wow.sjson
+1 -1

edxval/tests/test_api.py
+0 -0

edxval/tests/test_views.py
+1 -0

edxval/utils.py
+16 -0

No files found.
--- a/edxval/api.py
+++ b/edxval/api.py
@@ -8,6 +8,8 @@ from enum import Enum
 from uuid import uuid4

 from django.core.exceptions import ObjectDoesNotExist, ValidationError
+from django.core.files import File
+from fs.path import combine
 from lxml import etree
 from lxml.etree import Element, SubElement

@@ -20,7 +22,7 @@ from edxval.models import (CourseVideo, EncodedVideo, Profile,
                           TranscriptProviderType, Video, VideoImage,
                           VideoTranscript, ThirdPartyTranscriptCredentialsState)
 from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
-from edxval.utils import THIRD_PARTY_TRANSCRIPTION_PLANS
+from edxval.utils import THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs

 logger = logging.getLogger(__name__)  # pylint: disable=C0103

@@ -789,7 +791,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
        video_id (str): Video id of the video to export transcripts.
        course_id (str): The ID of the course with which this video is associated.
        static_dir (str): The Directory to store transcript file.
-        resource_fs (OSFS): The file system to store transcripts.
+        resource_fs (OSFS): Export file system.

    Returns:
        An lxml video_asset element containing export data
@@ -837,22 +839,15 @@ def create_trancript_file(video_id, language_code, file_format, resource_fs, sta
        static_dir (str): The Directory to store transcript file.
        resource_fs (OSFS): The file system to store transcripts.
    """
-    transcript_name = u'{static_dir}/{video_id}-{language_code}.{file_format}'.format(
-        static_dir=static_dir,
+    transcript_name = u'{video_id}-{language_code}.{file_format}'.format(
        video_id=video_id,
        language_code=language_code,
        file_format=file_format
    )
-    try:
    transcript_data = get_video_transcript_data(video_id, language_code)
    if transcript_data:
        transcript_content = transcript_data['content']
-            with resource_fs.open(transcript_name, 'wb') as f:
-                f.write(transcript_content)
-    except Exception:
-        # Do not raise exception in case no transcript file is found for now.
-        # TODO: Remove this - EDUCATOR-2173
-        pass
+        create_file_in_fs(transcript_content, transcript_name, resource_fs, static_dir)


 def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
@@ -886,7 +881,6 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
                transcripts_el,
                'transcript',
                {
-                    'file_name': video_transcript.transcript.name,
                    'language_code': language_code,
                    'file_format': file_format,
                    'provider': video_transcript.provider,
@@ -897,7 +891,7 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
    return video_el


-def import_from_xml(xml, edx_video_id, course_id=None):
+def import_from_xml(xml, edx_video_id, resource_fs, static_dir, course_id=None):
    """
    Imports data from a video_asset element about the given video_id.

@@ -907,6 +901,8 @@ def import_from_xml(xml, edx_video_id, course_id=None):
    Arguments:
        xml (Element): An lxml video_asset element containing import data
        edx_video_id (str): val video id
+        resource_fs (OSFS): Import file system.
+        static_dir (str): The Directory to retrieve transcript file.
        course_id (str): The ID of a course to associate the video with

    Raises:
@@ -915,7 +911,7 @@ def import_from_xml(xml, edx_video_id, course_id=None):
    if xml.tag != 'video_asset':
        raise ValCannotCreateError('Invalid XML')

-    # TODO this will be moved as a part of EDUCATOR-2173
+    # TODO this will be moved as a part of EDUCATOR-2403
    if not edx_video_id:
        return

@@ -968,26 +964,43 @@ def import_from_xml(xml, edx_video_id, course_id=None):
            'bitrate': encoded_video_el.get('bitrate'),
        })
    create_video(data)
-    create_transcript_objects(xml)
+    create_transcript_objects(xml, edx_video_id, resource_fs, static_dir)


-def create_transcript_objects(xml):
+def create_transcript_objects(xml, edx_video_id, resource_fs, static_dir):
    """
    Create VideoTranscript objects.

    Arguments:
-        xml (Element): lxml Element object
+        xml (Element): lxml Element object.
+        edx_video_id (str): Video id of the video.
+        resource_fs (OSFS): Import file system.
+        static_dir (str): The Directory to retrieve transcript file.
    """
    for transcript in xml.findall('.//transcripts/transcript'):
        try:
-            create_or_update_video_transcript(
-                transcript.attrib['video_id'],
-                transcript.attrib['language_code'],
-                metadata=dict(
-                    provider=transcript.attrib['provider'],
-                    file_name=transcript.attrib['file_name'],
-                    file_format=transcript.attrib['file_format'],
+            file_format = transcript.attrib['file_format']
+            language_code = transcript.attrib['language_code']
+            transcript_data = get_video_transcript_data(edx_video_id, language_code)
+
+            # First check if transcript record does not exist.
+            if not transcript_data:
+                transcript_file_name = u'{edx_video_id}-{language_code}.{file_format}'.format(
+                    edx_video_id=edx_video_id,
+                    language_code=language_code,
+                    file_format=file_format
                )
+
+                # Read file from import file system and attach File to transcript record in DS.
+                file_data = File(resource_fs.open(combine(static_dir, transcript_file_name)))
+
+                # Create transcript record.
+                create_video_transcript(
+                    video_id=edx_video_id,
+                    language_code=language_code,
+                    file_format=file_format,
+                    content=file_data,
+                    provider=transcript.attrib['provider']
                )
        except KeyError:
            logger.warn("VAL: Required attributes are missing from xml, xml=[%s]", etree.tostring(transcript).strip())
--- a/edxval/tests/constants.py
+++ b/edxval/tests/constants.py
@@ -12,6 +12,9 @@ from edxval.models import (
 )

 EDX_VIDEO_ID = "itchyjacket"
+
+EXPORT_IMPORT_STATIC_DIR = u'static'
+
 """
 Generic Profiles for manually creating profile objects
 """
@@ -363,12 +366,34 @@ VIDEO_DICT_UPDATE_ANIMAL = dict(
    encoded_videos=[],
 )

+
+TRANSCRIPT_DATA = {
+    "overwatch": """
+1
+00:00:14,370 --> 00:00:16,530
+I am overwatch.
+
+2
+00:00:16,500 --> 00:00:18,600
+可以用“我不太懂艺术 但我知道我喜欢什么”做比喻.""",
+    "flash": """
+1
+00:00:07,180 --> 00:00:08,460
+This is Flash line 1.""",
+    "wow": {
+        "start": [10],
+        "end": [100],
+        "text": ["Hi, welcome to edxval."],
+    }
+}
+
 VIDEO_TRANSCRIPT_CIELO24 = dict(
    video_id='super-soaker',
    language_code='en',
    transcript='edxval/tests/data/The_Flash.srt',
    provider=TranscriptProviderType.CIELO24,
    file_format=TranscriptFormat.SRT,
+    file_data=TRANSCRIPT_DATA['flash']
 )

 VIDEO_TRANSCRIPT_3PLAY = dict(
@@ -377,6 +402,7 @@ VIDEO_TRANSCRIPT_3PLAY = dict(
    transcript='edxval/tests/data/wow.sjson',
    provider=TranscriptProviderType.THREE_PLAY_MEDIA,
    file_format=TranscriptFormat.SJSON,
+    file_data=TRANSCRIPT_DATA['wow']
 )

 TRANSCRIPT_PREFERENCES_CIELO24 = dict(

--- a/edxval/tests/data/wow.sjson
+++ b/edxval/tests/data/wow.sjson
 {
   "start": [10],
   "end": [100],
-   "text": ["Hi, welcome to edxval."],
+   "text": ["Hi, welcome to edxval."]
 }
--- a/edxval/tests/test_api.py
+++ b/edxval/tests/test_api.py
--- a/edxval/tests/test_views.py
+++ b/edxval/tests/test_views.py
@@ -813,6 +813,7 @@ class VideoTranscriptViewTest(APIAuthTestCase):
        Tests POSTing transcript successfully.
        """
        post_transcript_data = dict(self.transcript_data)
+        post_transcript_data.pop('file_data')
        post_transcript_data['name'] = post_transcript_data.pop('transcript')

        response = self.client.post(self.url, post_transcript_data, format='json')

--- a/edxval/utils.py
+++ b/edxval/utils.py
@@ -4,6 +4,8 @@ Util methods to be used in api and models.

 from django.conf import settings
 from django.core.files.storage import get_storage_class
+from fs.path import combine
+

 # 3rd Party Transcription Plans
 THIRD_PARTY_TRANSCRIPTION_PLANS = {
@@ -169,3 +171,17 @@ def get_video_transcript_storage():
        # during edx-platform loading this method gets called but settings are not ready yet
        # so in that case we will return default(FileSystemStorage) storage class instance
        return get_storage_class()()
+
+
+def create_file_in_fs(file_data, file_name, file_system, static_dir):
+    """
+    Writes file in specific file system.
+
+    Arguments:
+        file_data (str): Data to store into the file.
+        file_name (str): File name of the file to be created.
+        resource_fs (OSFS): Import file system.
+        static_dir (str): The Directory to retrieve transcript file.
+    """
+    with file_system.open(combine(static_dir, file_name), 'wb') as f:
+        f.write(file_data)