Export transcripts metadata along with xml

- Transcript files are exported into course OLX in .srt format. - Transcript language to filename maps is returned with xml, so that, it can be used by platform in old metadata fields for backward compatiblilty. - Add/fix tests bump VAL version

Export transcripts metadata along with xml
- Transcript files are exported into course OLX in .srt format. - Transcript language to filename maps is returned with xml, so that, it can be used by platform in old metadata fields for backward compatiblilty. - Add/fix tests bump VAL version
0bbb8afd · Qubad786 · f7a077bd · 0bbb8afd · 0bbb8afd · 0bbb8afd
Commit 0bbb8afd authored May 21, 2018 by Qubad786
Showing with 327 additions and 65 deletions

edxval/api.py
+66 -36

edxval/exceptions.py
+7 -0

edxval/tests/test_api.py
+40 -26

edxval/tests/test_transcript_utils.py
+94 -0

edxval/transcript_utils.py
+117 -0

edxval/utils.py
+2 -2

setup.py
+1 -1

No files found.
--- a/edxval/api.py
+++ b/edxval/api.py
@@ -3,6 +3,7 @@
 """
 The internal API for VAL.
 """
+import os
 import logging
 from enum import Enum
 from uuid import uuid4
@@ -17,15 +18,35 @@ from lxml import etree
 from lxml.etree import Element, SubElement
 from pysrt.srtexc import Error

-from edxval.exceptions import (InvalidTranscriptFormat,
-                               InvalidTranscriptProvider, ValCannotCreateError,
-                               ValCannotUpdateError, ValInternalError,
-                               ValVideoNotFoundError)
-from edxval.models import (CourseVideo, EncodedVideo, Profile, TranscriptPreference,
-                           TranscriptProviderType, Video, VideoImage,
-                           VideoTranscript, ThirdPartyTranscriptCredentialsState)
+from edxval.exceptions import (
+    InvalidTranscriptFormat,
+    TranscriptsGenerationException,
+    InvalidTranscriptProvider,
+    ValCannotCreateError,
+    ValCannotUpdateError,
+    ValInternalError,
+    ValVideoNotFoundError,
+)
+from edxval.models import (
+    CourseVideo,
+    EncodedVideo,
+    Profile,
+    TranscriptPreference,
+    TranscriptProviderType,
+    Video,
+    VideoImage,
+    VideoTranscript,
+    ThirdPartyTranscriptCredentialsState,
+)
 from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
-from edxval.utils import TranscriptFormat, THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs, get_transcript_format
+from edxval.utils import (
+    TranscriptFormat,
+    THIRD_PARTY_TRANSCRIPTION_PLANS,
+    create_file_in_fs,
+    get_transcript_format,
+)
+
+from edxval.transcript_utils import Transcript


 logger = logging.getLogger(__name__)  # pylint: disable=C0103
@@ -829,6 +850,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
                for name in ['profile', 'url', 'file_size', 'bitrate']
            }
        )
+
    return create_transcripts_xml(video_id, video_el, resource_fs, static_dir)


@@ -843,21 +865,26 @@ def create_transcript_file(video_id, language_code, file_format, resource_fs, st
        static_dir (str): The Directory to store transcript file.
        resource_fs (SubFS): The file system to store transcripts.
    """
-    transcript_name = u'{video_id}-{language_code}.{file_format}'.format(
+    transcript_filename = '{video_id}-{language_code}.srt'.format(
        video_id=video_id,
-        language_code=language_code,
-        file_format=file_format
+        language_code=language_code
    )
    transcript_data = get_video_transcript_data(video_id, language_code)
    if transcript_data:
-        transcript_content = transcript_data['content']
-        create_file_in_fs(transcript_content, transcript_name, resource_fs, static_dir)
+        transcript_content = Transcript.convert(
+            transcript_data['content'],
+            input_format=file_format,
+            output_format=Transcript.SRT
+        )
+        create_file_in_fs(transcript_content, transcript_filename, resource_fs, static_dir)
+
+    return transcript_filename


 def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
    """
    Creates xml for transcripts.
-    For each transcript elment, an associated transcript file is also created in course OLX.
+    For each transcript element, an associated transcript file is also created in course OLX.

    Arguments:
        video_id (str): Video id of the video.
@@ -873,32 +900,36 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
    if video_transcripts.exists():
        transcripts_el = SubElement(video_el, 'transcripts')

-    exported_language_codes = []
+    transcript_files_map = {}
    for video_transcript in video_transcripts:
-        if video_transcript.language_code not in exported_language_codes:
-            language_code = video_transcript.language_code
-            file_format = video_transcript.file_format
+        language_code = video_transcript.language_code
+        file_format = video_transcript.file_format

-            create_transcript_file(
-                video_id,
-                language_code,
-                file_format,
-                resource_fs.delegate_fs(),
-                combine(u'course', static_dir)  # File system should not start from /draft directory.
+        try:
+            transcript_filename = create_transcript_file(
+                video_id=video_id,
+                language_code=language_code,
+                file_format=file_format,
+                resource_fs=resource_fs.delegate_fs(),
+                static_dir=combine(u'course', static_dir)  # File system should not start from /draft directory.
            )
+            transcript_files_map[language_code] = transcript_filename
+        except TranscriptsGenerationException:
+            # we don't want to halt export in this case, just log and move to the next transcript.
+            logger.exception('[VAL] Error while generating "%s" transcript for video["%s"].', language_code, video_id)
+            continue

-            SubElement(
-                transcripts_el,
-                'transcript',
-                {
-                    'language_code': language_code,
-                    'file_format': file_format,
-                    'provider': video_transcript.provider,
-                }
-            )
-            exported_language_codes.append(video_transcript.language_code)
+        SubElement(
+            transcripts_el,
+            'transcript',
+            {
+                'language_code': language_code,
+                'file_format': Transcript.SRT,
+                'provider': video_transcript.provider,
+            }
+        )

-    return video_el
+    return dict(xml=video_el, transcripts=transcript_files_map)


 def import_from_xml(xml, edx_video_id, resource_fs, static_dir, external_transcripts=dict(), course_id=None):
@@ -1033,7 +1064,6 @@ def import_transcript_from_fs(edx_video_id, language_code, file_name, provider, 
            )
            return

-
        # Get file format from transcript content.
        try:
            file_format = get_transcript_format(file_content)

--- a/edxval/exceptions.py
+++ b/edxval/exceptions.py
@@ -62,3 +62,10 @@ class InvalidTranscriptProvider(ValError):
    This error is raised when an transcript provider is not supported
    """
    pass
+
+
+class TranscriptsGenerationException(ValError):
+    """
+    This error is raised when a transcript content is not parse-able in specified format.
+    """
+    pass
--- a/edxval/tests/test_api.py
+++ b/edxval/tests/test_api.py
@@ -35,6 +35,7 @@ from edxval.models import (LIST_MAX_ITEMS, CourseVideo, EncodedVideo, Profile,
                           VideoTranscript)
 from edxval.serializers import VideoSerializer
 from edxval.tests import APIAuthTestCase, constants
+from edxval.transcript_utils import Transcript


 def omit_attrs(dict, attrs_to_omit=[]):
@@ -973,11 +974,14 @@ class ExportTest(TestCase):
        expected = self.parse_xml("""
            <video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
        """)
-        self.assert_xml_equal(
-            api.export_to_xml(constants.VIDEO_DICT_STAR['edx_video_id'], self.file_system, constants.EXPORT_IMPORT_STATIC_DIR),
-            expected
+        exported_metadata = api.export_to_xml(
+            resource_fs=self.file_system,
+            static_dir=constants.EXPORT_IMPORT_STATIC_DIR,
+            video_id=constants.VIDEO_DICT_STAR['edx_video_id'],
        )

+        self.assert_xml_equal(exported_metadata['xml'], expected)
+
    def test_no_video_transcript(self):
        """
        Verify that transcript export for video with no transcript is working as expected.
@@ -986,11 +990,12 @@ class ExportTest(TestCase):
            <video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
        """)

-        exported_xml = api.export_to_xml(
+        exported_metadata = api.export_to_xml(
            constants.VIDEO_DICT_STAR['edx_video_id'],
            self.file_system,
            constants.EXPORT_IMPORT_STATIC_DIR
        )
+        exported_xml = exported_metadata['xml']
        self.assert_xml_equal(exported_xml, expected)

        # Verify that no transcript is present in the XML.
@@ -1011,29 +1016,29 @@ class ExportTest(TestCase):
                <encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
                <encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
                <transcripts>
-                    <transcript file_format="sjson" language_code="de" provider="3PlayMedia" />
+                    <transcript file_format="srt" language_code="de" provider="3PlayMedia" />
                    <transcript file_format="srt" language_code="en" provider="Cielo24" />
                </transcripts>
            </video_asset>
        """.format(image=image))

-        self.assert_xml_equal(
-            api.export_to_xml(
-                constants.VIDEO_DICT_FISH['edx_video_id'],
-                self.file_system,
-                constants.EXPORT_IMPORT_STATIC_DIR,
-                course_id
-            ),
-            expected
+        exported_metadata = api.export_to_xml(
+            constants.VIDEO_DICT_FISH['edx_video_id'],
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR,
+            course_id
        )

+        self.assert_xml_equal(exported_metadata['xml'], expected)
+        self.assertItemsEqual(exported_metadata['transcripts'].keys(), ['en', 'de'])
+
    def test_transcript_export(self):
        """
        Test that transcript are exported correctly.
        """
        language_code = 'en'
        video_id = constants.VIDEO_DICT_FISH['edx_video_id']
-        transcript_files = {'de': u'super-soaker-de.sjson', 'en': u'super-soaker-en.srt'}
+        transcript_files = {'de': u'super-soaker-de.srt', 'en': u'super-soaker-en.srt'}
        expected_transcript_path = combine(
            self.temp_dir,
            combine(constants.EXPORT_IMPORT_COURSE_DIR, constants.EXPORT_IMPORT_STATIC_DIR)
@@ -1045,16 +1050,21 @@ class ExportTest(TestCase):
                <encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
                <encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
                <transcripts>
-                    <transcript file_format="sjson" language_code="de" provider="3PlayMedia" />
+                    <transcript file_format="srt" language_code="de" provider="3PlayMedia" />
                    <transcript file_format="srt" language_code="en" provider="Cielo24" />
                </transcripts>
            </video_asset>
        """)

-        exported_xml = api.export_to_xml(video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR, 'test-course')
+        exported_metadata = api.export_to_xml(
+            video_id=video_id,
+            course_id='test-course',
+            resource_fs=self.file_system,
+            static_dir=constants.EXPORT_IMPORT_STATIC_DIR
+        )

        # Assert video and transcript xml is exported correctly.
-        self.assert_xml_equal(exported_xml, expected_xml)
+        self.assert_xml_equal(exported_metadata['xml'], expected_xml)

        # Verify transcript file is created.
        self.assertItemsEqual(transcript_files.values(), self.file_system.listdir(constants.EXPORT_IMPORT_STATIC_DIR))
@@ -1065,7 +1075,13 @@ class ExportTest(TestCase):
                open(combine(expected_transcript_path, transcript_files[language_code]))
            ).read()
            transcript = api.get_video_transcript_data(video_id=video_id, language_code=language_code)
-            self.assertEqual(transcript['content'], expected_transcript_content)
+            transcript_format = os.path.splitext(transcript['file_name'])[1][1:]
+            exported_transcript_content = Transcript.convert(
+                transcript['content'],
+                input_format=transcript_format,
+                output_format=Transcript.SRT,
+            ).encode('utf-8')
+            self.assertEqual(exported_transcript_content, expected_transcript_content)


    def test_unknown_video(self):
@@ -1728,20 +1744,18 @@ class ImportTest(TestCase):
        """
        language_code = 'en'
        edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id']
-        # First create transcript file.
+
+        # First create non utf-8 encoded transcript file in the file system.
        transcript_file_name = 'invalid-transcript.txt'
        invalid_transcript = dict(
            constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON,
            video_id=edx_video_id,
-            file_data=u'Привіт, edX вітає вас.'.encode('cp1251')
-        )
-        utils.create_file_in_fs(
-            invalid_transcript['file_data'],
-            transcript_file_name,
-            self.file_system,
-            constants.EXPORT_IMPORT_STATIC_DIR
+            file_data=u'Привіт, edX вітає вас.'
        )

+        with self.file_system.open(combine(constants.EXPORT_IMPORT_STATIC_DIR, transcript_file_name), 'wb') as f:
+            f.write(invalid_transcript['file_data'].encode('cp1251'))
+
        api.import_transcript_from_fs(
            edx_video_id=edx_video_id,
            language_code=language_code,

--- a/edxval/tests/test_transcript_utils.py
+++ b/edxval/tests/test_transcript_utils.py
+# -*- coding: utf-8 -*-
+"""
+Tests for transcript utils.
+"""
+import ddt
+import json
+import textwrap
+import unittest
+
+from edxval.transcript_utils import Transcript
+from edxval.exceptions import TranscriptsGenerationException
+
+
+@ddt.ddt
+class TestTranscriptUtils(unittest.TestCase):
+    """
+    Tests transcripts conversion util.
+    """
+    def setUp(self):
+        super(TestTranscriptUtils, self).setUp()
+
+        self.srt_transcript = textwrap.dedent("""\
+            0
+            00:00:10,500 --> 00:00:13,000
+            Elephant&#39;s Dream 大象的梦想
+
+            1
+            00:00:15,000 --> 00:00:18,000
+            At the left we can see...
+
+        """)
+
+        self.sjson_transcript = textwrap.dedent("""\
+            {
+                "start": [
+                    10500,
+                    15000
+                ],
+                "end": [
+                    13000,
+                    18000
+                ],
+                "text": [
+                    "Elephant&#39;s Dream 大象的梦想",
+                    "At the left we can see..."
+                ]
+            }
+        """)
+
+    @ddt.data(
+        ('invalid_input_format', 'sjson'),
+        ('sjson', 'invalid_output_format'),
+        ('invalid_input_format', 'invalid_output_format')
+    )
+    @ddt.unpack
+    def test_invalid_transcript_format(self, input_format, output_format):
+        """
+        Tests that transcript conversion raises `AssertionError` on invalid input/output formats.
+        """
+        with self.assertRaises(AssertionError):
+            Transcript.convert(self.sjson_transcript, input_format, output_format)
+
+    def test_convert_srt_to_srt(self):
+        """
+        Tests that srt to srt conversion works as expected.
+        """
+        expected = self.srt_transcript.decode('utf-8')
+        actual = Transcript.convert(self.srt_transcript, 'srt', 'srt')
+        self.assertEqual(actual, expected)
+
+    def test_convert_sjson_to_srt(self):
+        """
+        Tests that the sjson transcript is successfully converted into srt format.
+        """
+        expected = self.srt_transcript.decode('utf-8')
+        actual = Transcript.convert(self.sjson_transcript, 'sjson', 'srt')
+        self.assertEqual(actual, expected)
+
+    def test_convert_srt_to_sjson(self):
+        """
+        Tests that the srt transcript is successfully converted into sjson format.
+        """
+        expected = self.sjson_transcript.decode('utf-8')
+        actual = Transcript.convert(self.srt_transcript, 'srt', 'sjson')
+        self.assertDictEqual(json.loads(actual), json.loads(expected))
+
+    def test_convert_invalid_srt_to_sjson(self):
+        """
+        Tests that TranscriptsGenerationException was raises on trying
+        to convert invalid srt transcript to sjson.
+        """
+        invalid_srt_transcript = 'invalid SubRip file content'
+        with self.assertRaises(TranscriptsGenerationException):
+            Transcript.convert(invalid_srt_transcript, 'srt', 'sjson')
--- a/edxval/transcript_utils.py
+++ b/edxval/transcript_utils.py
+"""
+A module containing transcripts utils.
+"""
+import json
+from six import text_type
+
+from pysrt import SubRipFile, SubRipItem, SubRipTime
+from pysrt.srtexc import Error
+
+from edxval.exceptions import TranscriptsGenerationException
+
+
+class Transcript(object):
+    """
+    Container for transcript methods.
+    """
+    SRT = 'srt'
+    SJSON = 'sjson'
+
+    @staticmethod
+    def generate_sjson_from_srt(srt_subs):
+        """
+        Generate transcripts from sjson to SubRip (*.srt).
+
+        Arguments:
+            srt_subs(SubRip): "SRT" subs object
+
+        Returns:
+            Subs converted to "SJSON" format.
+        """
+        sub_starts = []
+        sub_ends = []
+        sub_texts = []
+        for sub in srt_subs:
+            sub_starts.append(sub.start.ordinal)
+            sub_ends.append(sub.end.ordinal)
+            sub_texts.append(sub.text.replace('\n', ' '))
+
+        sjson_subs = {
+            'start': sub_starts,
+            'end': sub_ends,
+            'text': sub_texts
+        }
+        return sjson_subs
+
+    @staticmethod
+    def generate_srt_from_sjson(sjson_subs):
+        """
+        Generate transcripts from sjson to SubRip (*.srt).
+
+        Arguments:
+            sjson_subs (dict): `sjson` subs.
+
+        Returns:
+            Subtitles in SRT format.
+        """
+
+        output = ''
+
+        equal_len = len(sjson_subs['start']) == len(sjson_subs['end']) == len(sjson_subs['text'])
+        if not equal_len:
+            return output
+
+        for i in range(len(sjson_subs['start'])):
+            item = SubRipItem(
+                index=i,
+                start=SubRipTime(milliseconds=sjson_subs['start'][i]),
+                end=SubRipTime(milliseconds=sjson_subs['end'][i]),
+                text=sjson_subs['text'][i]
+            )
+            output += (unicode(item))
+            output += '\n'
+        return output
+
+    @classmethod
+    def convert(cls, content, input_format, output_format):
+        """
+        Convert transcript `content` from `input_format` to `output_format`.
+
+        Arguments:
+            content: Transcript content byte-stream.
+            input_format: Input transcript format.
+            output_format: Output transcript format.
+
+        Accepted input formats: sjson, srt.
+        Accepted output format: srt, sjson.
+
+        Raises:
+            TranscriptsGenerationException: On parsing the invalid srt
+            content during conversion from srt to sjson.
+        """
+        assert input_format in ('srt', 'sjson')
+        assert output_format in ('srt', 'sjson')
+
+        # Decode the content with utf-8-sig which will also
+        # skip byte order mark(BOM) character if found.
+        content = content.decode('utf-8-sig')
+
+        if input_format == output_format:
+            return content
+
+        if input_format == 'srt':
+
+            if output_format == 'sjson':
+                try:
+                    # With error handling (set to 'ERROR_RAISE'), we will be getting
+                    # the exception if something went wrong in parsing the transcript.
+                    srt_subs = SubRipFile.from_string(content, error_handling=SubRipFile.ERROR_RAISE)
+                except Error as ex:  # Base exception from pysrt
+                    raise TranscriptsGenerationException(text_type(ex))
+
+                return json.dumps(cls.generate_sjson_from_srt(srt_subs))
+
+        if input_format == 'sjson':
+
+            if output_format == 'srt':
+                return cls.generate_srt_from_sjson(json.loads(content))
--- a/edxval/utils.py
+++ b/edxval/utils.py
@@ -192,11 +192,11 @@ def create_file_in_fs(file_data, file_name, file_system, static_dir):
    Arguments:
        file_data (str): Data to store into the file.
        file_name (str): File name of the file to be created.
-        resource_fs (OSFS): Import file system.
+        file_system (OSFS): Import file system.
        static_dir (str): The Directory to retrieve transcript file.
    """
    with file_system.open(combine(static_dir, file_name), 'wb') as f:
-        f.write(file_data)
+        f.write(file_data.encode('utf-8'))


 def get_transcript_format(transcript_content):

--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ def load_requirements(*requirements_paths):

 setup(
    name='edxval',
-    version='0.1.14',
+    version='0.1.15',
    author='edX',
    url='http://github.com/edx/edx-val',
    description='edx-val',