Import contentstore transcripts - EDUCATOR-2403

c8f8219a · Mushtaq Ali · b8a64a57 · c8f8219a · c8f8219a · c8f8219a
Commit c8f8219a authored Mar 13, 2018 by Mushtaq Ali
Showing with 611 additions and 110 deletions

edxval/api.py
+162 -58

edxval/models.py
+1 -11

edxval/tests/constants.py
+20 -6

edxval/tests/test_api.py
+394 -33

edxval/tests/test_views.py
+2 -1

edxval/utils.py
+30 -0

edxval/views.py
+1 -1

requirements/base.in
+1 -0

No files found.
--- a/edxval/api.py
+++ b/edxval/api.py
@@ -9,20 +9,24 @@ from uuid import uuid4

 from django.core.exceptions import ObjectDoesNotExist, ValidationError
 from django.core.files import File
+from django.core.files.base import ContentFile
+from fs import open_fs
+from fs.errors import ResourceNotFound
 from fs.path import combine
 from lxml import etree
 from lxml.etree import Element, SubElement
+from pysrt.srtexc import Error

 from edxval.exceptions import (InvalidTranscriptFormat,
                               InvalidTranscriptProvider, ValCannotCreateError,
                               ValCannotUpdateError, ValInternalError,
                               ValVideoNotFoundError)
-from edxval.models import (CourseVideo, EncodedVideo, Profile,
-                           TranscriptFormat, TranscriptPreference,
+from edxval.models import (CourseVideo, EncodedVideo, Profile, TranscriptPreference,
                           TranscriptProviderType, Video, VideoImage,
                           VideoTranscript, ThirdPartyTranscriptCredentialsState)
 from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
-from edxval.utils import THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs
+from edxval.utils import TranscriptFormat, THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs, get_transcript_format
+

 logger = logging.getLogger(__name__)  # pylint: disable=C0103

@@ -791,7 +795,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
        video_id (str): Video id of the video to export transcripts.
        course_id (str): The ID of the course with which this video is associated.
        static_dir (str): The Directory to store transcript file.
-        resource_fs (OSFS): Export file system.
+        resource_fs (SubFS): Export file system.

    Returns:
        An lxml video_asset element containing export data
@@ -828,7 +832,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
    return create_transcripts_xml(video_id, video_el, resource_fs, static_dir)


-def create_trancript_file(video_id, language_code, file_format, resource_fs, static_dir):
+def create_transcript_file(video_id, language_code, file_format, resource_fs, static_dir):
    """
    Writes transcript file to file system.

@@ -837,7 +841,7 @@ def create_trancript_file(video_id, language_code, file_format, resource_fs, sta
        language_code (str): Language code of the transcript.
        file_format (str): File format of the transcript file.
        static_dir (str): The Directory to store transcript file.
-        resource_fs (OSFS): The file system to store transcripts.
+        resource_fs (SubFS): The file system to store transcripts.
    """
    transcript_name = u'{video_id}-{language_code}.{file_format}'.format(
        video_id=video_id,
@@ -859,7 +863,7 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
        video_id (str): Video id of the video.
        video_el (Element): lxml Element object
        static_dir (str): The Directory to store transcript file.
-        resource_fs (OSFS): The file system to store transcripts.
+        resource_fs (SubFS): The file system to store transcripts.

    Returns:
        lxml Element object with transcripts information
@@ -875,7 +879,13 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
            language_code = video_transcript.language_code
            file_format = video_transcript.file_format

-            create_trancript_file(video_id, language_code, file_format, resource_fs, static_dir)
+            create_transcript_file(
+                video_id,
+                language_code,
+                file_format,
+                resource_fs.delegate_fs(),
+                combine(u'course', static_dir)  # File system should not start from /draft directory.
+            )

            SubElement(
                transcripts_el,
@@ -891,7 +901,7 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
    return video_el


-def import_from_xml(xml, edx_video_id, resource_fs, static_dir, course_id=None):
+def import_from_xml(xml, edx_video_id, resource_fs, static_dir, external_transcripts=dict(), course_id=None):
    """
    Imports data from a video_asset element about the given video_id.

@@ -903,20 +913,28 @@ def import_from_xml(xml, edx_video_id, resource_fs, static_dir, course_id=None):
        edx_video_id (str): val video id
        resource_fs (OSFS): Import file system.
        static_dir (str): The Directory to retrieve transcript file.
+        external_transcripts (dict): A dict containing the list of names of the external transcripts.
+            Example:
+            {
+                'en': ['The_Flash.srt', 'Harry_Potter.srt'],
+                'es': ['Green_Arrow.srt']
+            }
        course_id (str): The ID of a course to associate the video with

    Raises:
        ValCannotCreateError: if there is an error importing the video
+
+    Returns:
+        edx_video_id (str): val video id.
    """
    if xml.tag != 'video_asset':
        raise ValCannotCreateError('Invalid XML')

-    # TODO this will be moved as a part of EDUCATOR-2403
-    if not edx_video_id:
-        return
-
    # If video with edx_video_id already exists, associate it with the given course_id.
    try:
+        if not edx_video_id:
+            raise Video.DoesNotExist
+
        video = Video.objects.get(edx_video_id=edx_video_id)
        logger.info(
            "edx_video_id '%s' present in course '%s' not imported because it exists in VAL.",
@@ -930,44 +948,116 @@ def import_from_xml(xml, edx_video_id, resource_fs, static_dir, course_id=None):
            if image_file_name:
                VideoImage.create_or_update(course_video, image_file_name)

-        return
+        return edx_video_id
    except ValidationError as err:
        logger.exception(err.message)
        raise ValCannotCreateError(err.message_dict)
    except Video.DoesNotExist:
        pass

-    # Video with edx_video_id did not exist, so create one from xml data.
-    data = {
-        'edx_video_id': edx_video_id,
-        'client_video_id': xml.get('client_video_id'),
-        'duration': xml.get('duration'),
-        'status': 'imported',
-        'encoded_videos': [],
-        'courses': [{course_id: xml.get('image')}] if course_id else [],
-    }
-    for encoded_video_el in xml.iterfind('encoded_video'):
-        profile_name = encoded_video_el.get('profile')
+    if edx_video_id:
+        # Video with edx_video_id did not exist, so create one from xml data.
+        data = {
+            'edx_video_id': edx_video_id,
+            'client_video_id': xml.get('client_video_id'),
+            'duration': xml.get('duration'),
+            'status': 'imported',
+            'encoded_videos': [],
+            'courses': [{course_id: xml.get('image')}] if course_id else [],
+        }
+        for encoded_video_el in xml.iterfind('encoded_video'):
+            profile_name = encoded_video_el.get('profile')
+            try:
+                Profile.objects.get(profile_name=profile_name)
+            except Profile.DoesNotExist:
+                logger.info(
+                    "Imported edx_video_id '%s' contains unknown profile '%s'.",
+                    edx_video_id,
+                    profile_name
+                )
+                continue
+            data['encoded_videos'].append({
+                'profile': profile_name,
+                'url': encoded_video_el.get('url'),
+                'file_size': encoded_video_el.get('file_size'),
+                'bitrate': encoded_video_el.get('bitrate'),
+            })
+
+        # Create external video if no edx_video_id.
+        edx_video_id = create_video(data)
+    else:
+        edx_video_id = create_external_video('External Video')
+
+    create_transcript_objects(xml, edx_video_id, resource_fs, static_dir, external_transcripts)
+    return edx_video_id
+
+
+def import_transcript_from_fs(edx_video_id, language_code, file_name, provider, resource_fs, static_dir):
+    """
+    Imports transcript file from file system and creates transcript record in DS.
+
+    Arguments:
+        edx_video_id (str): Video id of the video.
+        language_code (unicode): Language code of the requested transcript.
+        file_name (unicode): File name of the transcript file.
+        provider (unicode): Transcript provider.
+        resource_fs (OSFS): Import file system.
+        static_dir (str): The Directory to retrieve transcript file.
+    """
+    file_format = None
+    transcript_data = get_video_transcript_data(edx_video_id, language_code)
+
+    # First check if transcript record does not exist.
+    if not transcript_data:
+        # Read file from import file system and attach it to transcript record in DS.
+        try:
+            with resource_fs.open(combine(static_dir, file_name), 'rb') as f:
+                file_content = f.read()
+                file_content = file_content.decode('utf-8-sig')
+        except ResourceNotFound as exc:
+            # Don't raise exception in case transcript file is not found in course OLX.
+            logger.warn(
+                '[edx-val] "%s" transcript "%s" for video "%s" is not found.',
+                language_code,
+                file_name,
+                edx_video_id
+            )
+            return
+        except UnicodeDecodeError:
+            # Don't raise exception in case transcript contains non-utf8 content.
+            logger.warn(
+                '[edx-val] "%s" transcript "%s" for video "%s" contains a non-utf8 file content.',
+                language_code,
+                file_name,
+                edx_video_id
+            )
+            return
+
+
+        # Get file format from transcript content.
        try:
-            Profile.objects.get(profile_name=profile_name)
-        except Profile.DoesNotExist:
-            logger.info(
-                "Imported edx_video_id '%s' contains unknown profile '%s'.",
+            file_format = get_transcript_format(file_content)
+        except Error as ex:
+            # Don't raise exception, just don't create transcript record.
+            logger.warn(
+                '[edx-val] Error while getting transcript format for video=%s -- language_code=%s --file_name=%s',
                edx_video_id,
-                profile_name
+                language_code,
+                file_name
            )
-            continue
-        data['encoded_videos'].append({
-            'profile': profile_name,
-            'url': encoded_video_el.get('url'),
-            'file_size': encoded_video_el.get('file_size'),
-            'bitrate': encoded_video_el.get('bitrate'),
-        })
-    create_video(data)
-    create_transcript_objects(xml, edx_video_id, resource_fs, static_dir)
+            return
+
+        # Create transcript record.
+        create_video_transcript(
+            video_id=edx_video_id,
+            language_code=language_code,
+            file_format=file_format,
+            content=ContentFile(file_content),
+            provider=provider
+        )


-def create_transcript_objects(xml, edx_video_id, resource_fs, static_dir):
+def create_transcript_objects(xml, edx_video_id, resource_fs, static_dir, external_transcripts):
    """
    Create VideoTranscript objects.

@@ -976,31 +1066,45 @@ def create_transcript_objects(xml, edx_video_id, resource_fs, static_dir):
        edx_video_id (str): Video id of the video.
        resource_fs (OSFS): Import file system.
        static_dir (str): The Directory to retrieve transcript file.
+        external_transcripts (dict): A dict containing the list of names of the external transcripts.
+            Example:
+            {
+                'en': ['The_Flash.srt', 'Harry_Potter.srt'],
+                'es': ['Green_Arrow.srt']
+            }
    """
-    for transcript in xml.findall('.//transcripts/transcript'):
-        try:
-            file_format = transcript.attrib['file_format']
-            language_code = transcript.attrib['language_code']
-            transcript_data = get_video_transcript_data(edx_video_id, language_code)
-
-            # First check if transcript record does not exist.
-            if not transcript_data:
+    # File system should not start from /draft directory.
+    with open_fs(resource_fs.root_path.split('/drafts')[0]) as file_system:
+        # First import VAL transcripts.
+        for transcript in xml.findall('.//transcripts/transcript'):
+            try:
+                file_format = transcript.attrib['file_format']
+                language_code = transcript.attrib['language_code']
                transcript_file_name = u'{edx_video_id}-{language_code}.{file_format}'.format(
                    edx_video_id=edx_video_id,
                    language_code=language_code,
                    file_format=file_format
                )

-                # Read file from import file system and attach File to transcript record in DS.
-                file_data = File(resource_fs.open(combine(static_dir, transcript_file_name)))
+                import_transcript_from_fs(
+                    edx_video_id=edx_video_id,
+                    language_code=transcript.attrib['language_code'],
+                    file_name=transcript_file_name,
+                    provider=transcript.attrib['provider'],
+                    resource_fs=file_system,
+                    static_dir=static_dir
+                )
+            except KeyError:
+                logger.warn("VAL: Required attributes are missing from xml, xml=[%s]", etree.tostring(transcript).strip())

-                # Create transcript record.
-                create_video_transcript(
-                    video_id=edx_video_id,
+        # This won't overwrite transcript for a language which is already present for the video.
+        for language_code, transcript_file_names in external_transcripts.iteritems():
+            for transcript_file_name in transcript_file_names:
+                import_transcript_from_fs(
+                    edx_video_id=edx_video_id,
                    language_code=language_code,
-                    file_format=file_format,
-                    content=file_data,
-                    provider=transcript.attrib['provider']
+                    file_name=transcript_file_name,
+                    provider=TranscriptProviderType.CUSTOM,
+                    resource_fs=file_system,
+                    static_dir=static_dir
                )
-        except KeyError:
-            logger.warn("VAL: Required attributes are missing from xml, xml=[%s]", etree.tostring(transcript).strip())
--- a/edxval/models.py
+++ b/edxval/models.py
@@ -25,7 +25,7 @@ from django.dispatch import receiver
 from django.utils.six import python_2_unicode_compatible
 from model_utils.models import TimeStampedModel

-from edxval.utils import (get_video_image_storage,
+from edxval.utils import (TranscriptFormat, get_video_image_storage,
                          get_video_transcript_storage, video_image_path,
                          video_transcript_path)

@@ -373,16 +373,6 @@ class TranscriptProviderType(object):
    )


-class TranscriptFormat(object):
-    SRT = 'srt'
-    SJSON = 'sjson'
-
-    CHOICES = (
-        (SRT, 'SubRip'),
-        (SJSON, 'SRT JSON')
-    )
-
-
 class CustomizableFileField(models.FileField):
    """
    Subclass of FileField that allows custom settings to not

--- a/edxval/tests/constants.py
+++ b/edxval/tests/constants.py
@@ -4,15 +4,17 @@
 Constants used for tests.
 """
 from edxval.models import (
-    TranscriptFormat,
    TranscriptProviderType,
    Cielo24Fidelity,
    Cielo24Turnaround,
    ThreePlayTurnaround
 )

+from edxval.utils import TranscriptFormat
+
 EDX_VIDEO_ID = "itchyjacket"

+EXPORT_IMPORT_COURSE_DIR = u'course'
 EXPORT_IMPORT_STATIC_DIR = u'static'

 """
@@ -380,13 +382,25 @@ I am overwatch.
 1
 00:00:07,180 --> 00:00:08,460
 This is Flash line 1.""",
-    "wow": {
-        "start": [10],
-        "end": [100],
-        "text": ["Hi, welcome to edxval."],
-    }
+    "wow": """{\n   "start": [10],\n   "end": [100],\n   "text": ["Hi, welcome to edxval."]\n}\n"""
 }

+VIDEO_TRANSCRIPT_CUSTOM_SRT = dict(
+    language_code='en',
+    transcript='edxval/tests/data/The_Flash.srt',
+    provider=TranscriptProviderType.CUSTOM,
+    file_format=TranscriptFormat.SRT,
+    file_data=TRANSCRIPT_DATA['flash']
+)
+
+VIDEO_TRANSCRIPT_CUSTOM_SJSON = dict(
+    language_code='en',
+    transcript='edxval/tests/data/wow.sjson',
+    provider=TranscriptProviderType.CUSTOM,
+    file_format=TranscriptFormat.SJSON,
+    file_data=TRANSCRIPT_DATA['wow']
+)
+
 VIDEO_TRANSCRIPT_CIELO24 = dict(
    video_id='super-soaker',
    language_code='en',

--- a/edxval/tests/test_api.py
+++ b/edxval/tests/test_api.py
@@ -30,8 +30,7 @@ from edxval.api import (InvalidTranscriptFormat, InvalidTranscriptProvider,
                        ValCannotUpdateError, ValVideoNotFoundError,
                        VideoSortField)
 from edxval.models import (LIST_MAX_ITEMS, CourseVideo, EncodedVideo, Profile,
-                           ThirdPartyTranscriptCredentialsState,
-                           TranscriptFormat, TranscriptPreference,
+                           ThirdPartyTranscriptCredentialsState, TranscriptPreference,
                           TranscriptProviderType, Video, VideoImage,
                           VideoTranscript)
 from edxval.serializers import VideoSerializer
@@ -938,7 +937,8 @@ class ExportTest(TestCase):
        VideoTranscript.objects.create(**transcript_data)

        self.temp_dir = mkdtemp()
-        self.file_system = OSFS(self.temp_dir)
+        delegate_fs = OSFS(self.temp_dir)
+        self.file_system = delegate_fs.makedir(constants.EXPORT_IMPORT_COURSE_DIR, recreate=True)
        self.file_system.makedir(constants.EXPORT_IMPORT_STATIC_DIR, recreate=True)
        self.addCleanup(shutil.rmtree, self.temp_dir)

@@ -1034,7 +1034,10 @@ class ExportTest(TestCase):
        language_code = 'en'
        video_id = constants.VIDEO_DICT_FISH['edx_video_id']
        transcript_files = {'de': u'super-soaker-de.sjson', 'en': u'super-soaker-en.srt'}
-        expected_transcript_path = combine(self.temp_dir, constants.EXPORT_IMPORT_STATIC_DIR)
+        expected_transcript_path = combine(
+            self.temp_dir,
+            combine(constants.EXPORT_IMPORT_COURSE_DIR, constants.EXPORT_IMPORT_STATIC_DIR)
+        )

        expected_xml = self.parse_xml("""
            <video_asset client_video_id="Shallow Swordfish" duration="122.0" image="image.jpg">
@@ -1097,7 +1100,11 @@ class ImportTest(TestCase):

        self.temp_dir = mkdtemp()
        self.file_system = OSFS(self.temp_dir)
-        self.file_system.makedir(constants.EXPORT_IMPORT_STATIC_DIR, recreate=True)
+        self.file_system.makedir(constants.EXPORT_IMPORT_COURSE_DIR, recreate=True)
+        self.file_system.makedir(
+            constants.EXPORT_IMPORT_STATIC_DIR,
+            recreate=True
+        )

        self.addCleanup(shutil.rmtree, self.temp_dir)

@@ -1146,7 +1153,7 @@ class ImportTest(TestCase):
                    file_format=file_format
                )
                utils.create_file_in_fs(
-                    json.dumps(video_transcript['file_data']),
+                    video_transcript['file_data'],
                    transcript_file_name,
                    self.file_system,
                    constants.EXPORT_IMPORT_STATIC_DIR
@@ -1175,7 +1182,14 @@ class ImportTest(TestCase):
    def assert_invalid_import(self, xml, course_id=None):
        edx_video_id = "test_edx_video_id"
        with self.assertRaises(ValCannotCreateError):
-            api.import_from_xml(xml, edx_video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR, course_id)
+            api.import_from_xml(
+                xml,
+                edx_video_id,
+                self.file_system,
+                constants.EXPORT_IMPORT_STATIC_DIR,
+                {},
+                course_id
+            )
        self.assertFalse(Video.objects.filter(edx_video_id=edx_video_id).exists())

    def assert_transcripts(self, video_id, expected_transcripts):
@@ -1190,15 +1204,13 @@ class ImportTest(TestCase):
        for expected_transcript in expected_transcripts:
            language_code = expected_transcript['language_code']

-            # Get the imported transcript and rename `url` key.
+            # Get the imported transcript and remove `url` key.
            received_transcript = api.TranscriptSerializer(
                VideoTranscript.objects.get(video__edx_video_id=video_id, language_code=language_code)
            ).data

            # Assert transcript content
-            received_transcript['file_data'] = json.loads(
-                api.get_video_transcript_data(video_id, language_code)['content']
-            )
+            received_transcript['file_data'] = api.get_video_transcript_data(video_id, language_code)['content']

            # Omit not needed attrs.
            expected_transcript = omit_attrs(expected_transcript, ['transcript'])
@@ -1219,13 +1231,15 @@ class ImportTest(TestCase):
        # There must not be any transcript before import.
        self.assert_transcripts(constants.VIDEO_DICT_STAR['edx_video_id'], [])

-        api.import_from_xml(
+        edx_video_id = api.import_from_xml(
            xml,
            constants.VIDEO_DICT_STAR['edx_video_id'],
            self.file_system,
            constants.EXPORT_IMPORT_STATIC_DIR,
+            {},
            new_course_id
        )
+        self.assertEqual(edx_video_id, constants.VIDEO_DICT_STAR['edx_video_id'])

        video = Video.objects.get(edx_video_id=constants.VIDEO_DICT_STAR['edx_video_id'])
        self.assert_video_matches_dict(video, constants.VIDEO_DICT_STAR)
@@ -1271,7 +1285,7 @@ class ImportTest(TestCase):
        transcript_data = dict(self.transcript_data3, language_code=language_code)
        xml = self.make_import_xml(
            video_dict={
-                'edx_video_id': 'new_video_id',
+                'edx_video_id': constants.VIDEO_DICT_FISH['edx_video_id'],
                'client_video_id': 'new_client_video_id',
                'duration': 0,
            },
@@ -1291,13 +1305,15 @@ class ImportTest(TestCase):
        # There must not be any transcript before import.
        self.assert_transcripts(constants.VIDEO_DICT_FISH['edx_video_id'], [])

-        api.import_from_xml(
+        edx_video_id = api.import_from_xml(
            xml,
            constants.VIDEO_DICT_FISH['edx_video_id'],
            self.file_system,
            constants.EXPORT_IMPORT_STATIC_DIR,
+            {},
            course_id
        )
+        self.assertEqual(edx_video_id, constants.VIDEO_DICT_FISH['edx_video_id'])

        video = Video.objects.get(edx_video_id=constants.VIDEO_DICT_FISH['edx_video_id'])
        self.assert_video_matches_dict(video, constants.VIDEO_DICT_FISH)
@@ -1325,6 +1341,7 @@ class ImportTest(TestCase):
                constants.VIDEO_DICT_FISH['edx_video_id'],
                self.file_system,
                constants.EXPORT_IMPORT_STATIC_DIR,
+                {},
                course_id='x' * 300
            )

@@ -1387,7 +1404,6 @@ class ImportTest(TestCase):
        xml = self.make_import_xml(video_dict=constants.VIDEO_DICT_FISH)
        self.assert_invalid_import(xml, "x" * 300)

-    # FIXME: EDUCATOR-2403
    def test_external_no_video_transcript(self):
        """
        Verify that transcript import for external video working as expected when there is no transcript.
@@ -1403,6 +1419,344 @@ class ImportTest(TestCase):
            0
        )

+    @data(
+        ('external-transcript.srt', constants.VIDEO_TRANSCRIPT_CUSTOM_SRT),
+        ('external-transcript.sjson', constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON)
+    )
+    @unpack
+    def test_external_video_transcript(self, transcript_file_name, transcript_data):
+        """
+        Verify that transcript import for external video working as expected when there is transcript present.
+        """
+        # First create external transcript.
+        utils.create_file_in_fs(
+            transcript_data['file_data'],
+            transcript_file_name,
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR
+        )
+
+        # Verify that one video is present before import.
+        self.assertEqual(Video.objects.count(), 1)
+
+        # Verify that no transript was present before import.
+        self.assertEqual(VideoTranscript.objects.count(), 0)
+
+        # Import xml with empty edx video id.
+        edx_video_id = api.import_from_xml(
+            etree.fromstring('<video_asset/>'),
+            '',
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR,
+            {
+                'en': [transcript_file_name]
+            }
+        )
+
+        # Verify that a new video is created.
+        self.assertIsNotNone(edx_video_id)
+
+        # Verify transcript record is created with correct data.
+        self.assert_transcripts(
+            edx_video_id,
+            [dict(transcript_data, video_id=edx_video_id)]
+        )
+
+    def test_multiple_external_transcripts_different_langauges(self):
+        """
+        Verify that transcript import for external video working as expected when multiple transcripts are imported.
+        """
+        # First create external transcripts.
+        en_transcript_file_name = 'external-transcript-en.srt'
+        utils.create_file_in_fs(
+            constants.VIDEO_TRANSCRIPT_CUSTOM_SRT['file_data'],
+            en_transcript_file_name,
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR
+        )
+
+        es_transcript_file_name = 'external-transcript-es.srt'
+        utils.create_file_in_fs(
+            constants.VIDEO_TRANSCRIPT_CUSTOM_SRT['file_data'],
+            es_transcript_file_name,
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR
+        )
+
+        # Verify that one video is present before import.
+        self.assertEqual(Video.objects.count(), 1)
+
+        # Verify that no transript was present before import.
+        self.assertEqual(VideoTranscript.objects.count(), 0)
+
+        # Import xml with empty edx video id.
+        edx_video_id = api.import_from_xml(
+            etree.fromstring('<video_asset/>'),
+            '',
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR,
+            {
+                'en': [en_transcript_file_name],
+                'es': [es_transcript_file_name]
+            }
+        )
+
+        # Verify that new video is created.
+        self.assertIsNotNone(edx_video_id)
+
+        # Verify transcript records are created with correct data.
+        expected_transcripts =  [
+            dict(constants.VIDEO_TRANSCRIPT_CUSTOM_SRT, video_id=edx_video_id, language_code='en'),
+            dict(constants.VIDEO_TRANSCRIPT_CUSTOM_SRT, video_id=edx_video_id, language_code='es')
+        ]
+
+        self.assert_transcripts(
+            edx_video_id,
+            expected_transcripts
+        )
+
+    def test_multiple_external_transcripts_for_language(self):
+        """
+        Verify that transcript import for external video working as expected when multiple transcripts present against
+        a language e.g. external english transcript is imported through sub and transcripts field.
+        """
+        # First create external transcripts.
+        sub_transcript_file_name = 'external-transcript-sub.srt'
+        utils.create_file_in_fs(
+            constants.VIDEO_TRANSCRIPT_CUSTOM_SRT['file_data'],
+            sub_transcript_file_name,
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR
+        )
+
+        ext_transcript_file_name = 'external-transcript-ext.sjson'
+        utils.create_file_in_fs(
+            constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON['file_data'],
+            ext_transcript_file_name,
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR
+        )
+
+        # Verify that one video is present before import.
+        self.assertEqual(Video.objects.count(), 1)
+
+        # Verify that no transript was present before import.
+        self.assertEqual(VideoTranscript.objects.count(), 0)
+
+        # Import xml with empty edx video id.
+        edx_video_id = api.import_from_xml(
+            etree.fromstring('<video_asset/>'),
+            '',
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR,
+            {
+                'en': [sub_transcript_file_name, ext_transcript_file_name]
+            }
+        )
+
+        # Verify that new video is created.
+        self.assertIsNotNone(edx_video_id)
+
+        # Verify transcript record is created with correct data i.e sub field transcript.
+        expected_transcripts =  [
+            dict(constants.VIDEO_TRANSCRIPT_CUSTOM_SRT, video_id=edx_video_id, language_code='en')
+        ]
+
+        self.assert_transcripts(
+            edx_video_id,
+            expected_transcripts
+        )
+
+    def test_external_internal_transcripts_conflict(self):
+        """
+        Tests that when importing both external and internal (VAL) transcripts, internal transcript is imported.
+        """
+        # First create external transcript in sjson format.
+        en_transcript_file_name = 'external-transcript-en.sjson'
+        utils.create_file_in_fs(
+            constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON['file_data'],
+            en_transcript_file_name,
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR
+        )
+
+        # Let's create internal transcript in srt format.
+        expected_val_transcript = [self.transcript_data1]
+        import_xml = self.make_import_xml(
+            video_dict=constants.VIDEO_DICT_STAR,
+            video_transcripts=expected_val_transcript
+        )
+
+        # Verify that one video is present before import.
+        self.assertEqual(Video.objects.count(), 1)
+
+        # Verify that no transript was present before import.
+        self.assertEqual(VideoTranscript.objects.count(), 0)
+
+        # Note that we have an external en transcript as well as internal en transcript.
+        edx_video_id = api.import_from_xml(
+            import_xml,
+            constants.VIDEO_DICT_STAR['edx_video_id'],
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR,
+            {
+                'en': [en_transcript_file_name]
+            }
+        )
+
+        # Verify that new video is created.
+        self.assertIsNotNone(edx_video_id)
+
+        # Verify transcript record is created with internal transcript data.
+        self.assert_transcripts(
+            constants.VIDEO_DICT_STAR['edx_video_id'],
+            [self.transcript_data1]
+        )
+
+    def test_external_internal_transcripts_different_languages(self):
+        """
+        Tests that when importing both external and internal (VAL) transcripts for different langauges, all transcripts
+        are imported correctly.
+        """
+        edx_video_id = constants.VIDEO_DICT_STAR['edx_video_id']
+        # First create external es transcript.
+        es_transcript_file_name = 'external-transcript-es.sjson'
+        es_external_transcript = dict(
+            constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON,
+            video_id=edx_video_id,
+            language_code='es'
+        )
+        utils.create_file_in_fs(
+            es_external_transcript['file_data'],
+            es_transcript_file_name,
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR
+        )
+
+        # Let's create en internal transcript.
+        import_xml = self.make_import_xml(
+            video_dict=constants.VIDEO_DICT_STAR,
+            video_transcripts=[self.transcript_data1]
+        )
+
+        # Verify that one video is present before import.
+        self.assertEqual(Video.objects.count(), 1)
+
+        # Verify that no transript was present before import.
+        self.assertEqual(VideoTranscript.objects.count(), 0)
+
+        # Note that we have an external 'es' language transcript as well as an internal 'es' language transcript.
+        edx_video_id = api.import_from_xml(
+            import_xml,
+            edx_video_id,
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR,
+            {
+                'es': [es_transcript_file_name]
+            }
+        )
+
+        # Verify all transcript records are created correctly.
+        self.assert_transcripts(
+            constants.VIDEO_DICT_STAR['edx_video_id'],
+            [self.transcript_data1, es_external_transcript]
+        )
+
+    @patch('edxval.api.logger')
+    def test_import_transcript_from_fs_resource_not_found(self, mock_logger):
+        """
+        Test that `import_transcript_from_fs` correctly logs if transcript file is not found in file system.
+        """
+        language_code = 'en'
+        edx_video_id = 'test-edx-video-id'
+        file_name = 'file-not-found.srt'
+        api.import_transcript_from_fs(
+            edx_video_id=edx_video_id,
+            language_code=language_code,
+            file_name=file_name,
+            provider=TranscriptProviderType.CUSTOM,
+            resource_fs=self.file_system,
+            static_dir=constants.EXPORT_IMPORT_STATIC_DIR
+        )
+        mock_logger.warn.assert_called_with(
+            '[edx-val] "%s" transcript "%s" for video "%s" is not found.',
+            language_code,
+            file_name,
+            edx_video_id
+        )
+
+    @patch('edxval.api.logger')
+    def test_import_transcript_from_fs_invalid_format(self, mock_logger):
+        """
+        Test that `import_transcript_from_fs` correctly logs if we get error while retrieving transcript file format.
+        """
+        language_code = 'en'
+        edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id']
+        # First create transcript file.
+        invalid_transcript_file_name = 'invalid-transcript.txt'
+        invalid_transcript = dict(
+            constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON,
+            video_id=edx_video_id,
+            file_data='This is an invalid transcript file data.'
+        )
+        utils.create_file_in_fs(
+            invalid_transcript['file_data'],
+            invalid_transcript_file_name,
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR
+        )
+
+        api.import_transcript_from_fs(
+            edx_video_id=edx_video_id,
+            language_code=language_code,
+            file_name=invalid_transcript_file_name,
+            provider=TranscriptProviderType.CUSTOM,
+            resource_fs=self.file_system,
+            static_dir=constants.EXPORT_IMPORT_STATIC_DIR
+        )
+        mock_logger.warn.assert_called_with(
+            '[edx-val] Error while getting transcript format for video=%s -- language_code=%s --file_name=%s',
+            edx_video_id,
+            language_code,
+            invalid_transcript_file_name
+        )
+
+    @patch('edxval.api.logger')
+    def test_import_transcript_from_fs_bad_content(self, mock_logger):
+        """
+        Test that `import_transcript_from_fs` correctly logs if we get error while decoding transcript content.
+        """
+        language_code = 'en'
+        edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id']
+        # First create transcript file.
+        transcript_file_name = 'invalid-transcript.txt'
+        invalid_transcript = dict(
+            constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON,
+            video_id=edx_video_id,
+            file_data=u'Привіт, edX вітає вас.'.encode('cp1251')
+        )
+        utils.create_file_in_fs(
+            invalid_transcript['file_data'],
+            transcript_file_name,
+            self.file_system,
+            constants.EXPORT_IMPORT_STATIC_DIR
+        )
+
+        api.import_transcript_from_fs(
+            edx_video_id=edx_video_id,
+            language_code=language_code,
+            file_name=transcript_file_name,
+            provider=TranscriptProviderType.CUSTOM,
+            resource_fs=self.file_system,
+            static_dir=constants.EXPORT_IMPORT_STATIC_DIR
+        )
+        mock_logger.warn.assert_called_with(
+            '[edx-val] "%s" transcript "%s" for video "%s" contains a non-utf8 file content.',
+            language_code,
+            transcript_file_name,
+            edx_video_id
+        )
+
    def test_import_transcript_attached_existing_video(self):
        """
        Verify that transcript import for existing video with transcript attached is working as expected.
@@ -1434,6 +1788,7 @@ class ImportTest(TestCase):
            constants.VIDEO_DICT_FISH['edx_video_id'],
            self.file_system,
            constants.EXPORT_IMPORT_STATIC_DIR,
+            {},
            'test_course_id'
        )

@@ -1466,6 +1821,7 @@ class ImportTest(TestCase):
            constants.VIDEO_DICT_FISH['edx_video_id'],
            self.file_system,
            constants.EXPORT_IMPORT_STATIC_DIR,
+            {},
            'test_course_id'
        )

@@ -1495,6 +1851,7 @@ class ImportTest(TestCase):
            constants.VIDEO_DICT_STAR['edx_video_id'],
            self.file_system,
            constants.EXPORT_IMPORT_STATIC_DIR,
+            {},
            'test_course_id'
        )

@@ -1526,12 +1883,12 @@ class ImportTest(TestCase):

        # Create transcript files
        utils.create_file_in_fs(
-            json.dumps(constants.TRANSCRIPT_DATA['wow']),
+            constants.TRANSCRIPT_DATA['wow'],
            u'super-soaker-de.sjson',
            self.file_system,
            constants.EXPORT_IMPORT_STATIC_DIR
        )
-        api.create_transcript_objects(xml, video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR)
+        api.create_transcript_objects(xml, video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR, {})

        mock_logger.warn.assert_called_with(
            "VAL: Required attributes are missing from xml, xml=[%s]",
@@ -1878,14 +2235,14 @@ class TranscriptTest(TestCase):
                    'language_code': 'en',
                    'provider': TranscriptProviderType.THREE_PLAY_MEDIA,
                    'file_name': None,
-                    'file_format': TranscriptFormat.SRT,
+                    'file_format': utils.TranscriptFormat.SRT,
                    'file_data': File(open(self.flash_transcript_path))
                },
                {
                    'language_code': 'fr',
                    'provider': TranscriptProviderType.CIELO24,
                    'file_name': None,
-                    'file_format': TranscriptFormat.SRT,
+                    'file_format': utils.TranscriptFormat.SRT,
                    'file_data': ContentFile(constants.TRANSCRIPT_DATA['overwatch'])
                }
            ]
@@ -1902,14 +2259,14 @@ class TranscriptTest(TestCase):
                    'language_code': 'de',
                    'provider': TranscriptProviderType.CUSTOM,
                    'file_name': None,
-                    'file_format': TranscriptFormat.SRT,
+                    'file_format': utils.TranscriptFormat.SRT,
                    'file_data': File(open(self.arrow_transcript_path))
                },
                {
                    'language_code': 'zh',
                    'provider': TranscriptProviderType.CUSTOM,
                    'file_name': 'non/existent/transcript/path',
-                    'file_format': TranscriptFormat.SRT,
+                    'file_format': utils.TranscriptFormat.SRT,
                    'file_data': None
                }
            ]
@@ -1986,7 +2343,7 @@ class TranscriptTest(TestCase):
        expectation = {
            'video_id': u'super-soaker',
            'url': self.v1_transcript2.url(),
-            'file_format': TranscriptFormat.SRT,
+            'file_format': utils.TranscriptFormat.SRT,
            'provider': TranscriptProviderType.CIELO24,
            'language_code': u'fr'
        }
@@ -2042,14 +2399,14 @@ class TranscriptTest(TestCase):
        {
            'file_data': None,
            'file_name': 'overwatch.sjson',
-            'file_format': TranscriptFormat.SJSON,
+            'file_format': utils.TranscriptFormat.SJSON,
            'language_code': 'da',
            'provider': TranscriptProviderType.CIELO24
        },
        {
            'file_data': ContentFile(constants.TRANSCRIPT_DATA['overwatch']),
            'file_name': None,
-            'file_format': TranscriptFormat.SRT,
+            'file_format': utils.TranscriptFormat.SRT,
            'language_code': 'es',
            'provider': TranscriptProviderType.THREE_PLAY_MEDIA
        },
@@ -2104,7 +2461,7 @@ class TranscriptTest(TestCase):
        },
        {
            'video_id': 'medium-soaker',
-            'file_format': TranscriptFormat.SRT,
+            'file_format': utils.TranscriptFormat.SRT,
            'provider': 123,
            'exception': InvalidTranscriptProvider,
            'exception_message': '123 transcript provider is not supported',
@@ -2133,7 +2490,7 @@ class TranscriptTest(TestCase):
            video_id=edx_video_id,
            language_code=language_code,
            provider=TranscriptProviderType.THREE_PLAY_MEDIA,
-            file_format=TranscriptFormat.SRT,
+            file_format=utils.TranscriptFormat.SRT,
            content=ContentFile(constants.TRANSCRIPT_DATA['overwatch'])
        )

@@ -2169,7 +2526,7 @@ class TranscriptTest(TestCase):
        {
            'video_id': 'medium-soaker',
            'language_code': 'en',
-            'file_format': TranscriptFormat.SRT,
+            'file_format': utils.TranscriptFormat.SRT,
            'provider': 'unknown provider',
            'exception_msg': '"unknown provider" is not a valid choice.'
        }
@@ -2256,16 +2613,20 @@ class TranscriptTest(TestCase):
        language_code = 'en'
        video_id = constants.VIDEO_DICT_FISH['edx_video_id']
        transcript_file_name = u'super-soaker-en.srt'
-        expected_transcript_path = combine(self.temp_dir, combine(constants.EXPORT_IMPORT_STATIC_DIR, transcript_file_name))
+        expected_transcript_path = combine(
+            combine(self.temp_dir, constants.EXPORT_IMPORT_COURSE_DIR),
+            combine(constants.EXPORT_IMPORT_STATIC_DIR, transcript_file_name)
+        )

-        file_system = OSFS(self.temp_dir)
+        delegate_fs = OSFS(self.temp_dir)
+        file_system = delegate_fs.makedir(constants.EXPORT_IMPORT_COURSE_DIR, recreate=True)
        file_system.makedir(constants.EXPORT_IMPORT_STATIC_DIR, recreate=True)

        # Create transcript file now.
-        api.create_trancript_file(
+        api.create_transcript_file(
            video_id=video_id,
            language_code=language_code,
-            file_format=TranscriptFormat.SRT,
+            file_format=utils.TranscriptFormat.SRT,
            static_dir=constants.EXPORT_IMPORT_STATIC_DIR,
            resource_fs=file_system
        )
@@ -2291,10 +2652,10 @@ class TranscriptTest(TestCase):
        file_system.makedir(constants.EXPORT_IMPORT_STATIC_DIR, recreate=True)

        # Try to create transcript file now.
-        api.create_trancript_file(
+        api.create_transcript_file(
            video_id=video_id,
            language_code=language_code,
-            file_format=TranscriptFormat.SRT,
+            file_format=utils.TranscriptFormat.SRT,
            static_dir=constants.EXPORT_IMPORT_STATIC_DIR,
            resource_fs=file_system
        )

--- a/edxval/tests/test_views.py
+++ b/edxval/tests/test_views.py
@@ -8,10 +8,11 @@ from ddt import data, ddt, unpack
 from django.core.urlresolvers import reverse
 from rest_framework import status

-from edxval.models import (CourseVideo, Profile, TranscriptFormat,
+from edxval.models import (CourseVideo, Profile,
                           TranscriptProviderType, Video, VideoTranscript)
 from edxval.serializers import TranscriptSerializer
 from edxval.tests import APIAuthTestCase, constants
+from edxval.utils import TranscriptFormat


 class VideoDetail(APIAuthTestCase):

--- a/edxval/utils.py
+++ b/edxval/utils.py
@@ -2,9 +2,21 @@
 Util methods to be used in api and models.
 """

+import json
 from django.conf import settings
 from django.core.files.storage import get_storage_class
 from fs.path import combine
+from pysrt import SubRipFile
+
+
+class TranscriptFormat(object):
+    SRT = 'srt'
+    SJSON = 'sjson'
+
+    CHOICES = (
+        (SRT, 'SubRip'),
+        (SJSON, 'SRT JSON')
+    )


 # 3rd Party Transcription Plans
@@ -185,3 +197,21 @@ def create_file_in_fs(file_data, file_name, file_system, static_dir):
    """
    with file_system.open(combine(static_dir, file_name), 'wb') as f:
        f.write(file_data)
+
+
+def get_transcript_format(transcript_content):
+    """
+    Returns transcript format.
+
+    Arguments:
+        transcript_content (str): Transcript file content.
+    """
+    try:
+        sjson_obj = json.loads(transcript_content)
+    except ValueError:
+        # With error handling (set to 'ERROR_RAISE'), we will be getting
+        # the exception if something went wrong in parsing the transcript.
+        srt_subs = SubRipFile.from_string(transcript_content, error_handling=SubRipFile.ERROR_RAISE)
+        if len(srt_subs) > 0:
+            return TranscriptFormat.SRT
+    return TranscriptFormat.SJSON
--- a/edxval/views.py
+++ b/edxval/views.py
@@ -15,13 +15,13 @@ from rest_framework_oauth.authentication import OAuth2Authentication
 from edxval.api import create_or_update_video_transcript
 from edxval.models import (
    CourseVideo,
-    TranscriptFormat,
    TranscriptProviderType,
    Video,
    VideoImage,
    VideoTranscript
 )
 from edxval.serializers import VideoSerializer
+from edxval.utils import TranscriptFormat

 LOGGER = logging.getLogger(__name__)  # pylint: disable=C0103


--- a/requirements/base.in
+++ b/requirements/base.in
@@ -10,3 +10,4 @@ django-storages
 enum34
 lxml
 pillow
+pysrt==0.4.7