Commit 0bbb8afd by Qubad786

Export transcripts metadata along with xml

 - Transcript files are exported into course OLX in .srt format.
 - Transcript language to filename maps is returned with xml, so that, it can be used by platform in old metadata fields for backward compatiblilty.
 - Add/fix tests

bump VAL version
parent f7a077bd
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
""" """
The internal API for VAL. The internal API for VAL.
""" """
import os
import logging import logging
from enum import Enum from enum import Enum
from uuid import uuid4 from uuid import uuid4
...@@ -17,15 +18,35 @@ from lxml import etree ...@@ -17,15 +18,35 @@ from lxml import etree
from lxml.etree import Element, SubElement from lxml.etree import Element, SubElement
from pysrt.srtexc import Error from pysrt.srtexc import Error
from edxval.exceptions import (InvalidTranscriptFormat, from edxval.exceptions import (
InvalidTranscriptProvider, ValCannotCreateError, InvalidTranscriptFormat,
ValCannotUpdateError, ValInternalError, TranscriptsGenerationException,
ValVideoNotFoundError) InvalidTranscriptProvider,
from edxval.models import (CourseVideo, EncodedVideo, Profile, TranscriptPreference, ValCannotCreateError,
TranscriptProviderType, Video, VideoImage, ValCannotUpdateError,
VideoTranscript, ThirdPartyTranscriptCredentialsState) ValInternalError,
ValVideoNotFoundError,
)
from edxval.models import (
CourseVideo,
EncodedVideo,
Profile,
TranscriptPreference,
TranscriptProviderType,
Video,
VideoImage,
VideoTranscript,
ThirdPartyTranscriptCredentialsState,
)
from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
from edxval.utils import TranscriptFormat, THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs, get_transcript_format from edxval.utils import (
TranscriptFormat,
THIRD_PARTY_TRANSCRIPTION_PLANS,
create_file_in_fs,
get_transcript_format,
)
from edxval.transcript_utils import Transcript
logger = logging.getLogger(__name__) # pylint: disable=C0103 logger = logging.getLogger(__name__) # pylint: disable=C0103
...@@ -829,6 +850,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None): ...@@ -829,6 +850,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
for name in ['profile', 'url', 'file_size', 'bitrate'] for name in ['profile', 'url', 'file_size', 'bitrate']
} }
) )
return create_transcripts_xml(video_id, video_el, resource_fs, static_dir) return create_transcripts_xml(video_id, video_el, resource_fs, static_dir)
...@@ -843,21 +865,26 @@ def create_transcript_file(video_id, language_code, file_format, resource_fs, st ...@@ -843,21 +865,26 @@ def create_transcript_file(video_id, language_code, file_format, resource_fs, st
static_dir (str): The Directory to store transcript file. static_dir (str): The Directory to store transcript file.
resource_fs (SubFS): The file system to store transcripts. resource_fs (SubFS): The file system to store transcripts.
""" """
transcript_name = u'{video_id}-{language_code}.{file_format}'.format( transcript_filename = '{video_id}-{language_code}.srt'.format(
video_id=video_id, video_id=video_id,
language_code=language_code, language_code=language_code
file_format=file_format
) )
transcript_data = get_video_transcript_data(video_id, language_code) transcript_data = get_video_transcript_data(video_id, language_code)
if transcript_data: if transcript_data:
transcript_content = transcript_data['content'] transcript_content = Transcript.convert(
create_file_in_fs(transcript_content, transcript_name, resource_fs, static_dir) transcript_data['content'],
input_format=file_format,
output_format=Transcript.SRT
)
create_file_in_fs(transcript_content, transcript_filename, resource_fs, static_dir)
return transcript_filename
def create_transcripts_xml(video_id, video_el, resource_fs, static_dir): def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
""" """
Creates xml for transcripts. Creates xml for transcripts.
For each transcript elment, an associated transcript file is also created in course OLX. For each transcript element, an associated transcript file is also created in course OLX.
Arguments: Arguments:
video_id (str): Video id of the video. video_id (str): Video id of the video.
...@@ -873,32 +900,36 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir): ...@@ -873,32 +900,36 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
if video_transcripts.exists(): if video_transcripts.exists():
transcripts_el = SubElement(video_el, 'transcripts') transcripts_el = SubElement(video_el, 'transcripts')
exported_language_codes = [] transcript_files_map = {}
for video_transcript in video_transcripts: for video_transcript in video_transcripts:
if video_transcript.language_code not in exported_language_codes: language_code = video_transcript.language_code
language_code = video_transcript.language_code file_format = video_transcript.file_format
file_format = video_transcript.file_format
create_transcript_file( try:
video_id, transcript_filename = create_transcript_file(
language_code, video_id=video_id,
file_format, language_code=language_code,
resource_fs.delegate_fs(), file_format=file_format,
combine(u'course', static_dir) # File system should not start from /draft directory. resource_fs=resource_fs.delegate_fs(),
static_dir=combine(u'course', static_dir) # File system should not start from /draft directory.
) )
transcript_files_map[language_code] = transcript_filename
except TranscriptsGenerationException:
# we don't want to halt export in this case, just log and move to the next transcript.
logger.exception('[VAL] Error while generating "%s" transcript for video["%s"].', language_code, video_id)
continue
SubElement( SubElement(
transcripts_el, transcripts_el,
'transcript', 'transcript',
{ {
'language_code': language_code, 'language_code': language_code,
'file_format': file_format, 'file_format': Transcript.SRT,
'provider': video_transcript.provider, 'provider': video_transcript.provider,
} }
) )
exported_language_codes.append(video_transcript.language_code)
return video_el return dict(xml=video_el, transcripts=transcript_files_map)
def import_from_xml(xml, edx_video_id, resource_fs, static_dir, external_transcripts=dict(), course_id=None): def import_from_xml(xml, edx_video_id, resource_fs, static_dir, external_transcripts=dict(), course_id=None):
...@@ -1033,7 +1064,6 @@ def import_transcript_from_fs(edx_video_id, language_code, file_name, provider, ...@@ -1033,7 +1064,6 @@ def import_transcript_from_fs(edx_video_id, language_code, file_name, provider,
) )
return return
# Get file format from transcript content. # Get file format from transcript content.
try: try:
file_format = get_transcript_format(file_content) file_format = get_transcript_format(file_content)
......
...@@ -62,3 +62,10 @@ class InvalidTranscriptProvider(ValError): ...@@ -62,3 +62,10 @@ class InvalidTranscriptProvider(ValError):
This error is raised when an transcript provider is not supported This error is raised when an transcript provider is not supported
""" """
pass pass
class TranscriptsGenerationException(ValError):
"""
This error is raised when a transcript content is not parse-able in specified format.
"""
pass
...@@ -35,6 +35,7 @@ from edxval.models import (LIST_MAX_ITEMS, CourseVideo, EncodedVideo, Profile, ...@@ -35,6 +35,7 @@ from edxval.models import (LIST_MAX_ITEMS, CourseVideo, EncodedVideo, Profile,
VideoTranscript) VideoTranscript)
from edxval.serializers import VideoSerializer from edxval.serializers import VideoSerializer
from edxval.tests import APIAuthTestCase, constants from edxval.tests import APIAuthTestCase, constants
from edxval.transcript_utils import Transcript
def omit_attrs(dict, attrs_to_omit=[]): def omit_attrs(dict, attrs_to_omit=[]):
...@@ -973,11 +974,14 @@ class ExportTest(TestCase): ...@@ -973,11 +974,14 @@ class ExportTest(TestCase):
expected = self.parse_xml(""" expected = self.parse_xml("""
<video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/> <video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
""") """)
self.assert_xml_equal( exported_metadata = api.export_to_xml(
api.export_to_xml(constants.VIDEO_DICT_STAR['edx_video_id'], self.file_system, constants.EXPORT_IMPORT_STATIC_DIR), resource_fs=self.file_system,
expected static_dir=constants.EXPORT_IMPORT_STATIC_DIR,
video_id=constants.VIDEO_DICT_STAR['edx_video_id'],
) )
self.assert_xml_equal(exported_metadata['xml'], expected)
def test_no_video_transcript(self): def test_no_video_transcript(self):
""" """
Verify that transcript export for video with no transcript is working as expected. Verify that transcript export for video with no transcript is working as expected.
...@@ -986,11 +990,12 @@ class ExportTest(TestCase): ...@@ -986,11 +990,12 @@ class ExportTest(TestCase):
<video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/> <video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
""") """)
exported_xml = api.export_to_xml( exported_metadata = api.export_to_xml(
constants.VIDEO_DICT_STAR['edx_video_id'], constants.VIDEO_DICT_STAR['edx_video_id'],
self.file_system, self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR constants.EXPORT_IMPORT_STATIC_DIR
) )
exported_xml = exported_metadata['xml']
self.assert_xml_equal(exported_xml, expected) self.assert_xml_equal(exported_xml, expected)
# Verify that no transcript is present in the XML. # Verify that no transcript is present in the XML.
...@@ -1011,29 +1016,29 @@ class ExportTest(TestCase): ...@@ -1011,29 +1016,29 @@ class ExportTest(TestCase):
<encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/> <encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
<encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/> <encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
<transcripts> <transcripts>
<transcript file_format="sjson" language_code="de" provider="3PlayMedia" /> <transcript file_format="srt" language_code="de" provider="3PlayMedia" />
<transcript file_format="srt" language_code="en" provider="Cielo24" /> <transcript file_format="srt" language_code="en" provider="Cielo24" />
</transcripts> </transcripts>
</video_asset> </video_asset>
""".format(image=image)) """.format(image=image))
self.assert_xml_equal( exported_metadata = api.export_to_xml(
api.export_to_xml( constants.VIDEO_DICT_FISH['edx_video_id'],
constants.VIDEO_DICT_FISH['edx_video_id'], self.file_system,
self.file_system, constants.EXPORT_IMPORT_STATIC_DIR,
constants.EXPORT_IMPORT_STATIC_DIR, course_id
course_id
),
expected
) )
self.assert_xml_equal(exported_metadata['xml'], expected)
self.assertItemsEqual(exported_metadata['transcripts'].keys(), ['en', 'de'])
def test_transcript_export(self): def test_transcript_export(self):
""" """
Test that transcript are exported correctly. Test that transcript are exported correctly.
""" """
language_code = 'en' language_code = 'en'
video_id = constants.VIDEO_DICT_FISH['edx_video_id'] video_id = constants.VIDEO_DICT_FISH['edx_video_id']
transcript_files = {'de': u'super-soaker-de.sjson', 'en': u'super-soaker-en.srt'} transcript_files = {'de': u'super-soaker-de.srt', 'en': u'super-soaker-en.srt'}
expected_transcript_path = combine( expected_transcript_path = combine(
self.temp_dir, self.temp_dir,
combine(constants.EXPORT_IMPORT_COURSE_DIR, constants.EXPORT_IMPORT_STATIC_DIR) combine(constants.EXPORT_IMPORT_COURSE_DIR, constants.EXPORT_IMPORT_STATIC_DIR)
...@@ -1045,16 +1050,21 @@ class ExportTest(TestCase): ...@@ -1045,16 +1050,21 @@ class ExportTest(TestCase):
<encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/> <encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
<encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/> <encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
<transcripts> <transcripts>
<transcript file_format="sjson" language_code="de" provider="3PlayMedia" /> <transcript file_format="srt" language_code="de" provider="3PlayMedia" />
<transcript file_format="srt" language_code="en" provider="Cielo24" /> <transcript file_format="srt" language_code="en" provider="Cielo24" />
</transcripts> </transcripts>
</video_asset> </video_asset>
""") """)
exported_xml = api.export_to_xml(video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR, 'test-course') exported_metadata = api.export_to_xml(
video_id=video_id,
course_id='test-course',
resource_fs=self.file_system,
static_dir=constants.EXPORT_IMPORT_STATIC_DIR
)
# Assert video and transcript xml is exported correctly. # Assert video and transcript xml is exported correctly.
self.assert_xml_equal(exported_xml, expected_xml) self.assert_xml_equal(exported_metadata['xml'], expected_xml)
# Verify transcript file is created. # Verify transcript file is created.
self.assertItemsEqual(transcript_files.values(), self.file_system.listdir(constants.EXPORT_IMPORT_STATIC_DIR)) self.assertItemsEqual(transcript_files.values(), self.file_system.listdir(constants.EXPORT_IMPORT_STATIC_DIR))
...@@ -1065,7 +1075,13 @@ class ExportTest(TestCase): ...@@ -1065,7 +1075,13 @@ class ExportTest(TestCase):
open(combine(expected_transcript_path, transcript_files[language_code])) open(combine(expected_transcript_path, transcript_files[language_code]))
).read() ).read()
transcript = api.get_video_transcript_data(video_id=video_id, language_code=language_code) transcript = api.get_video_transcript_data(video_id=video_id, language_code=language_code)
self.assertEqual(transcript['content'], expected_transcript_content) transcript_format = os.path.splitext(transcript['file_name'])[1][1:]
exported_transcript_content = Transcript.convert(
transcript['content'],
input_format=transcript_format,
output_format=Transcript.SRT,
).encode('utf-8')
self.assertEqual(exported_transcript_content, expected_transcript_content)
def test_unknown_video(self): def test_unknown_video(self):
...@@ -1728,20 +1744,18 @@ class ImportTest(TestCase): ...@@ -1728,20 +1744,18 @@ class ImportTest(TestCase):
""" """
language_code = 'en' language_code = 'en'
edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id'] edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id']
# First create transcript file.
# First create non utf-8 encoded transcript file in the file system.
transcript_file_name = 'invalid-transcript.txt' transcript_file_name = 'invalid-transcript.txt'
invalid_transcript = dict( invalid_transcript = dict(
constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON, constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON,
video_id=edx_video_id, video_id=edx_video_id,
file_data=u'Привіт, edX вітає вас.'.encode('cp1251') file_data=u'Привіт, edX вітає вас.'
)
utils.create_file_in_fs(
invalid_transcript['file_data'],
transcript_file_name,
self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR
) )
with self.file_system.open(combine(constants.EXPORT_IMPORT_STATIC_DIR, transcript_file_name), 'wb') as f:
f.write(invalid_transcript['file_data'].encode('cp1251'))
api.import_transcript_from_fs( api.import_transcript_from_fs(
edx_video_id=edx_video_id, edx_video_id=edx_video_id,
language_code=language_code, language_code=language_code,
......
# -*- coding: utf-8 -*-
"""
Tests for transcript utils.
"""
import ddt
import json
import textwrap
import unittest
from edxval.transcript_utils import Transcript
from edxval.exceptions import TranscriptsGenerationException
@ddt.ddt
class TestTranscriptUtils(unittest.TestCase):
"""
Tests transcripts conversion util.
"""
def setUp(self):
super(TestTranscriptUtils, self).setUp()
self.srt_transcript = textwrap.dedent("""\
0
00:00:10,500 --> 00:00:13,000
Elephant&#39;s Dream 大象的梦想
1
00:00:15,000 --> 00:00:18,000
At the left we can see...
""")
self.sjson_transcript = textwrap.dedent("""\
{
"start": [
10500,
15000
],
"end": [
13000,
18000
],
"text": [
"Elephant&#39;s Dream 大象的梦想",
"At the left we can see..."
]
}
""")
@ddt.data(
('invalid_input_format', 'sjson'),
('sjson', 'invalid_output_format'),
('invalid_input_format', 'invalid_output_format')
)
@ddt.unpack
def test_invalid_transcript_format(self, input_format, output_format):
"""
Tests that transcript conversion raises `AssertionError` on invalid input/output formats.
"""
with self.assertRaises(AssertionError):
Transcript.convert(self.sjson_transcript, input_format, output_format)
def test_convert_srt_to_srt(self):
"""
Tests that srt to srt conversion works as expected.
"""
expected = self.srt_transcript.decode('utf-8')
actual = Transcript.convert(self.srt_transcript, 'srt', 'srt')
self.assertEqual(actual, expected)
def test_convert_sjson_to_srt(self):
"""
Tests that the sjson transcript is successfully converted into srt format.
"""
expected = self.srt_transcript.decode('utf-8')
actual = Transcript.convert(self.sjson_transcript, 'sjson', 'srt')
self.assertEqual(actual, expected)
def test_convert_srt_to_sjson(self):
"""
Tests that the srt transcript is successfully converted into sjson format.
"""
expected = self.sjson_transcript.decode('utf-8')
actual = Transcript.convert(self.srt_transcript, 'srt', 'sjson')
self.assertDictEqual(json.loads(actual), json.loads(expected))
def test_convert_invalid_srt_to_sjson(self):
"""
Tests that TranscriptsGenerationException was raises on trying
to convert invalid srt transcript to sjson.
"""
invalid_srt_transcript = 'invalid SubRip file content'
with self.assertRaises(TranscriptsGenerationException):
Transcript.convert(invalid_srt_transcript, 'srt', 'sjson')
"""
A module containing transcripts utils.
"""
import json
from six import text_type
from pysrt import SubRipFile, SubRipItem, SubRipTime
from pysrt.srtexc import Error
from edxval.exceptions import TranscriptsGenerationException
class Transcript(object):
"""
Container for transcript methods.
"""
SRT = 'srt'
SJSON = 'sjson'
@staticmethod
def generate_sjson_from_srt(srt_subs):
"""
Generate transcripts from sjson to SubRip (*.srt).
Arguments:
srt_subs(SubRip): "SRT" subs object
Returns:
Subs converted to "SJSON" format.
"""
sub_starts = []
sub_ends = []
sub_texts = []
for sub in srt_subs:
sub_starts.append(sub.start.ordinal)
sub_ends.append(sub.end.ordinal)
sub_texts.append(sub.text.replace('\n', ' '))
sjson_subs = {
'start': sub_starts,
'end': sub_ends,
'text': sub_texts
}
return sjson_subs
@staticmethod
def generate_srt_from_sjson(sjson_subs):
"""
Generate transcripts from sjson to SubRip (*.srt).
Arguments:
sjson_subs (dict): `sjson` subs.
Returns:
Subtitles in SRT format.
"""
output = ''
equal_len = len(sjson_subs['start']) == len(sjson_subs['end']) == len(sjson_subs['text'])
if not equal_len:
return output
for i in range(len(sjson_subs['start'])):
item = SubRipItem(
index=i,
start=SubRipTime(milliseconds=sjson_subs['start'][i]),
end=SubRipTime(milliseconds=sjson_subs['end'][i]),
text=sjson_subs['text'][i]
)
output += (unicode(item))
output += '\n'
return output
@classmethod
def convert(cls, content, input_format, output_format):
"""
Convert transcript `content` from `input_format` to `output_format`.
Arguments:
content: Transcript content byte-stream.
input_format: Input transcript format.
output_format: Output transcript format.
Accepted input formats: sjson, srt.
Accepted output format: srt, sjson.
Raises:
TranscriptsGenerationException: On parsing the invalid srt
content during conversion from srt to sjson.
"""
assert input_format in ('srt', 'sjson')
assert output_format in ('srt', 'sjson')
# Decode the content with utf-8-sig which will also
# skip byte order mark(BOM) character if found.
content = content.decode('utf-8-sig')
if input_format == output_format:
return content
if input_format == 'srt':
if output_format == 'sjson':
try:
# With error handling (set to 'ERROR_RAISE'), we will be getting
# the exception if something went wrong in parsing the transcript.
srt_subs = SubRipFile.from_string(content, error_handling=SubRipFile.ERROR_RAISE)
except Error as ex: # Base exception from pysrt
raise TranscriptsGenerationException(text_type(ex))
return json.dumps(cls.generate_sjson_from_srt(srt_subs))
if input_format == 'sjson':
if output_format == 'srt':
return cls.generate_srt_from_sjson(json.loads(content))
...@@ -192,11 +192,11 @@ def create_file_in_fs(file_data, file_name, file_system, static_dir): ...@@ -192,11 +192,11 @@ def create_file_in_fs(file_data, file_name, file_system, static_dir):
Arguments: Arguments:
file_data (str): Data to store into the file. file_data (str): Data to store into the file.
file_name (str): File name of the file to be created. file_name (str): File name of the file to be created.
resource_fs (OSFS): Import file system. file_system (OSFS): Import file system.
static_dir (str): The Directory to retrieve transcript file. static_dir (str): The Directory to retrieve transcript file.
""" """
with file_system.open(combine(static_dir, file_name), 'wb') as f: with file_system.open(combine(static_dir, file_name), 'wb') as f:
f.write(file_data) f.write(file_data.encode('utf-8'))
def get_transcript_format(transcript_content): def get_transcript_format(transcript_content):
......
...@@ -41,7 +41,7 @@ def load_requirements(*requirements_paths): ...@@ -41,7 +41,7 @@ def load_requirements(*requirements_paths):
setup( setup(
name='edxval', name='edxval',
version='0.1.14', version='0.1.15',
author='edX', author='edX',
url='http://github.com/edx/edx-val', url='http://github.com/edx/edx-val',
description='edx-val', description='edx-val',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment