Commit 0bbb8afd by Qubad786

Export transcripts metadata along with xml

 - Transcript files are exported into course OLX in .srt format.
 - Transcript language to filename maps is returned with xml, so that, it can be used by platform in old metadata fields for backward compatiblilty.
 - Add/fix tests

bump VAL version
parent f7a077bd
......@@ -3,6 +3,7 @@
"""
The internal API for VAL.
"""
import os
import logging
from enum import Enum
from uuid import uuid4
......@@ -17,15 +18,35 @@ from lxml import etree
from lxml.etree import Element, SubElement
from pysrt.srtexc import Error
from edxval.exceptions import (InvalidTranscriptFormat,
InvalidTranscriptProvider, ValCannotCreateError,
ValCannotUpdateError, ValInternalError,
ValVideoNotFoundError)
from edxval.models import (CourseVideo, EncodedVideo, Profile, TranscriptPreference,
TranscriptProviderType, Video, VideoImage,
VideoTranscript, ThirdPartyTranscriptCredentialsState)
from edxval.exceptions import (
InvalidTranscriptFormat,
TranscriptsGenerationException,
InvalidTranscriptProvider,
ValCannotCreateError,
ValCannotUpdateError,
ValInternalError,
ValVideoNotFoundError,
)
from edxval.models import (
CourseVideo,
EncodedVideo,
Profile,
TranscriptPreference,
TranscriptProviderType,
Video,
VideoImage,
VideoTranscript,
ThirdPartyTranscriptCredentialsState,
)
from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
from edxval.utils import TranscriptFormat, THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs, get_transcript_format
from edxval.utils import (
TranscriptFormat,
THIRD_PARTY_TRANSCRIPTION_PLANS,
create_file_in_fs,
get_transcript_format,
)
from edxval.transcript_utils import Transcript
logger = logging.getLogger(__name__) # pylint: disable=C0103
......@@ -829,6 +850,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
for name in ['profile', 'url', 'file_size', 'bitrate']
}
)
return create_transcripts_xml(video_id, video_el, resource_fs, static_dir)
......@@ -843,21 +865,26 @@ def create_transcript_file(video_id, language_code, file_format, resource_fs, st
static_dir (str): The Directory to store transcript file.
resource_fs (SubFS): The file system to store transcripts.
"""
transcript_name = u'{video_id}-{language_code}.{file_format}'.format(
transcript_filename = '{video_id}-{language_code}.srt'.format(
video_id=video_id,
language_code=language_code,
file_format=file_format
language_code=language_code
)
transcript_data = get_video_transcript_data(video_id, language_code)
if transcript_data:
transcript_content = transcript_data['content']
create_file_in_fs(transcript_content, transcript_name, resource_fs, static_dir)
transcript_content = Transcript.convert(
transcript_data['content'],
input_format=file_format,
output_format=Transcript.SRT
)
create_file_in_fs(transcript_content, transcript_filename, resource_fs, static_dir)
return transcript_filename
def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
"""
Creates xml for transcripts.
For each transcript elment, an associated transcript file is also created in course OLX.
For each transcript element, an associated transcript file is also created in course OLX.
Arguments:
video_id (str): Video id of the video.
......@@ -873,32 +900,36 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
if video_transcripts.exists():
transcripts_el = SubElement(video_el, 'transcripts')
exported_language_codes = []
transcript_files_map = {}
for video_transcript in video_transcripts:
if video_transcript.language_code not in exported_language_codes:
language_code = video_transcript.language_code
file_format = video_transcript.file_format
language_code = video_transcript.language_code
file_format = video_transcript.file_format
create_transcript_file(
video_id,
language_code,
file_format,
resource_fs.delegate_fs(),
combine(u'course', static_dir) # File system should not start from /draft directory.
try:
transcript_filename = create_transcript_file(
video_id=video_id,
language_code=language_code,
file_format=file_format,
resource_fs=resource_fs.delegate_fs(),
static_dir=combine(u'course', static_dir) # File system should not start from /draft directory.
)
transcript_files_map[language_code] = transcript_filename
except TranscriptsGenerationException:
# we don't want to halt export in this case, just log and move to the next transcript.
logger.exception('[VAL] Error while generating "%s" transcript for video["%s"].', language_code, video_id)
continue
SubElement(
transcripts_el,
'transcript',
{
'language_code': language_code,
'file_format': file_format,
'provider': video_transcript.provider,
}
)
exported_language_codes.append(video_transcript.language_code)
SubElement(
transcripts_el,
'transcript',
{
'language_code': language_code,
'file_format': Transcript.SRT,
'provider': video_transcript.provider,
}
)
return video_el
return dict(xml=video_el, transcripts=transcript_files_map)
def import_from_xml(xml, edx_video_id, resource_fs, static_dir, external_transcripts=dict(), course_id=None):
......@@ -1033,7 +1064,6 @@ def import_transcript_from_fs(edx_video_id, language_code, file_name, provider,
)
return
# Get file format from transcript content.
try:
file_format = get_transcript_format(file_content)
......
......@@ -62,3 +62,10 @@ class InvalidTranscriptProvider(ValError):
This error is raised when an transcript provider is not supported
"""
pass
class TranscriptsGenerationException(ValError):
"""
This error is raised when a transcript content is not parse-able in specified format.
"""
pass
......@@ -35,6 +35,7 @@ from edxval.models import (LIST_MAX_ITEMS, CourseVideo, EncodedVideo, Profile,
VideoTranscript)
from edxval.serializers import VideoSerializer
from edxval.tests import APIAuthTestCase, constants
from edxval.transcript_utils import Transcript
def omit_attrs(dict, attrs_to_omit=[]):
......@@ -973,11 +974,14 @@ class ExportTest(TestCase):
expected = self.parse_xml("""
<video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
""")
self.assert_xml_equal(
api.export_to_xml(constants.VIDEO_DICT_STAR['edx_video_id'], self.file_system, constants.EXPORT_IMPORT_STATIC_DIR),
expected
exported_metadata = api.export_to_xml(
resource_fs=self.file_system,
static_dir=constants.EXPORT_IMPORT_STATIC_DIR,
video_id=constants.VIDEO_DICT_STAR['edx_video_id'],
)
self.assert_xml_equal(exported_metadata['xml'], expected)
def test_no_video_transcript(self):
"""
Verify that transcript export for video with no transcript is working as expected.
......@@ -986,11 +990,12 @@ class ExportTest(TestCase):
<video_asset client_video_id="TWINKLE TWINKLE" duration="122.0" image=""/>
""")
exported_xml = api.export_to_xml(
exported_metadata = api.export_to_xml(
constants.VIDEO_DICT_STAR['edx_video_id'],
self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR
)
exported_xml = exported_metadata['xml']
self.assert_xml_equal(exported_xml, expected)
# Verify that no transcript is present in the XML.
......@@ -1011,29 +1016,29 @@ class ExportTest(TestCase):
<encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
<encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
<transcripts>
<transcript file_format="sjson" language_code="de" provider="3PlayMedia" />
<transcript file_format="srt" language_code="de" provider="3PlayMedia" />
<transcript file_format="srt" language_code="en" provider="Cielo24" />
</transcripts>
</video_asset>
""".format(image=image))
self.assert_xml_equal(
api.export_to_xml(
constants.VIDEO_DICT_FISH['edx_video_id'],
self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR,
course_id
),
expected
exported_metadata = api.export_to_xml(
constants.VIDEO_DICT_FISH['edx_video_id'],
self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR,
course_id
)
self.assert_xml_equal(exported_metadata['xml'], expected)
self.assertItemsEqual(exported_metadata['transcripts'].keys(), ['en', 'de'])
def test_transcript_export(self):
"""
Test that transcript are exported correctly.
"""
language_code = 'en'
video_id = constants.VIDEO_DICT_FISH['edx_video_id']
transcript_files = {'de': u'super-soaker-de.sjson', 'en': u'super-soaker-en.srt'}
transcript_files = {'de': u'super-soaker-de.srt', 'en': u'super-soaker-en.srt'}
expected_transcript_path = combine(
self.temp_dir,
combine(constants.EXPORT_IMPORT_COURSE_DIR, constants.EXPORT_IMPORT_STATIC_DIR)
......@@ -1045,16 +1050,21 @@ class ExportTest(TestCase):
<encoded_video url="http://www.meowmagic.com" file_size="33" bitrate="44" profile="desktop"/>
<encoded_video url="https://www.tmnt.com/tmnt101.m3u8" file_size="100" bitrate="0" profile="hls"/>
<transcripts>
<transcript file_format="sjson" language_code="de" provider="3PlayMedia" />
<transcript file_format="srt" language_code="de" provider="3PlayMedia" />
<transcript file_format="srt" language_code="en" provider="Cielo24" />
</transcripts>
</video_asset>
""")
exported_xml = api.export_to_xml(video_id, self.file_system, constants.EXPORT_IMPORT_STATIC_DIR, 'test-course')
exported_metadata = api.export_to_xml(
video_id=video_id,
course_id='test-course',
resource_fs=self.file_system,
static_dir=constants.EXPORT_IMPORT_STATIC_DIR
)
# Assert video and transcript xml is exported correctly.
self.assert_xml_equal(exported_xml, expected_xml)
self.assert_xml_equal(exported_metadata['xml'], expected_xml)
# Verify transcript file is created.
self.assertItemsEqual(transcript_files.values(), self.file_system.listdir(constants.EXPORT_IMPORT_STATIC_DIR))
......@@ -1065,7 +1075,13 @@ class ExportTest(TestCase):
open(combine(expected_transcript_path, transcript_files[language_code]))
).read()
transcript = api.get_video_transcript_data(video_id=video_id, language_code=language_code)
self.assertEqual(transcript['content'], expected_transcript_content)
transcript_format = os.path.splitext(transcript['file_name'])[1][1:]
exported_transcript_content = Transcript.convert(
transcript['content'],
input_format=transcript_format,
output_format=Transcript.SRT,
).encode('utf-8')
self.assertEqual(exported_transcript_content, expected_transcript_content)
def test_unknown_video(self):
......@@ -1728,20 +1744,18 @@ class ImportTest(TestCase):
"""
language_code = 'en'
edx_video_id = constants.VIDEO_DICT_FISH['edx_video_id']
# First create transcript file.
# First create non utf-8 encoded transcript file in the file system.
transcript_file_name = 'invalid-transcript.txt'
invalid_transcript = dict(
constants.VIDEO_TRANSCRIPT_CUSTOM_SJSON,
video_id=edx_video_id,
file_data=u'Привіт, edX вітає вас.'.encode('cp1251')
)
utils.create_file_in_fs(
invalid_transcript['file_data'],
transcript_file_name,
self.file_system,
constants.EXPORT_IMPORT_STATIC_DIR
file_data=u'Привіт, edX вітає вас.'
)
with self.file_system.open(combine(constants.EXPORT_IMPORT_STATIC_DIR, transcript_file_name), 'wb') as f:
f.write(invalid_transcript['file_data'].encode('cp1251'))
api.import_transcript_from_fs(
edx_video_id=edx_video_id,
language_code=language_code,
......
# -*- coding: utf-8 -*-
"""
Tests for transcript utils.
"""
import ddt
import json
import textwrap
import unittest
from edxval.transcript_utils import Transcript
from edxval.exceptions import TranscriptsGenerationException
@ddt.ddt
class TestTranscriptUtils(unittest.TestCase):
"""
Tests transcripts conversion util.
"""
def setUp(self):
super(TestTranscriptUtils, self).setUp()
self.srt_transcript = textwrap.dedent("""\
0
00:00:10,500 --> 00:00:13,000
Elephant&#39;s Dream 大象的梦想
1
00:00:15,000 --> 00:00:18,000
At the left we can see...
""")
self.sjson_transcript = textwrap.dedent("""\
{
"start": [
10500,
15000
],
"end": [
13000,
18000
],
"text": [
"Elephant&#39;s Dream 大象的梦想",
"At the left we can see..."
]
}
""")
@ddt.data(
('invalid_input_format', 'sjson'),
('sjson', 'invalid_output_format'),
('invalid_input_format', 'invalid_output_format')
)
@ddt.unpack
def test_invalid_transcript_format(self, input_format, output_format):
"""
Tests that transcript conversion raises `AssertionError` on invalid input/output formats.
"""
with self.assertRaises(AssertionError):
Transcript.convert(self.sjson_transcript, input_format, output_format)
def test_convert_srt_to_srt(self):
"""
Tests that srt to srt conversion works as expected.
"""
expected = self.srt_transcript.decode('utf-8')
actual = Transcript.convert(self.srt_transcript, 'srt', 'srt')
self.assertEqual(actual, expected)
def test_convert_sjson_to_srt(self):
"""
Tests that the sjson transcript is successfully converted into srt format.
"""
expected = self.srt_transcript.decode('utf-8')
actual = Transcript.convert(self.sjson_transcript, 'sjson', 'srt')
self.assertEqual(actual, expected)
def test_convert_srt_to_sjson(self):
"""
Tests that the srt transcript is successfully converted into sjson format.
"""
expected = self.sjson_transcript.decode('utf-8')
actual = Transcript.convert(self.srt_transcript, 'srt', 'sjson')
self.assertDictEqual(json.loads(actual), json.loads(expected))
def test_convert_invalid_srt_to_sjson(self):
"""
Tests that TranscriptsGenerationException was raises on trying
to convert invalid srt transcript to sjson.
"""
invalid_srt_transcript = 'invalid SubRip file content'
with self.assertRaises(TranscriptsGenerationException):
Transcript.convert(invalid_srt_transcript, 'srt', 'sjson')
"""
A module containing transcripts utils.
"""
import json
from six import text_type
from pysrt import SubRipFile, SubRipItem, SubRipTime
from pysrt.srtexc import Error
from edxval.exceptions import TranscriptsGenerationException
class Transcript(object):
"""
Container for transcript methods.
"""
SRT = 'srt'
SJSON = 'sjson'
@staticmethod
def generate_sjson_from_srt(srt_subs):
"""
Generate transcripts from sjson to SubRip (*.srt).
Arguments:
srt_subs(SubRip): "SRT" subs object
Returns:
Subs converted to "SJSON" format.
"""
sub_starts = []
sub_ends = []
sub_texts = []
for sub in srt_subs:
sub_starts.append(sub.start.ordinal)
sub_ends.append(sub.end.ordinal)
sub_texts.append(sub.text.replace('\n', ' '))
sjson_subs = {
'start': sub_starts,
'end': sub_ends,
'text': sub_texts
}
return sjson_subs
@staticmethod
def generate_srt_from_sjson(sjson_subs):
"""
Generate transcripts from sjson to SubRip (*.srt).
Arguments:
sjson_subs (dict): `sjson` subs.
Returns:
Subtitles in SRT format.
"""
output = ''
equal_len = len(sjson_subs['start']) == len(sjson_subs['end']) == len(sjson_subs['text'])
if not equal_len:
return output
for i in range(len(sjson_subs['start'])):
item = SubRipItem(
index=i,
start=SubRipTime(milliseconds=sjson_subs['start'][i]),
end=SubRipTime(milliseconds=sjson_subs['end'][i]),
text=sjson_subs['text'][i]
)
output += (unicode(item))
output += '\n'
return output
@classmethod
def convert(cls, content, input_format, output_format):
"""
Convert transcript `content` from `input_format` to `output_format`.
Arguments:
content: Transcript content byte-stream.
input_format: Input transcript format.
output_format: Output transcript format.
Accepted input formats: sjson, srt.
Accepted output format: srt, sjson.
Raises:
TranscriptsGenerationException: On parsing the invalid srt
content during conversion from srt to sjson.
"""
assert input_format in ('srt', 'sjson')
assert output_format in ('srt', 'sjson')
# Decode the content with utf-8-sig which will also
# skip byte order mark(BOM) character if found.
content = content.decode('utf-8-sig')
if input_format == output_format:
return content
if input_format == 'srt':
if output_format == 'sjson':
try:
# With error handling (set to 'ERROR_RAISE'), we will be getting
# the exception if something went wrong in parsing the transcript.
srt_subs = SubRipFile.from_string(content, error_handling=SubRipFile.ERROR_RAISE)
except Error as ex: # Base exception from pysrt
raise TranscriptsGenerationException(text_type(ex))
return json.dumps(cls.generate_sjson_from_srt(srt_subs))
if input_format == 'sjson':
if output_format == 'srt':
return cls.generate_srt_from_sjson(json.loads(content))
......@@ -192,11 +192,11 @@ def create_file_in_fs(file_data, file_name, file_system, static_dir):
Arguments:
file_data (str): Data to store into the file.
file_name (str): File name of the file to be created.
resource_fs (OSFS): Import file system.
file_system (OSFS): Import file system.
static_dir (str): The Directory to retrieve transcript file.
"""
with file_system.open(combine(static_dir, file_name), 'wb') as f:
f.write(file_data)
f.write(file_data.encode('utf-8'))
def get_transcript_format(transcript_content):
......
......@@ -41,7 +41,7 @@ def load_requirements(*requirements_paths):
setup(
name='edxval',
version='0.1.14',
version='0.1.15',
author='edX',
url='http://github.com/edx/edx-val',
description='edx-val',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment