Unverified Commit 04dd6fd9 by Mushtaq Ali Committed by GitHub

Merge pull request #130 from edx/mushtaq/import-transcripts

Import video transcripts
parents 6469fc26 b8a64a57
......@@ -8,6 +8,8 @@ from enum import Enum
from uuid import uuid4
from django.core.exceptions import ObjectDoesNotExist, ValidationError
from django.core.files import File
from fs.path import combine
from lxml import etree
from lxml.etree import Element, SubElement
......@@ -20,7 +22,7 @@ from edxval.models import (CourseVideo, EncodedVideo, Profile,
TranscriptProviderType, Video, VideoImage,
VideoTranscript, ThirdPartyTranscriptCredentialsState)
from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
from edxval.utils import THIRD_PARTY_TRANSCRIPTION_PLANS
from edxval.utils import THIRD_PARTY_TRANSCRIPTION_PLANS, create_file_in_fs
logger = logging.getLogger(__name__) # pylint: disable=C0103
......@@ -789,7 +791,7 @@ def export_to_xml(video_id, resource_fs, static_dir, course_id=None):
video_id (str): Video id of the video to export transcripts.
course_id (str): The ID of the course with which this video is associated.
static_dir (str): The Directory to store transcript file.
resource_fs (OSFS): The file system to store transcripts.
resource_fs (OSFS): Export file system.
Returns:
An lxml video_asset element containing export data
......@@ -837,22 +839,15 @@ def create_trancript_file(video_id, language_code, file_format, resource_fs, sta
static_dir (str): The Directory to store transcript file.
resource_fs (OSFS): The file system to store transcripts.
"""
transcript_name = u'{static_dir}/{video_id}-{language_code}.{file_format}'.format(
static_dir=static_dir,
transcript_name = u'{video_id}-{language_code}.{file_format}'.format(
video_id=video_id,
language_code=language_code,
file_format=file_format
)
try:
transcript_data = get_video_transcript_data(video_id, language_code)
if transcript_data:
transcript_content = transcript_data['content']
with resource_fs.open(transcript_name, 'wb') as f:
f.write(transcript_content)
except Exception:
# Do not raise exception in case no transcript file is found for now.
# TODO: Remove this - EDUCATOR-2173
pass
create_file_in_fs(transcript_content, transcript_name, resource_fs, static_dir)
def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
......@@ -886,7 +881,6 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
transcripts_el,
'transcript',
{
'file_name': video_transcript.transcript.name,
'language_code': language_code,
'file_format': file_format,
'provider': video_transcript.provider,
......@@ -897,7 +891,7 @@ def create_transcripts_xml(video_id, video_el, resource_fs, static_dir):
return video_el
def import_from_xml(xml, edx_video_id, course_id=None):
def import_from_xml(xml, edx_video_id, resource_fs, static_dir, course_id=None):
"""
Imports data from a video_asset element about the given video_id.
......@@ -907,6 +901,8 @@ def import_from_xml(xml, edx_video_id, course_id=None):
Arguments:
xml (Element): An lxml video_asset element containing import data
edx_video_id (str): val video id
resource_fs (OSFS): Import file system.
static_dir (str): The Directory to retrieve transcript file.
course_id (str): The ID of a course to associate the video with
Raises:
......@@ -915,7 +911,7 @@ def import_from_xml(xml, edx_video_id, course_id=None):
if xml.tag != 'video_asset':
raise ValCannotCreateError('Invalid XML')
# TODO this will be moved as a part of EDUCATOR-2173
# TODO this will be moved as a part of EDUCATOR-2403
if not edx_video_id:
return
......@@ -968,26 +964,43 @@ def import_from_xml(xml, edx_video_id, course_id=None):
'bitrate': encoded_video_el.get('bitrate'),
})
create_video(data)
create_transcript_objects(xml)
create_transcript_objects(xml, edx_video_id, resource_fs, static_dir)
def create_transcript_objects(xml):
def create_transcript_objects(xml, edx_video_id, resource_fs, static_dir):
"""
Create VideoTranscript objects.
Arguments:
xml (Element): lxml Element object
xml (Element): lxml Element object.
edx_video_id (str): Video id of the video.
resource_fs (OSFS): Import file system.
static_dir (str): The Directory to retrieve transcript file.
"""
for transcript in xml.findall('.//transcripts/transcript'):
try:
create_or_update_video_transcript(
transcript.attrib['video_id'],
transcript.attrib['language_code'],
metadata=dict(
provider=transcript.attrib['provider'],
file_name=transcript.attrib['file_name'],
file_format=transcript.attrib['file_format'],
file_format = transcript.attrib['file_format']
language_code = transcript.attrib['language_code']
transcript_data = get_video_transcript_data(edx_video_id, language_code)
# First check if transcript record does not exist.
if not transcript_data:
transcript_file_name = u'{edx_video_id}-{language_code}.{file_format}'.format(
edx_video_id=edx_video_id,
language_code=language_code,
file_format=file_format
)
# Read file from import file system and attach File to transcript record in DS.
file_data = File(resource_fs.open(combine(static_dir, transcript_file_name)))
# Create transcript record.
create_video_transcript(
video_id=edx_video_id,
language_code=language_code,
file_format=file_format,
content=file_data,
provider=transcript.attrib['provider']
)
except KeyError:
logger.warn("VAL: Required attributes are missing from xml, xml=[%s]", etree.tostring(transcript).strip())
......@@ -12,6 +12,9 @@ from edxval.models import (
)
EDX_VIDEO_ID = "itchyjacket"
EXPORT_IMPORT_STATIC_DIR = u'static'
"""
Generic Profiles for manually creating profile objects
"""
......@@ -363,12 +366,34 @@ VIDEO_DICT_UPDATE_ANIMAL = dict(
encoded_videos=[],
)
TRANSCRIPT_DATA = {
"overwatch": """
1
00:00:14,370 --> 00:00:16,530
I am overwatch.
2
00:00:16,500 --> 00:00:18,600
可以用“我不太懂艺术 但我知道我喜欢什么”做比喻.""",
"flash": """
1
00:00:07,180 --> 00:00:08,460
This is Flash line 1.""",
"wow": {
"start": [10],
"end": [100],
"text": ["Hi, welcome to edxval."],
}
}
VIDEO_TRANSCRIPT_CIELO24 = dict(
video_id='super-soaker',
language_code='en',
transcript='edxval/tests/data/The_Flash.srt',
provider=TranscriptProviderType.CIELO24,
file_format=TranscriptFormat.SRT,
file_data=TRANSCRIPT_DATA['flash']
)
VIDEO_TRANSCRIPT_3PLAY = dict(
......@@ -377,6 +402,7 @@ VIDEO_TRANSCRIPT_3PLAY = dict(
transcript='edxval/tests/data/wow.sjson',
provider=TranscriptProviderType.THREE_PLAY_MEDIA,
file_format=TranscriptFormat.SJSON,
file_data=TRANSCRIPT_DATA['wow']
)
TRANSCRIPT_PREFERENCES_CIELO24 = dict(
......
{
"start": [10],
"end": [100],
"text": ["Hi, welcome to edxval."],
"text": ["Hi, welcome to edxval."]
}
......@@ -813,6 +813,7 @@ class VideoTranscriptViewTest(APIAuthTestCase):
Tests POSTing transcript successfully.
"""
post_transcript_data = dict(self.transcript_data)
post_transcript_data.pop('file_data')
post_transcript_data['name'] = post_transcript_data.pop('transcript')
response = self.client.post(self.url, post_transcript_data, format='json')
......
......@@ -4,6 +4,8 @@ Util methods to be used in api and models.
from django.conf import settings
from django.core.files.storage import get_storage_class
from fs.path import combine
# 3rd Party Transcription Plans
THIRD_PARTY_TRANSCRIPTION_PLANS = {
......@@ -169,3 +171,17 @@ def get_video_transcript_storage():
# during edx-platform loading this method gets called but settings are not ready yet
# so in that case we will return default(FileSystemStorage) storage class instance
return get_storage_class()()
def create_file_in_fs(file_data, file_name, file_system, static_dir):
"""
Writes file in specific file system.
Arguments:
file_data (str): Data to store into the file.
file_name (str): File name of the file to be created.
resource_fs (OSFS): Import file system.
static_dir (str): The Directory to retrieve transcript file.
"""
with file_system.open(combine(static_dir, file_name), 'wb') as f:
f.write(file_data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment