Fixed transcript skip first line if it contains BOM(Byte Order Mark).

TNL-935

Fixed transcript skip first line if it contains BOM(Byte Order Mark).
TNL-935
4ac7b557 · Waheed Ahmed · 5364ccf1 · 4ac7b557 · 4ac7b557 · 4ac7b557
Commit 4ac7b557 authored Jan 23, 2015 by Waheed Ahmed
4 changed files
--- a/cms/djangoapps/contentstore/views/tests/test_transcripts.py
+++ b/cms/djangoapps/contentstore/views/tests/test_transcripts.py
@@ -302,7 +302,7 @@ class TestUploadtranscripts(Basetranscripts):
        """
        Test uploading subs containing BOM(Byte Order Mark), e.g. U+FEFF
        """
-        filedate = textwrap.dedent("""
+        filedata = textwrap.dedent("""
            1
            00:00:10,500 --> 00:00:13,000
            Test ufeff characters
@@ -313,8 +313,8 @@ class TestUploadtranscripts(Basetranscripts):
        """).encode('utf-8-sig')
        # Verify that ufeff character is in filedata.
-        self.assertIn("ufeff", filedate)
+        self.assertIn("ufeff", filedata)
-        self.ufeff_srt_file.write(filedate)
+        self.ufeff_srt_file.write(filedata)
        self.ufeff_srt_file.seek(0)
        link = reverse('upload_transcripts')

--- a/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
+++ b/common/lib/xmodule/xmodule/video_module/transcripts_utils.py
@@ -398,10 +398,11 @@ def generate_sjson_for_all_speeds(item, user_filename, result_subs_dict, lang):
    if not lang:
        lang = item.transcript_language
+    # Used utf-8-sig encoding type instead of utf-8 to remove BOM(Byte Order Mark), e.g. U+FEFF
    generate_subs_from_source(
        result_subs_dict,
        os.path.splitext(user_filename)[1][1:],
-        srt_transcripts.data.decode('utf8'),
+        srt_transcripts.data.decode('utf-8-sig'),
        item,
        lang
    )

--- a/common/test/acceptance/tests/video/test_studio_video_editor.py
+++ b/common/test/acceptance/tests/video/test_studio_video_editor.py
@@ -489,3 +489,23 @@ class VideoEditorTest(CMSVideoBaseTest):
        self.assertIn(unicode_text, self.video.captions_text)
        self.assertEqual(self.video.caption_languages.keys(), [u'table', u'uk'])
        self.assertEqual(self.video.caption_languages.keys()[0], 'table')
+    def test_upload_transcript_with_BOM(self):
+        """
+        Scenario: User can upload transcript file with BOM(Byte Order Mark) in it.
+        Given I have created a Video component
+        And I edit the component
+        And I open tab "Advanced"
+        And I upload transcript file "chinese_transcripts_with_BOM.srt" for "zh" language code
+        And I save changes
+        Then when I view the video it does show the captions
+        And I see "莎拉·佩林 (Sarah Palin)" text in the captions
+        """
+        self._create_video_component()
+        self.edit_component()
+        self.open_advanced_tab()
+        self.video.upload_translation('chinese_transcripts_with_BOM.srt', 'zh')
+        self.save_unit_settings()
+        self.assertTrue(self.video.is_captions_visible())
+        unicode_text = "莎拉·佩林 (Sarah Palin)".decode('utf-8')
+        self.assertIn(unicode_text, self.video.captions_lines())
--- a/common/test/data/uploads/chinese_transcripts_with_BOM.srt
+++ b/common/test/data/uploads/chinese_transcripts_with_BOM.srt
+1
+00:00:16,850 --> 00:00:23,850
+莎拉·佩林 (Sarah Palin) 的著作《我行我素》被乔纳森·拉班(Jonathan Raban) 评论为“400页对高尚无知的赞美”
+2
+00:00:24,040 --> 00:00:30,680
+他是什么意思呢？拉班所指的那种思想
+3
+00:00:30,680 --> 00:00:35,660
+可以用“我不太懂艺术 但我知道我喜欢什么”做比喻
+4
+00:00:35,660 --> 00:00:42,410
+他将其描述为“常识性保守派”
+5
+00:00:42,410 --> 00:00:47,510
+即占据道德制高点的外行人能比专家更好地评价 比方说