Merge pull request #6582 from edx/waheed/tnl935-fix-transcript-skip-first-line

Fixed transcript skip first line if it contains BOM(Byte Order Mark).

Merge pull request #6582 from edx/waheed/tnl935-fix-transcript-skip-first-line
Fixed transcript skip first line if it contains BOM(Byte Order Mark).
5958ffb3 · Waheed Ahmed · 2db00c57 · 47a37228 · 5958ffb3 · 5958ffb3
Commit 5958ffb3 authored Jan 16, 2015 by Waheed Ahmed
Hide whitespace changes
Inline Side-by-side

Showing with 44 additions and 1 deletions

cms/djangoapps/contentstore/views/tests/test_transcripts.py
+42 -0

cms/djangoapps/contentstore/views/transcripts_ajax.py
+2 -1

No files found.
--- a/cms/djangoapps/contentstore/views/tests/test_transcripts.py
+++ b/cms/djangoapps/contentstore/views/tests/test_transcripts.py
@@ -116,6 +116,8 @@ class TestUploadtranscripts(Basetranscripts):
        """))
        self.bad_name_srt_file.seek(0)
+        self.ufeff_srt_file = tempfile.NamedTemporaryFile(suffix='.srt')
    def test_success_video_module_source_subs_uploading(self):
        self.item.data = textwrap.dedent("""
            <video youtube="">
@@ -296,12 +298,52 @@ class TestUploadtranscripts(Basetranscripts):
        self.assertEqual(resp.status_code, 400)
        self.assertEqual(json.loads(resp.content).get('status'), 'Undefined file extension.')
+    def test_subs_uploading_with_byte_order_mark(self):
+        """
+        Test uploading subs containing BOM(Byte Order Mark), e.g. U+FEFF
+        """
+        filedate = textwrap.dedent("""
+            1
+            00:00:10,500 --> 00:00:13,000
+            Test ufeff characters
+            2
+            00:00:15,000 --> 00:00:18,000
+            At the left we can see...
+        """).encode('utf-8-sig')
+        # Verify that ufeff character is in filedata.
+        self.assertIn("ufeff", filedate)
+        self.ufeff_srt_file.write(filedate)
+        self.ufeff_srt_file.seek(0)
+        link = reverse('upload_transcripts')
+        filename = os.path.splitext(os.path.basename(self.ufeff_srt_file.name))[0]
+        resp = self.client.post(link, {
+            'locator': self.video_usage_key,
+            'transcript-file': self.ufeff_srt_file,
+            'video_list': json.dumps([{
+                'type': 'html5',
+                'video': filename,
+                'mode': 'mp4',
+            }])
+        })
+        self.assertEqual(resp.status_code, 200)
+        content_location = StaticContent.compute_location(
+            self.course.id, 'subs_{0}.srt.sjson'.format(filename))
+        self.assertTrue(contentstore().find(content_location))
+        subs_text = json.loads(contentstore().find(content_location).data).get('text')
+        self.assertIn("Test ufeff characters", subs_text)
    def tearDown(self):
        super(TestUploadtranscripts, self).tearDown()
        self.good_srt_file.close()
        self.bad_data_srt_file.close()
        self.bad_name_srt_file.close()
+        self.ufeff_srt_file.close()
 class TestDownloadtranscripts(Basetranscripts):

--- a/cms/djangoapps/contentstore/views/transcripts_ajax.py
+++ b/cms/djangoapps/contentstore/views/transcripts_ajax.py
@@ -100,7 +100,8 @@ def upload_transcripts(request):
    except ValueError:
        return error_response(response, 'Invalid video_list JSON.')
-    source_subs_filedata = request.FILES['transcript-file'].read().decode('utf8')
+    # Used utf-8-sig encoding type instead of utf-8 to remove BOM(Byte Order Mark), e.g. U+FEFF
+    source_subs_filedata = request.FILES['transcript-file'].read().decode('utf-8-sig')
    source_subs_filename = request.FILES['transcript-file'].name
    if '.' not in source_subs_filename: