mirror of
				https://gitlab.com/ytdl-org/youtube-dl.git
				synced 2025-11-04 06:17:07 -05:00 
			
		
		
		
	[utils] Use bytes-like objects in dfxp2srt
This fixes handling of non-UTF8 TTML subtitles Closes #14191
This commit is contained in:
		@@ -1,3 +1,9 @@
 | 
				
			|||||||
 | 
					version <unreleased>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Core
 | 
				
			||||||
 | 
					* [utils] Fix handling raw TTML subtitles (#14191)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
version 2017.09.15
 | 
					version 2017.09.15
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Core
 | 
					Core
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1064,7 +1064,7 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
 | 
				
			|||||||
                    <p begin="3" dur="-1">Ignored, three</p>
 | 
					                    <p begin="3" dur="-1">Ignored, three</p>
 | 
				
			||||||
                </div>
 | 
					                </div>
 | 
				
			||||||
            </body>
 | 
					            </body>
 | 
				
			||||||
            </tt>'''
 | 
					            </tt>'''.encode('utf-8')
 | 
				
			||||||
        srt_data = '''1
 | 
					        srt_data = '''1
 | 
				
			||||||
00:00:00,000 --> 00:00:01,000
 | 
					00:00:00,000 --> 00:00:01,000
 | 
				
			||||||
The following line contains Chinese characters and special symbols
 | 
					The following line contains Chinese characters and special symbols
 | 
				
			||||||
@@ -1089,7 +1089,7 @@ Line
 | 
				
			|||||||
                    <p begin="0" end="1">The first line</p>
 | 
					                    <p begin="0" end="1">The first line</p>
 | 
				
			||||||
                </div>
 | 
					                </div>
 | 
				
			||||||
            </body>
 | 
					            </body>
 | 
				
			||||||
            </tt>'''
 | 
					            </tt>'''.encode('utf-8')
 | 
				
			||||||
        srt_data = '''1
 | 
					        srt_data = '''1
 | 
				
			||||||
00:00:00,000 --> 00:00:01,000
 | 
					00:00:00,000 --> 00:00:01,000
 | 
				
			||||||
The first line
 | 
					The first line
 | 
				
			||||||
@@ -1115,7 +1115,7 @@ The first line
 | 
				
			|||||||
      <p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p>
 | 
					      <p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p>
 | 
				
			||||||
    </div>
 | 
					    </div>
 | 
				
			||||||
  </body>
 | 
					  </body>
 | 
				
			||||||
</tt>'''
 | 
					</tt>'''.encode('utf-8')
 | 
				
			||||||
        srt_data = '''1
 | 
					        srt_data = '''1
 | 
				
			||||||
00:00:02,080 --> 00:00:05,839
 | 
					00:00:02,080 --> 00:00:05,839
 | 
				
			||||||
<font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font>
 | 
					<font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font>
 | 
				
			||||||
@@ -1138,6 +1138,26 @@ part 3</font></u>
 | 
				
			|||||||
'''
 | 
					'''
 | 
				
			||||||
        self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data)
 | 
					        self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        dfxp_data_non_utf8 = '''<?xml version="1.0" encoding="UTF-16"?>
 | 
				
			||||||
 | 
					            <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
 | 
				
			||||||
 | 
					            <body>
 | 
				
			||||||
 | 
					                <div xml:lang="en">
 | 
				
			||||||
 | 
					                    <p begin="0" end="1">Line 1</p>
 | 
				
			||||||
 | 
					                    <p begin="1" end="2">第二行</p>
 | 
				
			||||||
 | 
					                </div>
 | 
				
			||||||
 | 
					            </body>
 | 
				
			||||||
 | 
					            </tt>'''.encode('utf-16')
 | 
				
			||||||
 | 
					        srt_data = '''1
 | 
				
			||||||
 | 
					00:00:00,000 --> 00:00:01,000
 | 
				
			||||||
 | 
					Line 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2
 | 
				
			||||||
 | 
					00:00:01,000 --> 00:00:02,000
 | 
				
			||||||
 | 
					第二行
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					        self.assertEqual(dfxp2srt(dfxp_data_non_utf8), srt_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_cli_option(self):
 | 
					    def test_cli_option(self):
 | 
				
			||||||
        self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])
 | 
					        self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])
 | 
				
			||||||
        self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), [])
 | 
					        self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), [])
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -585,7 +585,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
 | 
				
			|||||||
                dfxp_file = old_file
 | 
					                dfxp_file = old_file
 | 
				
			||||||
                srt_file = subtitles_filename(filename, lang, 'srt')
 | 
					                srt_file = subtitles_filename(filename, lang, 'srt')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                with io.open(dfxp_file, 'rt', encoding='utf-8') as f:
 | 
					                with open(dfxp_file, 'rb') as f:
 | 
				
			||||||
                    srt_data = dfxp2srt(f.read())
 | 
					                    srt_data = dfxp2srt(f.read())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                with io.open(srt_file, 'wt', encoding='utf-8') as f:
 | 
					                with io.open(srt_file, 'wt', encoding='utf-8') as f:
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -2572,14 +2572,18 @@ def srt_subtitles_timecode(seconds):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def dfxp2srt(dfxp_data):
 | 
					def dfxp2srt(dfxp_data):
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    @param dfxp_data A bytes-like object containing DFXP data
 | 
				
			||||||
 | 
					    @returns A unicode object containing converted SRT data
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
    LEGACY_NAMESPACES = (
 | 
					    LEGACY_NAMESPACES = (
 | 
				
			||||||
        ('http://www.w3.org/ns/ttml', [
 | 
					        (b'http://www.w3.org/ns/ttml', [
 | 
				
			||||||
            'http://www.w3.org/2004/11/ttaf1',
 | 
					            b'http://www.w3.org/2004/11/ttaf1',
 | 
				
			||||||
            'http://www.w3.org/2006/04/ttaf1',
 | 
					            b'http://www.w3.org/2006/04/ttaf1',
 | 
				
			||||||
            'http://www.w3.org/2006/10/ttaf1',
 | 
					            b'http://www.w3.org/2006/10/ttaf1',
 | 
				
			||||||
        ]),
 | 
					        ]),
 | 
				
			||||||
        ('http://www.w3.org/ns/ttml#styling', [
 | 
					        (b'http://www.w3.org/ns/ttml#styling', [
 | 
				
			||||||
            'http://www.w3.org/ns/ttml#style',
 | 
					            b'http://www.w3.org/ns/ttml#style',
 | 
				
			||||||
        ]),
 | 
					        ]),
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -2674,7 +2678,7 @@ def dfxp2srt(dfxp_data):
 | 
				
			|||||||
        for ns in v:
 | 
					        for ns in v:
 | 
				
			||||||
            dfxp_data = dfxp_data.replace(ns, k)
 | 
					            dfxp_data = dfxp_data.replace(ns, k)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
 | 
					    dfxp = compat_etree_fromstring(dfxp_data)
 | 
				
			||||||
    out = []
 | 
					    out = []
 | 
				
			||||||
    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
 | 
					    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user