release 2016.02.04

[test_subtitles] update youtube subtitles tests
[youtube] fix subtitle order
2026-04-25 00:00:04 -04:00 · 2016-02-04 13:39:26 +01:00 · 2016-02-04 08:50:55 +01:00 · 2016-02-04 08:39:01 +01:00 · 2016-02-04 08:28:37 +01:00 · 2016-02-04 01:25:36 +01:00
14 changed files with 114 additions and 41 deletions
@@ -455,6 +455,8 @@ The `-o` option allows users to indicate a template for the output file names. T
 - `format_id`: The sequence will be replaced by the format code specified by `--format`.
 - `duration`: The sequence will be replaced by the length of the video in seconds.

+Note that some of the aforementioned sequences are not guaranteed to be present since they depend on the metadata obtained by particular extractor, such sequences will be replaced with `NA`.
+
 The current default template is `%(title)s-%(id)s.%(ext)s`.

 In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title:
@@ -91,6 +91,7 @@
 - **Canvas**
 - **CBS**
 - **CBSNews**: CBS News
+ - **CBSNewsLiveVideo**: CBS News Live Videos
 - **CBSSports**
 - **CeskaTelevize**
 - **channel9**: Channel 9
@@ -248,6 +248,17 @@ class TestFormatSelection(unittest.TestCase):

        def format_info(f_id):
            info = YoutubeIE._formats[f_id].copy()
+
+            # XXX: In real cases InfoExtractor._parse_mpd() fills up 'acodec'
+            # and 'vcodec', while in tests such information is incomplete since
+            # commit a6c2c24479e5f4827ceb06f64d855329c0a6f593
+            # test_YoutubeDL.test_youtube_format_selection is broken without
+            # this fix
+            if 'acodec' in info and 'vcodec' not in info:
+                info['vcodec'] = 'none'
+            elif 'vcodec' in info and 'acodec' not in info:
+                info['acodec'] = 'none'
+
            info['format_id'] = f_id
            info['url'] = 'url:' + f_id
            return info
@@ -65,16 +65,16 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
        self.DL.params['allsubtitles'] = True
        subtitles = self.getSubtitles()
        self.assertEqual(len(subtitles.keys()), 13)
-        self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260')
-        self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
-        for lang in ['it', 'fr', 'de']:
+        self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
+        self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5')
+        for lang in ['fr', 'de']:
            self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)

-    def test_youtube_subtitles_sbv_format(self):
+    def test_youtube_subtitles_ttml_format(self):
        self.DL.params['writesubtitles'] = True
-        self.DL.params['subtitlesformat'] = 'sbv'
+        self.DL.params['subtitlesformat'] = 'ttml'
        subtitles = self.getSubtitles()
-        self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b')
+        self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54')

    def test_youtube_subtitles_vtt_format(self):
        self.DL.params['writesubtitles'] = True
@@ -90,7 +90,10 @@ from .canalplus import CanalplusIE
 from .canalc2 import Canalc2IE
 from .canvas import CanvasIE
 from .cbs import CBSIE
-from .cbsnews import CBSNewsIE
+from .cbsnews import (
+    CBSNewsIE,
+    CBSNewsLiveVideoIE,
+)
 from .cbssports import CBSSportsIE
 from .ccc import CCCIE
 from .ceskatelevize import CeskaTelevizeIE
@@ -1,15 +1,14 @@
 # encoding: utf-8
 from __future__ import unicode_literals

-import re
-import json
-
+from .common import InfoExtractor
 from .theplatform import ThePlatformIE
+from ..utils import parse_duration


 class CBSNewsIE(ThePlatformIE):
    IE_DESC = 'CBS News'
-    _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:[^/]+/)+(?P<id>[\da-z_-]+)'
+    _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)'

    _TESTS = [
        {
@@ -48,14 +47,13 @@ class CBSNewsIE(ThePlatformIE):
    ]

    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

-        video_info = json.loads(self._html_search_regex(
+        video_info = self._parse_json(self._html_search_regex(
            r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'',
-            webpage, 'video JSON info'))
+            webpage, 'video JSON info'), video_id)

        item = video_info['item'] if 'item' in video_info else video_info
        title = item.get('articleTitle') or item.get('hed')
@@ -88,3 +86,41 @@ class CBSNewsIE(ThePlatformIE):
            'formats': formats,
            'subtitles': subtitles,
        }
+
+
+class CBSNewsLiveVideoIE(InfoExtractor):
+    IE_DESC = 'CBS News Live Videos'
+    _VALID_URL = r'http://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)'
+
+    _TEST = {
+        'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/',
+        'info_dict': {
+            'id': 'clinton-sanders-prepare-to-face-off-in-nh',
+            'ext': 'flv',
+            'title': 'Clinton, Sanders Prepare To Face Off In NH',
+            'duration': 334,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_info = self._parse_json(self._html_search_regex(
+            r'data-story-obj=\'({.+?})\'', webpage, 'video JSON info'), video_id)['story']
+
+        hdcore_sign = 'hdcore=3.3.1'
+        f4m_formats = self._extract_f4m_formats(video_info['url'] + '&' + hdcore_sign, video_id)
+        if f4m_formats:
+            for entry in f4m_formats:
+                # URLs without the extra param induce an 404 error
+                entry.update({'extra_param_to_segment_url': hdcore_sign})
+
+        return {
+            'id': video_id,
+            'title': video_info['headline'],
+            'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'),
+            'duration': parse_duration(video_info.get('segmentDur')),
+            'formats': f4m_formats,
+        }
@@ -1229,19 +1229,24 @@ class GenericIE(InfoExtractor):

        # Check for direct link to a video
        content_type = head_response.headers.get('Content-Type', '')
-        m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+        m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)
        if m:
            upload_date = unified_strdate(
                head_response.headers.get('Last-Modified'))
+            formats = []
+            if m.group('format_id').endswith('mpegurl'):
+                formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+            else:
+                formats = [{
+                    'format_id': m.group('format_id'),
+                    'url': url,
+                    'vcodec': 'none' if m.group('type') == 'audio' else None
+                }]
            return {
                'id': video_id,
                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                'direct': True,
-                'formats': [{
-                    'format_id': m.group('format_id'),
-                    'url': url,
-                    'vcodec': 'none' if m.group('type') == 'audio' else None
-                }],
+                'formats': formats,
                'upload_date': upload_date,
            }

@@ -31,6 +31,10 @@ class KuwoBaseIE(InfoExtractor):
                (file_format['ext'], file_format.get('br', ''), song_id),
                song_id, note='Download %s url info' % file_format['format'],
            )
+
+            if song_url == 'IPDeny':
+                raise ExtractorError('This song is blocked in this region', expected=True)
+
            if song_url.startswith('http://') or song_url.startswith('https://'):
                formats.append({
                    'url': song_url,
@@ -70,14 +70,11 @@ class SRGSSRIE(InfoExtractor):
                        asset_url, media_id, 'mp4', 'm3u8_native',
                        m3u8_id=format_id, fatal=False))
                else:
-                    ext = None
-                    if protocol == 'RTMP':
-                        ext = self._search_regex(r'([a-z0-9]+):[^/]+', asset_url, 'ext')
                    formats.append({
                        'format_id': format_id,
                        'url': asset_url,
                        'preference': preference(quality),
-                        'ext': ext,
+                        'ext': 'flv' if protocol == 'RTMP' else None,
                    })
        self._sort_formats(formats)

@@ -321,7 +321,7 @@ class VKIE(InfoExtractor):
 class VKUserVideosIE(InfoExtractor):
    IE_NAME = 'vk:uservideos'
    IE_DESC = "VK - User's Videos"
-    _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)$'
+    _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
    _TEMPLATE_URL = 'https://vk.com/videos'
    _TESTS = [{
        'url': 'http://vk.com/videos205387401',
@@ -333,6 +333,9 @@ class VKUserVideosIE(InfoExtractor):
    }, {
        'url': 'http://vk.com/videos-77521',
        'only_matching': True,
+    }, {
+        'url': 'http://vk.com/videos-97664626?section=all',
+        'only_matching': True,
    }]

    def _real_extract(self, url):
@@ -918,7 +918,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            if lang in sub_lang_list:
                continue
            sub_formats = []
-            for ext in ['sbv', 'vtt', 'srt']:
+            for ext in ['ttml', 'vtt']:
                params = compat_urllib_parse.urlencode({
                    'lang': lang,
                    'v': video_id,
@@ -391,6 +391,10 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
        for (name, value) in metadata.items():
            options.extend(['-metadata', '%s=%s' % (name, value)])

+        # https://github.com/rg3/youtube-dl/issues/8350
+        if info['protocol'] == 'm3u8_native' or self._downloader.params.get('hls_prefer_native', False):
+            options.extend(['-bsf:a', 'aac_adtstoasc'])
+
        self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
        self.run_ffmpeg(filename, temp_filename, options)
        os.remove(encodeFilename(filename))
@@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data):
        'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
    })

+    class TTMLPElementParser(object):
+        out = ''
+
+        def start(self, tag, attrib):
+            if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
+                self.out += '\n'
+
+        def end(self, tag):
+            pass
+
+        def data(self, data):
+            self.out += data
+
+        def close(self):
+            return self.out.strip()
+
    def parse_node(node):
-        str_or_empty = functools.partial(str_or_none, default='')
-
-        out = str_or_empty(node.text)
-
-        for child in node:
-            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
-                out += '\n' + str_or_empty(child.tail)
-            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
-                out += str_or_empty(parse_node(child))
-            else:
-                out += str_or_empty(xml.etree.ElementTree.tostring(child))
-
-        return out
+        target = TTMLPElementParser()
+        parser = xml.etree.ElementTree.XMLParser(target=target)
+        parser.feed(xml.etree.ElementTree.tostring(node))
+        return parser.close()

    dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
    out = []
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals

-__version__ = '2016.02.01'
+__version__ = '2016.02.04'
Author	SHA1	Message	Date
Philipp Hagemeister	f1ed3acae5	release 2016.02.04	2016-02-04 13:39:26 +01:00
remitamine	920d21b9d3	[test_subtitles] update youtube subtitles tests	2016-02-04 08:50:55 +01:00
remitamine	2fb35d1c28	[youtube] fix subtitle order	2016-02-04 08:39:01 +01:00
remitamine	09be85b8dd	[youtube] fix subtitle extraction(fixes #8415 )	2016-02-04 08:28:37 +01:00
remitamine	eadc3ccd50	[generic] extract m3u8 formats when mpegurl content type detected	2016-02-04 01:25:36 +01:00
Yen Chi Hsuan	58be922079	[kuwo] Check for georestriction	2016-02-04 01:26:25 +08:00
Sergey M	c84d3a557d	[README.md] Clarify unavailable sequences in output format	2016-02-03 19:18:25 +05:00
remitamine	6ad2b01e14	[srgssr] use flv as ext for rtmp formats	2016-02-02 23:09:50 +01:00
remitamine	fd3a1f3d60	[cbsnews] add support for live videos(fixes #7010 )	2016-02-02 23:02:18 +01:00
Jaime Marquínez Ferrándiz	87de7069b9	[utils] dfxp2srt: make TTMLPElementParser inherit from object For consistency between python 2 and 3.	2016-02-02 22:30:13 +01:00
remitamine	6fba62c87a	[ffmpeg] fix adding metadata when using --hls-prefer-native(#8350 )	2016-02-02 22:14:23 +01:00
Yen Chi Hsuan	1df4141196	[test_YoutubeDL] Fix test_youtube_format_selection Broken since `a6c2c24479`. Thanks to @jaimeMF and @anisse for pointing that out	2016-02-03 03:42:37 +08:00
remitamine	fae45ede08	Merge pull request #8354 from remitamine/m3u8_metadata [ffmpeg] fix adding metadata when using m3u8_native(fixes #8350)	2016-02-02 19:13:58 +01:00
remitamine	4e0cff2a50	Merge pull request #8348 from remitamine/dfxp2srt-text [utils] fix dfxp2srt text extraction(fixes #8055)	2016-02-02 18:36:26 +01:00
Sergey M․	0436157b95	[vk:uservideos] Improve _VALID_URL (Closes #8389 )	2016-02-02 00:52:37 +06:00
remitamine	cf57433bbd	[ffmpeg] fix adding metadata when using m3u8_native(fixes #8350 )	2016-01-28 18:57:32 +01:00
remitamine	2b14cb566f	[utils] fix dfxp2srt text extraction(fixes #8055 )	2016-01-28 12:38:34 +01:00