mirror of
				https://gitlab.com/ytdl-org/youtube-dl.git
				synced 2025-11-04 00:57:07 -05:00 
			
		
		
		
	[extractor/common] add helper method to extract html5 media entries
This commit is contained in:
		@@ -54,6 +54,8 @@ from ..utils import (
 | 
			
		||||
    update_Request,
 | 
			
		||||
    update_url_query,
 | 
			
		||||
    parse_m3u8_attributes,
 | 
			
		||||
    extract_attributes,
 | 
			
		||||
    parse_codecs,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -1610,6 +1612,62 @@ class InfoExtractor(object):
 | 
			
		||||
                        self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
 | 
			
		||||
        return formats
 | 
			
		||||
 | 
			
		||||
    def _parse_html5_media_entries(self, base_url, webpage):
 | 
			
		||||
        def absolute_url(video_url):
 | 
			
		||||
            return compat_urlparse.urljoin(base_url, video_url)
 | 
			
		||||
 | 
			
		||||
        def parse_content_type(content_type):
 | 
			
		||||
            if not content_type:
 | 
			
		||||
                return {}
 | 
			
		||||
            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
 | 
			
		||||
            if ctr:
 | 
			
		||||
                mimetype, codecs = ctr.groups()
 | 
			
		||||
                f = parse_codecs(codecs)
 | 
			
		||||
                f['ext'] = mimetype2ext(mimetype)
 | 
			
		||||
                return f
 | 
			
		||||
            return {}
 | 
			
		||||
 | 
			
		||||
        entries = []
 | 
			
		||||
        for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
 | 
			
		||||
            media_info = {
 | 
			
		||||
                'formats': [],
 | 
			
		||||
                'subtitles': {},
 | 
			
		||||
            }
 | 
			
		||||
            media_attributes = extract_attributes(media_tag)
 | 
			
		||||
            src = media_attributes.get('src')
 | 
			
		||||
            if src:
 | 
			
		||||
                media_info['formats'].append({
 | 
			
		||||
                    'url': absolute_url(src),
 | 
			
		||||
                    'vcodec': 'none' if media_type == 'audio' else None,
 | 
			
		||||
                })
 | 
			
		||||
            media_info['thumbnail'] = media_attributes.get('poster')
 | 
			
		||||
            if media_content:
 | 
			
		||||
                for source_tag in re.findall(r'<source[^>]+>', media_content):
 | 
			
		||||
                    source_attributes = extract_attributes(source_tag)
 | 
			
		||||
                    src = source_attributes.get('src')
 | 
			
		||||
                    if not src:
 | 
			
		||||
                        continue
 | 
			
		||||
                    f = parse_content_type(source_attributes.get('type'))
 | 
			
		||||
                    f.update({
 | 
			
		||||
                        'url': absolute_url(src),
 | 
			
		||||
                        'vcodec': 'none' if media_type == 'audio' else None,
 | 
			
		||||
                    })
 | 
			
		||||
                    media_info['formats'].append(f)
 | 
			
		||||
                for track_tag in re.findall(r'<track[^>]+>', media_content):
 | 
			
		||||
                    track_attributes = extract_attributes(track_tag)
 | 
			
		||||
                    kind = track_attributes.get('kind')
 | 
			
		||||
                    if not kind or kind == 'subtitles':
 | 
			
		||||
                        src = track_attributes.get('src')
 | 
			
		||||
                        if not src:
 | 
			
		||||
                            continue
 | 
			
		||||
                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
 | 
			
		||||
                        media_info['subtitles'].setdefault(lang, []).append({
 | 
			
		||||
                            'url': absolute_url(src),
 | 
			
		||||
                        })
 | 
			
		||||
            if media_info['formats']:
 | 
			
		||||
                entries.append(media_info)
 | 
			
		||||
        return entries
 | 
			
		||||
 | 
			
		||||
    def _live_title(self, name):
 | 
			
		||||
        """ Generate the title for a live video """
 | 
			
		||||
        now = datetime.datetime.now()
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user