|
@@ -9,18 +9,19 @@ from urllib import urlencode
|
9
|
9
|
from urllib2 import urlopen, URLError
|
10
|
10
|
import sys
|
11
|
11
|
import ssl
|
12
|
|
-ssl._create_default_https_context = ssl._create_unverified_context
|
|
12
|
+if "_create_unverified_context" in dir(ssl):
|
|
13
|
+ ssl._create_default_https_context = ssl._create_unverified_context
|
13
|
14
|
|
14
|
15
|
#from Components.config import config
|
15
|
16
|
|
16
|
17
|
#from . import sslContext
|
17
|
18
|
sslContext = None
|
18
|
19
|
if sys.version_info >= (2, 7, 9):
|
19
|
|
- try:
|
20
|
|
- import ssl
|
21
|
|
- sslContext = ssl._create_unverified_context()
|
22
|
|
- except:
|
23
|
|
- pass
|
|
20
|
+ try:
|
|
21
|
+ import ssl
|
|
22
|
+ sslContext = ssl._create_unverified_context()
|
|
23
|
+ except:
|
|
24
|
+ pass
|
24
|
25
|
from jsinterp import JSInterpreter
|
25
|
26
|
from swfinterp import SWFInterpreter
|
26
|
27
|
|
|
@@ -30,377 +31,377 @@ maxResolution = '22'
|
30
|
31
|
|
31
|
32
|
|
32
|
33
|
def createPriorityFormats():
|
33
|
|
- global PRIORITY_VIDEO_FORMAT,maxResolution
|
34
|
|
- PRIORITY_VIDEO_FORMAT = []
|
35
|
|
- use_format = False
|
36
|
|
- for itag_value in ['38', '37', '96', '22', '95', '120',
|
37
|
|
- '35', '94', '18', '93', '5', '92', '132', '17']:
|
38
|
|
- if itag_value == maxResolution: #config.plugins.YouTube.maxResolution.value:
|
39
|
|
- use_format = True
|
40
|
|
- if use_format:
|
41
|
|
- PRIORITY_VIDEO_FORMAT.append(itag_value)
|
|
34
|
+ global PRIORITY_VIDEO_FORMAT,maxResolution
|
|
35
|
+ PRIORITY_VIDEO_FORMAT = []
|
|
36
|
+ use_format = False
|
|
37
|
+ for itag_value in ['38', '37', '96', '22', '95', '120',
|
|
38
|
+ '35', '94', '18', '93', '5', '92', '132', '17']:
|
|
39
|
+ if itag_value == maxResolution: #config.plugins.YouTube.maxResolution.value:
|
|
40
|
+ use_format = True
|
|
41
|
+ if use_format:
|
|
42
|
+ PRIORITY_VIDEO_FORMAT.append(itag_value)
|
42
|
43
|
|
43
|
44
|
createPriorityFormats()
|
44
|
45
|
|
45
|
46
|
IGNORE_VIDEO_FORMAT = [
|
46
|
|
- '43', # webm
|
47
|
|
- '44', # webm
|
48
|
|
- '45', # webm
|
49
|
|
- '46', # webm
|
50
|
|
- '100', # webm
|
51
|
|
- '101', # webm
|
52
|
|
- '102' # webm
|
53
|
|
- ]
|
|
47
|
+ '43', # webm
|
|
48
|
+ '44', # webm
|
|
49
|
+ '45', # webm
|
|
50
|
+ '46', # webm
|
|
51
|
+ '100', # webm
|
|
52
|
+ '101', # webm
|
|
53
|
+ '102' # webm
|
|
54
|
+]
|
54
|
55
|
|
55
|
56
|
|
56
|
57
|
def uppercase_escape(s):
|
57
|
|
- unicode_escape = codecs.getdecoder('unicode_escape')
|
58
|
|
- return re.sub(
|
59
|
|
- r'\\U[0-9a-fA-F]{8}',
|
60
|
|
- lambda m: unicode_escape(m.group(0))[0],
|
61
|
|
- s)
|
|
58
|
+ unicode_escape = codecs.getdecoder('unicode_escape')
|
|
59
|
+ return re.sub(
|
|
60
|
+ r'\\U[0-9a-fA-F]{8}',
|
|
61
|
+ lambda m: unicode_escape(m.group(0))[0],
|
|
62
|
+ s)
|
62
|
63
|
|
63
|
64
|
|
64
|
65
|
def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
|
65
|
|
- if string == '':
|
66
|
|
- return string
|
67
|
|
- res = string.split('%')
|
68
|
|
- if len(res) == 1:
|
69
|
|
- return string
|
70
|
|
- if encoding is None:
|
71
|
|
- encoding = 'utf-8'
|
72
|
|
- if errors is None:
|
73
|
|
- errors = 'replace'
|
74
|
|
- # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
|
75
|
|
- pct_sequence = b''
|
76
|
|
- string = res[0]
|
77
|
|
- for item in res[1:]:
|
78
|
|
- try:
|
79
|
|
- if not item:
|
80
|
|
- raise ValueError
|
81
|
|
- pct_sequence += item[:2].decode('hex')
|
82
|
|
- rest = item[2:]
|
83
|
|
- if not rest:
|
84
|
|
- # This segment was just a single percent-encoded character.
|
85
|
|
- # May be part of a sequence of code units, so delay decoding.
|
86
|
|
- # (Stored in pct_sequence).
|
87
|
|
- continue
|
88
|
|
- except ValueError:
|
89
|
|
- rest = '%' + item
|
90
|
|
- # Encountered non-percent-encoded characters. Flush the current
|
91
|
|
- # pct_sequence.
|
92
|
|
- string += pct_sequence.decode(encoding, errors) + rest
|
93
|
|
- pct_sequence = b''
|
94
|
|
- if pct_sequence:
|
95
|
|
- # Flush the final pct_sequence
|
96
|
|
- string += pct_sequence.decode(encoding, errors)
|
97
|
|
- return string
|
|
66
|
+ if string == '':
|
|
67
|
+ return string
|
|
68
|
+ res = string.split('%')
|
|
69
|
+ if len(res) == 1:
|
|
70
|
+ return string
|
|
71
|
+ if encoding is None:
|
|
72
|
+ encoding = 'utf-8'
|
|
73
|
+ if errors is None:
|
|
74
|
+ errors = 'replace'
|
|
75
|
+ # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
|
|
76
|
+ pct_sequence = b''
|
|
77
|
+ string = res[0]
|
|
78
|
+ for item in res[1:]:
|
|
79
|
+ try:
|
|
80
|
+ if not item:
|
|
81
|
+ raise ValueError
|
|
82
|
+ pct_sequence += item[:2].decode('hex')
|
|
83
|
+ rest = item[2:]
|
|
84
|
+ if not rest:
|
|
85
|
+ # This segment was just a single percent-encoded character.
|
|
86
|
+ # May be part of a sequence of code units, so delay decoding.
|
|
87
|
+ # (Stored in pct_sequence).
|
|
88
|
+ continue
|
|
89
|
+ except ValueError:
|
|
90
|
+ rest = '%' + item
|
|
91
|
+ # Encountered non-percent-encoded characters. Flush the current
|
|
92
|
+ # pct_sequence.
|
|
93
|
+ string += pct_sequence.decode(encoding, errors) + rest
|
|
94
|
+ pct_sequence = b''
|
|
95
|
+ if pct_sequence:
|
|
96
|
+ # Flush the final pct_sequence
|
|
97
|
+ string += pct_sequence.decode(encoding, errors)
|
|
98
|
+ return string
|
98
|
99
|
|
99
|
100
|
|
100
|
101
|
def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
|
101
|
|
- encoding='utf-8', errors='replace'):
|
102
|
|
- qs, _coerce_result = qs, unicode
|
103
|
|
- pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
|
104
|
|
- r = []
|
105
|
|
- for name_value in pairs:
|
106
|
|
- if not name_value and not strict_parsing:
|
107
|
|
- continue
|
108
|
|
- nv = name_value.split('=', 1)
|
109
|
|
- if len(nv) != 2:
|
110
|
|
- if strict_parsing:
|
111
|
|
- raise ValueError("bad query field: %r" % (name_value,))
|
112
|
|
- # Handle case of a control-name with no equal sign
|
113
|
|
- if keep_blank_values:
|
114
|
|
- nv.append('')
|
115
|
|
- else:
|
116
|
|
- continue
|
117
|
|
- if len(nv[1]) or keep_blank_values:
|
118
|
|
- name = nv[0].replace('+', ' ')
|
119
|
|
- name = compat_urllib_parse_unquote(
|
120
|
|
- name, encoding=encoding, errors=errors)
|
121
|
|
- name = _coerce_result(name)
|
122
|
|
- value = nv[1].replace('+', ' ')
|
123
|
|
- value = compat_urllib_parse_unquote(
|
124
|
|
- value, encoding=encoding, errors=errors)
|
125
|
|
- value = _coerce_result(value)
|
126
|
|
- r.append((name, value))
|
127
|
|
- return r
|
|
102
|
+ encoding='utf-8', errors='replace'):
|
|
103
|
+ qs, _coerce_result = qs, unicode
|
|
104
|
+ pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
|
|
105
|
+ r = []
|
|
106
|
+ for name_value in pairs:
|
|
107
|
+ if not name_value and not strict_parsing:
|
|
108
|
+ continue
|
|
109
|
+ nv = name_value.split('=', 1)
|
|
110
|
+ if len(nv) != 2:
|
|
111
|
+ if strict_parsing:
|
|
112
|
+ raise ValueError("bad query field: %r" % (name_value,))
|
|
113
|
+ # Handle case of a control-name with no equal sign
|
|
114
|
+ if keep_blank_values:
|
|
115
|
+ nv.append('')
|
|
116
|
+ else:
|
|
117
|
+ continue
|
|
118
|
+ if len(nv[1]) or keep_blank_values:
|
|
119
|
+ name = nv[0].replace('+', ' ')
|
|
120
|
+ name = compat_urllib_parse_unquote(
|
|
121
|
+ name, encoding=encoding, errors=errors)
|
|
122
|
+ name = _coerce_result(name)
|
|
123
|
+ value = nv[1].replace('+', ' ')
|
|
124
|
+ value = compat_urllib_parse_unquote(
|
|
125
|
+ value, encoding=encoding, errors=errors)
|
|
126
|
+ value = _coerce_result(value)
|
|
127
|
+ r.append((name, value))
|
|
128
|
+ return r
|
128
|
129
|
|
129
|
130
|
|
130
|
131
|
def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
|
131
|
|
- encoding='utf-8', errors='replace'):
|
132
|
|
- parsed_result = {}
|
133
|
|
- pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
|
134
|
|
- encoding=encoding, errors=errors)
|
135
|
|
- for name, value in pairs:
|
136
|
|
- if name in parsed_result:
|
137
|
|
- parsed_result[name].append(value)
|
138
|
|
- else:
|
139
|
|
- parsed_result[name] = [value]
|
140
|
|
- return parsed_result
|
|
132
|
+ encoding='utf-8', errors='replace'):
|
|
133
|
+ parsed_result = {}
|
|
134
|
+ pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
|
|
135
|
+ encoding=encoding, errors=errors)
|
|
136
|
+ for name, value in pairs:
|
|
137
|
+ if name in parsed_result:
|
|
138
|
+ parsed_result[name].append(value)
|
|
139
|
+ else:
|
|
140
|
+ parsed_result[name] = [value]
|
|
141
|
+ return parsed_result
|
141
|
142
|
|
142
|
143
|
|
143
|
144
|
class YouTubeVideoUrl():
|
144
|
145
|
|
145
|
|
- def _download_webpage(self, url):
|
146
|
|
- """ Returns a tuple (page content as string, URL handle) """
|
147
|
|
- try:
|
148
|
|
- if sslContext:
|
149
|
|
- urlh = urlopen(url, context = sslContext)
|
150
|
|
- else:
|
151
|
|
- urlh = urlopen(url)
|
152
|
|
- except URLError, e:
|
153
|
|
- #raise Exception(e.reason)
|
154
|
|
- return ""
|
155
|
|
- return urlh.read()
|
156
|
|
-
|
157
|
|
- def _search_regex(self, pattern, string):
|
158
|
|
- """
|
159
|
|
- Perform a regex search on the given string, using a single or a list of
|
160
|
|
- patterns returning the first matching group.
|
161
|
|
- """
|
162
|
|
- mobj = re.search(pattern, string, 0)
|
163
|
|
- if mobj:
|
164
|
|
- # return the first matching group
|
165
|
|
- return next(g for g in mobj.groups() if g is not None)
|
166
|
|
- else:
|
167
|
|
- raise Exception('Unable extract pattern from string!')
|
168
|
|
-
|
169
|
|
- def _decrypt_signature(self, s, player_url):
|
170
|
|
- """Turn the encrypted s field into a working signature"""
|
171
|
|
-
|
172
|
|
- if player_url is None:
|
173
|
|
- raise Exception('Cannot decrypt signature without player_url!')
|
174
|
|
-
|
175
|
|
- if player_url[:2] == '//':
|
176
|
|
- player_url = 'https:' + player_url
|
177
|
|
- try:
|
178
|
|
- func = self._extract_signature_function(player_url)
|
179
|
|
- return func(s)
|
180
|
|
- except:
|
181
|
|
- raise Exception('Signature extraction failed!')
|
182
|
|
-
|
183
|
|
- def _extract_signature_function(self, player_url):
|
184
|
|
- id_m = re.match(
|
185
|
|
- r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
|
186
|
|
- player_url)
|
187
|
|
- if not id_m:
|
188
|
|
- raise Exception('Cannot identify player %r!' % player_url)
|
189
|
|
- player_type = id_m.group('ext')
|
190
|
|
- code = self._download_webpage(player_url)
|
191
|
|
- if player_type == 'js':
|
192
|
|
- return self._parse_sig_js(code)
|
193
|
|
- elif player_type == 'swf':
|
194
|
|
- return self._parse_sig_swf(code)
|
195
|
|
- else:
|
196
|
|
- raise Exception('Invalid player type %r!' % player_type)
|
197
|
|
-
|
198
|
|
- def _parse_sig_js(self, jscode):
|
199
|
|
- funcname = self._search_regex(r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode)
|
200
|
|
- jsi = JSInterpreter(jscode)
|
201
|
|
- initial_function = jsi.extract_function(funcname)
|
202
|
|
- return lambda s: initial_function([s])
|
203
|
|
-
|
204
|
|
- def _parse_sig_swf(self, file_contents):
|
205
|
|
- swfi = SWFInterpreter(file_contents)
|
206
|
|
- TARGET_CLASSNAME = 'SignatureDecipher'
|
207
|
|
- searched_class = swfi.extract_class(TARGET_CLASSNAME)
|
208
|
|
- initial_function = swfi.extract_function(searched_class, 'decipher')
|
209
|
|
- return lambda s: initial_function([s])
|
210
|
|
-
|
211
|
|
- def _extract_from_m3u8(self, manifest_url):
|
212
|
|
- url_map = {}
|
213
|
|
-
|
214
|
|
- def _get_urls(_manifest):
|
215
|
|
- lines = _manifest.split('\n')
|
216
|
|
- urls = filter(lambda l: l and not l.startswith('#'), lines)
|
217
|
|
- return urls
|
218
|
|
-
|
219
|
|
- manifest = self._download_webpage(manifest_url)
|
220
|
|
- formats_urls = _get_urls(manifest)
|
221
|
|
- for format_url in formats_urls:
|
222
|
|
- itag = self._search_regex(r'itag/(\d+?)/', format_url)
|
223
|
|
- url_map[itag] = format_url
|
224
|
|
- return url_map
|
225
|
|
-
|
226
|
|
- def _get_ytplayer_config(self, webpage):
|
227
|
|
- # User data may contain arbitrary character sequences that may affect
|
228
|
|
- # JSON extraction with regex, e.g. when '};' is contained the second
|
229
|
|
- # regex won't capture the whole JSON. Yet working around by trying more
|
230
|
|
- # concrete regex first keeping in mind proper quoted string handling
|
231
|
|
- # to be implemented in future that will replace this workaround (see
|
232
|
|
- # https://github.com/rg3/youtube-dl/issues/7468,
|
233
|
|
- # https://github.com/rg3/youtube-dl/pull/7599)
|
234
|
|
- patterns = [
|
235
|
|
- r';ytplayer\.config\s*=\s*({.+?});ytplayer',
|
236
|
|
- r';ytplayer\.config\s*=\s*({.+?});',
|
237
|
|
- ]
|
238
|
|
- for pattern in patterns:
|
239
|
|
- config = self._search_regex(pattern, webpage)
|
240
|
|
- if config:
|
241
|
|
- return json.loads(uppercase_escape(config))
|
242
|
|
-
|
243
|
|
- def extract(self, video_id):
|
244
|
|
- url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
|
245
|
|
-
|
246
|
|
- # Get video webpage
|
247
|
|
- video_webpage = self._download_webpage(url)
|
248
|
|
- if not video_webpage:
|
249
|
|
- #raise Exception('Video webpage not found!')
|
250
|
|
- return ""
|
251
|
|
-
|
252
|
|
- # Attempt to extract SWF player URL
|
253
|
|
- mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
|
254
|
|
- if mobj is not None:
|
255
|
|
- player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
|
256
|
|
- else:
|
257
|
|
- player_url = None
|
258
|
|
-
|
259
|
|
- # Get video info
|
260
|
|
- embed_webpage = None
|
261
|
|
- if re.search(r'player-age-gate-content">', video_webpage) is not None:
|
262
|
|
- age_gate = True
|
263
|
|
- # We simulate the access to the video from www.youtube.com/v/{video_id}
|
264
|
|
- # this can be viewed without login into Youtube
|
265
|
|
- url = 'https://www.youtube.com/embed/%s' % video_id
|
266
|
|
- embed_webpage = self._download_webpage(url)
|
267
|
|
- data = urlencode({
|
268
|
|
- 'video_id': video_id,
|
269
|
|
- 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
|
270
|
|
- 'sts': self._search_regex(r'"sts"\s*:\s*(\d+)', embed_webpage),
|
271
|
|
- })
|
272
|
|
- video_info_url = 'https://www.youtube.com/get_video_info?' + data
|
273
|
|
- video_info_webpage = self._download_webpage(video_info_url)
|
274
|
|
- video_info = compat_parse_qs(video_info_webpage)
|
275
|
|
- else:
|
276
|
|
- age_gate = False
|
277
|
|
- video_info = None
|
278
|
|
- # Try looking directly into the video webpage
|
279
|
|
- ytplayer_config = self._get_ytplayer_config(video_webpage)
|
280
|
|
- if ytplayer_config:
|
281
|
|
- args = ytplayer_config['args']
|
282
|
|
- if args.get('url_encoded_fmt_stream_map'):
|
283
|
|
- # Convert to the same format returned by compat_parse_qs
|
284
|
|
- video_info = dict((k, [v]) for k, v in args.items())
|
285
|
|
-
|
286
|
|
- if not video_info:
|
287
|
|
- # We also try looking in get_video_info since it may contain different dashmpd
|
288
|
|
- # URL that points to a DASH manifest with possibly different itag set (some itags
|
289
|
|
- # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
|
290
|
|
- # manifest pointed by get_video_info's dashmpd).
|
291
|
|
- # The general idea is to take a union of itags of both DASH manifests (for example
|
292
|
|
- # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
|
293
|
|
- for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
|
294
|
|
- video_info_url = (
|
295
|
|
- 'https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
|
296
|
|
- % (video_id, el_type))
|
297
|
|
- video_info_webpage = self._download_webpage(video_info_url)
|
298
|
|
- video_info = compat_parse_qs(video_info_webpage)
|
299
|
|
- if 'token' in video_info:
|
300
|
|
- break
|
301
|
|
- if 'token' not in video_info:
|
302
|
|
- if 'reason' in video_info:
|
303
|
|
- print '[YouTubeVideoUrl] %s' % video_info['reason'][0]
|
304
|
|
- else:
|
305
|
|
- print '[YouTubeVideoUrl] "token" parameter not in video info for unknown reason'
|
306
|
|
-
|
307
|
|
- # Start extracting information
|
308
|
|
- if 'conn' in video_info and video_info['conn'][0][:4] == 'rtmp':
|
309
|
|
- url = video_info['conn'][0]
|
310
|
|
- elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or \
|
311
|
|
- len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
|
312
|
|
- encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + \
|
313
|
|
- ',' + video_info.get('adaptive_fmts', [''])[0]
|
314
|
|
- if 'rtmpe%3Dyes' in encoded_url_map:
|
315
|
|
- raise Exception('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343')
|
316
|
|
-
|
317
|
|
- # Find the best format from our format priority map
|
318
|
|
- encoded_url_map = encoded_url_map.split(',')
|
319
|
|
- url_map_str = None
|
320
|
|
- # If format changed in config, recreate priority list
|
321
|
|
- if PRIORITY_VIDEO_FORMAT[0] != maxResolution: #config.plugins.YouTube.maxResolution.value:
|
322
|
|
- createPriorityFormats()
|
323
|
|
- for our_format in PRIORITY_VIDEO_FORMAT:
|
324
|
|
- our_format = 'itag=' + our_format
|
325
|
|
- for encoded_url in encoded_url_map:
|
326
|
|
- if our_format in encoded_url and 'url=' in encoded_url:
|
327
|
|
- url_map_str = encoded_url
|
328
|
|
- break
|
329
|
|
- if url_map_str:
|
330
|
|
- break
|
331
|
|
- # If anything not found, used first in the list if it not in ignore map
|
332
|
|
- if not url_map_str:
|
333
|
|
- for encoded_url in encoded_url_map:
|
334
|
|
- if 'url=' in encoded_url:
|
335
|
|
- url_map_str = encoded_url
|
336
|
|
- for ignore_format in IGNORE_VIDEO_FORMAT:
|
337
|
|
- ignore_format = 'itag=' + ignore_format
|
338
|
|
- if ignore_format in encoded_url:
|
339
|
|
- url_map_str = None
|
340
|
|
- break
|
341
|
|
- if url_map_str:
|
342
|
|
- break
|
343
|
|
- if not url_map_str:
|
344
|
|
- url_map_str = encoded_url_map[0]
|
345
|
|
-
|
346
|
|
- url_data = compat_parse_qs(url_map_str)
|
347
|
|
- url = url_data['url'][0]
|
348
|
|
- if 'sig' in url_data:
|
349
|
|
- url += '&signature=' + url_data['sig'][0]
|
350
|
|
- elif 's' in url_data:
|
351
|
|
- encrypted_sig = url_data['s'][0]
|
352
|
|
- ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
|
353
|
|
-
|
354
|
|
- jsplayer_url_json = self._search_regex(ASSETS_RE,
|
355
|
|
- embed_webpage if age_gate else video_webpage)
|
356
|
|
- if not jsplayer_url_json and not age_gate:
|
357
|
|
- # We need the embed website after all
|
358
|
|
- if embed_webpage is None:
|
359
|
|
- embed_url = 'https://www.youtube.com/embed/%s' % video_id
|
360
|
|
- embed_webpage = self._download_webpage(embed_url)
|
361
|
|
- jsplayer_url_json = self._search_regex(ASSETS_RE, embed_webpage)
|
362
|
|
-
|
363
|
|
- player_url = json.loads(jsplayer_url_json)
|
364
|
|
- if player_url is None:
|
365
|
|
- player_url_json = self._search_regex(
|
366
|
|
- r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
|
367
|
|
- video_webpage)
|
368
|
|
- player_url = json.loads(player_url_json)
|
369
|
|
-
|
370
|
|
- signature = self._decrypt_signature(encrypted_sig, player_url)
|
371
|
|
- url += '&signature=' + signature
|
372
|
|
- if 'ratebypass' not in url:
|
373
|
|
- url += '&ratebypass=yes'
|
374
|
|
- elif video_info.get('hlsvp'):
|
375
|
|
- url = None
|
376
|
|
- manifest_url = video_info['hlsvp'][0]
|
377
|
|
- url_map = self._extract_from_m3u8(manifest_url)
|
378
|
|
-
|
379
|
|
- # Find the best format from our format priority map
|
380
|
|
- for our_format in PRIORITY_VIDEO_FORMAT:
|
381
|
|
- if url_map.get(our_format):
|
382
|
|
- url = url_map[our_format]
|
383
|
|
- break
|
384
|
|
- # If anything not found, used first in the list if it not in ignore map
|
385
|
|
- if not url:
|
386
|
|
- for url_map_key in url_map.keys():
|
387
|
|
- if url_map_key not in IGNORE_VIDEO_FORMAT:
|
388
|
|
- url = url_map[url_map_key]
|
389
|
|
- break
|
390
|
|
- if not url:
|
391
|
|
- url = url_map.values()[0]
|
392
|
|
- else:
|
393
|
|
- #raise Exception('No supported formats found in video info!')
|
394
|
|
- return ""
|
395
|
|
-
|
396
|
|
- return str(url)
|
|
146
|
+ def _download_webpage(self, url):
|
|
147
|
+ """ Returns a tuple (page content as string, URL handle) """
|
|
148
|
+ try:
|
|
149
|
+ if sslContext:
|
|
150
|
+ urlh = urlopen(url, context = sslContext)
|
|
151
|
+ else:
|
|
152
|
+ urlh = urlopen(url)
|
|
153
|
+ except URLError, e:
|
|
154
|
+ #raise Exception(e.reason)
|
|
155
|
+ return ""
|
|
156
|
+ return urlh.read()
|
|
157
|
+
|
|
158
|
+ def _search_regex(self, pattern, string):
|
|
159
|
+ """
|
|
160
|
+ Perform a regex search on the given string, using a single or a list of
|
|
161
|
+ patterns returning the first matching group.
|
|
162
|
+ """
|
|
163
|
+ mobj = re.search(pattern, string, 0)
|
|
164
|
+ if mobj:
|
|
165
|
+ # return the first matching group
|
|
166
|
+ return next(g for g in mobj.groups() if g is not None)
|
|
167
|
+ else:
|
|
168
|
+ raise Exception('Unable extract pattern from string!')
|
|
169
|
+
|
|
170
|
+ def _decrypt_signature(self, s, player_url):
|
|
171
|
+ """Turn the encrypted s field into a working signature"""
|
|
172
|
+
|
|
173
|
+ if player_url is None:
|
|
174
|
+ raise Exception('Cannot decrypt signature without player_url!')
|
|
175
|
+
|
|
176
|
+ if player_url[:2] == '//':
|
|
177
|
+ player_url = 'https:' + player_url
|
|
178
|
+ try:
|
|
179
|
+ func = self._extract_signature_function(player_url)
|
|
180
|
+ return func(s)
|
|
181
|
+ except:
|
|
182
|
+ raise Exception('Signature extraction failed!')
|
|
183
|
+
|
|
184
|
+ def _extract_signature_function(self, player_url):
|
|
185
|
+ id_m = re.match(
|
|
186
|
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
|
|
187
|
+ player_url)
|
|
188
|
+ if not id_m:
|
|
189
|
+ raise Exception('Cannot identify player %r!' % player_url)
|
|
190
|
+ player_type = id_m.group('ext')
|
|
191
|
+ code = self._download_webpage(player_url)
|
|
192
|
+ if player_type == 'js':
|
|
193
|
+ return self._parse_sig_js(code)
|
|
194
|
+ elif player_type == 'swf':
|
|
195
|
+ return self._parse_sig_swf(code)
|
|
196
|
+ else:
|
|
197
|
+ raise Exception('Invalid player type %r!' % player_type)
|
|
198
|
+
|
|
199
|
+ def _parse_sig_js(self, jscode):
|
|
200
|
+ funcname = self._search_regex(r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode)
|
|
201
|
+ jsi = JSInterpreter(jscode)
|
|
202
|
+ initial_function = jsi.extract_function(funcname)
|
|
203
|
+ return lambda s: initial_function([s])
|
|
204
|
+
|
|
205
|
+ def _parse_sig_swf(self, file_contents):
|
|
206
|
+ swfi = SWFInterpreter(file_contents)
|
|
207
|
+ TARGET_CLASSNAME = 'SignatureDecipher'
|
|
208
|
+ searched_class = swfi.extract_class(TARGET_CLASSNAME)
|
|
209
|
+ initial_function = swfi.extract_function(searched_class, 'decipher')
|
|
210
|
+ return lambda s: initial_function([s])
|
|
211
|
+
|
|
212
|
+ def _extract_from_m3u8(self, manifest_url):
|
|
213
|
+ url_map = {}
|
|
214
|
+
|
|
215
|
+ def _get_urls(_manifest):
|
|
216
|
+ lines = _manifest.split('\n')
|
|
217
|
+ urls = filter(lambda l: l and not l.startswith('#'), lines)
|
|
218
|
+ return urls
|
|
219
|
+
|
|
220
|
+ manifest = self._download_webpage(manifest_url)
|
|
221
|
+ formats_urls = _get_urls(manifest)
|
|
222
|
+ for format_url in formats_urls:
|
|
223
|
+ itag = self._search_regex(r'itag/(\d+?)/', format_url)
|
|
224
|
+ url_map[itag] = format_url
|
|
225
|
+ return url_map
|
|
226
|
+
|
|
227
|
+ def _get_ytplayer_config(self, webpage):
|
|
228
|
+ # User data may contain arbitrary character sequences that may affect
|
|
229
|
+ # JSON extraction with regex, e.g. when '};' is contained the second
|
|
230
|
+ # regex won't capture the whole JSON. Yet working around by trying more
|
|
231
|
+ # concrete regex first keeping in mind proper quoted string handling
|
|
232
|
+ # to be implemented in future that will replace this workaround (see
|
|
233
|
+ # https://github.com/rg3/youtube-dl/issues/7468,
|
|
234
|
+ # https://github.com/rg3/youtube-dl/pull/7599)
|
|
235
|
+ patterns = [
|
|
236
|
+ r';ytplayer\.config\s*=\s*({.+?});ytplayer',
|
|
237
|
+ r';ytplayer\.config\s*=\s*({.+?});',
|
|
238
|
+ ]
|
|
239
|
+ for pattern in patterns:
|
|
240
|
+ config = self._search_regex(pattern, webpage)
|
|
241
|
+ if config:
|
|
242
|
+ return json.loads(uppercase_escape(config))
|
|
243
|
+
|
|
244
|
+ def extract(self, video_id):
|
|
245
|
+ url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
|
|
246
|
+
|
|
247
|
+ # Get video webpage
|
|
248
|
+ video_webpage = self._download_webpage(url)
|
|
249
|
+ if not video_webpage:
|
|
250
|
+ #raise Exception('Video webpage not found!')
|
|
251
|
+ return ""
|
|
252
|
+
|
|
253
|
+ # Attempt to extract SWF player URL
|
|
254
|
+ mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
|
|
255
|
+ if mobj is not None:
|
|
256
|
+ player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
|
|
257
|
+ else:
|
|
258
|
+ player_url = None
|
|
259
|
+
|
|
260
|
+ # Get video info
|
|
261
|
+ embed_webpage = None
|
|
262
|
+ if re.search(r'player-age-gate-content">', video_webpage) is not None:
|
|
263
|
+ age_gate = True
|
|
264
|
+ # We simulate the access to the video from www.youtube.com/v/{video_id}
|
|
265
|
+ # this can be viewed without login into Youtube
|
|
266
|
+ url = 'https://www.youtube.com/embed/%s' % video_id
|
|
267
|
+ embed_webpage = self._download_webpage(url)
|
|
268
|
+ data = urlencode({
|
|
269
|
+ 'video_id': video_id,
|
|
270
|
+ 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
|
|
271
|
+ 'sts': self._search_regex(r'"sts"\s*:\s*(\d+)', embed_webpage),
|
|
272
|
+ })
|
|
273
|
+ video_info_url = 'https://www.youtube.com/get_video_info?' + data
|
|
274
|
+ video_info_webpage = self._download_webpage(video_info_url)
|
|
275
|
+ video_info = compat_parse_qs(video_info_webpage)
|
|
276
|
+ else:
|
|
277
|
+ age_gate = False
|
|
278
|
+ video_info = None
|
|
279
|
+ # Try looking directly into the video webpage
|
|
280
|
+ ytplayer_config = self._get_ytplayer_config(video_webpage)
|
|
281
|
+ if ytplayer_config:
|
|
282
|
+ args = ytplayer_config['args']
|
|
283
|
+ if args.get('url_encoded_fmt_stream_map'):
|
|
284
|
+ # Convert to the same format returned by compat_parse_qs
|
|
285
|
+ video_info = dict((k, [v]) for k, v in args.items())
|
|
286
|
+
|
|
287
|
+ if not video_info:
|
|
288
|
+ # We also try looking in get_video_info since it may contain different dashmpd
|
|
289
|
+ # URL that points to a DASH manifest with possibly different itag set (some itags
|
|
290
|
+ # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
|
|
291
|
+ # manifest pointed by get_video_info's dashmpd).
|
|
292
|
+ # The general idea is to take a union of itags of both DASH manifests (for example
|
|
293
|
+ # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
|
|
294
|
+ for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
|
|
295
|
+ video_info_url = (
|
|
296
|
+ 'https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
|
|
297
|
+ % (video_id, el_type))
|
|
298
|
+ video_info_webpage = self._download_webpage(video_info_url)
|
|
299
|
+ video_info = compat_parse_qs(video_info_webpage)
|
|
300
|
+ if 'token' in video_info:
|
|
301
|
+ break
|
|
302
|
+ if 'token' not in video_info:
|
|
303
|
+ if 'reason' in video_info:
|
|
304
|
+ print '[YouTubeVideoUrl] %s' % video_info['reason'][0]
|
|
305
|
+ else:
|
|
306
|
+ print '[YouTubeVideoUrl] "token" parameter not in video info for unknown reason'
|
|
307
|
+
|
|
308
|
+ # Start extracting information
|
|
309
|
+ if 'conn' in video_info and video_info['conn'][0][:4] == 'rtmp':
|
|
310
|
+ url = video_info['conn'][0]
|
|
311
|
+ elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or \
|
|
312
|
+ len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
|
|
313
|
+ encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + \
|
|
314
|
+ ',' + video_info.get('adaptive_fmts', [''])[0]
|
|
315
|
+ if 'rtmpe%3Dyes' in encoded_url_map:
|
|
316
|
+ raise Exception('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343')
|
|
317
|
+
|
|
318
|
+ # Find the best format from our format priority map
|
|
319
|
+ encoded_url_map = encoded_url_map.split(',')
|
|
320
|
+ url_map_str = None
|
|
321
|
+ # If format changed in config, recreate priority list
|
|
322
|
+ if PRIORITY_VIDEO_FORMAT[0] != maxResolution: #config.plugins.YouTube.maxResolution.value:
|
|
323
|
+ createPriorityFormats()
|
|
324
|
+ for our_format in PRIORITY_VIDEO_FORMAT:
|
|
325
|
+ our_format = 'itag=' + our_format
|
|
326
|
+ for encoded_url in encoded_url_map:
|
|
327
|
+ if our_format in encoded_url and 'url=' in encoded_url:
|
|
328
|
+ url_map_str = encoded_url
|
|
329
|
+ break
|
|
330
|
+ if url_map_str:
|
|
331
|
+ break
|
|
332
|
+ # If anything not found, used first in the list if it not in ignore map
|
|
333
|
+ if not url_map_str:
|
|
334
|
+ for encoded_url in encoded_url_map:
|
|
335
|
+ if 'url=' in encoded_url:
|
|
336
|
+ url_map_str = encoded_url
|
|
337
|
+ for ignore_format in IGNORE_VIDEO_FORMAT:
|
|
338
|
+ ignore_format = 'itag=' + ignore_format
|
|
339
|
+ if ignore_format in encoded_url:
|
|
340
|
+ url_map_str = None
|
|
341
|
+ break
|
|
342
|
+ if url_map_str:
|
|
343
|
+ break
|
|
344
|
+ if not url_map_str:
|
|
345
|
+ url_map_str = encoded_url_map[0]
|
|
346
|
+
|
|
347
|
+ url_data = compat_parse_qs(url_map_str)
|
|
348
|
+ url = url_data['url'][0]
|
|
349
|
+ if 'sig' in url_data:
|
|
350
|
+ url += '&signature=' + url_data['sig'][0]
|
|
351
|
+ elif 's' in url_data:
|
|
352
|
+ encrypted_sig = url_data['s'][0]
|
|
353
|
+ ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
|
|
354
|
+
|
|
355
|
+ jsplayer_url_json = self._search_regex(ASSETS_RE,
|
|
356
|
+ embed_webpage if age_gate else video_webpage)
|
|
357
|
+ if not jsplayer_url_json and not age_gate:
|
|
358
|
+ # We need the embed website after all
|
|
359
|
+ if embed_webpage is None:
|
|
360
|
+ embed_url = 'https://www.youtube.com/embed/%s' % video_id
|
|
361
|
+ embed_webpage = self._download_webpage(embed_url)
|
|
362
|
+ jsplayer_url_json = self._search_regex(ASSETS_RE, embed_webpage)
|
|
363
|
+
|
|
364
|
+ player_url = json.loads(jsplayer_url_json)
|
|
365
|
+ if player_url is None:
|
|
366
|
+ player_url_json = self._search_regex(
|
|
367
|
+ r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
|
|
368
|
+ video_webpage)
|
|
369
|
+ player_url = json.loads(player_url_json)
|
|
370
|
+
|
|
371
|
+ signature = self._decrypt_signature(encrypted_sig, player_url)
|
|
372
|
+ url += '&signature=' + signature
|
|
373
|
+ if 'ratebypass' not in url:
|
|
374
|
+ url += '&ratebypass=yes'
|
|
375
|
+ elif video_info.get('hlsvp'):
|
|
376
|
+ url = None
|
|
377
|
+ manifest_url = video_info['hlsvp'][0]
|
|
378
|
+ url_map = self._extract_from_m3u8(manifest_url)
|
|
379
|
+
|
|
380
|
+ # Find the best format from our format priority map
|
|
381
|
+ for our_format in PRIORITY_VIDEO_FORMAT:
|
|
382
|
+ if url_map.get(our_format):
|
|
383
|
+ url = url_map[our_format]
|
|
384
|
+ break
|
|
385
|
+ # If anything not found, used first in the list if it not in ignore map
|
|
386
|
+ if not url:
|
|
387
|
+ for url_map_key in url_map.keys():
|
|
388
|
+ if url_map_key not in IGNORE_VIDEO_FORMAT:
|
|
389
|
+ url = url_map[url_map_key]
|
|
390
|
+ break
|
|
391
|
+ if not url:
|
|
392
|
+ url = url_map.values()[0]
|
|
393
|
+ else:
|
|
394
|
+ #raise Exception('No supported formats found in video info!')
|
|
395
|
+ return ""
|
|
396
|
+
|
|
397
|
+ return str(url)
|
397
|
398
|
|
398
|
399
|
if __name__ == "__main__":
|
399
|
400
|
|
400
|
|
- #yt = YouTubeVideoUrl()
|
401
|
|
- if len(sys.argv)>1:
|
402
|
|
- video_id= sys.argv[1]
|
403
|
|
- else:
|
404
|
|
- video_id = "2rlTF6HiMGg"
|
405
|
|
- e = YouTubeVideoUrl().extract(video_id)
|
406
|
|
- print e
|
|
401
|
+ #yt = YouTubeVideoUrl()
|
|
402
|
+ if len(sys.argv)>1:
|
|
403
|
+ video_id= sys.argv[1]
|
|
404
|
+ else:
|
|
405
|
+ video_id = "2rlTF6HiMGg"
|
|
406
|
+ e = YouTubeVideoUrl().extract(video_id)
|
|
407
|
+ print e
|