Python module (submodule repositary), which provides content (video streams) from various online stream sources to corresponding Enigma2, Kodi, Plex plugins

YouTubeVideoUrl.py 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. # -*- coding: UTF-8 -*-
  2. # This video extraction code based on youtube-dl: https://github.com/rg3/youtube-dl
  3. import codecs
  4. import json
  5. import re
  6. from urllib import urlencode
  7. from urllib2 import urlopen, URLError
  8. import sys
  9. import ssl
  10. if "_create_unverified_context" in dir(ssl):
  11. ssl._create_default_https_context = ssl._create_unverified_context
  12. #from Components.config import config
  13. #from . import sslContext
  14. sslContext = None
  15. if sys.version_info >= (2, 7, 9):
  16. try:
  17. import ssl
  18. sslContext = ssl._create_unverified_context()
  19. except:
  20. pass
  21. from jsinterp import JSInterpreter
  22. from swfinterp import SWFInterpreter
  23. PRIORITY_VIDEO_FORMAT = []
  24. maxResolution = '22'
  25. def createPriorityFormats():
  26. global PRIORITY_VIDEO_FORMAT,maxResolution
  27. PRIORITY_VIDEO_FORMAT = []
  28. use_format = False
  29. for itag_value in ['38', '37', '96', '22', '95', '120',
  30. '35', '94', '18', '93', '5', '92', '132', '17']:
  31. if itag_value == maxResolution: #config.plugins.YouTube.maxResolution.value:
  32. use_format = True
  33. if use_format:
  34. PRIORITY_VIDEO_FORMAT.append(itag_value)
  35. createPriorityFormats()
  36. IGNORE_VIDEO_FORMAT = [
  37. '43', # webm
  38. '44', # webm
  39. '45', # webm
  40. '46', # webm
  41. '100', # webm
  42. '101', # webm
  43. '102' # webm
  44. ]
  45. def uppercase_escape(s):
  46. unicode_escape = codecs.getdecoder('unicode_escape')
  47. return re.sub(
  48. r'\\U[0-9a-fA-F]{8}',
  49. lambda m: unicode_escape(m.group(0))[0],
  50. s)
  51. def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  52. if string == '':
  53. return string
  54. res = string.split('%')
  55. if len(res) == 1:
  56. return string
  57. if encoding is None:
  58. encoding = 'utf-8'
  59. if errors is None:
  60. errors = 'replace'
  61. # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  62. pct_sequence = b''
  63. string = res[0]
  64. for item in res[1:]:
  65. try:
  66. if not item:
  67. raise ValueError
  68. pct_sequence += item[:2].decode('hex')
  69. rest = item[2:]
  70. if not rest:
  71. # This segment was just a single percent-encoded character.
  72. # May be part of a sequence of code units, so delay decoding.
  73. # (Stored in pct_sequence).
  74. continue
  75. except ValueError:
  76. rest = '%' + item
  77. # Encountered non-percent-encoded characters. Flush the current
  78. # pct_sequence.
  79. string += pct_sequence.decode(encoding, errors) + rest
  80. pct_sequence = b''
  81. if pct_sequence:
  82. # Flush the final pct_sequence
  83. string += pct_sequence.decode(encoding, errors)
  84. return string
  85. def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
  86. encoding='utf-8', errors='replace'):
  87. qs, _coerce_result = qs, unicode
  88. pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
  89. r = []
  90. for name_value in pairs:
  91. if not name_value and not strict_parsing:
  92. continue
  93. nv = name_value.split('=', 1)
  94. if len(nv) != 2:
  95. if strict_parsing:
  96. raise ValueError("bad query field: %r" % (name_value,))
  97. # Handle case of a control-name with no equal sign
  98. if keep_blank_values:
  99. nv.append('')
  100. else:
  101. continue
  102. if len(nv[1]) or keep_blank_values:
  103. name = nv[0].replace('+', ' ')
  104. name = compat_urllib_parse_unquote(
  105. name, encoding=encoding, errors=errors)
  106. name = _coerce_result(name)
  107. value = nv[1].replace('+', ' ')
  108. value = compat_urllib_parse_unquote(
  109. value, encoding=encoding, errors=errors)
  110. value = _coerce_result(value)
  111. r.append((name, value))
  112. return r
  113. def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
  114. encoding='utf-8', errors='replace'):
  115. parsed_result = {}
  116. pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
  117. encoding=encoding, errors=errors)
  118. for name, value in pairs:
  119. if name in parsed_result:
  120. parsed_result[name].append(value)
  121. else:
  122. parsed_result[name] = [value]
  123. return parsed_result
  124. class YouTubeVideoUrl():
  125. def _download_webpage(self, url):
  126. """ Returns a tuple (page content as string, URL handle) """
  127. try:
  128. if sslContext:
  129. urlh = urlopen(url, context = sslContext)
  130. else:
  131. urlh = urlopen(url)
  132. except URLError, e:
  133. #raise Exception(e.reason)
  134. return ""
  135. return urlh.read()
  136. def _search_regex(self, pattern, string):
  137. """
  138. Perform a regex search on the given string, using a single or a list of
  139. patterns returning the first matching group.
  140. """
  141. mobj = re.search(pattern, string, 0)
  142. if mobj:
  143. # return the first matching group
  144. return next(g for g in mobj.groups() if g is not None)
  145. else:
  146. raise Exception('Unable extract pattern from string!')
  147. def _decrypt_signature(self, s, player_url):
  148. """Turn the encrypted s field into a working signature"""
  149. if player_url is None:
  150. raise Exception('Cannot decrypt signature without player_url!')
  151. if player_url[:2] == '//':
  152. player_url = 'https:' + player_url
  153. try:
  154. func = self._extract_signature_function(player_url)
  155. return func(s)
  156. except:
  157. raise Exception('Signature extraction failed!')
  158. def _extract_signature_function(self, player_url):
  159. id_m = re.match(
  160. r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
  161. player_url)
  162. if not id_m:
  163. raise Exception('Cannot identify player %r!' % player_url)
  164. player_type = id_m.group('ext')
  165. code = self._download_webpage(player_url)
  166. if player_type == 'js':
  167. return self._parse_sig_js(code)
  168. elif player_type == 'swf':
  169. return self._parse_sig_swf(code)
  170. else:
  171. raise Exception('Invalid player type %r!' % player_type)
  172. def _parse_sig_js(self, jscode):
  173. funcname = self._search_regex(r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode)
  174. jsi = JSInterpreter(jscode)
  175. initial_function = jsi.extract_function(funcname)
  176. return lambda s: initial_function([s])
  177. def _parse_sig_swf(self, file_contents):
  178. swfi = SWFInterpreter(file_contents)
  179. TARGET_CLASSNAME = 'SignatureDecipher'
  180. searched_class = swfi.extract_class(TARGET_CLASSNAME)
  181. initial_function = swfi.extract_function(searched_class, 'decipher')
  182. return lambda s: initial_function([s])
  183. def _extract_from_m3u8(self, manifest_url):
  184. url_map = {}
  185. def _get_urls(_manifest):
  186. lines = _manifest.split('\n')
  187. urls = filter(lambda l: l and not l.startswith('#'), lines)
  188. return urls
  189. manifest = self._download_webpage(manifest_url)
  190. formats_urls = _get_urls(manifest)
  191. for format_url in formats_urls:
  192. itag = self._search_regex(r'itag/(\d+?)/', format_url)
  193. url_map[itag] = format_url
  194. return url_map
  195. def _get_ytplayer_config(self, webpage):
  196. # User data may contain arbitrary character sequences that may affect
  197. # JSON extraction with regex, e.g. when '};' is contained the second
  198. # regex won't capture the whole JSON. Yet working around by trying more
  199. # concrete regex first keeping in mind proper quoted string handling
  200. # to be implemented in future that will replace this workaround (see
  201. # https://github.com/rg3/youtube-dl/issues/7468,
  202. # https://github.com/rg3/youtube-dl/pull/7599)
  203. patterns = [
  204. r';ytplayer\.config\s*=\s*({.+?});ytplayer',
  205. r';ytplayer\.config\s*=\s*({.+?});',
  206. ]
  207. for pattern in patterns:
  208. config = self._search_regex(pattern, webpage)
  209. if config:
  210. return json.loads(uppercase_escape(config))
  211. def extract(self, video_id):
  212. url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
  213. # Get video webpage
  214. video_webpage = self._download_webpage(url)
  215. if not video_webpage:
  216. #raise Exception('Video webpage not found!')
  217. return ""
  218. # Attempt to extract SWF player URL
  219. mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
  220. if mobj is not None:
  221. player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
  222. else:
  223. player_url = None
  224. # Get video info
  225. embed_webpage = None
  226. if re.search(r'player-age-gate-content">', video_webpage) is not None:
  227. age_gate = True
  228. # We simulate the access to the video from www.youtube.com/v/{video_id}
  229. # this can be viewed without login into Youtube
  230. url = 'https://www.youtube.com/embed/%s' % video_id
  231. embed_webpage = self._download_webpage(url)
  232. data = urlencode({
  233. 'video_id': video_id,
  234. 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
  235. 'sts': self._search_regex(r'"sts"\s*:\s*(\d+)', embed_webpage),
  236. })
  237. video_info_url = 'https://www.youtube.com/get_video_info?' + data
  238. video_info_webpage = self._download_webpage(video_info_url)
  239. video_info = compat_parse_qs(video_info_webpage)
  240. else:
  241. age_gate = False
  242. video_info = None
  243. # Try looking directly into the video webpage
  244. ytplayer_config = self._get_ytplayer_config(video_webpage)
  245. if ytplayer_config:
  246. args = ytplayer_config['args']
  247. if args.get('url_encoded_fmt_stream_map'):
  248. # Convert to the same format returned by compat_parse_qs
  249. video_info = dict((k, [v]) for k, v in args.items())
  250. if not video_info:
  251. # We also try looking in get_video_info since it may contain different dashmpd
  252. # URL that points to a DASH manifest with possibly different itag set (some itags
  253. # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
  254. # manifest pointed by get_video_info's dashmpd).
  255. # The general idea is to take a union of itags of both DASH manifests (for example
  256. # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
  257. for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
  258. video_info_url = (
  259. 'https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
  260. % (video_id, el_type))
  261. video_info_webpage = self._download_webpage(video_info_url)
  262. video_info = compat_parse_qs(video_info_webpage)
  263. if 'token' in video_info:
  264. break
  265. if 'token' not in video_info:
  266. if 'reason' in video_info:
  267. print '[YouTubeVideoUrl] %s' % video_info['reason'][0]
  268. else:
  269. print '[YouTubeVideoUrl] "token" parameter not in video info for unknown reason'
  270. # Start extracting information
  271. if 'conn' in video_info and video_info['conn'][0][:4] == 'rtmp':
  272. url = video_info['conn'][0]
  273. elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or \
  274. len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
  275. encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + \
  276. ',' + video_info.get('adaptive_fmts', [''])[0]
  277. if 'rtmpe%3Dyes' in encoded_url_map:
  278. raise Exception('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343')
  279. # Find the best format from our format priority map
  280. encoded_url_map = encoded_url_map.split(',')
  281. url_map_str = None
  282. # If format changed in config, recreate priority list
  283. if PRIORITY_VIDEO_FORMAT[0] != maxResolution: #config.plugins.YouTube.maxResolution.value:
  284. createPriorityFormats()
  285. for our_format in PRIORITY_VIDEO_FORMAT:
  286. our_format = 'itag=' + our_format
  287. for encoded_url in encoded_url_map:
  288. if our_format in encoded_url and 'url=' in encoded_url:
  289. url_map_str = encoded_url
  290. break
  291. if url_map_str:
  292. break
  293. # If anything not found, used first in the list if it not in ignore map
  294. if not url_map_str:
  295. for encoded_url in encoded_url_map:
  296. if 'url=' in encoded_url:
  297. url_map_str = encoded_url
  298. for ignore_format in IGNORE_VIDEO_FORMAT:
  299. ignore_format = 'itag=' + ignore_format
  300. if ignore_format in encoded_url:
  301. url_map_str = None
  302. break
  303. if url_map_str:
  304. break
  305. if not url_map_str:
  306. url_map_str = encoded_url_map[0]
  307. url_data = compat_parse_qs(url_map_str)
  308. url = url_data['url'][0]
  309. if 'sig' in url_data:
  310. url += '&signature=' + url_data['sig'][0]
  311. elif 's' in url_data:
  312. encrypted_sig = url_data['s'][0]
  313. ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
  314. jsplayer_url_json = self._search_regex(ASSETS_RE,
  315. embed_webpage if age_gate else video_webpage)
  316. if not jsplayer_url_json and not age_gate:
  317. # We need the embed website after all
  318. if embed_webpage is None:
  319. embed_url = 'https://www.youtube.com/embed/%s' % video_id
  320. embed_webpage = self._download_webpage(embed_url)
  321. jsplayer_url_json = self._search_regex(ASSETS_RE, embed_webpage)
  322. player_url = json.loads(jsplayer_url_json)
  323. if player_url is None:
  324. player_url_json = self._search_regex(
  325. r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
  326. video_webpage)
  327. player_url = json.loads(player_url_json)
  328. signature = self._decrypt_signature(encrypted_sig, player_url)
  329. url += '&signature=' + signature
  330. if 'ratebypass' not in url:
  331. url += '&ratebypass=yes'
  332. elif video_info.get('hlsvp'):
  333. url = None
  334. manifest_url = video_info['hlsvp'][0]
  335. url_map = self._extract_from_m3u8(manifest_url)
  336. # Find the best format from our format priority map
  337. for our_format in PRIORITY_VIDEO_FORMAT:
  338. if url_map.get(our_format):
  339. url = url_map[our_format]
  340. break
  341. # If anything not found, used first in the list if it not in ignore map
  342. if not url:
  343. for url_map_key in url_map.keys():
  344. if url_map_key not in IGNORE_VIDEO_FORMAT:
  345. url = url_map[url_map_key]
  346. break
  347. if not url:
  348. url = url_map.values()[0]
  349. else:
  350. #raise Exception('No supported formats found in video info!')
  351. return ""
  352. return str(url)
  353. if __name__ == "__main__":
  354. #yt = YouTubeVideoUrl()
  355. if len(sys.argv)>1:
  356. video_id= sys.argv[1]
  357. else:
  358. video_id = "2rlTF6HiMGg"
  359. e = YouTubeVideoUrl().extract(video_id)
  360. print e