1
- from typing import List , Set , Tuple , Optional
1
+ from typing import List , Tuple , Optional , Iterator
2
2
3
3
import re
4
4
import json
5
5
from html import unescape
6
6
from urllib .parse import unquote
7
7
from ural import infer_redirection
8
+ from ebbe import getpath
8
9
9
10
from minet .scrape import WonderfulSoup
10
11
from minet .web import (
22
23
23
24
CAPTION_TRACKS_RE = re .compile (r'"captionTracks":(\[.*?\])' )
24
25
INITIAL_DATA_RE = re .compile (
25
- rb"(?:const|let|var)\s+ytInitialData\s*=\s*({.+});</script>"
26
+ rb"(?:const|let|var)\s+ytInitialData\s*=\s*({.+})\s* ;</script>"
26
27
)
27
28
28
29
29
- def gather_url_endpoints (data ):
30
+ def gather_external_links (data ) -> Iterator [ Tuple [ str , str ]] :
30
31
if isinstance (data , dict ):
31
32
for k , v in data .items ():
32
- if k == "urlEndpoint " :
33
+ if k == "channelExternalLinkViewModel " :
33
34
if not isinstance (v , dict ):
34
35
return
35
36
36
- yield infer_redirection (v ["url" ])
37
+ yield (
38
+ getpath (v , ("title" , "content" )),
39
+ infer_redirection (
40
+ getpath (
41
+ v ,
42
+ (
43
+ "link" ,
44
+ "commandRuns" ,
45
+ 0 ,
46
+ "onTap" ,
47
+ "innertubeCommand" ,
48
+ "urlEndpoint" ,
49
+ "url" ,
50
+ ),
51
+ )
52
+ ),
53
+ )
37
54
38
55
return
39
56
40
- yield from gather_url_endpoints (v )
57
+ yield from gather_external_links (v )
41
58
42
59
elif isinstance (data , list ):
43
60
for v in data :
44
- yield from gather_url_endpoints (v )
61
+ yield from gather_external_links (v )
45
62
46
63
else :
47
64
return
@@ -152,7 +169,12 @@ def get_channel_id(self, channel_url: str) -> Optional[str]:
152
169
153
170
return None
154
171
155
- def get_channel_links (self , channel_url : str ) -> Optional [Set [str ]]:
172
+ def get_channel_links (self , channel_url : str ) -> Optional [List [Tuple [str , str ]]]:
173
+ # NOTE: for some weird reason, the /about page has more info in
174
+ # the ytInitialData global variable even if visual content is
175
+ # strictly identical.
176
+ channel_url = channel_url .split ("?" , 1 )[0 ].split ("#" )[0 ].rstrip ("/" ) + "/about"
177
+
156
178
response = self .request (channel_url , spoof_ua = True )
157
179
158
180
match = INITIAL_DATA_RE .search (response .body )
@@ -165,4 +187,7 @@ def get_channel_links(self, channel_url: str) -> Optional[Set[str]]:
165
187
except json .JSONDecodeError :
166
188
return None
167
189
168
- return set (gather_url_endpoints (data ))
190
+ # with open("./dump.json", "w") as f:
191
+ # json.dump(data, f, ensure_ascii=False, indent=2)
192
+
193
+ return list (gather_external_links (data ))
0 commit comments