Skip to content

Commit 7741328

Browse files
committed
Better yt channel-links command
1 parent 01e19bd commit 7741328

File tree

3 files changed

+40
-13
lines changed

3 files changed

+40
-13
lines changed

minet/cli/youtube/channel_links.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44

55
@with_enricher_and_loading_bar(
6-
headers=["url"],
6+
headers=["title", "url"],
77
title="Retrieving channel links",
88
unit="channels",
99
sub_unit="links",
@@ -19,5 +19,5 @@ def action(cli_args, enricher, loading_bar):
1919
if links is None:
2020
continue
2121

22-
for link in links:
23-
enricher.writerow(row, [link])
22+
for title, link in links:
23+
enricher.writerow(row, [title, link])

minet/youtube/scraper.py

+34-9
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
from typing import List, Set, Tuple, Optional
1+
from typing import List, Tuple, Optional, Iterator
22

33
import re
44
import json
55
from html import unescape
66
from urllib.parse import unquote
77
from ural import infer_redirection
8+
from ebbe import getpath
89

910
from minet.scrape import WonderfulSoup
1011
from minet.web import (
@@ -22,26 +23,42 @@
2223

2324
CAPTION_TRACKS_RE = re.compile(r'"captionTracks":(\[.*?\])')
2425
INITIAL_DATA_RE = re.compile(
25-
rb"(?:const|let|var)\s+ytInitialData\s*=\s*({.+});</script>"
26+
rb"(?:const|let|var)\s+ytInitialData\s*=\s*({.+})\s*;</script>"
2627
)
2728

2829

29-
def gather_url_endpoints(data):
30+
def gather_external_links(data) -> Iterator[Tuple[str, str]]:
3031
if isinstance(data, dict):
3132
for k, v in data.items():
32-
if k == "urlEndpoint":
33+
if k == "channelExternalLinkViewModel":
3334
if not isinstance(v, dict):
3435
return
3536

36-
yield infer_redirection(v["url"])
37+
yield (
38+
getpath(v, ("title", "content")),
39+
infer_redirection(
40+
getpath(
41+
v,
42+
(
43+
"link",
44+
"commandRuns",
45+
0,
46+
"onTap",
47+
"innertubeCommand",
48+
"urlEndpoint",
49+
"url",
50+
),
51+
)
52+
),
53+
)
3754

3855
return
3956

40-
yield from gather_url_endpoints(v)
57+
yield from gather_external_links(v)
4158

4259
elif isinstance(data, list):
4360
for v in data:
44-
yield from gather_url_endpoints(v)
61+
yield from gather_external_links(v)
4562

4663
else:
4764
return
@@ -152,7 +169,12 @@ def get_channel_id(self, channel_url: str) -> Optional[str]:
152169

153170
return None
154171

155-
def get_channel_links(self, channel_url: str) -> Optional[Set[str]]:
172+
def get_channel_links(self, channel_url: str) -> Optional[List[Tuple[str, str]]]:
173+
# NOTE: for some weird reason, the /about page has more info in
174+
# the ytInitialData global variable even if visual content is
175+
# strictly identical.
176+
channel_url = channel_url.split("?", 1)[0].split("#")[0].rstrip("/") + "/about"
177+
156178
response = self.request(channel_url, spoof_ua=True)
157179

158180
match = INITIAL_DATA_RE.search(response.body)
@@ -165,4 +187,7 @@ def get_channel_links(self, channel_url: str) -> Optional[Set[str]]:
165187
except json.JSONDecodeError:
166188
return None
167189

168-
return set(gather_url_endpoints(data))
190+
# with open("./dump.json", "w") as f:
191+
# json.dump(data, f, ensure_ascii=False, indent=2)
192+
193+
return list(gather_external_links(data))

test/scraper_test.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1085,4 +1085,6 @@ def basic_optional_scalar() -> Optional[str]:
10851085
assert infer_fieldnames_from_function_return_type(basic_float) == ["value"]
10861086
assert infer_fieldnames_from_function_return_type(basic_bool) == ["value"]
10871087
assert infer_fieldnames_from_function_return_type(basic_void) == ["value"]
1088-
assert infer_fieldnames_from_function_return_type(basic_optional_scalar) == ["value"]
1088+
assert infer_fieldnames_from_function_return_type(basic_optional_scalar) == [
1089+
"value"
1090+
]

0 commit comments

Comments
 (0)