Skip to content

Commit e405ea2

Browse files
committed
refactor: prepare for non puppeteer scrapers
1 parent 34dcfcc commit e405ea2

File tree

13 files changed

+65
-62
lines changed

13 files changed

+65
-62
lines changed

browser-extension/entrypoints/background/scraping/base-scraper.ts

Lines changed: 0 additions & 31 deletions
This file was deleted.

browser-extension/entrypoints/background/scraping/create-scraper.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import { SocialNetworkName } from "@/entrypoints/shared/model/social-network-name";
2-
import { BaseScraper } from "./base-scraper";
2+
import { PuppeteerBaseScraper } from "./puppeteer/puppeteer-base-scraper";
33
import { InstagramScraper } from "./instagram/instagram-scraper";
44
import { YoutubeScraper } from "./youtube/youtube-scraper";
55

6-
export function createScraper(sn: SocialNetworkName): BaseScraper {
6+
export function createScraper(sn: SocialNetworkName): PuppeteerBaseScraper {
77
switch (sn) {
88
case "YOUTUBE":
99
return new YoutubeScraper();

browser-extension/entrypoints/background/scraping/instagram/instagram-scraper.ts

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1-
import { ElementHandle } from "puppeteer-core/lib/esm/puppeteer/puppeteer-core-browser.js";
2-
import { BaseScraper } from "../base-scraper";
1+
import {
2+
ElementHandle,
3+
Page,
4+
} from "puppeteer-core/lib/esm/puppeteer/puppeteer-core-browser.js";
5+
import { PuppeteerBaseScraper } from "../puppeteer/puppeteer-base-scraper";
36
import {
47
type Author,
58
type Post,
@@ -10,7 +13,7 @@ import { currentIsoDate } from "../utils/current-iso-date";
1013

1114
//TODO: gérer le scroll et le chargement des commentaires
1215
//TODO: gérer le scraping des réponses aux commentaires
13-
export class InstagramScraper extends BaseScraper {
16+
export class InstagramScraper extends PuppeteerBaseScraper {
1417
private INSTAGRAM_URL = "https://www.instagram.com/";
1518

1619
extractPostId(url: string): string {
@@ -21,9 +24,7 @@ export class InstagramScraper extends BaseScraper {
2124
return parsed.postId;
2225
}
2326

24-
async scrapTab(tab: Browser.tabs.Tab): Promise<Post> {
25-
const page = await this.getBrowserPageFromTab(tab);
26-
27+
async doScrapTab(tab: Browser.tabs.Tab, page: Page): Promise<Post> {
2728
// //main/div/div/div
2829
const cadre_publication = (await page.$("::-p-xpath(//main/div/div/div)"))!;
2930
const colonne_commentaires = (await cadre_publication.$(

browser-extension/entrypoints/background/scraping/utils/anchorHref.ts renamed to browser-extension/entrypoints/background/scraping/puppeteer/anchorHref.ts

File renamed without changes.

browser-extension/entrypoints/background/scraping/utils/ariaLabel.ts renamed to browser-extension/entrypoints/background/scraping/puppeteer/ariaLabel.ts

File renamed without changes.

browser-extension/entrypoints/background/scraping/utils/innerHtml.ts renamed to browser-extension/entrypoints/background/scraping/puppeteer/innerHtml.ts

File renamed without changes.

browser-extension/entrypoints/background/scraping/utils/innerText.ts renamed to browser-extension/entrypoints/background/scraping/puppeteer/innerText.ts

File renamed without changes.
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import {
2+
connect,
3+
ExtensionTransport,
4+
Page,
5+
Browser as PuppeteerBrowser,
6+
} from "puppeteer-core/lib/esm/puppeteer/puppeteer-core-browser.js";
7+
import type { Post } from "../../../shared/model/post";
8+
import { Scraper } from "../scraper";
9+
10+
export abstract class PuppeteerBaseScraper implements Scraper {
11+
browser?: PuppeteerBrowser;
12+
constructor() {}
13+
14+
protected async sleep(ms: number): Promise<void> {
15+
return new Promise((resolve) => setTimeout(resolve, ms));
16+
}
17+
18+
private async getBrowserPageFromTab(tab: Browser.tabs.Tab): Promise<Page> {
19+
this.browser = await connect({
20+
transport: await ExtensionTransport.connectTab(tab.id!),
21+
});
22+
const [page] = await this.browser.pages();
23+
return page;
24+
}
25+
26+
private async disconnectBrowser() {
27+
if (this.browser) {
28+
this.browser.disconnect();
29+
}
30+
}
31+
32+
public async scrapTab(tab: Browser.tabs.Tab): Promise<Post> {
33+
const page = await this.getBrowserPageFromTab(tab);
34+
try {
35+
return this.doScrapTab(tab, page);
36+
} finally {
37+
this.disconnectBrowser();
38+
}
39+
}
40+
41+
abstract doScrapTab(tab: Browser.tabs.Tab, page: Page): Promise<Post>;
42+
}

browser-extension/entrypoints/background/scraping/utils/selectOrThrow.ts renamed to browser-extension/entrypoints/background/scraping/puppeteer/selectOrThrow.ts

File renamed without changes.

browser-extension/entrypoints/background/scraping/scrap-tab.ts

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,5 @@ export async function scrapTab(
1313
throw new Error("Url of tab is not scrapable!");
1414
}
1515
const scraper = createScraper(snUrl.socialNetwork);
16-
try {
17-
const socialNetworkPost = await scraper.scrapTab(tab);
18-
19-
return socialNetworkPost;
20-
} finally {
21-
await scraper.disconnectBrowser();
22-
}
16+
return await scraper.scrapTab(tab);
2317
}

0 commit comments

Comments
 (0)