Skip to content

Commit

Permalink
feat(scrapeURL): add url-specific parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
mogery committed Nov 5, 2024
1 parent e5385e6 commit 5e2124c
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 0 deletions.
7 changes: 7 additions & 0 deletions apps/api/src/scraper/scrapeURL/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { parseMarkdown } from "../../lib/html-to-markdown";
import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "./error";
import { executeTransformers } from "./transformers";
import { LLMRefusalError } from "./transformers/llmExtract";
import { urlSpecificParams } from "./lib/urlSpecificParams";

export type ScrapeUrlResponse = ({
success: true,
Expand Down Expand Up @@ -74,6 +75,12 @@ function buildFeatureFlags(url: string, options: ScrapeOptions, internalOptions:
}

function buildMetaObject(id: string, url: string, options: ScrapeOptions, internalOptions: InternalOptions): Meta {
const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
if (specParams !== undefined) {
options = Object.assign(options, specParams.scrapeOptions);
internalOptions = Object.assign(internalOptions, specParams.internalOptions);
}

const _logger = logger.child({ module: "ScrapeURL", scrapeId: id });
const logs: any[] = [];
_logger.add(new ArrayTransport({ array: logs, scrapeId: id }));
Expand Down
78 changes: 78 additions & 0 deletions apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import { InternalOptions } from "..";
import { ScrapeOptions } from "../../../controllers/v1/types";

export type UrlSpecificParams = {
scrapeOptions: Partial<ScrapeOptions>,
internalOptions: Partial<InternalOptions>,
};

const docsParam: UrlSpecificParams = {
scrapeOptions: {
waitFor: 2000,
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
internalOptions: { forceEngine: "fire-engine;chrome-cdp" },
}

export const urlSpecificParams: Record<string, UrlSpecificParams> = {
"support.greenpay.me": docsParam,
"docs.pdw.co": docsParam,
"developers.notion.com": docsParam,
"docs2.hubitat.com": docsParam,
"rsseau.fr": docsParam,
"help.salesforce.com": docsParam,
"scrapethissite.com": {
scrapeOptions: {
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
internalOptions: { forceEngine: "fetch" },
},
// "eonhealth.com": {
// defaultScraper: "fire-engine",
// params: {
// fireEngineOptions: {
// mobileProxy: true,
// method: "get",
// engine: "request",
// },
// },
// },
"notion.com": {
scrapeOptions: { waitFor: 2000 },
internalOptions: { forceEngine: "fire-engine;playwright" }
},
"developer.apple.com": {
scrapeOptions: { waitFor: 2000 },
internalOptions: { forceEngine: "fire-engine;playwright" }
},
"digikey.com": {
scrapeOptions: {},
internalOptions: { forceEngine: "fire-engine;tlsclient" }
},
"lorealparis.hu": {
scrapeOptions: {},
internalOptions: { forceEngine: "fire-engine;tlsclient" },
}
};

0 comments on commit 5e2124c

Please sign in to comment.