Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
mogery committed Nov 6, 2024
1 parent 8616fe6 commit 66a6f91
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 9 deletions.
4 changes: 3 additions & 1 deletion apps/api/src/controllers/v1/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -505,8 +505,10 @@ export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptio
}),
internalOptions: {
atsv: pageOptions.atsv,
v0DisableJsDom: pageOptions.disableJsDom,
v0UseFastMode: pageOptions.useFastMode,
},
// TODO: fallback, fetchPage Content, replaceAllPathsWithAbsolutePaths, includeLinks, useFastMode, disableJsDom
// TODO: fallback, fetchPage Content, replaceAllPathsWithAbsolutePaths, includeLinks
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi
atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation,
removeBase64Images: meta.options.removeBase64Images,
disableJsDom: meta.internalOptions.v0DisableJsDom,
};

let response = await performFireEngineScrape(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ export type FireEngineScrapeRequestTLSClient = {
engine: "tlsclient";
atsv?: boolean; // v0 only, default: false
geolocation?: { country?: string; languages?: string[]; };
disableJsDom?: boolean; // v0 only, default: false
// blockAds?: boolean; // default: true
};

Expand Down
17 changes: 11 additions & 6 deletions apps/api/src/scraper/scrapeURL/engines/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export const featureFlags = [
"location",
"mobile",
"skipTlsVerification",
"useFastMode",
] as const;

export type FeatureFlag = typeof featureFlags[number];
Expand All @@ -49,6 +50,7 @@ export const featureFlagOptions: {
"pdf": { priority: 100 },
"docx": { priority: 100 },
"atsv": { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
"useFastMode": { priority: 90 },
"location": { priority: 10 },
"mobile": { priority: 10 },
"skipTlsVerification": { priority: 10 },
Expand Down Expand Up @@ -105,6 +107,7 @@ export const engineOptions: {
"location": true,
"mobile": true,
"skipTlsVerification": true,
"useFastMode": false,
},
quality: 50,
},
Expand All @@ -120,6 +123,7 @@ export const engineOptions: {
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": false,
},
quality: 40,
},
Expand All @@ -135,7 +139,7 @@ export const engineOptions: {
"location": false,
"mobile": false,
"skipTlsVerification": false,

"useFastMode": false,
},
quality: 30,
},
Expand All @@ -151,7 +155,7 @@ export const engineOptions: {
"location": false,
"mobile": false,
"skipTlsVerification": false,

"useFastMode": false,
},
quality: 29,
},
Expand All @@ -167,7 +171,7 @@ export const engineOptions: {
"location": false,
"mobile": false,
"skipTlsVerification": false,

"useFastMode": false,
},
quality: 20,
},
Expand All @@ -183,7 +187,7 @@ export const engineOptions: {
"location": true,
"mobile": false,
"skipTlsVerification": false,

"useFastMode": true,
},
quality: 10,
},
Expand All @@ -199,7 +203,7 @@ export const engineOptions: {
"location": false,
"mobile": false,
"skipTlsVerification": false,

"useFastMode": true,
},
quality: 5,
},
Expand All @@ -215,7 +219,7 @@ export const engineOptions: {
"location": false,
"mobile": false,
"skipTlsVerification": false,

"useFastMode": true,
},
quality: -10,
},
Expand All @@ -231,6 +235,7 @@ export const engineOptions: {
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": true,
},
quality: -10,
},
Expand Down
15 changes: 13 additions & 2 deletions apps/api/src/scraper/scrapeURL/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ function buildFeatureFlags(url: string, options: ScrapeOptions, internalOptions:
flags.add("skipTlsVerification");
}

if (internalOptions.v0UseFastMode) {
flags.add("useFastMode");
}

const urlO = new URL(url);

if (urlO.pathname.endsWith(".pdf")) {
Expand All @@ -78,6 +82,11 @@ function buildFeatureFlags(url: string, options: ScrapeOptions, internalOptions:
return flags;
}

// The meta object contains all required information to perform a scrape.
// For example, the scrape ID, URL, options, feature flags, logs that occur while scraping.
// The meta object is usually immutable, except for the logs array, and in edge cases (e.g. a new feature is suddenly required)
// Having a meta object that is treated as immutable helps the code stay clean and easily tracable,
// while also retaining the benefits that WebScraper had from its OOP design.
function buildMetaObject(id: string, url: string, options: ScrapeOptions, internalOptions: InternalOptions): Meta {
const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
if (specParams !== undefined) {
Expand All @@ -103,6 +112,8 @@ export type InternalOptions = {
atsv?: boolean; // anti-bot solver, beta

v0CrawlOnlyUrls?: boolean;
v0UseFastMode?: boolean;
v0DisableJsDom?: boolean;
};

export type EngineResultsTracker = { [E in Engine]?: {
Expand Down Expand Up @@ -147,8 +158,8 @@ async function scrapeURLLoop(
const engineResult = _engineResult as EngineScrapeResult & { markdown: string };

// Success factors
const isLongEnough = engineResult.markdown.length >= 1;
const isGoodStatusCode = engineResult.statusCode < 300;
const isLongEnough = engineResult.markdown.length >= 20;
const isGoodStatusCode = (engineResult.statusCode >= 200 && engineResult.statusCode < 300) || engineResult.statusCode === 304;
const hasNoPageError = engineResult.error === undefined;

results[engine] = {
Expand Down

0 comments on commit 66a6f91

Please sign in to comment.