Skip to content

Commit

Permalink
Merge branch 'mog/webscraper-refactor' of https://github.com/mendable…
Browse files Browse the repository at this point in the history
…ai/firecrawl into mog/webscraper-refactor
  • Loading branch information
nickscamara committed Nov 7, 2024
2 parents a02c42a + 7198a28 commit f9e775a
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 9 deletions.
2 changes: 1 addition & 1 deletion apps/api/src/controllers/v1/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptio
v0DisableJsDom: pageOptions.disableJsDom,
v0UseFastMode: pageOptions.useFastMode,
},
// TODO: fallback, fetchPage Content, replaceAllPathsWithAbsolutePaths, includeLinks
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
}
}

Expand Down
6 changes: 2 additions & 4 deletions apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
priority: meta.internalOptions.priority,
geolocation: meta.options.geolocation,
mobile: meta.options.mobile,
removeBase64Images: meta.options.removeBase64Images,
// TODO: scrollXPaths, disableJsDom
// TODO: scrollXPaths
};

let response = await performFireEngineScrape(
Expand Down Expand Up @@ -137,7 +136,7 @@ export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<Eng
screenshot: meta.options.formats.includes("screenshot"),
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
wait: meta.options.waitFor,
removeBase64Images: meta.options.removeBase64Images,
geolocation: meta.options.geolocation,
};

let response = await performFireEngineScrape(
Expand Down Expand Up @@ -175,7 +174,6 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi

atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation,
removeBase64Images: meta.options.removeBase64Images,
disableJsDom: meta.internalOptions.v0DisableJsDom,
};

Expand Down
5 changes: 1 addition & 4 deletions apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,14 @@ export type FireEngineScrapeRequestCommon = {
// team_id?: string; // unused
logRequest?: boolean; // default: true
instantReturn?: boolean; // default: false

removeBase64Images?: boolean;
geolocation?: { country?: string; languages?: string[]; };
}

export type FireEngineScrapeRequestChromeCDP = {
engine: "chrome-cdp";
skipTlsVerification?: boolean;
actions?: Action[];
blockMedia?: true; // cannot be false
geolocation?: { country?: string; languages?: string[]; };
mobile?: boolean;
};

Expand All @@ -51,7 +49,6 @@ export type FireEngineScrapeRequestPlaywright = {
export type FireEngineScrapeRequestTLSClient = {
engine: "tlsclient";
atsv?: boolean; // v0 only, default: false
geolocation?: { country?: string; languages?: string[]; };
disableJsDom?: boolean; // v0 only, default: false
// blockAds?: boolean; // default: true
};
Expand Down
2 changes: 2 additions & 0 deletions apps/api/src/scraper/scrapeURL/transformers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { extractLinks } from "../lib/extractLinks";
import { extractMetadata } from "../lib/extractMetadata";
import { performLLMExtract } from "./llmExtract";
import { uploadScreenshot } from "./uploadScreenshot";
import { removeBase64Images } from "./removeBase64Images";

export type Transformer = (meta: Meta, document: Document) => Document | Promise<Document>;

Expand Down Expand Up @@ -110,6 +111,7 @@ export const transformerStack: Transformer[] = [
uploadScreenshot,
performLLMExtract,
coerceFieldsToFormats,
removeBase64Images,
];

export async function executeTransformers(meta: Meta, document: Document): Promise<Document> {
Expand Down
11 changes: 11 additions & 0 deletions apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { Meta } from "..";
import { Document } from "../../../controllers/v1/types";

const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g;

export function removeBase64Images(meta: Meta, document: Document): Document {
if (meta.options.removeBase64Images && document.markdown !== undefined) {
document.markdown = document.markdown.replace(regex, '$1(<Base64-Image-Removed>)');
}
return document;
}

0 comments on commit f9e775a

Please sign in to comment.