Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.failedRequestHandler = failedRequestHandler;
this.errorHandler = errorHandler;

if (requestHandlerTimeoutSecs) {
if (typeof requestHandlerTimeoutSecs === 'number') {
this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
} else {
this.requestHandlerTimeoutMillis = 60_000;
Expand Down Expand Up @@ -1337,11 +1337,18 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

try {
request.state = RequestState.REQUEST_HANDLER;
await addTimeoutToPromise(
async () => this._runRequestHandler(crawlingContext),
this.requestHandlerTimeoutMillis,
`requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`,
);

// Only wrap in timeout if requestHandlerTimeoutMillis > 0
if (this.requestHandlerTimeoutMillis > 0) {
await addTimeoutToPromise(
async () => this._runRequestHandler(crawlingContext),
this.requestHandlerTimeoutMillis,
`Request handler safety timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`,
);
} else {
// No timeout wrapper - run directly
await this._runRequestHandler(crawlingContext);
}

await this._timeoutAndRetry(
async () => source.markRequestHandled(request!),
Expand Down
41 changes: 22 additions & 19 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import type {
SkippedRequestCallback,
} from '@crawlee/basic';
import {
BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
// BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
BasicCrawler,
BLOCKED_STATUS_CODES as DEFAULT_BLOCKED_STATUS_CODES,
Configuration,
Expand Down Expand Up @@ -332,8 +332,7 @@ export abstract class BrowserCrawler<
{
...basicCrawlerOptions,
requestHandler: async (...args) => this._runRequestHandler(...(args as [Context])),
requestHandlerTimeoutSecs:
navigationTimeoutSecs + requestHandlerTimeoutSecs + BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
requestHandlerTimeoutSecs: 0, // Disable outer request handler timeout after modifying the basic crawler wrapper
},
config,
);
Expand Down Expand Up @@ -556,32 +555,36 @@ export abstract class BrowserCrawler<
}

protected async _handleNavigation(crawlingContext: Context) {
const gotoOptions = { timeout: this.navigationTimeoutMillis } as unknown as GoToOptions;
const runNavigationPhase = async () => {
const gotoOptions = { timeout: this.navigationTimeoutMillis } as unknown as GoToOptions;

const preNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);

crawlingContext.request.state = RequestState.BEFORE_NAV;
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotoOptions);
tryCancel();
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);
crawlingContext.request.state = RequestState.BEFORE_NAV;
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotoOptions);
tryCancel();

const postNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);
const postNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);
await this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);

await this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies);
crawlingContext.response = (await this._navigationHandler(crawlingContext, gotoOptions)) ?? undefined;
tryCancel();

crawlingContext.request.state = RequestState.AFTER_NAV;
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotoOptions);
};
try {
crawlingContext.response = (await this._navigationHandler(crawlingContext, gotoOptions)) ?? undefined;
await addTimeoutToPromise(
runNavigationPhase,
this.navigationTimeoutMillis,
// This message will never be seen, but is required by addTimeoutToPromise
`Navigation timed out after ${this.navigationTimeoutMillis} ms`,
);
} catch (error) {
await this._handleNavigationTimeout(crawlingContext, error as Error);

crawlingContext.request.state = RequestState.ERROR;

await this._handleNavigationTimeout(crawlingContext, error as Error); // marks session bad and closes page
this._throwIfProxyError(error as Error);
throw error;
}
tryCancel();

crawlingContext.request.state = RequestState.AFTER_NAV;
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotoOptions);
}

protected async _applyCookies(
Expand Down
78 changes: 47 additions & 31 deletions packages/http-crawler/src/internals/http-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import type {
Session,
} from '@crawlee/basic';
import {
BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
// BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
BasicCrawler,
Configuration,
CrawlerExtension,
Expand Down Expand Up @@ -391,10 +391,7 @@ export class HttpCrawler<
...basicCrawlerOptions,
requestHandler,
autoscaledPoolOptions,
// We need to add some time for internal functions to finish,
// but not too much so that we would stall the crawler.
requestHandlerTimeoutSecs:
navigationTimeoutSecs + requestHandlerTimeoutSecs + BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
requestHandlerTimeoutSecs: 0, // Disable request handler timeout after modifying the basic crawler wrapper
},
config,
);
Expand Down Expand Up @@ -579,32 +576,51 @@ export class HttpCrawler<
}

protected async _handleNavigation(crawlingContext: Context) {
const gotOptions = {} as OptionsInit;
const { request, session } = crawlingContext;
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);

request.state = RequestState.BEFORE_NAV;
// Execute pre navigation hooks before applying session pool cookies,
// as they may also set cookies in the session
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
tryCancel();

const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);

this._applyCookies(crawlingContext, gotOptions, preNavigationHooksCookies, postNavigationHooksCookies);

const proxyUrl = crawlingContext.proxyInfo?.url;

crawlingContext.response = await addTimeoutToPromise(
async () => this._requestFunction({ request, session, proxyUrl, gotOptions }),
this.navigationTimeoutMillis,
`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`,
);
tryCancel();

request.state = RequestState.AFTER_NAV;
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
tryCancel();
try {
// Wrapped the navigation hooks and navigation in the same timeout
await addTimeoutToPromise(
async () => {
const gotOptions = {} as OptionsInit;
const { request, session } = crawlingContext;
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);

request.state = RequestState.BEFORE_NAV;
// Execute pre navigation hooks before applying session pool cookies,
// as they may also set cookies in the session
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
tryCancel();

const postNavigationHooksCookies = this._getCookieHeaderFromRequest(request);

this._applyCookies(
crawlingContext,
gotOptions,
preNavigationHooksCookies,
postNavigationHooksCookies,
);

const proxyUrl = crawlingContext.proxyInfo?.url;

crawlingContext.response =
(await this._requestFunction({
request,
session,
proxyUrl,
gotOptions,
})) ?? undefined;
tryCancel();

request.state = RequestState.AFTER_NAV;
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
tryCancel();
},
this.navigationTimeoutMillis, // default amount may need to be adjusted to accommodate the hooks
`Navigation timed out after ${this.navigationTimeoutMillis / 1000} seconds.`,
);
} catch (e: any) {
crawlingContext.request.state = RequestState.ERROR;
throw e;
}
}

/**
Expand Down
143 changes: 143 additions & 0 deletions packages/http-crawler/test/timeout.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import { expect, test } from 'vitest';
import { TimeoutError } from '@apify/timeout';
import { HttpCrawler } from '@crawlee/http';

const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));

function getStats(crawler: HttpCrawler<any>) {
const statsAny = (crawler as any).stats;
if (statsAny?.toJSON) return statsAny.toJSON();
if (statsAny?.getState) return statsAny.getState();
return {};
}

test('Navigation timeout (preNavigationHooks delay)', async () => {
let handlerCalled = false;
let failedError: Error | undefined;
let failedCalled = 0;

const crawler = new HttpCrawler({
maxRequestRetries: 0,
navigationTimeoutSecs: 1,
preNavigationHooks: [
async () => {
await sleep(1500);
},
],
requestHandler: async () => {
handlerCalled = true;
},
failedRequestHandler: async (_ctx, error) => {
failedCalled++;
failedError = error as Error;
},
});

const start = Date.now();
await crawler.run([`https://example.com/?http_nav=${Date.now()}`]);
const elapsed = Date.now() - start;
const stats = getStats(crawler);

expect(handlerCalled).toBe(false);
expect(failedCalled).toBe(1);
expect(stats.requestsFailed).toBe(1);
expect(failedError).toBeInstanceOf(TimeoutError);
expect(failedError!.message).toMatch(/^Navigation timed out after 1/);
expect(elapsed).toBeLessThan(4000); // proves early cut-off
});

test('Navigation timeout via postNavigationHooks delay', async () => {
let handlerCalled = false;
let failedError: Error | undefined;
let failedCalled = 0;

const crawler = new HttpCrawler({
maxRequestRetries: 0,
navigationTimeoutSecs: 1,
postNavigationHooks: [
async () => {
await sleep(1_500);
},
],
requestHandler: async () => {
handlerCalled = true;
},
failedRequestHandler: async (_ctx, error) => {
failedCalled++;
failedError = error as Error;
},
});

await crawler.run([`https://example.com/?http_post=${Date.now()}`]);
const stats = getStats(crawler);

expect(handlerCalled).toBe(false);
expect(failedCalled).toBe(1);
expect(stats.requestsFailed).toBe(1);
expect(failedError).toBeInstanceOf(TimeoutError);
expect((failedError as Error).message).toMatch(/Navigation timed out/);
});

test('Request handler timeout (post-navigation)', async () => {
let handlerEntered = false;
let failedError: Error | undefined;
let failedCalled = 0;

const crawler = new HttpCrawler({
maxRequestRetries: 0,
navigationTimeoutSecs: 10,
requestHandlerTimeoutSecs: 2,
requestHandler: async () => {
handlerEntered = true;
await sleep(12_000);
},
failedRequestHandler: async (_ctx, error) => {
failedCalled++;
failedError = error as Error;
},
});

const start = Date.now();
await crawler.run([`https://example.com/?http_handler=${Date.now()}`]);
const elapsed = Date.now() - start;
const stats = getStats(crawler);

expect(handlerEntered).toBe(true);
expect(failedCalled).toBe(1);
expect(stats.requestsFailed).toBe(1);
expect(failedError).toBeInstanceOf(TimeoutError);
expect(failedError!.message).toMatch(/requestHandler timed out after 2 seconds\./i);
expect(failedError!.message).not.toMatch(/Navigation timed out/i);
expect(elapsed).toBeLessThan(6000);
});

test('Succeeds when under both timeouts', async () => {
let handlerRan = false;
let failedError: Error | undefined;

const crawler = new HttpCrawler({
maxRequestRetries: 0,
navigationTimeoutSecs: 2,
requestHandlerTimeoutSecs: 2,
preNavigationHooks: [
async () => {
await sleep(300);
},
],
requestHandler: async () => {
handlerRan = true;
await sleep(400);
},
failedRequestHandler: async (_ctx, error) => {
failedError = error as Error;
},
});

await crawler.run([`https://example.com/?http_ok=${Date.now()}`]);
const stats = getStats(crawler);

expect(handlerRan).toBe(true);
expect(stats.requestsFinished).toBe(1);
expect(stats.requestsFailed || 0).toBe(0);
expect(failedError).toBeUndefined();
});
Loading