Skip to content

Commit 10ed6f8

Browse files
authored
Merge pull request #44 from peterbencze/development
Serritor 1.6.0
2 parents 63d5c3f + 9596d3d commit 10ed6f8

15 files changed

+463
-35
lines changed

.travis.yml

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
language: java
2+
jdk:
3+
- oraclejdk8
4+
- openjdk8
5+
install: true
6+
script: mvn clean verify -Dgpg.skip
7+
cache:
8+
directories:
9+
- $HOME/.m2

README.md

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
Serritor
22
========
33

4-
[![Support via PayPal](https://cdn.rawgit.com/twolfson/paypal-github-button/1.0.0/dist/button.svg)](https://paypal.me/peterbencze)
5-
64
Serritor is an open source web crawler framework built upon [Selenium](http://www.seleniumhq.org/) and written in Java. It can be used to crawl dynamic web pages that use JavaScript.
75

86
## Using Serritor in your build
@@ -13,15 +11,15 @@ Add the following dependency to your pom.xml:
1311
<dependency>
1412
<groupId>com.github.peterbencze</groupId>
1513
<artifactId>serritor</artifactId>
16-
<version>1.5.0</version>
14+
<version>1.6.0</version>
1715
</dependency>
1816
```
1917

2018
### Gradle
2119

2220
Add the following dependency to your build.gradle:
2321
```groovy
24-
compile group: 'com.github.peterbencze', name: 'serritor', version: '1.5.0'
22+
compile group: 'com.github.peterbencze', name: 'serritor', version: '1.6.0'
2523
```
2624

2725
### Manual dependencies
@@ -93,5 +91,12 @@ crawler.start(new ChromeDriver());
9391

9492
That's it! In just a few lines you can create a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the `WebDriver` instance, so you can use all the features that are provided by Selenium.
9593

94+
## Support
95+
If this framework helped you in any way, or you would like to support the development:
96+
97+
[![Support via PayPal](https://cdn.rawgit.com/twolfson/paypal-github-button/1.0.0/dist/button.svg)](https://paypal.me/peterbencze)
98+
99+
Any amount you choose to give will be greatly appreciated.
100+
96101
## License
97102
The source code of Serritor is made available under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).

pom.xml

+10-7
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<modelVersion>4.0.0</modelVersion>
44
<groupId>com.github.peterbencze</groupId>
55
<artifactId>serritor</artifactId>
6-
<version>1.5.0</version>
6+
<version>1.6.0</version>
77
<packaging>jar</packaging>
88

99
<name>Serritor</name>
@@ -59,12 +59,12 @@
5959
<dependency>
6060
<groupId>org.seleniumhq.selenium</groupId>
6161
<artifactId>htmlunit-driver</artifactId>
62-
<version>2.32.1</version>
62+
<version>2.33.0</version>
6363
</dependency>
6464
<dependency>
6565
<groupId>com.google.guava</groupId>
6666
<artifactId>guava</artifactId>
67-
<version>26.0-jre</version>
67+
<version>27.0-jre</version>
6868
</dependency>
6969
<dependency>
7070
<groupId>junit</groupId>
@@ -75,7 +75,7 @@
7575
<dependency>
7676
<groupId>org.mockito</groupId>
7777
<artifactId>mockito-core</artifactId>
78-
<version>2.18.3</version>
78+
<version>2.23.0</version>
7979
<scope>test</scope>
8080
</dependency>
8181
<dependency>
@@ -87,7 +87,7 @@
8787
<dependency>
8888
<groupId>com.github.tomakehurst</groupId>
8989
<artifactId>wiremock</artifactId>
90-
<version>2.18.0</version>
90+
<version>2.19.0</version>
9191
<scope>test</scope>
9292
</dependency>
9393
</dependencies>
@@ -134,7 +134,7 @@
134134
<dependency>
135135
<groupId>com.puppycrawl.tools</groupId>
136136
<artifactId>checkstyle</artifactId>
137-
<version>8.11</version>
137+
<version>8.14</version>
138138
</dependency>
139139
</dependencies>
140140
<configuration>
@@ -152,7 +152,10 @@
152152
<plugin>
153153
<groupId>org.apache.maven.plugins</groupId>
154154
<artifactId>maven-failsafe-plugin</artifactId>
155-
<version>2.22.0</version>
155+
<version>2.22.1</version>
156+
<configuration>
157+
<argLine>-Djdk.net.URLClassPath.disableClassPathURLCheck=true</argLine>
158+
</configuration>
156159
<executions>
157160
<execution>
158161
<goals>

src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java

+41-9
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import com.gargoylesoftware.htmlunit.WebClient;
2020
import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder;
21+
import com.github.peterbencze.serritor.api.event.CrawlEvent;
2122
import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent;
2223
import com.github.peterbencze.serritor.api.event.PageLoadEvent;
2324
import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent;
@@ -30,6 +31,7 @@
3031
import com.github.peterbencze.serritor.internal.crawldelaymechanism.CrawlDelayMechanism;
3132
import com.github.peterbencze.serritor.internal.crawldelaymechanism.FixedCrawlDelayMechanism;
3233
import com.github.peterbencze.serritor.internal.crawldelaymechanism.RandomCrawlDelayMechanism;
34+
import com.github.peterbencze.serritor.internal.event.EventCallbackManager;
3335
import java.io.File;
3436
import java.io.IOException;
3537
import java.io.InputStream;
@@ -72,6 +74,7 @@ public abstract class BaseCrawler {
7274
private static final Logger LOGGER = Logger.getLogger(BaseCrawler.class.getName());
7375

7476
private CrawlerConfiguration config;
77+
private EventCallbackManager callbackManager;
7578
private CrawlFrontier crawlFrontier;
7679
private BasicCookieStore cookieStore;
7780
private CloseableHttpClient httpClient;
@@ -108,6 +111,16 @@ protected BaseCrawler(final InputStream inStream) {
108111
* Private base constructor which does simple initialization.
109112
*/
110113
private BaseCrawler() {
114+
callbackManager = new EventCallbackManager();
115+
callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD, this::onPageLoad);
116+
callbackManager.setDefaultEventCallback(CrawlEvent.NON_HTML_CONTENT,
117+
this::onNonHtmlContent);
118+
callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD_TIMEOUT,
119+
this::onPageLoadTimeout);
120+
callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_REDIRECT,
121+
this::onRequestRedirect);
122+
callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_ERROR, this::onRequestError);
123+
111124
isStopping = false;
112125
isStopped = true;
113126
}
@@ -208,6 +221,22 @@ public final void resumeState(final WebDriver webDriver) {
208221
start(webDriver, true);
209222
}
210223

224+
/**
225+
* Registers an operation which is invoked when the specific event occurs and the provided
226+
* pattern matches the request URL.
227+
*
228+
* @param event the event for which the callback should be triggered
229+
* @param callback the pattern matching callback to invoke
230+
*/
231+
protected final void registerCustomEventCallback(
232+
final CrawlEvent event,
233+
final PatternMatchingCallback callback) {
234+
Validate.notNull(event, "The event cannot be null.");
235+
Validate.notNull(callback, "The callback cannot be null.");
236+
237+
callbackManager.addCustomEventCallback(event, callback);
238+
}
239+
211240
/**
212241
* Gracefully stops the crawler.
213242
*/
@@ -284,7 +313,8 @@ private void run() {
284313
// Send an HTTP HEAD request to determine its availability and content type
285314
httpHeadResponse = getHttpHeadResponse(candidateUrl, context);
286315
} catch (IOException exception) {
287-
onRequestError(new RequestErrorEvent(currentCandidate, exception));
316+
callbackManager.call(CrawlEvent.REQUEST_ERROR,
317+
new RequestErrorEvent(currentCandidate, exception));
288318
isUnsuccessfulRequest = true;
289319
}
290320

@@ -294,35 +324,36 @@ private void run() {
294324
String responseMimeType = getResponseMimeType(httpHeadResponse);
295325
if (responseMimeType.equals(ContentType.TEXT_HTML.getMimeType())) {
296326
boolean isTimedOut = false;
297-
TimeoutException requestTimeoutException = null;
327+
TimeoutException timeoutException = null;
298328

299329
try {
300330
// Open URL in browser
301331
webDriver.get(candidateUrl);
302332
} catch (TimeoutException exception) {
303333
isTimedOut = true;
304-
requestTimeoutException = exception;
334+
timeoutException = exception;
305335
}
306336

307337
// Ensure the HTTP client and Selenium have the same state
308338
syncHttpClientCookies();
309339

310340
if (isTimedOut) {
311-
onPageLoadTimeout(new PageLoadTimeoutEvent(currentCandidate,
312-
requestTimeoutException));
341+
callbackManager.call(CrawlEvent.PAGE_LOAD_TIMEOUT,
342+
new PageLoadTimeoutEvent(currentCandidate, timeoutException));
313343
} else {
314344
String loadedPageUrl = webDriver.getCurrentUrl();
315345
if (!loadedPageUrl.equals(candidateUrl)) {
316346
// Create a new crawl request for the redirected URL (JS redirect)
317347
handleRequestRedirect(currentCandidate, loadedPageUrl);
318348
} else {
319-
onPageLoad(new PageLoadEvent(currentCandidate, webDriver));
349+
callbackManager.call(CrawlEvent.PAGE_LOAD,
350+
new PageLoadEvent(currentCandidate, webDriver));
320351
}
321352
}
322353
} else {
323354
// URLs that point to non-HTML content should not be opened in the browser
324-
onNonHtmlContent(new NonHtmlContentEvent(currentCandidate,
325-
responseMimeType));
355+
callbackManager.call(CrawlEvent.NON_HTML_CONTENT,
356+
new NonHtmlContentEvent(currentCandidate, responseMimeType));
326357
}
327358
} else {
328359
// Create a new crawl request for the redirected URL
@@ -432,7 +463,8 @@ private void handleRequestRedirect(
432463
CrawlRequest redirectedRequest = builder.build();
433464

434465
crawlFrontier.feedRequest(redirectedRequest, false);
435-
onRequestRedirect(new RequestRedirectEvent(currentCrawlCandidate, redirectedRequest));
466+
callbackManager.call(CrawlEvent.REQUEST_REDIRECT,
467+
new RequestRedirectEvent(currentCrawlCandidate, redirectedRequest));
436468
}
437469

438470
/**
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Copyright 2018 Peter Bencze.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.github.peterbencze.serritor.api;
18+
19+
import com.github.peterbencze.serritor.internal.event.EventObject;
20+
import java.util.function.Consumer;
21+
import java.util.regex.Pattern;
22+
import org.apache.commons.lang3.Validate;
23+
24+
/**
25+
* Represents an operation which is invoked when the specified regex pattern matches the request
26+
* URL.
27+
*
28+
* @author Peter Bencze
29+
*/
30+
public final class PatternMatchingCallback {
31+
32+
private final Pattern urlPattern;
33+
private final Consumer<? extends EventObject> callback;
34+
35+
/**
36+
* Creates a {@link PatternMatchingCallback} instance.
37+
*
38+
* @param <T> the type of the input to the operation
39+
* @param urlPattern the regex pattern used for matching on request URLs
40+
* @param callback the operation to be performed when the pattern matches
41+
*/
42+
public <T extends EventObject> PatternMatchingCallback(
43+
final Pattern urlPattern,
44+
final Consumer<T> callback) {
45+
Validate.notNull(urlPattern, "The pattern cannot be null.");
46+
Validate.notNull(callback, "The callback cannot be null.");
47+
48+
this.urlPattern = urlPattern;
49+
this.callback = callback;
50+
}
51+
52+
/**
53+
* Returns the regex pattern used for matching on request URLs.
54+
*
55+
* @return the regex pattern used for matching
56+
*/
57+
public Pattern getUrlPattern() {
58+
return urlPattern;
59+
}
60+
61+
/**
62+
* Returns the operation to be performed when the pattern matches.
63+
*
64+
* @param <T> the type of the input to the operation
65+
*
66+
* @return the operation to be performed when the pattern matches
67+
*/
68+
public <T extends EventObject> Consumer<T> getCallback() {
69+
return (Consumer<T>) callback;
70+
}
71+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Copyright 2018 Peter Bencze.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.github.peterbencze.serritor.api.event;
18+
19+
/**
20+
* Represents events occurred during the crawling.
21+
*
22+
* @author Peter Bencze
23+
*/
24+
public enum CrawlEvent {
25+
26+
/**
27+
* Event which gets triggered when the browser loads the page.
28+
*/
29+
PAGE_LOAD,
30+
/**
31+
* Event which gets triggered when the MIME type of the response is not "text/html".
32+
*/
33+
NON_HTML_CONTENT,
34+
/**
35+
* Event which gets triggered when a page does not load in the browser within the timeout
36+
* period.
37+
*/
38+
PAGE_LOAD_TIMEOUT,
39+
/**
40+
* Event which gets triggered when a request is redirected.
41+
*/
42+
REQUEST_REDIRECT,
43+
/**
44+
* Event which gets triggered when a request error occurs.
45+
*/
46+
REQUEST_ERROR
47+
}

src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
package com.github.peterbencze.serritor.api.event;
1818

1919
import com.github.peterbencze.serritor.api.CrawlCandidate;
20-
import com.github.peterbencze.serritor.internal.EventObject;
20+
import com.github.peterbencze.serritor.internal.event.EventObject;
2121

2222
/**
2323
* Event which gets delivered when the MIME type of the response is not "text/html".

src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
package com.github.peterbencze.serritor.api.event;
1818

1919
import com.github.peterbencze.serritor.api.CrawlCandidate;
20-
import com.github.peterbencze.serritor.internal.EventObject;
20+
import com.github.peterbencze.serritor.internal.event.EventObject;
2121
import org.openqa.selenium.WebDriver;
2222

2323
/**

src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
package com.github.peterbencze.serritor.api.event;
1818

1919
import com.github.peterbencze.serritor.api.CrawlCandidate;
20-
import com.github.peterbencze.serritor.internal.EventObject;
20+
import com.github.peterbencze.serritor.internal.event.EventObject;
2121
import org.openqa.selenium.TimeoutException;
2222

2323
/**

src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
package com.github.peterbencze.serritor.api.event;
1818

1919
import com.github.peterbencze.serritor.api.CrawlCandidate;
20-
import com.github.peterbencze.serritor.internal.EventObject;
20+
import com.github.peterbencze.serritor.internal.event.EventObject;
2121
import java.io.IOException;
2222

2323
/**

0 commit comments

Comments
 (0)