Merge pull request #44 from peterbencze/development

peterbencze · web-flow · commit 10ed6f82c90f · 2018-11-04T19:26:23.000Z
Serritor 1.6.0
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,9 @@
+language: java
+jdk:
+  - oraclejdk8
+  - openjdk8
+install: true
+script: mvn clean verify -Dgpg.skip
+cache:
+  directories:
+  - $HOME/.m2
diff --git a/README.md b/README.md
@@ -1,8 +1,6 @@
 Serritor
 ========
 
-[![Support via PayPal](https://cdn.rawgit.com/twolfson/paypal-github-button/1.0.0/dist/button.svg)](https://paypal.me/peterbencze)
-
 Serritor is an open source web crawler framework built upon [Selenium](http://www.seleniumhq.org/) and written in Java. It can be used to crawl dynamic web pages that use JavaScript.
 
 ## Using Serritor in your build
@@ -13,15 +11,15 @@ Add the following dependency to your pom.xml:
 <dependency>
     <groupId>com.github.peterbencze</groupId>
     <artifactId>serritor</artifactId>
-    <version>1.5.0</version>
+    <version>1.6.0</version>
 </dependency>
 ```
 
 ### Gradle
 
 Add the following dependency to your build.gradle:
 ```groovy
-compile group: 'com.github.peterbencze', name: 'serritor', version: '1.5.0'
+compile group: 'com.github.peterbencze', name: 'serritor', version: '1.6.0'
 ```
 
 ### Manual dependencies
@@ -93,5 +91,12 @@ crawler.start(new ChromeDriver());
 
 That's it! In just a few lines you can create a crawler that crawls every link it finds, while filtering duplicate and offsite requests. You also get access to the `WebDriver` instance, so you can use all the features that are provided by Selenium.
 
+## Support
+If this framework helped you in any way, or you would like to support the development:
+
+[![Support via PayPal](https://cdn.rawgit.com/twolfson/paypal-github-button/1.0.0/dist/button.svg)](https://paypal.me/peterbencze)
+
+Any amount you choose to give will be greatly appreciated.
+
 ## License
 The source code of Serritor is made available under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>com.github.peterbencze</groupId>
     <artifactId>serritor</artifactId>
-    <version>1.5.0</version>
+    <version>1.6.0</version>
     <packaging>jar</packaging>
 
     <name>Serritor</name>
@@ -59,12 +59,12 @@
         <dependency>
             <groupId>org.seleniumhq.selenium</groupId>
             <artifactId>htmlunit-driver</artifactId>
-            <version>2.32.1</version>
+            <version>2.33.0</version>
         </dependency>
         <dependency>
             <groupId>com.google.guava</groupId>
             <artifactId>guava</artifactId>
-            <version>26.0-jre</version>
+            <version>27.0-jre</version>
         </dependency>
         <dependency>
             <groupId>junit</groupId>
@@ -75,7 +75,7 @@
         <dependency>
             <groupId>org.mockito</groupId>
             <artifactId>mockito-core</artifactId>
-            <version>2.18.3</version>
+            <version>2.23.0</version>
             <scope>test</scope>
         </dependency>
         <dependency>
@@ -87,7 +87,7 @@
         <dependency>
             <groupId>com.github.tomakehurst</groupId>
             <artifactId>wiremock</artifactId>
-            <version>2.18.0</version>
+            <version>2.19.0</version>
             <scope>test</scope>
         </dependency>
     </dependencies>
@@ -134,7 +134,7 @@
                     <dependency>
                         <groupId>com.puppycrawl.tools</groupId>
                         <artifactId>checkstyle</artifactId>
-                        <version>8.11</version>
+                        <version>8.14</version>
                     </dependency>
                 </dependencies>
                 <configuration>
@@ -152,7 +152,10 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-failsafe-plugin</artifactId>
-                <version>2.22.0</version>
+                <version>2.22.1</version>
+                <configuration>
+                    <argLine>-Djdk.net.URLClassPath.disableClassPathURLCheck=true</argLine>
+                </configuration>
                 <executions>
                     <execution>
                         <goals>
diff --git a/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java b/src/main/java/com/github/peterbencze/serritor/api/BaseCrawler.java
@@ -18,6 +18,7 @@
 
 import com.gargoylesoftware.htmlunit.WebClient;
 import com.github.peterbencze.serritor.api.CrawlRequest.CrawlRequestBuilder;
+import com.github.peterbencze.serritor.api.event.CrawlEvent;
 import com.github.peterbencze.serritor.api.event.NonHtmlContentEvent;
 import com.github.peterbencze.serritor.api.event.PageLoadEvent;
 import com.github.peterbencze.serritor.api.event.PageLoadTimeoutEvent;
@@ -30,6 +31,7 @@
 import com.github.peterbencze.serritor.internal.crawldelaymechanism.CrawlDelayMechanism;
 import com.github.peterbencze.serritor.internal.crawldelaymechanism.FixedCrawlDelayMechanism;
 import com.github.peterbencze.serritor.internal.crawldelaymechanism.RandomCrawlDelayMechanism;
+import com.github.peterbencze.serritor.internal.event.EventCallbackManager;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -72,6 +74,7 @@ public abstract class BaseCrawler {
     private static final Logger LOGGER = Logger.getLogger(BaseCrawler.class.getName());
 
     private CrawlerConfiguration config;
+    private EventCallbackManager callbackManager;
     private CrawlFrontier crawlFrontier;
     private BasicCookieStore cookieStore;
     private CloseableHttpClient httpClient;
@@ -108,6 +111,16 @@ protected BaseCrawler(final InputStream inStream) {
      * Private base constructor which does simple initialization.
      */
     private BaseCrawler() {
+        callbackManager = new EventCallbackManager();
+        callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD, this::onPageLoad);
+        callbackManager.setDefaultEventCallback(CrawlEvent.NON_HTML_CONTENT,
+                this::onNonHtmlContent);
+        callbackManager.setDefaultEventCallback(CrawlEvent.PAGE_LOAD_TIMEOUT,
+                this::onPageLoadTimeout);
+        callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_REDIRECT,
+                this::onRequestRedirect);
+        callbackManager.setDefaultEventCallback(CrawlEvent.REQUEST_ERROR, this::onRequestError);
+
         isStopping = false;
         isStopped = true;
     }
@@ -208,6 +221,22 @@ public final void resumeState(final WebDriver webDriver) {
         start(webDriver, true);
     }
 
+    /**
+     * Registers an operation which is invoked when the specific event occurs and the provided
+     * pattern matches the request URL.
+     *
+     * @param event    the event for which the callback should be triggered
+     * @param callback the pattern matching callback to invoke
+     */
+    protected final void registerCustomEventCallback(
+            final CrawlEvent event,
+            final PatternMatchingCallback callback) {
+        Validate.notNull(event, "The event cannot be null.");
+        Validate.notNull(callback, "The callback cannot be null.");
+
+        callbackManager.addCustomEventCallback(event, callback);
+    }
+
     /**
      * Gracefully stops the crawler.
      */
@@ -284,7 +313,8 @@ private void run() {
                 // Send an HTTP HEAD request to determine its availability and content type
                 httpHeadResponse = getHttpHeadResponse(candidateUrl, context);
             } catch (IOException exception) {
-                onRequestError(new RequestErrorEvent(currentCandidate, exception));
+                callbackManager.call(CrawlEvent.REQUEST_ERROR,
+                        new RequestErrorEvent(currentCandidate, exception));
                 isUnsuccessfulRequest = true;
             }
 
@@ -294,35 +324,36 @@ private void run() {
                     String responseMimeType = getResponseMimeType(httpHeadResponse);
                     if (responseMimeType.equals(ContentType.TEXT_HTML.getMimeType())) {
                         boolean isTimedOut = false;
-                        TimeoutException requestTimeoutException = null;
+                        TimeoutException timeoutException = null;
 
                         try {
                             // Open URL in browser
                             webDriver.get(candidateUrl);
                         } catch (TimeoutException exception) {
                             isTimedOut = true;
-                            requestTimeoutException = exception;
+                            timeoutException = exception;
                         }
 
                         // Ensure the HTTP client and Selenium have the same state
                         syncHttpClientCookies();
 
                         if (isTimedOut) {
-                            onPageLoadTimeout(new PageLoadTimeoutEvent(currentCandidate,
-                                    requestTimeoutException));
+                            callbackManager.call(CrawlEvent.PAGE_LOAD_TIMEOUT,
+                                    new PageLoadTimeoutEvent(currentCandidate, timeoutException));
                         } else {
                             String loadedPageUrl = webDriver.getCurrentUrl();
                             if (!loadedPageUrl.equals(candidateUrl)) {
                                 // Create a new crawl request for the redirected URL (JS redirect)
                                 handleRequestRedirect(currentCandidate, loadedPageUrl);
                             } else {
-                                onPageLoad(new PageLoadEvent(currentCandidate, webDriver));
+                                callbackManager.call(CrawlEvent.PAGE_LOAD,
+                                        new PageLoadEvent(currentCandidate, webDriver));
                             }
                         }
                     } else {
                         // URLs that point to non-HTML content should not be opened in the browser
-                        onNonHtmlContent(new NonHtmlContentEvent(currentCandidate,
-                                responseMimeType));
+                        callbackManager.call(CrawlEvent.NON_HTML_CONTENT,
+                                new NonHtmlContentEvent(currentCandidate, responseMimeType));
                     }
                 } else {
                     // Create a new crawl request for the redirected URL
@@ -432,7 +463,8 @@ private void handleRequestRedirect(
         CrawlRequest redirectedRequest = builder.build();
 
         crawlFrontier.feedRequest(redirectedRequest, false);
-        onRequestRedirect(new RequestRedirectEvent(currentCrawlCandidate, redirectedRequest));
+        callbackManager.call(CrawlEvent.REQUEST_REDIRECT,
+                new RequestRedirectEvent(currentCrawlCandidate, redirectedRequest));
     }
 
     /**
diff --git a/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java b/src/main/java/com/github/peterbencze/serritor/api/PatternMatchingCallback.java
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.peterbencze.serritor.api;
+
+import com.github.peterbencze.serritor.internal.event.EventObject;
+import java.util.function.Consumer;
+import java.util.regex.Pattern;
+import org.apache.commons.lang3.Validate;
+
+/**
+ * Represents an operation which is invoked when the specified regex pattern matches the request
+ * URL.
+ *
+ * @author Peter Bencze
+ */
+public final class PatternMatchingCallback {
+
+    private final Pattern urlPattern;
+    private final Consumer<? extends EventObject> callback;
+
+    /**
+     * Creates a {@link PatternMatchingCallback} instance.
+     *
+     * @param <T>        the type of the input to the operation
+     * @param urlPattern the regex pattern used for matching on request URLs
+     * @param callback   the operation to be performed when the pattern matches
+     */
+    public <T extends EventObject> PatternMatchingCallback(
+            final Pattern urlPattern,
+            final Consumer<T> callback) {
+        Validate.notNull(urlPattern, "The pattern cannot be null.");
+        Validate.notNull(callback, "The callback cannot be null.");
+
+        this.urlPattern = urlPattern;
+        this.callback = callback;
+    }
+
+    /**
+     * Returns the regex pattern used for matching on request URLs.
+     *
+     * @return the regex pattern used for matching
+     */
+    public Pattern getUrlPattern() {
+        return urlPattern;
+    }
+
+    /**
+     * Returns the operation to be performed when the pattern matches.
+     *
+     * @param <T> the type of the input to the operation
+     *
+     * @return the operation to be performed when the pattern matches
+     */
+    public <T extends EventObject> Consumer<T> getCallback() {
+        return (Consumer<T>) callback;
+    }
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/CrawlEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/CrawlEvent.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2018 Peter Bencze.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.peterbencze.serritor.api.event;
+
+/**
+ * Represents events occurred during the crawling.
+ *
+ * @author Peter Bencze
+ */
+public enum CrawlEvent {
+
+    /**
+     * Event which gets triggered when the browser loads the page.
+     */
+    PAGE_LOAD,
+    /**
+     * Event which gets triggered when the MIME type of the response is not "text/html".
+     */
+    NON_HTML_CONTENT,
+    /**
+     * Event which gets triggered when a page does not load in the browser within the timeout
+     * period.
+     */
+    PAGE_LOAD_TIMEOUT,
+    /**
+     * Event which gets triggered when a request is redirected.
+     */
+    REQUEST_REDIRECT,
+    /**
+     * Event which gets triggered when a request error occurs.
+     */
+    REQUEST_ERROR
+}
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/NonHtmlContentEvent.java
@@ -17,7 +17,7 @@
 package com.github.peterbencze.serritor.api.event;
 
 import com.github.peterbencze.serritor.api.CrawlCandidate;
-import com.github.peterbencze.serritor.internal.EventObject;
+import com.github.peterbencze.serritor.internal.event.EventObject;
 
 /**
  * Event which gets delivered when the MIME type of the response is not "text/html".
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadEvent.java
@@ -17,7 +17,7 @@
 package com.github.peterbencze.serritor.api.event;
 
 import com.github.peterbencze.serritor.api.CrawlCandidate;
-import com.github.peterbencze.serritor.internal.EventObject;
+import com.github.peterbencze.serritor.internal.event.EventObject;
 import org.openqa.selenium.WebDriver;
 
 /**
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/PageLoadTimeoutEvent.java
@@ -17,7 +17,7 @@
 package com.github.peterbencze.serritor.api.event;
 
 import com.github.peterbencze.serritor.api.CrawlCandidate;
-import com.github.peterbencze.serritor.internal.EventObject;
+import com.github.peterbencze.serritor.internal.event.EventObject;
 import org.openqa.selenium.TimeoutException;
 
 /**
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestErrorEvent.java
@@ -17,7 +17,7 @@
 package com.github.peterbencze.serritor.api.event;
 
 import com.github.peterbencze.serritor.api.CrawlCandidate;
-import com.github.peterbencze.serritor.internal.EventObject;
+import com.github.peterbencze.serritor.internal.event.EventObject;
 import java.io.IOException;
 
 /**
diff --git a/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java b/src/main/java/com/github/peterbencze/serritor/api/event/RequestRedirectEvent.java
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java b/src/main/java/com/github/peterbencze/serritor/internal/event/EventCallbackManager.java
diff --git a/src/main/java/com/github/peterbencze/serritor/internal/event/EventObject.java b/src/main/java/com/github/peterbencze/serritor/internal/event/EventObject.java
diff --git a/src/test/java/com/github/peterbencze/serritor/internal/event/EventCallbackManagerTest.java b/src/test/java/com/github/peterbencze/serritor/internal/event/EventCallbackManagerTest.java
diff --git a/wercker.yml b/wercker.yml