diff --git a/README.md b/README.md index ffd04ae222..e05f56ccd0 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ On the "Import Project" screen select the "Import project from external model" r Click "Create". On the next screen the "Eclipse projects directory" should be already set to the nutch folder. Leave the "Create module files near .classpath files" radio button selected. Click "Next" on the next screens. On the project SDK screen select Java 11 and click "Create". +**N.B.** For anyone on a Mac with a homebrew-installed openjdk, you need to use the directory under _libexec_: `/libexec/openjdk.jdk/Contents/Home`. Once the project is imported, you will see a popup saying "Ant build scripts found", "Frameworks detected - IvyIDEA Framework detected". Click "Import". If you don't get the pop-up, I'd suggest going through the steps again as this happens from time to time. There is another diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java index d66ecbf2e2..b99bb9603e 100644 --- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java +++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java @@ -16,31 +16,21 @@ */ package org.apache.nutch.protocol.htmlunit; -import java.lang.invoke.MethodHandles; -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.concurrent.TimeUnit; - +import com.gargoylesoftware.htmlunit.WebClient; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; -import org.openqa.selenium.By; -import org.openqa.selenium.JavascriptExecutor; -import org.openqa.selenium.OutputType; -import org.openqa.selenium.TakesScreenshot; -import org.openqa.selenium.TimeoutException; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.WebElement; +import org.openqa.selenium.*; import org.openqa.selenium.htmlunit.HtmlUnitDriver; import org.openqa.selenium.io.TemporaryFilesystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.gargoylesoftware.htmlunit.WebClient; +import java.io.*; +import java.lang.invoke.MethodHandles; +import java.time.Duration; +import java.time.temporal.ChronoUnit; public class HtmlUnitWebDriver extends HtmlUnitDriver { @@ -75,14 +65,15 @@ public static WebDriver getDriverForPage(String url, Configuration conf) { enableCss = conf.getBoolean("htmlunit.enable.css", false); javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500); int redirects = Integer.parseInt(conf.get("http.redirect.max", "0")); - enableRedirect = redirects <= 0 ? false : true; + enableRedirect = redirects > 0; maxRedirects = redirects; WebDriver driver = null; try { driver = new HtmlUnitWebDriver(); - driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS); + driver.manage().timeouts().pageLoadTimeout(Duration.of(pageLoadTimout, + ChronoUnit.SECONDS)); driver.get(url); } catch(Exception e) { if(e instanceof TimeoutException) { diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 034fa78406..150f1ad821 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -301,7 +301,7 @@ public void setConf(Configuration conf) { if (parts.length == 2) { this.hostCookies.put(parts[0], parts[1]); } else { - LOG.warn("Unable to parse cookie file correctly at: " + word); + LOG.warn("Unable to parse cookie file correctly at: {}", word); } } } @@ -332,8 +332,8 @@ public void setConf(Configuration conf) { ciphers = ((SSLSocketFactory) SSLSocketFactory.getDefault()).getDefaultCipherSuites(); } - this.tlsPreferredProtocols = new HashSet(Arrays.asList(protocols)); - this.tlsPreferredCipherSuites = new HashSet(Arrays.asList(ciphers)); + this.tlsPreferredProtocols = new HashSet<>(Arrays.asList(protocols)); + this.tlsPreferredCipherSuites = new HashSet<>(Arrays.asList(ciphers)); logConf(); } @@ -402,7 +402,7 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u)); } else if (code == 400) { // bad request, mark as GONE if (this.logger.isTraceEnabled()) { - this.logger.trace("400 Bad request: " + u); + this.logger.trace("400 Bad request: {}", u); } return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u)); @@ -435,11 +435,6 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { } } - /* - * -------------------------- * * - * -------------------------- - */ - public String getProxyHost() { return this.proxyHost; } @@ -569,37 +564,35 @@ public Set getTlsPreferredProtocols() { private static String getAgentString(String agentName, String agentVersion, String agentDesc, String agentURL, String agentEmail) { - if ((agentName == null) || (agentName.trim().length() == 0)) { - if (LOG.isErrorEnabled()) { + if (((agentName == null) || (agentName.trim().isEmpty())) && LOG.isErrorEnabled()) { LOG.error("No User-Agent string set (http.agent.name)!"); - } } - StringBuffer buf = new StringBuffer(); + StringBuilder buf = new StringBuilder(); buf.append(agentName); if (agentVersion != null && !agentVersion.trim().isEmpty()) { buf.append("/"); buf.append(agentVersion); } - if (((agentDesc != null) && (agentDesc.length() != 0)) - || ((agentEmail != null) && (agentEmail.length() != 0)) - || ((agentURL != null) && (agentURL.length() != 0))) { + if (((agentDesc != null) && (!agentDesc.isEmpty())) + || ((agentEmail != null) && (!agentEmail.isEmpty())) + || ((agentURL != null) && (!agentURL.isEmpty()))) { buf.append(" ("); - if ((agentDesc != null) && (agentDesc.length() != 0)) { + if ((agentDesc != null) && (!agentDesc.isEmpty())) { buf.append(agentDesc); if ((agentURL != null) || (agentEmail != null)) buf.append("; "); } - if ((agentURL != null) && (agentURL.length() != 0)) { + if ((agentURL != null) && (!agentURL.isEmpty())) { buf.append(agentURL); if (agentEmail != null) buf.append("; "); } - if ((agentEmail != null) && (agentEmail.length() != 0)) + if ((agentEmail != null) && (!agentEmail.isEmpty())) buf.append(agentEmail); buf.append(")"); @@ -609,15 +602,15 @@ private static String getAgentString(String agentName, String agentVersion, protected void logConf() { if (this.logger.isInfoEnabled()) { - this.logger.info("http.proxy.host = " + this.proxyHost); - this.logger.info("http.proxy.port = " + this.proxyPort); - this.logger.info("http.proxy.exception.list = " + this.useProxy); - this.logger.info("http.timeout = " + this.timeout); - this.logger.info("http.content.limit = " + this.maxContent); - this.logger.info("http.agent = " + this.userAgent); - this.logger.info("http.accept.language = " + this.acceptLanguage); - this.logger.info("http.accept = " + this.accept); - this.logger.info("http.enable.cookie.header = " + isCookieEnabled()); + this.logger.info("http.proxy.host = {}", this.proxyHost); + this.logger.info("http.proxy.port = {}", this.proxyPort); + this.logger.info("http.proxy.exception.list = {}", this.useProxy); + this.logger.info("http.timeout = {}", this.timeout); + this.logger.info("http.content.limit = {}", this.maxContent); + this.logger.info("http.agent = {}", this.userAgent); + this.logger.info("http.accept.language = {}", this.acceptLanguage); + this.logger.info("http.accept = {}", this.accept); + this.logger.info("http.enable.cookie.header = {}", isCookieEnabled()); } } @@ -644,9 +637,8 @@ public byte[] processGzipEncoded(byte[] compressed, URL url) throw new IOException("unzipBestEffort returned null"); if (LOG.isTraceEnabled()) { - LOG.trace("fetched " + compressed.length - + " bytes of compressed content (expanded to " + content.length - + " bytes) from " + url); + LOG.trace("fetched {} bytes of compressed content (expanded to {} " + + "bytes) from {}", compressed.length, content.length, url); } return content; } @@ -674,9 +666,8 @@ public byte[] processDeflateEncoded(byte[] compressed, URL url) throw new IOException("inflateBestEffort returned null"); if (LOG.isTraceEnabled()) { - LOG.trace("fetched " + compressed.length - + " bytes of compressed content (expanded to " + content.length - + " bytes) from " + url); + LOG.trace("fetched {} bytes of compressed content (expanded to {} " + + "bytes) from {}", compressed.length, content.length, url); } return content; } @@ -736,11 +727,11 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, */ private static HashMap arrayToMap(String[] input) { if (input == null || input.length == 0) { - return new HashMap(); + return new HashMap<>(); } HashMap hm = new HashMap<>(); for (int i = 0; i < input.length; i++) { - if (!"".equals(input[i].trim())) { + if (!input[i].trim().isEmpty()) { hm.put(input[i], input[i]); } } diff --git a/src/plugin/lib-selenium/README.md b/src/plugin/lib-selenium/README.md index 5054d7ad8e..2d82fa752b 100644 --- a/src/plugin/lib-selenium/README.md +++ b/src/plugin/lib-selenium/README.md @@ -23,7 +23,7 @@ Your can run Nutch in Docker. Check some examples at https://github.com/sbatururimi/nutch-test. -Don't forget to update Dockefile to point to the original Nutch repository when updated. +Don't forget to update Dockerfile to point to the original Nutch repository when updated. # Contributors Stas Batururimi [s.batururimi@gmail.com] diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.md b/src/plugin/lib-selenium/howto_upgrade_selenium.md index 3071c74cbf..a14a346b1b 100644 --- a/src/plugin/lib-selenium/howto_upgrade_selenium.md +++ b/src/plugin/lib-selenium/howto_upgrade_selenium.md @@ -15,18 +15,36 @@ limitations under the License. --> -1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml +1. Upgrade various driver versions dependency in `src/plugin/lib-selenium/ivy.xml` -2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml +2. Upgrade Selenium's own dependencies in `src/plugin/lib-selenium/plugin.xml` To get a list of dependencies and their versions execute: - $ ant -f ./build-ivy.xml - $ ls lib | sed 's/^/ \n \n <\/library>/g' + ``` + $ ant -f ./build-ivy.xml + $ ls lib | sed 's/^/ \n \n <\/library>/g' + ``` + Note that all dependent libraries are exported for a "library" plugin `lib-selenium`. - Note that all dependent libraries are exported for a "library" plugin ("lib-selenium"). + N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can install GNU Sed as follows - N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows - - $ brew install gnu-sed --with-default-names + `$ brew install gnu-sed --with-default-names` You can then restart your terminal and the Regex + Sed command should work just fine! + +3. In the `src/plugin/lib-selenium/plugin.xml` replace all lines between + `` + and + `` + with the output of the command above. + +4. Remove the locally "installed" dependencies in `src/plugin/lib-selenium/lib/`: + + `$ rm -rf lib/` + +5. Build Nutch and run all unit tests: + + ``` + $ cd ../../../ + $ ant clean runtime test + ``` diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml index 22910a4187..85f2e0d6d6 100644 --- a/src/plugin/lib-selenium/ivy.xml +++ b/src/plugin/lib-selenium/ivy.xml @@ -38,7 +38,7 @@ - + diff --git a/src/plugin/lib-selenium/plugin.xml b/src/plugin/lib-selenium/plugin.xml index 9ec85964fc..b90c6c9511 100644 --- a/src/plugin/lib-selenium/plugin.xml +++ b/src/plugin/lib-selenium/plugin.xml @@ -26,200 +26,135 @@ - - - - - - - - + + - + - - - - - - - + - - - - - - - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java index b0b12004da..0440e84b01 100644 --- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java +++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java @@ -16,49 +16,30 @@ */ package org.apache.nutch.protocol.selenium; -import java.lang.invoke.MethodHandles; -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.URL; -import java.util.concurrent.TimeUnit; -import java.util.Random; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; - -import org.openqa.selenium.By; -import org.openqa.selenium.Capabilities; import org.openqa.selenium.OutputType; import org.openqa.selenium.TakesScreenshot; import org.openqa.selenium.TimeoutException; import org.openqa.selenium.WebDriver; - import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; - -//import org.openqa.selenium.firefox.FirefoxBinary; import org.openqa.selenium.firefox.FirefoxDriver; -//import org.openqa.selenium.firefox.FirefoxProfile; import org.openqa.selenium.firefox.FirefoxOptions; - import org.openqa.selenium.io.TemporaryFilesystem; - -import org.openqa.selenium.remote.DesiredCapabilities; import org.openqa.selenium.remote.RemoteWebDriver; - -//import org.openqa.selenium.safari.SafariDriver; - -//import org.openqa.selenium.phantomjs.PhantomJSDriver; -//import org.openqa.selenium.phantomjs.PhantomJSDriverService; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.*; +import java.lang.invoke.MethodHandles; +import java.net.URL; +import java.time.Duration; +import java.time.temporal.ChronoUnit; +import java.util.Random; + public class HttpWebClient { private static final Logger LOG = LoggerFactory @@ -127,8 +108,10 @@ public static WebDriver getDriverForPage(String url, Configuration conf) { } LOG.debug("Selenium {} WebDriver selected.", driverType); - driver.manage().timeouts().pageLoadTimeout(pageLoadWait, - TimeUnit.SECONDS); + driver.manage().window().maximize(); + driver.manage().deleteAllCookies(); + driver.manage().timeouts().pageLoadTimeout(Duration.of(pageLoadWait, + ChronoUnit.SECONDS)); driver.get(url); } catch (Exception e) { if (e instanceof TimeoutException) { @@ -147,50 +130,46 @@ public static WebDriver getDriverForPage(String url, Configuration conf) { public static WebDriver createFirefoxWebDriver(String firefoxDriverPath, boolean enableHeadlessMode) { - System.setProperty("webdriver.gecko.driver", firefoxDriverPath); FirefoxOptions firefoxOptions = new FirefoxOptions(); + firefoxOptions.setBinary(firefoxDriverPath); if (enableHeadlessMode) { - firefoxOptions.addArguments("--headless"); + firefoxOptions.addArguments("-headless"); } - WebDriver driver = new FirefoxDriver(firefoxOptions); - return driver; + return new FirefoxDriver(firefoxOptions); } public static WebDriver createChromeWebDriver(String chromeDriverPath, boolean enableHeadlessMode) { // if not specified, WebDriver will search your path for chromedriver - System.setProperty("webdriver.chrome.driver", chromeDriverPath); ChromeOptions chromeOptions = new ChromeOptions(); chromeOptions.addArguments("--no-sandbox"); chromeOptions.addArguments("--disable-extensions"); + chromeOptions.setBinary(chromeDriverPath); // be sure to set selenium.enable.headless to true if no monitor attached // to your server if (enableHeadlessMode) { - chromeOptions.addArguments("--headless"); + chromeOptions.addArguments("--headless=new"); } - WebDriver driver = new ChromeDriver(chromeOptions); - return driver; + return new ChromeDriver(chromeOptions); } public static RemoteWebDriver createFirefoxRemoteWebDriver(URL seleniumHubUrl, boolean enableHeadlessMode) { FirefoxOptions firefoxOptions = new FirefoxOptions(); if (enableHeadlessMode) { - firefoxOptions.setHeadless(true); + firefoxOptions.addArguments("-headless"); } - RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl, + return new RemoteWebDriver(seleniumHubUrl, firefoxOptions); - return driver; } public static RemoteWebDriver createChromeRemoteWebDriver(URL seleniumHubUrl, boolean enableHeadlessMode) { ChromeOptions chromeOptions = new ChromeOptions(); if (enableHeadlessMode) { - chromeOptions.setHeadless(true); + chromeOptions.addArguments("--headless=new"); } - RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl, chromeOptions); - return driver; + return new RemoteWebDriver(seleniumHubUrl, chromeOptions); } public static RemoteWebDriver createRandomRemoteWebDriver(URL seleniumHubUrl, @@ -211,7 +190,6 @@ public static RemoteWebDriver createRandomRemoteWebDriver(URL seleniumHubUrl, if (num == 0) { return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode); } - return createChromeRemoteWebDriver(seleniumHubUrl, enableHeadlessMode); } @@ -235,7 +213,7 @@ public static void cleanUpDriver(WebDriver driver) { /** * Function for obtaining the HTML using the selected selenium + * 'https://www.selenium.dev/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium * webdriver There are a number of configuration properties within * nutch-site.xml which determine whether to take screenshots of * the rendered pages and persist them as timestamped .png's into HDFS. @@ -260,7 +238,7 @@ public static String getHtmlPage(String url, Configuration conf) { } catch (Exception e) { TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); // throw new RuntimeException(e); - LOG.error("getHtmlPage(url, conf): " + e.toString()); + LOG.error("getHtmlPage(url, conf): {}", e.toString()); throw new RuntimeException(e); } finally { cleanUpDriver(driver); @@ -279,22 +257,22 @@ private static void takeScreenshot(WebDriver driver, Configuration conf) { LOG.debug("In-memory screenshot taken of: {}", url); FileSystem fs = FileSystem.get(conf); if (conf.get("screenshot.location") != null) { - Path screenshotPath = new Path( - conf.get("screenshot.location") + "/" + srcFile.getName()); + String screenshotPath = conf.get("screenshot.location", ""); + Path path = new Path(String.valueOf(new File(screenshotPath, srcFile.getName()))); OutputStream os = null; - if (!fs.exists(screenshotPath)) { + if (!fs.exists(path)) { LOG.debug( - "No existing screenshot already exists... creating new file at {} {}.", + "No existing screenshot already exists... creating new file at {}/{}.", screenshotPath, srcFile.getName()); - os = fs.create(screenshotPath); + os = fs.create(path); } InputStream is = new BufferedInputStream(new FileInputStream(srcFile)); IOUtils.copyBytes(is, os, conf); - LOG.debug("Screenshot for {} successfully saved to: {} {}", url, + LOG.debug("Screenshot for {} successfully saved to: {}/{}", url, screenshotPath, srcFile.getName()); } else { LOG.warn( - "Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " + "Screenshot for {} not saved to HDFS (subsequently discarded) as value for " + "'screenshot.location' is absent from nutch-site.xml.", url); } diff --git a/src/plugin/protocol-interactiveselenium/README.md b/src/plugin/protocol-interactiveselenium/README.md index 545efb830a..b94b7466d5 100644 --- a/src/plugin/protocol-interactiveselenium/README.md +++ b/src/plugin/protocol-interactiveselenium/README.md @@ -18,11 +18,11 @@ Nutch Interactive Selenium ========================== -This protocol plugin allows you to fetch and interact with pages using [Selenium](http://www.seleniumhq.org/). +This protocol plugin allows you to fetch and interact with pages using [Selenium](https://www.selenium.dev/). # Dependencies and Configuration -You will need to have [Selenium](http://www.seleniumhq.org/) and a compatible version of Firefox installed to use this plugin. +You will need to have [Selenium](https://www.selenium.dev/) and a compatible version of Firefox installed to use this plugin. Set the protocol to be used in your Nutch configuration files. ``` diff --git a/src/plugin/protocol-selenium/README.md b/src/plugin/protocol-selenium/README.md index 4d43c330d5..265ea73161 100644 --- a/src/plugin/protocol-selenium/README.md +++ b/src/plugin/protocol-selenium/README.md @@ -35,7 +35,7 @@ There are essentially two ways in which Nutch can be used with Selenium. ### A) Setting up Selenium (local mode) - * Ensure that you have your prefered browser installed. Currently Chrome, Safari, Opera, PhantomJS and Firefox are supported. Here there example of installing Firefox is provided. More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox) + * Ensure that you have your preferred browser installed. Currently Chrome, Safari, Opera, PhantomJS and Firefox are supported. Here there example of installing Firefox is provided. More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox) ``` sudo apt-get install firefox ``` diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java index dde1122fec..aed535b959 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java @@ -16,19 +16,18 @@ */ package org.apache.nutch.protocol.selenium; -import java.lang.invoke.MethodHandles; -import java.io.IOException; -import java.net.URL; -import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.http.api.HttpBase; import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.http.api.HttpBase; import org.apache.nutch.util.NutchConfiguration; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.URL; + public class Http extends HttpBase { protected static final Logger LOG = LoggerFactory @@ -38,11 +37,6 @@ public Http() { super(LOG); } - @Override - public void setConf(Configuration conf) { - super.setConf(conf); - } - public static void main(String[] args) throws Exception { Http http = new Http(); http.setConf(NutchConfiguration.create());