Skip to content

Commit

Permalink
NUTCH-3036 Upgrade org.seleniumhq.selenium:selenium-java dependency i… (
Browse files Browse the repository at this point in the history
  • Loading branch information
lewismc authored Mar 30, 2024
1 parent 5a95bc6 commit 1563396
Show file tree
Hide file tree
Showing 11 changed files with 144 additions and 236 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ On the "Import Project" screen select the "Import project from external model" r
Click "Create". On the next screen the "Eclipse projects directory" should be already set to the nutch folder.
Leave the "Create module files near .classpath files" radio button selected.
Click "Next" on the next screens. On the project SDK screen select Java 11 and click "Create".
**N.B.** For anyone on a Mac with a homebrew-installed openjdk, you need to use the directory under _libexec_: `<openjdk11_directory>/libexec/openjdk.jdk/Contents/Home`.

Once the project is imported, you will see a popup saying "Ant build scripts found", "Frameworks detected - IvyIDEA Framework detected". Click "Import".
If you don't get the pop-up, I'd suggest going through the steps again as this happens from time to time. There is another
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,21 @@
*/
package org.apache.nutch.protocol.htmlunit;

import java.lang.invoke.MethodHandles;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.concurrent.TimeUnit;

import com.gargoylesoftware.htmlunit.WebClient;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.TimeoutException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.*;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import org.openqa.selenium.io.TemporaryFilesystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.gargoylesoftware.htmlunit.WebClient;
import java.io.*;
import java.lang.invoke.MethodHandles;
import java.time.Duration;
import java.time.temporal.ChronoUnit;

public class HtmlUnitWebDriver extends HtmlUnitDriver {

Expand Down Expand Up @@ -75,14 +65,15 @@ public static WebDriver getDriverForPage(String url, Configuration conf) {
enableCss = conf.getBoolean("htmlunit.enable.css", false);
javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
enableRedirect = redirects <= 0 ? false : true;
enableRedirect = redirects > 0;
maxRedirects = redirects;

WebDriver driver = null;

try {
driver = new HtmlUnitWebDriver();
driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(Duration.of(pageLoadTimout,
ChronoUnit.SECONDS));
driver.get(url);
} catch(Exception e) {
if(e instanceof TimeoutException) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ public void setConf(Configuration conf) {
if (parts.length == 2) {
this.hostCookies.put(parts[0], parts[1]);
} else {
LOG.warn("Unable to parse cookie file correctly at: " + word);
LOG.warn("Unable to parse cookie file correctly at: {}", word);
}
}
}
Expand Down Expand Up @@ -332,8 +332,8 @@ public void setConf(Configuration conf) {
ciphers = ((SSLSocketFactory) SSLSocketFactory.getDefault()).getDefaultCipherSuites();
}

this.tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
this.tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
this.tlsPreferredProtocols = new HashSet<>(Arrays.asList(protocols));
this.tlsPreferredCipherSuites = new HashSet<>(Arrays.asList(ciphers));

logConf();
}
Expand Down Expand Up @@ -402,7 +402,7 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
} else if (code == 400) { // bad request, mark as GONE
if (this.logger.isTraceEnabled()) {
this.logger.trace("400 Bad request: " + u);
this.logger.trace("400 Bad request: {}", u);
}
return new ProtocolOutput(c,
new ProtocolStatus(ProtocolStatus.GONE, u));
Expand Down Expand Up @@ -435,11 +435,6 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
}
}

/*
* -------------------------- * </implementation:Protocol> *
* --------------------------
*/

public String getProxyHost() {
return this.proxyHost;
}
Expand Down Expand Up @@ -569,37 +564,35 @@ public Set<String> getTlsPreferredProtocols() {
private static String getAgentString(String agentName, String agentVersion,
String agentDesc, String agentURL, String agentEmail) {

if ((agentName == null) || (agentName.trim().length() == 0)) {
if (LOG.isErrorEnabled()) {
if (((agentName == null) || (agentName.trim().isEmpty())) && LOG.isErrorEnabled()) {
LOG.error("No User-Agent string set (http.agent.name)!");
}
}

StringBuffer buf = new StringBuffer();
StringBuilder buf = new StringBuilder();

buf.append(agentName);
if (agentVersion != null && !agentVersion.trim().isEmpty()) {
buf.append("/");
buf.append(agentVersion);
}
if (((agentDesc != null) && (agentDesc.length() != 0))
|| ((agentEmail != null) && (agentEmail.length() != 0))
|| ((agentURL != null) && (agentURL.length() != 0))) {
if (((agentDesc != null) && (!agentDesc.isEmpty()))
|| ((agentEmail != null) && (!agentEmail.isEmpty()))
|| ((agentURL != null) && (!agentURL.isEmpty()))) {
buf.append(" (");

if ((agentDesc != null) && (agentDesc.length() != 0)) {
if ((agentDesc != null) && (!agentDesc.isEmpty())) {
buf.append(agentDesc);
if ((agentURL != null) || (agentEmail != null))
buf.append("; ");
}

if ((agentURL != null) && (agentURL.length() != 0)) {
if ((agentURL != null) && (!agentURL.isEmpty())) {
buf.append(agentURL);
if (agentEmail != null)
buf.append("; ");
}

if ((agentEmail != null) && (agentEmail.length() != 0))
if ((agentEmail != null) && (!agentEmail.isEmpty()))
buf.append(agentEmail);

buf.append(")");
Expand All @@ -609,15 +602,15 @@ private static String getAgentString(String agentName, String agentVersion,

protected void logConf() {
if (this.logger.isInfoEnabled()) {
this.logger.info("http.proxy.host = " + this.proxyHost);
this.logger.info("http.proxy.port = " + this.proxyPort);
this.logger.info("http.proxy.exception.list = " + this.useProxy);
this.logger.info("http.timeout = " + this.timeout);
this.logger.info("http.content.limit = " + this.maxContent);
this.logger.info("http.agent = " + this.userAgent);
this.logger.info("http.accept.language = " + this.acceptLanguage);
this.logger.info("http.accept = " + this.accept);
this.logger.info("http.enable.cookie.header = " + isCookieEnabled());
this.logger.info("http.proxy.host = {}", this.proxyHost);
this.logger.info("http.proxy.port = {}", this.proxyPort);
this.logger.info("http.proxy.exception.list = {}", this.useProxy);
this.logger.info("http.timeout = {}", this.timeout);
this.logger.info("http.content.limit = {}", this.maxContent);
this.logger.info("http.agent = {}", this.userAgent);
this.logger.info("http.accept.language = {}", this.acceptLanguage);
this.logger.info("http.accept = {}", this.accept);
this.logger.info("http.enable.cookie.header = {}", isCookieEnabled());
}
}

Expand All @@ -644,9 +637,8 @@ public byte[] processGzipEncoded(byte[] compressed, URL url)
throw new IOException("unzipBestEffort returned null");

if (LOG.isTraceEnabled()) {
LOG.trace("fetched " + compressed.length
+ " bytes of compressed content (expanded to " + content.length
+ " bytes) from " + url);
LOG.trace("fetched {} bytes of compressed content (expanded to {} " +
"bytes) from {}", compressed.length, content.length, url);
}
return content;
}
Expand Down Expand Up @@ -674,9 +666,8 @@ public byte[] processDeflateEncoded(byte[] compressed, URL url)
throw new IOException("inflateBestEffort returned null");

if (LOG.isTraceEnabled()) {
LOG.trace("fetched " + compressed.length
+ " bytes of compressed content (expanded to " + content.length
+ " bytes) from " + url);
LOG.trace("fetched {} bytes of compressed content (expanded to {} " +
"bytes) from {}", compressed.length, content.length, url);
}
return content;
}
Expand Down Expand Up @@ -736,11 +727,11 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
*/
private static HashMap<String, String> arrayToMap(String[] input) {
if (input == null || input.length == 0) {
return new HashMap<String, String>();
return new HashMap<>();
}
HashMap<String, String> hm = new HashMap<>();
for (int i = 0; i < input.length; i++) {
if (!"".equals(input[i].trim())) {
if (!input[i].trim().isEmpty()) {
hm.put(input[i], input[i]);
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/plugin/lib-selenium/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@


Your can run Nutch in Docker. Check some examples at https://github.com/sbatururimi/nutch-test.
Don't forget to update Dockefile to point to the original Nutch repository when updated.
Don't forget to update Dockerfile to point to the original Nutch repository when updated.

# Contributors
Stas Batururimi [[email protected]]
Expand Down
34 changes: 26 additions & 8 deletions src/plugin/lib-selenium/howto_upgrade_selenium.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,36 @@
limitations under the License.
-->

1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml
1. Upgrade various driver versions dependency in `src/plugin/lib-selenium/ivy.xml`

2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
2. Upgrade Selenium's own dependencies in `src/plugin/lib-selenium/plugin.xml`

To get a list of dependencies and their versions execute:
$ ant -f ./build-ivy.xml
$ ls lib | sed 's/^/ <library name="/g' | sed 's/$/">\n <export name="*"\/>\n <\/library>/g'
```
$ ant -f ./build-ivy.xml
$ ls lib | sed 's/^/ <library name="/g' | sed 's/$/">\n <export name="*"\/>\n <\/library>/g'
```
Note that all dependent libraries are exported for a "library" plugin `lib-selenium`.

Note that all dependent libraries are exported for a "library" plugin ("lib-selenium").
N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can install GNU Sed as follows

N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows

$ brew install gnu-sed --with-default-names
`$ brew install gnu-sed --with-default-names`

You can then restart your terminal and the Regex + Sed command should work just fine!

3. In the `src/plugin/lib-selenium/plugin.xml` replace all lines between
`<!-- Begin dependencies -->`
and
`<!-- End of dependencies -->`
with the output of the command above.

4. Remove the locally "installed" dependencies in `src/plugin/lib-selenium/lib/`:

`$ rm -rf lib/`

5. Build Nutch and run all unit tests:

```
$ cd ../../../
$ ant clean runtime test
```
2 changes: 1 addition & 1 deletion src/plugin/lib-selenium/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

<dependencies>
<!-- begin selenium dependencies -->
<dependency org="org.seleniumhq.selenium" name="selenium-java" rev="4.7.2" />
<dependency org="org.seleniumhq.selenium" name="selenium-java" rev="4.18.1" />
<!-- end selenium dependencies -->
</dependencies>

Expand Down
Loading

0 comments on commit 1563396

Please sign in to comment.