apache · sebastian-nagel · Oct 6, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
@@ -19,8 +19,6 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 

diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -19,7 +19,6 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.FloatWritable;
-import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.commons.lang.StringUtils;

diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
@@ -85,7 +85,7 @@
  * fetchlists for several segments in one go. Unlike in the initial version
  * (OldGenerator), the IP resolution is done ONLY on the entries which have been
  * selected for fetching. The URLs are partitioned by IP, domain or host within
- * a segment. We can chose separately how to count the URLS i.e. by domain or
+ * a segment. We can choose separately how to count the URLs i.e. by domain or
  * host to limit the entries.
  **/
 public class Generator extends NutchTool implements Tool {

diff --git a/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
@@ -25,7 +25,6 @@
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;

diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -404,11 +404,6 @@ public void run() {
 
             switch (status.getCode()) {
 
-            case ProtocolStatus.WOULDBLOCK:
-              // retry ?
-              fetchQueues.addFetchItem(fit);
-              break;
-
             case ProtocolStatus.SUCCESS: // got a page
               pstatus = output(fit.url, fit.datum, content, status,
                   CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
@@ -457,8 +452,8 @@ public void run() {
                 context.getCounter("FetcherStatus",
                     "AboveExceptionThresholdInQueue").increment(killedURLs);
               /* FALLTHROUGH */
+
             case ProtocolStatus.RETRY: // retry
-            case ProtocolStatus.BLOCKED:
               output(fit.url, fit.datum, null, status,
                   CrawlDatum.STATUS_FETCH_RETRY);
               break;

diff --git a/src/java/org/apache/nutch/indexer/NutchIndexAction.java b/src/java/org/apache/nutch/indexer/NutchIndexAction.java
@@ -22,8 +22,6 @@
 
 import org.apache.hadoop.io.Writable;
 
-import org.apache.nutch.indexer.NutchDocument;
-
 /**
  * A {@link NutchIndexAction} is the new unit of indexing holding the document
  * and action information.

diff --git a/src/java/org/apache/nutch/service/NutchReader.java b/src/java/org/apache/nutch/service/NutchReader.java
@@ -25,14 +25,14 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public interface  NutchReader {
+public interface NutchReader {
 
   static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
   public static final Configuration conf = NutchConfiguration.create();
 
-  public List read(String path) throws FileNotFoundException;
-  public List head(String path, int nrows) throws FileNotFoundException;
-  public List slice(String path, int start, int end) throws FileNotFoundException;
+  public List<?> read(String path) throws FileNotFoundException;
+  public List<?> head(String path, int nrows) throws FileNotFoundException;
+  public List<?> slice(String path, int start, int end) throws FileNotFoundException;
   public int count(String path) throws FileNotFoundException;
 }
diff --git a/src/java/org/apache/nutch/service/impl/LinkReader.java b/src/java/org/apache/nutch/service/impl/LinkReader.java
@@ -33,11 +33,11 @@
 import org.apache.nutch.scoring.webgraph.LinkDatum;
 import org.apache.nutch.service.NutchReader;
 
-public class LinkReader implements NutchReader{
+public class LinkReader implements NutchReader {
 
   @Override
-  public List read(String path) throws FileNotFoundException {
-    List<HashMap> rows= new ArrayList<>();
+  public List<HashMap<String, String>> read(String path) throws FileNotFoundException {
+    List<HashMap<String, String>> rows= new ArrayList<>();
     Path file = new Path(path);
     SequenceFile.Reader reader;
     try{
@@ -69,8 +69,8 @@ public List read(String path) throws FileNotFoundException {
   }
 
   @Override
-  public List head(String path, int nrows) throws FileNotFoundException {
-    List<HashMap> rows= new ArrayList<>();
+  public List<HashMap<String, String>> head(String path, int nrows) throws FileNotFoundException {
+    List<HashMap<String, String>> rows= new ArrayList<>();
     Path file = new Path(path);
     SequenceFile.Reader reader;
     try{
@@ -101,9 +101,9 @@ public List head(String path, int nrows) throws FileNotFoundException {
   }
 
   @Override
-  public List slice(String path, int start, int end)
+  public List<HashMap<String, String>> slice(String path, int start, int end)
       throws FileNotFoundException {
-    List<HashMap> rows= new ArrayList<>();
+    List<HashMap<String, String>> rows= new ArrayList<>();
     Path file = new Path(path);
     SequenceFile.Reader reader;
     try{

diff --git a/src/java/org/apache/nutch/service/impl/NodeReader.java b/src/java/org/apache/nutch/service/impl/NodeReader.java
@@ -36,8 +36,8 @@
 public class NodeReader implements NutchReader {
 
   @Override
-  public List read(String path) throws FileNotFoundException {
-    List<HashMap> rows= new ArrayList<>();
+  public List<HashMap<String, String>> read(String path) throws FileNotFoundException {
+    List<HashMap<String, String>> rows= new ArrayList<>();
     Path file = new Path(path);
     SequenceFile.Reader reader;
     try{
@@ -70,8 +70,8 @@ public List read(String path) throws FileNotFoundException {
   }
 
   @Override
-  public List head(String path, int nrows) throws FileNotFoundException {
-    List<HashMap> rows= new ArrayList<>();
+  public List<HashMap<String, String>> head(String path, int nrows) throws FileNotFoundException {
+    List<HashMap<String, String>> rows= new ArrayList<>();
     Path file = new Path(path);
     SequenceFile.Reader reader;
     try{
@@ -102,9 +102,9 @@ public List head(String path, int nrows) throws FileNotFoundException {
   }
 
   @Override
-  public List slice(String path, int start, int end)
+  public List<HashMap<String, String>> slice(String path, int start, int end)
       throws FileNotFoundException {
-    List<HashMap> rows= new ArrayList<>();
+    List<HashMap<String, String>> rows= new ArrayList<>();
     Path file = new Path(path);
     SequenceFile.Reader reader;
     try{

diff --git a/src/java/org/apache/nutch/util/EncodingDetector.java b/src/java/org/apache/nutch/util/EncodingDetector.java
@@ -33,7 +33,6 @@
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConfiguration;
 
 import com.ibm.icu.text.CharsetDetector;
 import com.ibm.icu.text.CharsetMatch;

diff --git a/.../index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java b/.../index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java
@@ -153,7 +153,7 @@ public class ArbitraryIndexingFilter implements IndexingFilter {
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
                               CrawlDatum datum, Inlinks inlinks) throws IndexingException {
 
-    Class theClass = null;
+    Class<?> theClass = null;
     Method theMethod = null;
     Constructor<?> theConstructor = null;
     Object instance = null;

diff --git a/...ex-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java b/...ex-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java
@@ -21,15 +21,11 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.NutchField;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import java.lang.invoke.MethodHandles;
 
 /**
  * Tests that the index-arbitrary filter can add a new field with an arbitrary

diff --git a/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
@@ -21,7 +21,6 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.basic.BasicIndexingFilter;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.ParseData;
@@ -94,6 +93,6 @@ public void testBasicIndexingFilter() throws Exception {
     Assert.assertEquals("test content", "this is a sample foo",
         doc.getField("content").getValues().get(0));
     Assert.assertEquals("test fetch time", new Date(100L),
-        (Date) doc.getField("tstamp").getValues().get(0));
+        doc.getField("tstamp").getValues().get(0));
   }
 }
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -22,7 +22,6 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.html.HtmlParser;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;

diff --git a/...plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java b/...plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
@@ -41,7 +41,7 @@ public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExt
         // Attempt to load the class
       try {
         ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
-        Class extractorClass = loader.loadClass(boilerpipeExtractorName);
+        Class<?> extractorClass = loader.loadClass(boilerpipeExtractorName);
 
           // Add an instance to the repository
         extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.getConstructor().newInstance());

diff --git a/...lugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java b/...lugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java
@@ -17,13 +17,10 @@
 package org.apache.nutch.parsefilter.debug;
 
 import java.io.ByteArrayOutputStream;
-import java.io.OutputStreamWriter;
 import java.lang.invoke.MethodHandles;
-import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.HtmlParseFilter;
 import org.apache.nutch.parse.Parse;

diff --git a/...ocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java b/...ocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
@@ -21,6 +21,7 @@
 import java.io.UnsupportedEncodingException;
 import java.net.CookieHandler;
 import java.net.CookieManager;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -132,7 +133,8 @@ public boolean getFollowRedirects() {
           LOG.debug("Response headers : " + header);
         }
       }
-      String rst = IOUtils.toString(post.getResponseBodyAsStream());
+      String rst = IOUtils.toString(post.getResponseBodyAsStream(),
+          StandardCharsets.UTF_8);
       LOG.debug("login post result: " + rst);
     } finally {
       if (post != null) {
@@ -194,7 +196,8 @@ private String httpGetPageContent(String url) throws IOException {
       if (cookieHeader != null) {
         setCookies(cookieHeader.getValue());
       }
-      String rst = IOUtils.toString(get.getResponseBodyAsStream());
+      String rst = IOUtils.toString(get.getResponseBodyAsStream(),
+          StandardCharsets.UTF_8);
       return rst;
     } finally {
       get.releaseConnection();

diff --git a/...ocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java b/...ocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java
@@ -16,18 +16,16 @@
  */
 package org.apache.nutch.protocol.interactiveselenium;
 
-import java.lang.invoke.MethodHandles;
 import java.io.IOException;
+import java.lang.invoke.MethodHandles;
 import java.net.URL;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.nutch.util.NutchConfiguration;
-
-import org.apache.nutch.protocol.interactiveselenium.HttpResponse;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 

diff --git a/...plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/...plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
@@ -16,8 +16,6 @@
  */
 package org.apache.nutch.scoring.link;
 
-import java.util.List;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;

diff --git a/...in/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java b/...in/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java
@@ -17,21 +17,17 @@
 package org.apache.nutch.scoring.metadata;
 
 import java.util.Collection;
-import java.util.Map.Entry;
 import java.util.Iterator;
-import java.util.List;
+import java.util.Map.Entry;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.ScoringFilter;
 import org.apache.nutch.scoring.AbstractScoringFilter;
+import org.apache.nutch.scoring.ScoringFilter;
 import org.apache.nutch.scoring.ScoringFilterException;
 
 
@@ -48,7 +44,6 @@ public class MetadataScoringFilter extends AbstractScoringFilter  {
   private static String[] datumMetadata;
   private static String[] contentMetadata;
   private static String[] parseMetadata;
-  private Configuration conf;
 
   /**
    * This will take the metadata that you have listed in your "scoring.parse.md"

diff --git a/...coring-metadata/src/test/org/apache/nutch/scoring/metadata/TestMetadataScoringFilter.java b/...coring-metadata/src/test/org/apache/nutch/scoring/metadata/TestMetadataScoringFilter.java
@@ -24,7 +24,6 @@
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
-import org.junit.Before;
 import org.junit.Test;
 
 import java.util.HashMap;
@@ -51,7 +50,7 @@ public void distributeScoreToOutlinks() throws ScoringFilterException {
     parseData.getParseMeta().add("parent",parentMD);
     parseData.getParseMeta().add("depth",depthMD);
 
-    HashMap<Text,CrawlDatum> targets = new HashMap();
+    HashMap<Text,CrawlDatum> targets = new HashMap<>();
     targets.put(new Text("https://nutch.apache.org/downloads.html"),new CrawlDatum());
     targets.put(new Text("https://wiki.apache.org/nutch"),new CrawlDatum());
 

diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
@@ -133,7 +133,7 @@ public static CollectionManager getCollectionManager(Configuration conf) {
    * @return Named SubCollection (or null if not existing)
    */
   public Subcollection getSubColection(final String id) {
-    return (Subcollection) collectionMap.get(id);
+    return collectionMap.get(id);
   }
 
   /**
@@ -180,10 +180,10 @@ public Subcollection createSubCollection(final String id, final String name) {
    */
   public List<Subcollection> getSubCollections(final String url) {
     List<Subcollection> collections = new ArrayList<Subcollection>();
-    final Iterator iterator = collectionMap.values().iterator();
+    final Iterator<Subcollection> iterator = collectionMap.values().iterator();
 
     while (iterator.hasNext()) {
-      final Subcollection subCol = (Subcollection) iterator.next();
+      final Subcollection subCol = iterator.next();
       if (subCol.filter(url) != null) {
         collections.add(subCol);
       }
@@ -200,7 +200,7 @@ public List<Subcollection> getSubCollections(final String url) {
    * 
    * @return All collections CollectionManager knows about
    */
-  public Collection getAll() {
+  public Collection<Subcollection> getAll() {
     return collectionMap.values();
   }
 
@@ -219,10 +219,10 @@ public void save() throws IOException {
       final Document doc = new DocumentImpl();
       final Element collections = doc
           .createElement(Subcollection.TAG_COLLECTIONS);
-      final Iterator iterator = collectionMap.values().iterator();
+      final Iterator<Subcollection> iterator = collectionMap.values().iterator();
 
       while (iterator.hasNext()) {
-        final Subcollection subCol = (Subcollection) iterator.next();
+        final Subcollection subCol = iterator.next();
         final Element collection = doc
             .createElement(Subcollection.TAG_COLLECTION);
         collections.appendChild(collection);

diff --git a/...n/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java b/...n/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.urlfilter.validator;
 
-import org.apache.nutch.urlfilter.validator.UrlValidator;
 import org.junit.Assert;
 import org.junit.Test;