Merged from apache master

Signed-off-by: Julien Nioche <[email protected]>
commoncrawl · Nov 9, 2023 · bec481b · bec481b
2 parents f3f948e + 7ad382d
commit bec481b
Show file tree

Hide file tree

Showing 42 changed files with 337 additions and 134 deletions.
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
@@ -1964,8 +1964,38 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
 <property>
   <name>urlfilter.fast.file</name>
   <value>fast-urlfilter.txt</value>
-  <description>Name of file on CLASSPATH containing regular expressions
-  used by urlfilter-fast (FastURLFilter) plugin.</description>
+  <description>Name of file containing rules and regular expressions
+  used by urlfilter-fast (FastURLFilter) plugin. If the filename
+  includes a scheme (for example, hdfs://) it is loaded using the
+  Hadoop FileSystem implementation supporting that scheme. If the
+  filename does not contain a scheme, the file is loaded from
+  CLASSPATH. If indicated by file extension (.gz, .bzip2, .zst),
+  the file is decompressed while reading using Hadoop-provided
+  compression codecs.</description>
+</property>
+
+<property>
+  <name>urlfilter.fast.url.max.length</name>
+  <value>-1</value>
+  <description>Filters URLs based on their overall length.
+  The default value of -1 means that it is deactivated.
+  </description>
+</property>
+
+<property>
+  <name>urlfilter.fast.url.path.max.length</name>
+  <value>-1</value>
+  <description>Filters URLs based on the length of their path element.
+  The default value of -1 means that it is deactivated.
+  </description>
+</property>
+
+<property>
+  <name>urlfilter.fast.url.query.max.length</name>
+  <value>-1</value>
+  <description>Filters URLs based on the length of their query element.
+  The default value of -1 means that it is deactivated.
+  </description>
 </property>
 
 <property>

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
@@ -36,19 +36,21 @@
 	</publications>
 
 	<dependencies>
-		<dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.20.0" conf="*->master" />
-    <dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.20.0" conf="*->master" />
-		<dependency org="org.apache.logging.log4j" name="log4j-slf4j2-impl" rev="2.20.0" conf="*->master" />
-		<dependency org="org.slf4j" name="slf4j-api" rev="2.0.7" conf="*->master" />
+		<dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.21.1" conf="*->master" />
+    <dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.21.1" conf="*->master" />
+		<dependency org="org.apache.logging.log4j" name="log4j-slf4j2-impl" rev="2.21.1" conf="*->master" />
+		<dependency org="org.slf4j" name="slf4j-api" rev="2.0.9" conf="*->master" />
 
 		<dependency org="org.apache.commons" name="commons-lang3" rev="3.13.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-collections4" rev="4.4" conf="*->master" />
 		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.13" conf="*->master" />
 		<dependency org="commons-codec" name="commons-codec" rev="1.16.0" conf="*->default" />
 		<!-- hadoop 3.4.0 should have 2.11.0; Tika is broken in distributed mode until then;
+				We're currently relying on the hadoop-safe-tika shim that shades commons-io to upgrade
+				Tika
 		 		see https://github.com/apache/nutch/pull/776 -->
 		<dependency org="commons-io" name="commons-io" rev="2.11.0" conf="*->default" />
-		<dependency org="org.apache.commons" name="commons-compress" rev="1.23.0" conf="*->default" />
+		<dependency org="org.apache.commons" name="commons-compress" rev="1.24.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-jexl3" rev="3.2.1" conf="*->default" />
 		<dependency org="com.tdunning" name="t-digest" rev="3.3" />
 
@@ -70,15 +72,15 @@
 			<exclude org="org.slf4j" name="*" />
 		</dependency><!-- End of Hadoop Dependencies -->
 
-		<dependency org="org.tallison.tika" name="tika-core-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
+		<dependency org="org.tallison.tika" name="tika-core-shaded" rev="2.9.1.0" conf="*->default" transitive="false"/>
 		<!--dependency org="org.apache.tika" name="tika-core" rev="2.9.0" /-->
 		<!-- Tika parser text and html modules (without transitive dependencies) are used to detect
 		     the charset in text resp. HTML documents. -->
 		<!-- With the shaded packages, we need to use the tika-parsers-standard-package-shaded.
 		     It is therefor not required in parse-tika. -->
 		<!--dependency org="org.apache.tika" name="tika-parser-text-module" rev="2.9.0" transitive="false" /-->
 		<!--dependency org="org.apache.tika" name="tika-parser-html-module" rev="2.9.0" transitive="false" /-->
-		<dependency org="org.tallison.tika" name="tika-parsers-standard-package-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
+		<dependency org="org.tallison.tika" name="tika-parsers-standard-package-shaded" rev="2.9.1.0" conf="*->default" transitive="false"/>
 
 		<!-- language detection -->
 		<dependency org="org.commoncrawl" name="language-detection-cld2" rev="0.1-SNAPSHOT" />

diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -165,8 +165,7 @@ public static Job createJob(Configuration config, Path crawlDb)
     Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random()
         .nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(config);
-    job.setJobName("crawldb " + crawlDb);
+    Job job = Job.getInstance(config, "Nutch CrawlDb: " + crawlDb);
 
     Path current = new Path(crawlDb, CURRENT_NAME);
     if (current.getFileSystem(job.getConfiguration()).exists(current)) {

diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
@@ -165,9 +165,8 @@ public static Job createMergeJob(Configuration conf, Path output,
     Path newCrawlDb = new Path(output,
         "merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(conf);
+    Job job = Job.getInstance(conf, "Nutch CrawlDbMerger: " + output);
     conf = job.getConfiguration();
-    job.setJobName("crawldb merge " + output);
 
     job.setInputFormatClass(SequenceFileInputFormat.class);
 

diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -564,9 +564,8 @@ private TreeMap<String, Writable> processStatJobHelper(String crawlDb,
       throws IOException, InterruptedException, ClassNotFoundException {
     Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
 
-    Job job = NutchJob.getInstance(config);
+    Job job = Job.getInstance(config, "Nutch CrawlDbReader: " + crawlDb);
     config = job.getConfiguration();
-    job.setJobName("stats " + crawlDb);
     config.setBoolean("db.reader.stats.sort", sort);
 
     FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
@@ -812,7 +811,7 @@ public CrawlDatum get(String crawlDb, String url, Configuration config)
 
   @Override
   protected int process(String line, StringBuilder output) throws Exception {
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch CrawlDbReader: process " + crawlDb);
     Configuration config = job.getConfiguration();
     readUrl(this.crawlDb, line, config, output);
     return 0;
@@ -839,8 +838,7 @@ public void processDumpJob(String crawlDb, String output,
 
     Path outFolder = new Path(output);
 
-    Job job = NutchJob.getInstance(config);
-    job.setJobName("dump " + crawlDb);
+    Job job = Job.getInstance(config, "Nutch CrawlDbReader: dump " + crawlDb);
     Configuration jobConf = job.getConfiguration();
 
     FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
@@ -958,18 +956,15 @@ public void processTopNJob(String crawlDb, long topN, float min,
       String output, Configuration config)
       throws IOException, ClassNotFoundException, InterruptedException {
 
-    if (LOG.isInfoEnabled()) {
-      LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
-      LOG.info("CrawlDb db: {}", crawlDb);
-    }
+    LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
+    LOG.info("CrawlDb db: {}", crawlDb);
 
     Path outFolder = new Path(output);
     Path tempDir = new Path(
         config.get("mapreduce.cluster.temp.dir", ".") + "/readdb-topN-temp-"
             + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(config);
-    job.setJobName("topN prepare " + crawlDb);
+    Job job = Job.getInstance(config, "Nutch CrawlDbReader: topN prepare " + crawlDb);
     FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
     job.setInputFormatClass(SequenceFileInputFormat.class);
 
@@ -1000,8 +995,7 @@ public void processTopNJob(String crawlDb, long topN, float min,
     }
 
     LOG.info("CrawlDb topN: collecting topN scores.");
-    job = NutchJob.getInstance(config);
-    job.setJobName("topN collect " + crawlDb);
+    job = Job.getInstance(config, "Nutch CrawlDbReader: topN collect " + crawlDb);
     job.getConfiguration().setLong("db.reader.topn", topN);
 
     FileInputFormat.addInputPath(job, tempDir);

diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -305,9 +305,8 @@ public int run(String[] args) throws IOException {
     Path tempDir = new Path(crawlDb, "dedup-temp-"
         + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch DeduplicationJob: " + crawlDb);
     Configuration conf = job.getConfiguration();
-    job.setJobName("Deduplication on " + crawlDb);
     conf.set(DEDUPLICATION_GROUP_MODE, group);
     conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
     job.setJarByClass(DeduplicationJob.class);

diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
@@ -388,7 +388,7 @@ private JexlContext createContext(HostDatum datum) {
     public void setup(Context context) throws IOException {
       conf = context.getConfiguration();
       mos = new MultipleOutputs<FloatWritable, SelectorEntry>(context);
-      Job job = Job.getInstance(conf);
+      Job job = Job.getInstance(conf, "Nutch Generator.SelectorReducer");
       limit = conf.getLong(GENERATOR_TOP_N, Long.MAX_VALUE)
           / job.getNumReduceTasks();
       maxNumSegments = conf.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
@@ -695,7 +695,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
       long curTime)
       throws IOException, InterruptedException, ClassNotFoundException {
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch Generator: generate from " + dbDir);
     Configuration conf = job.getConfiguration();
     boolean filter = conf.getBoolean(GENERATOR_FILTER, true);
     boolean normalise = conf.getBoolean(GENERATOR_NORMALISE, true);
@@ -839,8 +839,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
     }
 
     // map to inverted subset due for fetch, sort by score
-    Job job = NutchJob.getInstance(getConf());
-    job.setJobName("generate: select from " + dbDir);
+    Job job = Job.getInstance(getConf(), "Nutch Generator: generate from " + dbDir);
     Configuration conf = job.getConfiguration();
     if (numLists == -1) {
       /* for politeness create exactly one partition per fetch task */ 
@@ -942,8 +941,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
       Path tempDir2 = new Path(dbDir,
           "generate-temp-" + java.util.UUID.randomUUID().toString());
 
-      job = NutchJob.getInstance(getConf());
-      job.setJobName("generate: updatedb " + dbDir);
+      job = Job.getInstance(getConf(), "Nutch Generator: updatedb " + dbDir);
       job.getConfiguration().setLong(Nutch.GENERATE_TIME_KEY, generateTime);
       for (Path segmpaths : generatedSegments) {
         Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
@@ -1001,8 +999,7 @@ private Path partitionSegment(Path segmentsDir, Path inputDir, int numLists)
 
     LOG.info("Generator: segment: " + segment);
 
-    Job job = NutchJob.getInstance(getConf());
-    job.setJobName("generate: partition " + segment);
+    Job job = Job.getInstance(getConf(), "Nutch Generator: partition segment " + segment);
     Configuration conf = job.getConfiguration();
     conf.setInt("partition.url.seed", RANDOM.nextInt());
 

diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java
@@ -404,7 +404,7 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite,
     Path lock = CrawlDb.lock(conf, crawlDb, false);
 
     // configure job
-    Job job = Job.getInstance(conf, "inject " + urlDir);
+    Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir);
     job.setJarByClass(Injector.class);
     job.setMapperClass(InjectMapper.class);
     job.setReducerClass(InjectReducer.class);

diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java b/src/java/org/apache/nutch/crawl/LinkDb.java
@@ -270,9 +270,8 @@ private static Job createJob(Configuration config, Path linkDb,
     Path newLinkDb = new Path(linkDb,
             Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(config);
+    Job job = Job.getInstance(config, "Nutch LinkDb: " + linkDb);
     Configuration conf = job.getConfiguration();
-    job.setJobName("linkdb " + linkDb);
 
     job.setInputFormatClass(SequenceFileInputFormat.class);
 

diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
@@ -147,8 +147,7 @@ public static Job createMergeJob(Configuration config, Path linkDb,
     Path newLinkDb = new Path(linkDb,
         "merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(config);
-    job.setJobName("linkdb merge " + linkDb);
+    Job job = Job.getInstance(config, "Nutch LinkDbMerger: " + linkDb);
 
     Configuration conf = job.getConfiguration();
     job.setInputFormatClass(SequenceFileInputFormat.class);

diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java b/src/java/org/apache/nutch/crawl/LinkDbReader.java
@@ -159,8 +159,7 @@ public void processDumpJob(String linkdb, String output, String regex)
 
     Path outFolder = new Path(output);
 
-    Job job = NutchJob.getInstance(getConf());
-    job.setJobName("read " + linkdb);
+    Job job = Job.getInstance(getConf(), "Nutch LinkDbReader: " + linkdb);
     job.setJarByClass(LinkDbReader.class);
 
     Configuration conf = job.getConfiguration();

diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -502,7 +502,7 @@ public void fetch(Path segment, int threads) throws IOException,
           totalOutlinksToFollow);
     }
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch Fetcher: " + segment.getName());
     job.setJobName("FetchData");
     Configuration conf = job.getConfiguration();
 

diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -181,8 +181,7 @@ private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean
     conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
     conf.set("mapreduce.output.textoutputformat.separator", "\t");
 
-    Job job = Job.getInstance(conf);
-    job.setJobName("ReadHostDb");
+    Job job = Job.getInstance(conf, "Nutch ReadHostDb");
     job.setJarByClass(ReadHostDb.class);
 
     FileInputFormat.addInputPath(job, new Path(hostDb, "current"));

diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
@@ -77,11 +77,10 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts,
     stopWatch.start();
     LOG.info("UpdateHostDb: starting");
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch UpdateHostDb");
     Configuration conf = job.getConfiguration();
     boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
     job.setJarByClass(UpdateHostDb.class);
-    job.setJobName("UpdateHostDb");
 
     FileSystem fs = hostDb.getFileSystem(conf);
     Path old = new Path(hostDb, "old");

diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -144,7 +144,7 @@ public void delete(String crawldb, boolean noCommit)
     stopWatch.start();
     LOG.info("CleaningJob: starting");
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch CleaningJob: " + crawldb);
     Configuration conf = job.getConfiguration();
 
     FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
@@ -157,8 +157,6 @@ public void delete(String crawldb, boolean noCommit)
     job.setReducerClass(DeleterReducer.class);
     job.setJarByClass(CleaningJob.class);
 
-    job.setJobName("CleaningJob");
-
     // need to expicitely allow deletions
     conf.setBoolean(IndexerMapReduce.INDEXER_DELETE, true);
 

diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java
@@ -108,7 +108,8 @@ public void index(Path crawlDb, Path linkDb, List<Path> segments,
     stopWatch.start();
     LOG.info("Indexer: starting");
 
-    final Job job = NutchJob.getInstance(getConf());
+    final Job job = Job.getInstance(getConf(),
+        "Nutch IndexingJob: crawldb: " + crawlDb + " segment(s): " + segments);
     job.setJobName("Indexer");
     Configuration conf = job.getConfiguration();
 

diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -180,6 +180,17 @@ public static boolean isTruncated(Content content) {
     if (metadata == null)
       return false;
 
+    //check for okhttp or other protocol's truncated flag
+    //if the flag is there, no matter the value, trust it.
+    if (metadata.get(Response.TRUNCATED_CONTENT) != null) {
+      if ("true".equals(metadata.get(Response.TRUNCATED_CONTENT))) {
+        LOG.info(content.getUrl() + " skipped. Protocol metadata indicates truncated content, " +
+                "actualSize= " + content.getContent().length);
+        return true;
+      }
+      return false;
+    }
+
     String lengthStr = metadata.get(Response.CONTENT_LENGTH);
     if (lengthStr != null)
       lengthStr = lengthStr.trim();
@@ -232,8 +243,7 @@ public void parse(Path segment) throws IOException,
     LOG.info("ParseSegment: starting");
     LOG.info("ParseSegment: segment: {}", segment);
 
-    Job job = NutchJob.getInstance(getConf());
-    job.setJobName("parse " + segment);
+    Job job = Job.getInstance(getConf(), "Nutch ParseSegment: " + segment);
 
     Configuration conf = job.getConfiguration();
     FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));

diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
@@ -341,8 +341,7 @@ public void dumpLinks(Path webGraphDb) throws IOException,
     // run the inverter job
     Path tempInverted = new Path(webGraphDb, "inverted-"
         + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-    Job inverter = NutchJob.getInstance(conf);
-    inverter.setJobName("LinkDumper: inverter");
+    Job inverter = Job.getInstance(conf, "Nutch LinkDumper: invert " + webGraphDb);
     FileInputFormat.addInputPath(inverter, nodeDb);
     FileInputFormat.addInputPath(inverter, outlinkDb);
     inverter.setInputFormatClass(SequenceFileInputFormat.class);
@@ -372,8 +371,7 @@ public void dumpLinks(Path webGraphDb) throws IOException,
     }
 
     // run the merger job
-    Job merger = NutchJob.getInstance(conf);
-    merger.setJobName("LinkDumper: merger");
+    Job merger = Job.getInstance(conf, "Nutch LinkDumper: merge " + tempInverted);
     FileInputFormat.addInputPath(merger, tempInverted);
     merger.setJarByClass(Merger.class);
     merger.setInputFormatClass(SequenceFileInputFormat.class);