diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 6543737cf0..0161a250ca 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1964,8 +1964,38 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
 <property>
   <name>urlfilter.fast.file</name>
   <value>fast-urlfilter.txt</value>
-  <description>Name of file on CLASSPATH containing regular expressions
-  used by urlfilter-fast (FastURLFilter) plugin.</description>
+  <description>Name of file containing rules and regular expressions
+  used by urlfilter-fast (FastURLFilter) plugin. If the filename
+  includes a scheme (for example, hdfs://) it is loaded using the
+  Hadoop FileSystem implementation supporting that scheme. If the
+  filename does not contain a scheme, the file is loaded from
+  CLASSPATH. If indicated by file extension (.gz, .bzip2, .zst),
+  the file is decompressed while reading using Hadoop-provided
+  compression codecs.</description>
+</property>
+
+<property>
+  <name>urlfilter.fast.url.max.length</name>
+  <value>-1</value>
+  <description>Filters URLs based on their overall length.
+  The default value of -1 means that it is deactivated.
+  </description>
+</property>
+
+<property>
+  <name>urlfilter.fast.url.path.max.length</name>
+  <value>-1</value>
+  <description>Filters URLs based on the length of their path element.
+  The default value of -1 means that it is deactivated.
+  </description>
+</property>
+
+<property>
+  <name>urlfilter.fast.url.query.max.length</name>
+  <value>-1</value>
+  <description>Filters URLs based on the length of their query element.
+  The default value of -1 means that it is deactivated.
+  </description>
 </property>
 
 <property>
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 71464ed25c..8956742dbf 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -36,19 +36,21 @@
 	</publications>
 
 	<dependencies>
-		<dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.20.0" conf="*->master" />
-    <dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.20.0" conf="*->master" />
-		<dependency org="org.apache.logging.log4j" name="log4j-slf4j2-impl" rev="2.20.0" conf="*->master" />
-		<dependency org="org.slf4j" name="slf4j-api" rev="2.0.7" conf="*->master" />
+		<dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.21.1" conf="*->master" />
+    <dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.21.1" conf="*->master" />
+		<dependency org="org.apache.logging.log4j" name="log4j-slf4j2-impl" rev="2.21.1" conf="*->master" />
+		<dependency org="org.slf4j" name="slf4j-api" rev="2.0.9" conf="*->master" />
 
 		<dependency org="org.apache.commons" name="commons-lang3" rev="3.13.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-collections4" rev="4.4" conf="*->master" />
 		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.13" conf="*->master" />
 		<dependency org="commons-codec" name="commons-codec" rev="1.16.0" conf="*->default" />
 		<!-- hadoop 3.4.0 should have 2.11.0; Tika is broken in distributed mode until then;
+				We're currently relying on the hadoop-safe-tika shim that shades commons-io to upgrade
+				Tika
 		 		see https://github.com/apache/nutch/pull/776 -->
 		<dependency org="commons-io" name="commons-io" rev="2.11.0" conf="*->default" />
-		<dependency org="org.apache.commons" name="commons-compress" rev="1.23.0" conf="*->default" />
+		<dependency org="org.apache.commons" name="commons-compress" rev="1.24.0" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-jexl3" rev="3.2.1" conf="*->default" />
 		<dependency org="com.tdunning" name="t-digest" rev="3.3" />
 
@@ -70,7 +72,7 @@
 			<exclude org="org.slf4j" name="*" />
 		</dependency><!-- End of Hadoop Dependencies -->
 
-		<dependency org="org.tallison.tika" name="tika-core-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
+		<dependency org="org.tallison.tika" name="tika-core-shaded" rev="2.9.1.0" conf="*->default" transitive="false"/>
 		<!--dependency org="org.apache.tika" name="tika-core" rev="2.9.0" /-->
 		<!-- Tika parser text and html modules (without transitive dependencies) are used to detect
 		     the charset in text resp. HTML documents. -->
@@ -78,7 +80,7 @@
 		     It is therefor not required in parse-tika. -->
 		<!--dependency org="org.apache.tika" name="tika-parser-text-module" rev="2.9.0" transitive="false" /-->
 		<!--dependency org="org.apache.tika" name="tika-parser-html-module" rev="2.9.0" transitive="false" /-->
-		<dependency org="org.tallison.tika" name="tika-parsers-standard-package-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
+		<dependency org="org.tallison.tika" name="tika-parsers-standard-package-shaded" rev="2.9.1.0" conf="*->default" transitive="false"/>
 
 		<!-- language detection -->
 		<dependency org="org.commoncrawl" name="language-detection-cld2" rev="0.1-SNAPSHOT" />
diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 16394832bf..2b609c0a6e 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -165,8 +165,7 @@ public static Job createJob(Configuration config, Path crawlDb)
     Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random()
         .nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(config);
-    job.setJobName("crawldb " + crawlDb);
+    Job job = Job.getInstance(config, "Nutch CrawlDb: " + crawlDb);
 
     Path current = new Path(crawlDb, CURRENT_NAME);
     if (current.getFileSystem(job.getConfiguration()).exists(current)) {
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
index 1bf7243d38..6ee4b43cd2 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
@@ -165,9 +165,8 @@ public static Job createMergeJob(Configuration conf, Path output,
     Path newCrawlDb = new Path(output,
         "merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(conf);
+    Job job = Job.getInstance(conf, "Nutch CrawlDbMerger: " + output);
     conf = job.getConfiguration();
-    job.setJobName("crawldb merge " + output);
 
     job.setInputFormatClass(SequenceFileInputFormat.class);
 
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index bd3e6f38de..29e8efe173 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -564,9 +564,8 @@ private TreeMap<String, Writable> processStatJobHelper(String crawlDb,
       throws IOException, InterruptedException, ClassNotFoundException {
     Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
 
-    Job job = NutchJob.getInstance(config);
+    Job job = Job.getInstance(config, "Nutch CrawlDbReader: " + crawlDb);
     config = job.getConfiguration();
-    job.setJobName("stats " + crawlDb);
     config.setBoolean("db.reader.stats.sort", sort);
 
     FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
@@ -812,7 +811,7 @@ public CrawlDatum get(String crawlDb, String url, Configuration config)
 
   @Override
   protected int process(String line, StringBuilder output) throws Exception {
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch CrawlDbReader: process " + crawlDb);
     Configuration config = job.getConfiguration();
     readUrl(this.crawlDb, line, config, output);
     return 0;
@@ -839,8 +838,7 @@ public void processDumpJob(String crawlDb, String output,
 
     Path outFolder = new Path(output);
 
-    Job job = NutchJob.getInstance(config);
-    job.setJobName("dump " + crawlDb);
+    Job job = Job.getInstance(config, "Nutch CrawlDbReader: dump " + crawlDb);
     Configuration jobConf = job.getConfiguration();
 
     FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
@@ -958,18 +956,15 @@ public void processTopNJob(String crawlDb, long topN, float min,
       String output, Configuration config)
       throws IOException, ClassNotFoundException, InterruptedException {
 
-    if (LOG.isInfoEnabled()) {
-      LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
-      LOG.info("CrawlDb db: {}", crawlDb);
-    }
+    LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
+    LOG.info("CrawlDb db: {}", crawlDb);
 
     Path outFolder = new Path(output);
     Path tempDir = new Path(
         config.get("mapreduce.cluster.temp.dir", ".") + "/readdb-topN-temp-"
             + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(config);
-    job.setJobName("topN prepare " + crawlDb);
+    Job job = Job.getInstance(config, "Nutch CrawlDbReader: topN prepare " + crawlDb);
     FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
     job.setInputFormatClass(SequenceFileInputFormat.class);
 
@@ -1000,8 +995,7 @@ public void processTopNJob(String crawlDb, long topN, float min,
     }
 
     LOG.info("CrawlDb topN: collecting topN scores.");
-    job = NutchJob.getInstance(config);
-    job.setJobName("topN collect " + crawlDb);
+    job = Job.getInstance(config, "Nutch CrawlDbReader: topN collect " + crawlDb);
     job.getConfiguration().setLong("db.reader.topn", topN);
 
     FileInputFormat.addInputPath(job, tempDir);
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index 217005d415..e370013546 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -305,9 +305,8 @@ public int run(String[] args) throws IOException {
     Path tempDir = new Path(crawlDb, "dedup-temp-"
         + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch DeduplicationJob: " + crawlDb);
     Configuration conf = job.getConfiguration();
-    job.setJobName("Deduplication on " + crawlDb);
     conf.set(DEDUPLICATION_GROUP_MODE, group);
     conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
     job.setJarByClass(DeduplicationJob.class);
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 1b62314e7a..33f743a37a 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -388,7 +388,7 @@ private JexlContext createContext(HostDatum datum) {
     public void setup(Context context) throws IOException {
       conf = context.getConfiguration();
       mos = new MultipleOutputs<FloatWritable, SelectorEntry>(context);
-      Job job = Job.getInstance(conf);
+      Job job = Job.getInstance(conf, "Nutch Generator.SelectorReducer");
       limit = conf.getLong(GENERATOR_TOP_N, Long.MAX_VALUE)
           / job.getNumReduceTasks();
       maxNumSegments = conf.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
@@ -695,7 +695,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
       long curTime)
       throws IOException, InterruptedException, ClassNotFoundException {
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch Generator: generate from " + dbDir);
     Configuration conf = job.getConfiguration();
     boolean filter = conf.getBoolean(GENERATOR_FILTER, true);
     boolean normalise = conf.getBoolean(GENERATOR_NORMALISE, true);
@@ -839,8 +839,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
     }
 
     // map to inverted subset due for fetch, sort by score
-    Job job = NutchJob.getInstance(getConf());
-    job.setJobName("generate: select from " + dbDir);
+    Job job = Job.getInstance(getConf(), "Nutch Generator: generate from " + dbDir);
     Configuration conf = job.getConfiguration();
     if (numLists == -1) {
       /* for politeness create exactly one partition per fetch task */ 
@@ -942,8 +941,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
       Path tempDir2 = new Path(dbDir,
           "generate-temp-" + java.util.UUID.randomUUID().toString());
 
-      job = NutchJob.getInstance(getConf());
-      job.setJobName("generate: updatedb " + dbDir);
+      job = Job.getInstance(getConf(), "Nutch Generator: updatedb " + dbDir);
       job.getConfiguration().setLong(Nutch.GENERATE_TIME_KEY, generateTime);
       for (Path segmpaths : generatedSegments) {
         Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
@@ -1001,8 +999,7 @@ private Path partitionSegment(Path segmentsDir, Path inputDir, int numLists)
 
     LOG.info("Generator: segment: " + segment);
 
-    Job job = NutchJob.getInstance(getConf());
-    job.setJobName("generate: partition " + segment);
+    Job job = Job.getInstance(getConf(), "Nutch Generator: partition segment " + segment);
     Configuration conf = job.getConfiguration();
     conf.setInt("partition.url.seed", RANDOM.nextInt());
 
diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java
index 9ece036538..2b07113343 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -404,7 +404,7 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite,
     Path lock = CrawlDb.lock(conf, crawlDb, false);
 
     // configure job
-    Job job = Job.getInstance(conf, "inject " + urlDir);
+    Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir);
     job.setJarByClass(Injector.class);
     job.setMapperClass(InjectMapper.class);
     job.setReducerClass(InjectReducer.class);
diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java b/src/java/org/apache/nutch/crawl/LinkDb.java
index 3c752ab1db..2f4a0dda42 100644
--- a/src/java/org/apache/nutch/crawl/LinkDb.java
+++ b/src/java/org/apache/nutch/crawl/LinkDb.java
@@ -270,9 +270,8 @@ private static Job createJob(Configuration config, Path linkDb,
     Path newLinkDb = new Path(linkDb,
             Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(config);
+    Job job = Job.getInstance(config, "Nutch LinkDb: " + linkDb);
     Configuration conf = job.getConfiguration();
-    job.setJobName("linkdb " + linkDb);
 
     job.setInputFormatClass(SequenceFileInputFormat.class);
 
diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
index d6a41ab48c..c3da2031e9 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
@@ -147,8 +147,7 @@ public static Job createMergeJob(Configuration config, Path linkDb,
     Path newLinkDb = new Path(linkDb,
         "merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job job = NutchJob.getInstance(config);
-    job.setJobName("linkdb merge " + linkDb);
+    Job job = Job.getInstance(config, "Nutch LinkDbMerger: " + linkDb);
 
     Configuration conf = job.getConfiguration();
     job.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java b/src/java/org/apache/nutch/crawl/LinkDbReader.java
index fa01f20bf3..9ae3566833 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbReader.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java
@@ -159,8 +159,7 @@ public void processDumpJob(String linkdb, String output, String regex)
 
     Path outFolder = new Path(output);
 
-    Job job = NutchJob.getInstance(getConf());
-    job.setJobName("read " + linkdb);
+    Job job = Job.getInstance(getConf(), "Nutch LinkDbReader: " + linkdb);
     job.setJarByClass(LinkDbReader.class);
     
     Configuration conf = job.getConfiguration();
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 23fd30d731..273168eb81 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -502,7 +502,7 @@ public void fetch(Path segment, int threads) throws IOException,
           totalOutlinksToFollow);
     }
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch Fetcher: " + segment.getName());
     job.setJobName("FetchData");
     Configuration conf = job.getConfiguration();
 
diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index 0321a8652c..036b786502 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -181,8 +181,7 @@ private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean
     conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
     conf.set("mapreduce.output.textoutputformat.separator", "\t");
     
-    Job job = Job.getInstance(conf);
-    job.setJobName("ReadHostDb");
+    Job job = Job.getInstance(conf, "Nutch ReadHostDb");
     job.setJarByClass(ReadHostDb.class);
 
     FileInputFormat.addInputPath(job, new Path(hostDb, "current"));
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
index 65e45c55d8..5148a6be12 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java
@@ -77,11 +77,10 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts,
     stopWatch.start();
     LOG.info("UpdateHostDb: starting");
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch UpdateHostDb");
     Configuration conf = job.getConfiguration();
     boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
     job.setJarByClass(UpdateHostDb.class);
-    job.setJobName("UpdateHostDb");
 
     FileSystem fs = hostDb.getFileSystem(conf);
     Path old = new Path(hostDb, "old");
diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java
index 04b9c2efa5..8334ac3537 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -144,7 +144,7 @@ public void delete(String crawldb, boolean noCommit)
     stopWatch.start();
     LOG.info("CleaningJob: starting");
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch CleaningJob: " + crawldb);
     Configuration conf = job.getConfiguration();
 
     FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
@@ -157,8 +157,6 @@ public void delete(String crawldb, boolean noCommit)
     job.setReducerClass(DeleterReducer.class);
     job.setJarByClass(CleaningJob.class);
 
-    job.setJobName("CleaningJob");
-
     // need to expicitely allow deletions
     conf.setBoolean(IndexerMapReduce.INDEXER_DELETE, true);
 
diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java
index d2115230c8..c3ddb4ae94 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java
@@ -108,7 +108,8 @@ public void index(Path crawlDb, Path linkDb, List<Path> segments,
     stopWatch.start();
     LOG.info("Indexer: starting");
 
-    final Job job = NutchJob.getInstance(getConf());
+    final Job job = Job.getInstance(getConf(),
+        "Nutch IndexingJob: crawldb: " + crawlDb + " segment(s): " + segments);
     job.setJobName("Indexer");
     Configuration conf = job.getConfiguration();
 
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java
index de45c463b9..e9f041a5fb 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -180,6 +180,17 @@ public static boolean isTruncated(Content content) {
     if (metadata == null)
       return false;
 
+    //check for okhttp or other protocol's truncated flag
+    //if the flag is there, no matter the value, trust it.
+    if (metadata.get(Response.TRUNCATED_CONTENT) != null) {
+      if ("true".equals(metadata.get(Response.TRUNCATED_CONTENT))) {
+        LOG.info(content.getUrl() + " skipped. Protocol metadata indicates truncated content, " +
+                "actualSize= " + content.getContent().length);
+        return true;
+      }
+      return false;
+    }
+
     String lengthStr = metadata.get(Response.CONTENT_LENGTH);
     if (lengthStr != null)
       lengthStr = lengthStr.trim();
@@ -232,8 +243,7 @@ public void parse(Path segment) throws IOException,
     LOG.info("ParseSegment: starting");
     LOG.info("ParseSegment: segment: {}", segment);
 
-    Job job = NutchJob.getInstance(getConf());
-    job.setJobName("parse " + segment);
+    Job job = Job.getInstance(getConf(), "Nutch ParseSegment: " + segment);
 
     Configuration conf = job.getConfiguration();
     FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
index 4831d73f38..439d7438c4 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
@@ -341,8 +341,7 @@ public void dumpLinks(Path webGraphDb) throws IOException,
     // run the inverter job
     Path tempInverted = new Path(webGraphDb, "inverted-"
         + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-    Job inverter = NutchJob.getInstance(conf);
-    inverter.setJobName("LinkDumper: inverter");
+    Job inverter = Job.getInstance(conf, "Nutch LinkDumper: invert " + webGraphDb);
     FileInputFormat.addInputPath(inverter, nodeDb);
     FileInputFormat.addInputPath(inverter, outlinkDb);
     inverter.setInputFormatClass(SequenceFileInputFormat.class);
@@ -372,8 +371,7 @@ public void dumpLinks(Path webGraphDb) throws IOException,
     }
 
     // run the merger job
-    Job merger = NutchJob.getInstance(conf);
-    merger.setJobName("LinkDumper: merger");
+    Job merger = Job.getInstance(conf, "Nutch LinkDumper: merge " + tempInverted);
     FileInputFormat.addInputPath(merger, tempInverted);
     merger.setJarByClass(Merger.class);
     merger.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
index c226ad130b..e48f04acdf 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
@@ -93,9 +93,8 @@ private int runCounter(FileSystem fs, Path webGraphDb) throws IOException,
     // configure the counter job
     Path numLinksPath = new Path(webGraphDb, NUM_NODES);
     Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
-    Job counter = NutchJob.getInstance(getConf());
+    Job counter = Job.getInstance(getConf(), "Nutch LinkRank: counter " + webGraphDb);
     Configuration conf = counter.getConfiguration();
-    counter.setJobName("LinkRank Counter");
     FileInputFormat.addInputPath(counter, nodeDb);
     FileOutputFormat.setOutputPath(counter, numLinksPath);
     counter.setInputFormatClass(SequenceFileInputFormat.class);
@@ -194,9 +193,8 @@ private void runInitializer(Path nodeDb, Path output) throws IOException,
      InterruptedException, ClassNotFoundException {
 
     // configure the initializer
-    Job initializer = NutchJob.getInstance(getConf());
+    Job initializer = Job.getInstance(getConf(), "Nutch LinkRank: initializer " + nodeDb);
     Configuration conf = initializer.getConfiguration();
-    initializer.setJobName("LinkAnalysis Initializer");
     FileInputFormat.addInputPath(initializer, nodeDb);
     FileOutputFormat.setOutputPath(initializer, output);
     initializer.setJarByClass(Initializer.class);
@@ -245,9 +243,9 @@ private void runInverter(Path nodeDb, Path outlinkDb, Path output)
       throws IOException, InterruptedException, ClassNotFoundException {
 
     // configure the inverter
-    Job inverter = NutchJob.getInstance(getConf());
+    Job inverter = Job.getInstance(getConf(),
+        "Nutch Linkrank: inverter nodedb: " + nodeDb + " outlinkdb: " + outlinkDb);
     Configuration conf = inverter.getConfiguration();
-    inverter.setJobName("LinkAnalysis Inverter");
     FileInputFormat.addInputPath(inverter, nodeDb);
     FileInputFormat.addInputPath(inverter, outlinkDb);
     FileOutputFormat.setOutputPath(inverter, output);
@@ -305,11 +303,10 @@ private void runAnalysis(Path nodeDb, Path inverted, Path output,
       int iteration, int numIterations, float rankOne) 
       throws IOException, InterruptedException, ClassNotFoundException {
 
-    Job analyzer = NutchJob.getInstance(getConf());
+    Job analyzer = Job.getInstance(getConf(),
+        "Nutch LinkRank: analysis iteration" + (iteration + 1) + " of " + numIterations);
     Configuration conf = analyzer.getConfiguration();
     conf.set("link.analyze.iteration", String.valueOf(iteration + 1));
-    analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1)
-        + " of " + numIterations);
     FileInputFormat.addInputPath(analyzer, nodeDb);
     FileInputFormat.addInputPath(analyzer, inverted);
     FileOutputFormat.setOutputPath(analyzer, output);
diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
index dfccccc19e..9277df8f66 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
@@ -298,9 +298,8 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output,
     LOG.info("NodeDumper: starting");
     Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
 
-    Job dumper = NutchJob.getInstance(getConf());
+    Job dumper = Job.getInstance(getConf(), "Nutch NodeDumper: " + webGraphDb);
     Configuration conf = dumper.getConfiguration();
-    dumper.setJobName("NodeDumper: " + webGraphDb);
     FileInputFormat.addInputPath(dumper, nodeDb);
     dumper.setInputFormatClass(SequenceFileInputFormat.class);
 
diff --git a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
index c10a6e37b0..bcd5342743 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
@@ -170,8 +170,7 @@ public void update(Path crawlDb, Path webGraphDb) throws IOException,
         .nextInt(Integer.MAX_VALUE)));
 
     // run the updater job outputting to the temp crawl database
-    Job updater = NutchJob.getInstance(conf);
-    updater.setJobName("Update CrawlDb from WebGraph");
+    Job updater = Job.getInstance(conf, "Nutch ScoreUpdater: " + crawlDb);
     FileInputFormat.addInputPath(updater, crawlDbCurrent);
     FileInputFormat.addInputPath(updater, nodeDb);
     FileOutputFormat.setOutputPath(updater, newCrawlDb);
diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
index b98329d1e0..25e3cf2304 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
@@ -545,9 +545,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
 
     Path tempOutlinkDb = new Path(outlinkDb + "-"
         + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-    Job outlinkJob = NutchJob.getInstance(getConf());
+    Job outlinkJob = Job.getInstance(getConf(), "Nutch WebGraph: outlinkdb " + outlinkDb);
     Configuration outlinkJobConf = outlinkJob.getConfiguration();
-    outlinkJob.setJobName("Outlinkdb: " + outlinkDb);
 
     boolean deleteGone = outlinkJobConf.getBoolean("link.delete.gone", false);
     boolean preserveBackup = outlinkJobConf.getBoolean("db.preserve.backup", true);
@@ -625,9 +624,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
     Path tempInlinkDb = new Path(inlinkDb + "-"
         + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job inlinkJob = NutchJob.getInstance(getConf());
+    Job inlinkJob = Job.getInstance(getConf(), "Nutch WebGraph: inlinkdb " + inlinkDb);
     Configuration inlinkJobConf = inlinkJob.getConfiguration();
-    inlinkJob.setJobName("Inlinkdb " + inlinkDb);
     LOG.info("InlinkDb: adding input: " + outlinkDb);
     FileInputFormat.addInputPath(inlinkJob, outlinkDb);
     inlinkJob.setInputFormatClass(SequenceFileInputFormat.class);
@@ -669,9 +667,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
     Path tempNodeDb = new Path(nodeDb + "-"
         + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
-    Job nodeJob = NutchJob.getInstance(getConf());
+    Job nodeJob = Job.getInstance(getConf(), "Nutch WebGraph: nodedb " + nodeDb);
     Configuration nodeJobConf = nodeJob.getConfiguration();
-    nodeJob.setJobName("NodeDb " + nodeDb);
     LOG.info("NodeDb: adding input: " + outlinkDb);
     LOG.info("NodeDb: adding input: " + inlinkDb);
     FileInputFormat.addInputPath(nodeJob, outlinkDb);
diff --git a/src/java/org/apache/nutch/segment/SegmentMerger.java b/src/java/org/apache/nutch/segment/SegmentMerger.java
index c884dfedf6..53bdee22eb 100644
--- a/src/java/org/apache/nutch/segment/SegmentMerger.java
+++ b/src/java/org/apache/nutch/segment/SegmentMerger.java
@@ -625,9 +625,8 @@ public void merge(Path out, Path[] segs, boolean filter, boolean normalize,
           long slice) throws IOException, ClassNotFoundException, InterruptedException {
     String segmentName = Generator.generateSegmentName();
     LOG.info("Merging {} segments to {}/{}", segs.length, out, segmentName);
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch SegmentMerger: " + out + "/" + segmentName);
     Configuration conf = job.getConfiguration();
-    job.setJobName("mergesegs " + out + "/" + segmentName);
     conf.setBoolean("segment.merger.filter", filter);
     conf.setBoolean("segment.merger.normalizer", normalize);
     conf.setLong("segment.merger.slice", slice);
diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java b/src/java/org/apache/nutch/segment/SegmentReader.java
index ee5c266fd0..bef980060e 100644
--- a/src/java/org/apache/nutch/segment/SegmentReader.java
+++ b/src/java/org/apache/nutch/segment/SegmentReader.java
@@ -200,8 +200,7 @@ public void dump(Path segment, Path output) throws IOException,
 
     LOG.info("SegmentReader: dump segment: {}", segment);
 
-    Job job = NutchJob.getInstance(getConf());
-    job.setJobName("read " + segment);
+    Job job = Job.getInstance(getConf(), "Nutch SegmentReader: " + segment);
     Configuration conf = job.getConfiguration();
 
     if (ge)
diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java b/src/java/org/apache/nutch/tools/FreeGenerator.java
index e9f5c87619..9ace8f192f 100644
--- a/src/java/org/apache/nutch/tools/FreeGenerator.java
+++ b/src/java/org/apache/nutch/tools/FreeGenerator.java
@@ -184,7 +184,7 @@ public int run(String[] args) throws Exception {
     stopWatch.start();
     LOG.info("FreeGenerator: starting");
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch FreeGenerator: " + args[0]);
     Configuration conf = job.getConfiguration();
     conf.setBoolean(FILTER_KEY, filter);
     conf.setBoolean(NORMALIZE_KEY, normalize);
diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
index 825e752cc0..311675310a 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
@@ -371,14 +371,11 @@ public void createSegments(Path arcFiles, Path segmentsOutDir)
 
     StopWatch stopWatch = new StopWatch();
     stopWatch.start();
-    if (LOG.isInfoEnabled()) {
-      LOG.info("ArcSegmentCreator: starting");
-      LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
-    }
+    LOG.info("ArcSegmentCreator: starting");
+    LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
 
-    Job job = NutchJob.getInstance(getConf());
+    Job job = Job.getInstance(getConf(), "Nutch ArcSegmentCreator: " + arcFiles);
     Configuration conf = job.getConfiguration();
-    job.setJobName("ArcSegmentCreator " + arcFiles);
     String segName = generateSegmentName();
     conf.set(Nutch.SEGMENT_NAME_KEY, segName);
     FileInputFormat.addInputPath(job, arcFiles);
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index 6d8a385572..4e80aac5f6 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -433,8 +433,7 @@ public int generateWARC(String output, List<Path> segments,
     stopWatch.start();
     LOG.info("WARCExporter: starting");
 
-    final Job job = NutchJob.getInstance(getConf());
-    job.setJobName("warc-exporter " + output);
+    final Job job = Job.getInstance(getConf(), "Nutch WARCExporter: " + output);
 
     job.getConfiguration().setBoolean(ONLY_SUCCESSFUL_RESPONSES,
         onlySuccessfulResponses);
diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
index 8696d28221..e5ee5f6439 100644
--- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
+++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -133,12 +133,12 @@ public int run(String[] args) throws Exception {
     LOG.info("CrawlCompletionStats: starting");
 
     int mode = 0;
-    String jobName = "CrawlCompletionStats";
+    String jobName = "Nutch CrawlCompletionStats: ";
     if (cli.getOptionValue("mode").equals("host")) {
-      jobName = "Host CrawlCompletionStats";
+      jobName = jobName + "Host statistics";
       mode = MODE_HOST;
     } else if (cli.getOptionValue("mode").equals("domain")) {
-      jobName = "Domain CrawlCompletionStats";
+      jobName = jobName + "Domain statistics";
       mode = MODE_DOMAIN;
     } 
 
diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java
index 068c64fefb..25b8945504 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java
@@ -56,10 +56,6 @@ public NutchJob(Configuration conf, String jobName) throws IOException {
     }
   }
 
-  public static Job getInstance(Configuration conf) throws IOException {
-    return Job.getInstance(conf);
-  } 
-
   /**
    * Clean up the file system in case of a job failure.
    * @param tempDir The temporary directory which needs to be 
diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
index 0fe6c57d03..f4e8a1b913 100644
--- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
+++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -89,7 +89,7 @@ public int run(String[] args) throws Exception {
     stopWatch.start();
     LOG.info("ProtocolStatistics: starting");
 
-    String jobName = "ProtocolStatistics";
+    String jobName = "Nutch ProtocolStatusStatistics: " + inputDir;
 
     Configuration conf = getConf();
     conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 66fa9b0e7a..043e77f694 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -383,7 +383,7 @@ public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean stric
     conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize);
     conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
 
-    Job job = Job.getInstance(conf, "SitemapProcessor_" + crawldb.toString());
+    Job job = Job.getInstance(conf, "Nutch SitemapProcessor: " + crawldb.toString());
     job.setJarByClass(SitemapProcessor.class);
 
     // add crawlDb, sitemap url directory and hostDb to input paths
@@ -431,23 +431,21 @@ public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean stric
       FSUtils.replace(fs, current, tempCrawlDb, true);
       LockUtil.removeLockFile(fs, lock);
 
-      if (LOG.isInfoEnabled()) {
-        long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue();
-        long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue();
-        long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue();
-        long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue();
-        long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue();
-
-        LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords);
-        LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname);
-        LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds);
-        LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches);
-        LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries);
-
-        stopWatch.stop();
-        LOG.info("SitemapProcessor: finished, elapsed: {} ms", stopWatch.getTime(
-            TimeUnit.MILLISECONDS));
-      }
+      long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue();
+      long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue();
+      long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue();
+      long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue();
+      long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue();
+
+      LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords);
+      LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname);
+      LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds);
+      LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches);
+      LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries);
+
+      stopWatch.stop();
+      LOG.info("SitemapProcessor: finished, elapsed: {} ms", stopWatch.getTime(
+          TimeUnit.MILLISECONDS));
     } catch (IOException | InterruptedException | ClassNotFoundException e) {
       LOG.error("SitemapProcessor_" + crawldb.toString(), e);
       NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index f77b72bc5f..1843c424d1 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -97,18 +97,18 @@ public int run(String[] args) throws Exception {
     LOG.info("DomainStatistics: starting");
 
     int mode = 0;
-    String jobName = "DomainStatistics";
+    String jobName = "Nutch DomainStatistics: ";
     if (args[2].equals("host")) {
-      jobName = "Host statistics";
+      jobName = jobName + "Host statistics";
       mode = MODE_HOST;
     } else if (args[2].equals("domain")) {
-      jobName = "Domain statistics";
+      jobName = jobName + "Domain statistics";
       mode = MODE_DOMAIN;
     } else if (args[2].equals("suffix")) {
-      jobName = "Suffix statistics";
+      jobName = jobName + "Suffix statistics";
       mode = MODE_SUFFIX;
     } else if (args[2].equals("tld")) {
-      jobName = "TLD statistics";
+      jobName = jobName + "Top Level Directory statistics";
       mode = MODE_TLD;
     }
 
diff --git a/src/plugin/language-identifier/ivy.xml b/src/plugin/language-identifier/ivy.xml
index f64b97055b..e22284c03b 100644
--- a/src/plugin/language-identifier/ivy.xml
+++ b/src/plugin/language-identifier/ivy.xml
@@ -35,7 +35,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.tallison.tika" name="tika-langdetect-optimaize-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
+    <dependency org="org.tallison.tika" name="tika-langdetect-optimaize-shaded" rev="2.9.1.0" conf="*->default" transitive="false"/>
   </dependencies>
   
 </ivy-module>
diff --git a/src/plugin/language-identifier/plugin.xml b/src/plugin/language-identifier/plugin.xml
index dab1a52f31..94929bdbfa 100644
--- a/src/plugin/language-identifier/plugin.xml
+++ b/src/plugin/language-identifier/plugin.xml
@@ -26,7 +26,7 @@
          <export name="*"/>
       </library>
       <!-- dependencies of Tika's Optimaize language detector (tika-langdetect-optimaize) -->
-      <library name="tika-langdetect-optimaize-shaded-2.9.0.0.jar"/>
+      <library name="tika-langdetect-optimaize-shaded-2.9.1.0.jar"/>
     </runtime>
 
    <requires>
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index f16636d255..d1929ee0e2 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -35,7 +35,7 @@
   </publications>
 
   <dependencies>
-    <!--dependency org="org.tallison.tika" name="tika-parsers-standard-package-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/-->
+    <!--dependency org="org.tallison.tika" name="tika-parsers-standard-package-shaded" rev="2.9.1.0" conf="*->default" transitive="false"/-->
   </dependencies>
   
 </ivy-module>
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index a20fa7266a..9ec410182b 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -25,7 +25,7 @@
       <library name="parse-tika.jar">
          <export name="*"/>
       </library>
-      <!--library name="tika-parsers-standard-package-shaded-2.9.0.0.jar"/-->
+      <!--library name="tika-parsers-standard-package-shaded-2.9.1.0.jar"/-->
    </runtime>
 
    <requires>
diff --git a/src/plugin/urlfilter-fast/README.md b/src/plugin/urlfilter-fast/README.md
index 2e58605752..b4b0dfcd96 100644
--- a/src/plugin/urlfilter-fast/README.md
+++ b/src/plugin/urlfilter-fast/README.md
@@ -73,3 +73,9 @@ the end of the line.
 
 The rules file is defined via the property `urlfilter.fast.file`,
 the default name is `fast-urlfilter.txt`.
+
+In addition to this, the filter checks that the length of the path element of the URL and its query
+done not exceed the values set in the properties `urlfilter.fast.url.path.max.length` and 
+`urlfilter.fast.url.query.max.length` if set. The overall length of the URL can also be used for 
+filtering through the config `urlfilter.fast.url.max.length`.
+
diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
index ffcd0138a9..b1e589a0e1 100644
--- a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
+++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
@@ -20,6 +20,10 @@
 import com.google.common.collect.Multimap;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.nutch.net.URLFilter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -27,6 +31,8 @@
 import java.lang.invoke.MethodHandles;
 import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.net.URL;
 import java.util.regex.Pattern;
@@ -89,6 +95,9 @@
  * 
  * The rules file is defined via the property <code>urlfilter.fast.file</code>,
  * the default name is <code>fast-urlfilter.txt</code>.
+ * 
+ * In addition, it can filter based on the length of the whole URL, its path element or
+ * its query element. See <code>urlfilter.fast.url.*</code> configurations.
  */
 public class FastURLFilter implements URLFilter {
 
@@ -97,25 +106,49 @@ public class FastURLFilter implements URLFilter {
 
   private Configuration conf;
   public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file";
+  public static final String URLFILTER_FAST_MAX_LENGTH = "urlfilter.fast.url.max.length";
+  public static final String URLFILTER_FAST_PATH_MAX_LENGTH = "urlfilter.fast.url.path.max.length";
+  public static final String URLFILTER_FAST_QUERY_MAX_LENGTH = "urlfilter.fast.url.query.max.length";
+  
   private Multimap<String, Rule> hostRules = LinkedHashMultimap.create();
   private Multimap<String, Rule> domainRules = LinkedHashMultimap.create();
 
+  /** Max allowed size of the path of a URL **/
+  private int maxLengthPath = -1;
+  /** Max allowed size of the query of a URL **/
+  private int maxLengthQuery = -1;
+  /** Max allowed size for the whole URL **/
+  private int maxLength = -1;
+
   private static final Pattern CATCH_ALL_RULE = Pattern
       .compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
 
   public FastURLFilter() {}
 
+  /** Used by the tests so that the rules file doesn't have to be in the jar **/
   FastURLFilter(Reader rules) throws IOException, PatternSyntaxException {
     reloadRules(rules);
   }
+  
+  /** Used by the tests so that the rules file doesn't have to be in the jar AND 
+   * we can set the conf for the length-based filtering **/
+  FastURLFilter(Reader rules, Configuration conf) throws IOException, PatternSyntaxException {
+    maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1);
+    maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1);
+    maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1);
+    reloadRules(rules);
+  }
 
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
+    maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1);
+    maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1);
+    maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1);
     try {
       reloadRules();
     } catch (Exception e) {
-      LOG.error(e.getMessage());
+      LOG.error("Failed to load rules: {}", e.getMessage()  );
       throw new RuntimeException(e.getMessage(), e);
     }
   }
@@ -128,6 +161,12 @@ public Configuration getConf() {
   @Override
   public String filter(String url) {
 
+    if (maxLength != -1 && url.length() > maxLength) {
+      LOG.debug("Rejected {} because URL length ({}) greater than limit {}", url,
+          url.length(), maxLength);
+      return null;
+    }
+    
     URL u;
 
     try {
@@ -137,6 +176,22 @@ public String filter(String url) {
           e.getMessage());
       return null;
     }
+    
+    final String path = u.getPath();
+    if (maxLengthPath != -1 && path.length() > maxLengthPath)
+    {
+      LOG.debug("Rejected {} as path length {} is greater than {}", url,
+          path.length(), maxLengthPath);
+      return null;
+    }
+    
+    final String query = u.getQuery();
+    if (maxLengthQuery != -1 &&  query != null && query.length() > maxLengthQuery)
+    {
+      LOG.debug("Rejected {} as query length {} is greater than {}", url,
+          query.length(), maxLengthQuery);
+      return null;
+    }
 
     String hostname = u.getHost();
 
@@ -181,8 +236,33 @@ public String filter(String url) {
 
   public void reloadRules() throws IOException {
     String fileRules = conf.get(URLFILTER_FAST_FILE);
-    try (Reader reader = conf.getConfResourceAsReader(fileRules)) {
-      reloadRules(reader);
+    InputStream is;
+
+    Path fileRulesPath = new Path(fileRules);
+    if (fileRulesPath.toUri().getScheme() != null) {
+      FileSystem fs = fileRulesPath.getFileSystem(conf);
+      is = fs.open(fileRulesPath);
+    } else {
+      is = conf.getConfResourceAsInputStream(fileRules);
+    }
+
+    CompressionCodec codec = new CompressionCodecFactory(conf)
+        .getCodec(fileRulesPath);
+    if (codec != null && is != null) {
+      is = codec.createInputStream(is);
+    }
+
+    try {
+      reloadRules(new InputStreamReader(is));
+    } catch (Exception e) {
+      String message = "Couldn't load the rules from " + fileRules;
+      LOG.error(message);
+      throw new IOException(message);
+    }
+    finally {
+      if (is != null) {
+        is.close();
+      }
     }
   }
 
diff --git a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
index 8e01d8d3cd..75b37250eb 100644
--- a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
+++ b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
@@ -19,13 +19,14 @@
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.Reader;
+import java.io.StringReader;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.net.URLFilter;
 import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
 import org.junit.Assert;
 import org.junit.Test;
 
-
 public class TestFastURLFilter extends RegexURLFilterBaseTest {
 
   @Override
@@ -53,4 +54,39 @@ public void benchmark() {
     bench(800, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
   }
 
+  @Test
+  public void lengthQueryAndPath() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setInt(FastURLFilter.URLFILTER_FAST_PATH_MAX_LENGTH, 50);
+    conf.setInt(FastURLFilter.URLFILTER_FAST_QUERY_MAX_LENGTH, 50);
+    // not interested in testing rules
+    URLFilter filter = new FastURLFilter(new StringReader(""), conf);
+
+    StringBuilder url = new StringBuilder("http://nutch.apache.org/");
+    for (int i = 0; i < 50; i++) {
+      url.append(i);
+    }
+    Assert.assertEquals(null, filter.filter(url.toString()));
+
+    url = new StringBuilder("http://nutch.apache.org/path?");
+    for (int i = 0; i < 50; i++) {
+      url.append(i);
+    }
+
+    Assert.assertEquals(null, filter.filter(url.toString()));
+  }
+
+  @Test
+  public void overalLengthTest() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setInt(FastURLFilter.URLFILTER_FAST_MAX_LENGTH, 100);
+    // not interested in testing rules
+    URLFilter filter = new FastURLFilter(new StringReader(""), conf);
+
+    StringBuilder url = new StringBuilder("http://nutch.apache.org/");
+    for (int i = 0; i < 500; i++) {
+      url.append(i);
+    }
+    Assert.assertEquals(null, filter.filter(url.toString()));
+  }
 }
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
index 82fefaf164..812d4a6a8f 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
@@ -31,7 +31,6 @@
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
-import org.apache.nutch.util.NutchJob;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -94,7 +93,7 @@ public void testUrl404Purging() throws Exception {
     conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true);
     conf.setBoolean(CrawlDbFilter.URL_FILTERING, false);
     conf.setInt("urlnormalizer.loop.count", 2);
-    Job job = NutchJob.getInstance(conf);
+    Job job = Job.getInstance(conf);
     job.setJobName("Test CrawlDbFilter");
     Path current = new Path(dbDir, "current");
     if (FileSystem.get(conf).exists(current)) {
diff --git a/src/test/org/apache/nutch/parse/TestParseSegment.java b/src/test/org/apache/nutch/parse/TestParseSegment.java
new file mode 100644
index 0000000000..dd7f4f9202
--- /dev/null
+++ b/src/test/org/apache/nutch/parse/TestParseSegment.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+import static junit.framework.TestCase.assertFalse;
+import static junit.framework.TestCase.assertTrue;
+
+import java.nio.charset.StandardCharsets;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.junit.Test;
+
+public class TestParseSegment {
+  private static byte[] BYTES = "the quick brown fox".getBytes(StandardCharsets.UTF_8);
+
+  @Test
+  public void testMetadataFlag() throws Exception {
+
+    Content content = new Content();
+    Metadata metadata = new Metadata();
+    metadata.set(Response.TRUNCATED_CONTENT, "true");
+    content.setMetadata(metadata);
+    content.setContent(BYTES);
+    assertTrue(ParseSegment.isTruncated(content));
+
+    metadata.set(Response.TRUNCATED_CONTENT, "false");
+    assertFalse(ParseSegment.isTruncated(content));
+
+    //test that truncated_content does override length field
+    metadata = new Metadata();
+    metadata.set(Response.TRUNCATED_CONTENT, "false");
+    metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length + 10));
+    assertFalse(ParseSegment.isTruncated(content));
+
+    //test that truncated_content does override length field
+    metadata = new Metadata();
+    metadata.set(Response.TRUNCATED_CONTENT, "true");
+    metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length));
+    assertFalse(ParseSegment.isTruncated(content));
+
+  }
+
+  @Test
+  public void testLength() throws Exception {
+    Content content = new Content();
+    Metadata metadata = new Metadata();
+    metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length));
+    content.setMetadata(metadata);
+    content.setContent(BYTES);
+    assertFalse(ParseSegment.isTruncated(content));
+
+    metadata.set(Response.CONTENT_LENGTH, Integer.toString(BYTES.length * 2));
+    assertTrue(ParseSegment.isTruncated(content));
+  }
+
+  @Test
+  public void testNoLengthField() {
+    //test return false if there is no "Length" header field
+    Content content = new Content();
+    Metadata metadata = new Metadata();
+    content.setMetadata(metadata);
+    content.setContent(BYTES);
+    assertFalse(ParseSegment.isTruncated(content));
+  }
+}
diff --git a/src/test/org/apache/nutch/plugin/TestPluginSystem.java b/src/test/org/apache/nutch/plugin/TestPluginSystem.java
index dba7c66066..7c1362aa56 100644
--- a/src/test/org/apache/nutch/plugin/TestPluginSystem.java
+++ b/src/test/org/apache/nutch/plugin/TestPluginSystem.java
@@ -28,7 +28,6 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -102,7 +101,7 @@ public void testLoadPlugins() {
   public void testRepositoryCache() throws IOException {
     Configuration config = NutchConfiguration.create();
     PluginRepository repo = PluginRepository.get(config);
-    Job job = NutchJob.getInstance(config);
+    Job job = Job.getInstance(config);
     config = job.getConfiguration();
     PluginRepository repo1 = PluginRepository.get(config);
     Assert.assertTrue(repo == repo1);
@@ -111,7 +110,7 @@ public void testRepositoryCache() throws IOException {
     config.addResource("nutch-default.xml");
     config.addResource("nutch-site.xml");
     repo = PluginRepository.get(config);
-    job = NutchJob.getInstance(config);
+    job = Job.getInstance(config);
     config = job.getConfiguration();
     repo1 = PluginRepository.get(config);
     Assert.assertTrue(repo1 != repo);