Skip to content

Commit

Permalink
Merged from apache master
Browse files Browse the repository at this point in the history
Signed-off-by: Julien Nioche <[email protected]>
  • Loading branch information
jnioche committed Nov 9, 2023
2 parents f3f948e + 7ad382d commit bec481b
Show file tree
Hide file tree
Showing 42 changed files with 337 additions and 134 deletions.
34 changes: 32 additions & 2 deletions conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1964,8 +1964,38 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
<property>
<name>urlfilter.fast.file</name>
<value>fast-urlfilter.txt</value>
<description>Name of file on CLASSPATH containing regular expressions
used by urlfilter-fast (FastURLFilter) plugin.</description>
<description>Name of file containing rules and regular expressions
used by urlfilter-fast (FastURLFilter) plugin. If the filename
includes a scheme (for example, hdfs://) it is loaded using the
Hadoop FileSystem implementation supporting that scheme. If the
filename does not contain a scheme, the file is loaded from
CLASSPATH. If indicated by file extension (.gz, .bzip2, .zst),
the file is decompressed while reading using Hadoop-provided
compression codecs.</description>
</property>

<property>
<name>urlfilter.fast.url.max.length</name>
<value>-1</value>
<description>Filters URLs based on their overall length.
The default value of -1 means that it is deactivated.
</description>
</property>

<property>
<name>urlfilter.fast.url.path.max.length</name>
<value>-1</value>
<description>Filters URLs based on the length of their path element.
The default value of -1 means that it is deactivated.
</description>
</property>

<property>
<name>urlfilter.fast.url.query.max.length</name>
<value>-1</value>
<description>Filters URLs based on the length of their query element.
The default value of -1 means that it is deactivated.
</description>
</property>

<property>
Expand Down
16 changes: 9 additions & 7 deletions ivy/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,21 @@
</publications>

<dependencies>
<dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.20.0" conf="*->master" />
<dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.20.0" conf="*->master" />
<dependency org="org.apache.logging.log4j" name="log4j-slf4j2-impl" rev="2.20.0" conf="*->master" />
<dependency org="org.slf4j" name="slf4j-api" rev="2.0.7" conf="*->master" />
<dependency org="org.apache.logging.log4j" name="log4j-api" rev="2.21.1" conf="*->master" />
<dependency org="org.apache.logging.log4j" name="log4j-core" rev="2.21.1" conf="*->master" />
<dependency org="org.apache.logging.log4j" name="log4j-slf4j2-impl" rev="2.21.1" conf="*->master" />
<dependency org="org.slf4j" name="slf4j-api" rev="2.0.9" conf="*->master" />

<dependency org="org.apache.commons" name="commons-lang3" rev="3.13.0" conf="*->default" />
<dependency org="org.apache.commons" name="commons-collections4" rev="4.4" conf="*->master" />
<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.13" conf="*->master" />
<dependency org="commons-codec" name="commons-codec" rev="1.16.0" conf="*->default" />
<!-- hadoop 3.4.0 should have 2.11.0; Tika is broken in distributed mode until then;
We're currently relying on the hadoop-safe-tika shim that shades commons-io to upgrade
Tika
see https://github.com/apache/nutch/pull/776 -->
<dependency org="commons-io" name="commons-io" rev="2.11.0" conf="*->default" />
<dependency org="org.apache.commons" name="commons-compress" rev="1.23.0" conf="*->default" />
<dependency org="org.apache.commons" name="commons-compress" rev="1.24.0" conf="*->default" />
<dependency org="org.apache.commons" name="commons-jexl3" rev="3.2.1" conf="*->default" />
<dependency org="com.tdunning" name="t-digest" rev="3.3" />

Expand All @@ -70,15 +72,15 @@
<exclude org="org.slf4j" name="*" />
</dependency><!-- End of Hadoop Dependencies -->

<dependency org="org.tallison.tika" name="tika-core-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
<dependency org="org.tallison.tika" name="tika-core-shaded" rev="2.9.1.0" conf="*->default" transitive="false"/>
<!--dependency org="org.apache.tika" name="tika-core" rev="2.9.0" /-->
<!-- Tika parser text and html modules (without transitive dependencies) are used to detect
the charset in text resp. HTML documents. -->
<!-- With the shaded packages, we need to use the tika-parsers-standard-package-shaded.
It is therefor not required in parse-tika. -->
<!--dependency org="org.apache.tika" name="tika-parser-text-module" rev="2.9.0" transitive="false" /-->
<!--dependency org="org.apache.tika" name="tika-parser-html-module" rev="2.9.0" transitive="false" /-->
<dependency org="org.tallison.tika" name="tika-parsers-standard-package-shaded" rev="2.9.0.0" conf="*->default" transitive="false"/>
<dependency org="org.tallison.tika" name="tika-parsers-standard-package-shaded" rev="2.9.1.0" conf="*->default" transitive="false"/>

<!-- language detection -->
<dependency org="org.commoncrawl" name="language-detection-cld2" rev="0.1-SNAPSHOT" />
Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/crawl/CrawlDb.java
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,7 @@ public static Job createJob(Configuration config, Path crawlDb)
Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random()
.nextInt(Integer.MAX_VALUE)));

Job job = NutchJob.getInstance(config);
job.setJobName("crawldb " + crawlDb);
Job job = Job.getInstance(config, "Nutch CrawlDb: " + crawlDb);

Path current = new Path(crawlDb, CURRENT_NAME);
if (current.getFileSystem(job.getConfiguration()).exists(current)) {
Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/crawl/CrawlDbMerger.java
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,8 @@ public static Job createMergeJob(Configuration conf, Path output,
Path newCrawlDb = new Path(output,
"merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

Job job = NutchJob.getInstance(conf);
Job job = Job.getInstance(conf, "Nutch CrawlDbMerger: " + output);
conf = job.getConfiguration();
job.setJobName("crawldb merge " + output);

job.setInputFormatClass(SequenceFileInputFormat.class);

Expand Down
20 changes: 7 additions & 13 deletions src/java/org/apache/nutch/crawl/CrawlDbReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -564,9 +564,8 @@ private TreeMap<String, Writable> processStatJobHelper(String crawlDb,
throws IOException, InterruptedException, ClassNotFoundException {
Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());

Job job = NutchJob.getInstance(config);
Job job = Job.getInstance(config, "Nutch CrawlDbReader: " + crawlDb);
config = job.getConfiguration();
job.setJobName("stats " + crawlDb);
config.setBoolean("db.reader.stats.sort", sort);

FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
Expand Down Expand Up @@ -812,7 +811,7 @@ public CrawlDatum get(String crawlDb, String url, Configuration config)

@Override
protected int process(String line, StringBuilder output) throws Exception {
Job job = NutchJob.getInstance(getConf());
Job job = Job.getInstance(getConf(), "Nutch CrawlDbReader: process " + crawlDb);
Configuration config = job.getConfiguration();
readUrl(this.crawlDb, line, config, output);
return 0;
Expand All @@ -839,8 +838,7 @@ public void processDumpJob(String crawlDb, String output,

Path outFolder = new Path(output);

Job job = NutchJob.getInstance(config);
job.setJobName("dump " + crawlDb);
Job job = Job.getInstance(config, "Nutch CrawlDbReader: dump " + crawlDb);
Configuration jobConf = job.getConfiguration();

FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
Expand Down Expand Up @@ -958,18 +956,15 @@ public void processTopNJob(String crawlDb, long topN, float min,
String output, Configuration config)
throws IOException, ClassNotFoundException, InterruptedException {

if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
LOG.info("CrawlDb db: {}", crawlDb);
}
LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
LOG.info("CrawlDb db: {}", crawlDb);

Path outFolder = new Path(output);
Path tempDir = new Path(
config.get("mapreduce.cluster.temp.dir", ".") + "/readdb-topN-temp-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

Job job = NutchJob.getInstance(config);
job.setJobName("topN prepare " + crawlDb);
Job job = Job.getInstance(config, "Nutch CrawlDbReader: topN prepare " + crawlDb);
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
job.setInputFormatClass(SequenceFileInputFormat.class);

Expand Down Expand Up @@ -1000,8 +995,7 @@ public void processTopNJob(String crawlDb, long topN, float min,
}

LOG.info("CrawlDb topN: collecting topN scores.");
job = NutchJob.getInstance(config);
job.setJobName("topN collect " + crawlDb);
job = Job.getInstance(config, "Nutch CrawlDbReader: topN collect " + crawlDb);
job.getConfiguration().setLong("db.reader.topn", topN);

FileInputFormat.addInputPath(job, tempDir);
Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/crawl/DeduplicationJob.java
Original file line number Diff line number Diff line change
Expand Up @@ -305,9 +305,8 @@ public int run(String[] args) throws IOException {
Path tempDir = new Path(crawlDb, "dedup-temp-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

Job job = NutchJob.getInstance(getConf());
Job job = Job.getInstance(getConf(), "Nutch DeduplicationJob: " + crawlDb);
Configuration conf = job.getConfiguration();
job.setJobName("Deduplication on " + crawlDb);
conf.set(DEDUPLICATION_GROUP_MODE, group);
conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
job.setJarByClass(DeduplicationJob.class);
Expand Down
13 changes: 5 additions & 8 deletions src/java/org/apache/nutch/crawl/Generator.java
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ private JexlContext createContext(HostDatum datum) {
public void setup(Context context) throws IOException {
conf = context.getConfiguration();
mos = new MultipleOutputs<FloatWritable, SelectorEntry>(context);
Job job = Job.getInstance(conf);
Job job = Job.getInstance(conf, "Nutch Generator.SelectorReducer");
limit = conf.getLong(GENERATOR_TOP_N, Long.MAX_VALUE)
/ job.getNumReduceTasks();
maxNumSegments = conf.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
Expand Down Expand Up @@ -695,7 +695,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
long curTime)
throws IOException, InterruptedException, ClassNotFoundException {

Job job = NutchJob.getInstance(getConf());
Job job = Job.getInstance(getConf(), "Nutch Generator: generate from " + dbDir);
Configuration conf = job.getConfiguration();
boolean filter = conf.getBoolean(GENERATOR_FILTER, true);
boolean normalise = conf.getBoolean(GENERATOR_NORMALISE, true);
Expand Down Expand Up @@ -839,8 +839,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
}

// map to inverted subset due for fetch, sort by score
Job job = NutchJob.getInstance(getConf());
job.setJobName("generate: select from " + dbDir);
Job job = Job.getInstance(getConf(), "Nutch Generator: generate from " + dbDir);
Configuration conf = job.getConfiguration();
if (numLists == -1) {
/* for politeness create exactly one partition per fetch task */
Expand Down Expand Up @@ -942,8 +941,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
Path tempDir2 = new Path(dbDir,
"generate-temp-" + java.util.UUID.randomUUID().toString());

job = NutchJob.getInstance(getConf());
job.setJobName("generate: updatedb " + dbDir);
job = Job.getInstance(getConf(), "Nutch Generator: updatedb " + dbDir);
job.getConfiguration().setLong(Nutch.GENERATE_TIME_KEY, generateTime);
for (Path segmpaths : generatedSegments) {
Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
Expand Down Expand Up @@ -1001,8 +999,7 @@ private Path partitionSegment(Path segmentsDir, Path inputDir, int numLists)

LOG.info("Generator: segment: " + segment);

Job job = NutchJob.getInstance(getConf());
job.setJobName("generate: partition " + segment);
Job job = Job.getInstance(getConf(), "Nutch Generator: partition segment " + segment);
Configuration conf = job.getConfiguration();
conf.setInt("partition.url.seed", RANDOM.nextInt());

Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/crawl/Injector.java
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite,
Path lock = CrawlDb.lock(conf, crawlDb, false);

// configure job
Job job = Job.getInstance(conf, "inject " + urlDir);
Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir);
job.setJarByClass(Injector.class);
job.setMapperClass(InjectMapper.class);
job.setReducerClass(InjectReducer.class);
Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/crawl/LinkDb.java
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,8 @@ private static Job createJob(Configuration config, Path linkDb,
Path newLinkDb = new Path(linkDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

Job job = NutchJob.getInstance(config);
Job job = Job.getInstance(config, "Nutch LinkDb: " + linkDb);
Configuration conf = job.getConfiguration();
job.setJobName("linkdb " + linkDb);

job.setInputFormatClass(SequenceFileInputFormat.class);

Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/crawl/LinkDbMerger.java
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,7 @@ public static Job createMergeJob(Configuration config, Path linkDb,
Path newLinkDb = new Path(linkDb,
"merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

Job job = NutchJob.getInstance(config);
job.setJobName("linkdb merge " + linkDb);
Job job = Job.getInstance(config, "Nutch LinkDbMerger: " + linkDb);

Configuration conf = job.getConfiguration();
job.setInputFormatClass(SequenceFileInputFormat.class);
Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/crawl/LinkDbReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,7 @@ public void processDumpJob(String linkdb, String output, String regex)

Path outFolder = new Path(output);

Job job = NutchJob.getInstance(getConf());
job.setJobName("read " + linkdb);
Job job = Job.getInstance(getConf(), "Nutch LinkDbReader: " + linkdb);
job.setJarByClass(LinkDbReader.class);

Configuration conf = job.getConfiguration();
Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/fetcher/Fetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ public void fetch(Path segment, int threads) throws IOException,
totalOutlinksToFollow);
}

Job job = NutchJob.getInstance(getConf());
Job job = Job.getInstance(getConf(), "Nutch Fetcher: " + segment.getName());
job.setJobName("FetchData");
Configuration conf = job.getConfiguration();

Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/hostdb/ReadHostDb.java
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,7 @@ private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean
conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
conf.set("mapreduce.output.textoutputformat.separator", "\t");

Job job = Job.getInstance(conf);
job.setJobName("ReadHostDb");
Job job = Job.getInstance(conf, "Nutch ReadHostDb");
job.setJarByClass(ReadHostDb.class);

FileInputFormat.addInputPath(job, new Path(hostDb, "current"));
Expand Down
3 changes: 1 addition & 2 deletions src/java/org/apache/nutch/hostdb/UpdateHostDb.java
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,10 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts,
stopWatch.start();
LOG.info("UpdateHostDb: starting");

Job job = NutchJob.getInstance(getConf());
Job job = Job.getInstance(getConf(), "Nutch UpdateHostDb");
Configuration conf = job.getConfiguration();
boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
job.setJarByClass(UpdateHostDb.class);
job.setJobName("UpdateHostDb");

FileSystem fs = hostDb.getFileSystem(conf);
Path old = new Path(hostDb, "old");
Expand Down
4 changes: 1 addition & 3 deletions src/java/org/apache/nutch/indexer/CleaningJob.java
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ public void delete(String crawldb, boolean noCommit)
stopWatch.start();
LOG.info("CleaningJob: starting");

Job job = NutchJob.getInstance(getConf());
Job job = Job.getInstance(getConf(), "Nutch CleaningJob: " + crawldb);
Configuration conf = job.getConfiguration();

FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
Expand All @@ -157,8 +157,6 @@ public void delete(String crawldb, boolean noCommit)
job.setReducerClass(DeleterReducer.class);
job.setJarByClass(CleaningJob.class);

job.setJobName("CleaningJob");

// need to expicitely allow deletions
conf.setBoolean(IndexerMapReduce.INDEXER_DELETE, true);

Expand Down
3 changes: 2 additions & 1 deletion src/java/org/apache/nutch/indexer/IndexingJob.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ public void index(Path crawlDb, Path linkDb, List<Path> segments,
stopWatch.start();
LOG.info("Indexer: starting");

final Job job = NutchJob.getInstance(getConf());
final Job job = Job.getInstance(getConf(),
"Nutch IndexingJob: crawldb: " + crawlDb + " segment(s): " + segments);
job.setJobName("Indexer");
Configuration conf = job.getConfiguration();

Expand Down
14 changes: 12 additions & 2 deletions src/java/org/apache/nutch/parse/ParseSegment.java
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,17 @@ public static boolean isTruncated(Content content) {
if (metadata == null)
return false;

//check for okhttp or other protocol's truncated flag
//if the flag is there, no matter the value, trust it.
if (metadata.get(Response.TRUNCATED_CONTENT) != null) {
if ("true".equals(metadata.get(Response.TRUNCATED_CONTENT))) {
LOG.info(content.getUrl() + " skipped. Protocol metadata indicates truncated content, " +
"actualSize= " + content.getContent().length);
return true;
}
return false;
}

String lengthStr = metadata.get(Response.CONTENT_LENGTH);
if (lengthStr != null)
lengthStr = lengthStr.trim();
Expand Down Expand Up @@ -232,8 +243,7 @@ public void parse(Path segment) throws IOException,
LOG.info("ParseSegment: starting");
LOG.info("ParseSegment: segment: {}", segment);

Job job = NutchJob.getInstance(getConf());
job.setJobName("parse " + segment);
Job job = Job.getInstance(getConf(), "Nutch ParseSegment: " + segment);

Configuration conf = job.getConfiguration();
FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
Expand Down
6 changes: 2 additions & 4 deletions src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
Original file line number Diff line number Diff line change
Expand Up @@ -341,8 +341,7 @@ public void dumpLinks(Path webGraphDb) throws IOException,
// run the inverter job
Path tempInverted = new Path(webGraphDb, "inverted-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
Job inverter = NutchJob.getInstance(conf);
inverter.setJobName("LinkDumper: inverter");
Job inverter = Job.getInstance(conf, "Nutch LinkDumper: invert " + webGraphDb);
FileInputFormat.addInputPath(inverter, nodeDb);
FileInputFormat.addInputPath(inverter, outlinkDb);
inverter.setInputFormatClass(SequenceFileInputFormat.class);
Expand Down Expand Up @@ -372,8 +371,7 @@ public void dumpLinks(Path webGraphDb) throws IOException,
}

// run the merger job
Job merger = NutchJob.getInstance(conf);
merger.setJobName("LinkDumper: merger");
Job merger = Job.getInstance(conf, "Nutch LinkDumper: merge " + tempInverted);
FileInputFormat.addInputPath(merger, tempInverted);
merger.setJarByClass(Merger.class);
merger.setInputFormatClass(SequenceFileInputFormat.class);
Expand Down
Loading

0 comments on commit bec481b

Please sign in to comment.