diff --git a/.github/workflows/dependency-check.yml b/.github/workflows/dependency-check.yml new file mode 100644 index 0000000000..f07f746a0d --- /dev/null +++ b/.github/workflows/dependency-check.yml @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: master pr build + +on: + schedule: + - cron: '0 0 * * *' # every day at midnight + +jobs: + dependency-check: + strategy: + matrix: + java: ['11'] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + - name: Dependency check + run: ant clean dependency-check -buildfile build.xml diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index e3ed11c869..e0af58df06 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -1,4 +1,3 @@ -# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -13,29 +12,67 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -name: master pr build +name: master pull request ci on: push: - branches: [ master ] + branches: [master] pull_request: - branches: [ master ] - + types: [opened, synchronize, reopened] + branches: [master] jobs: - build: - runs-on: ubuntu-latest + javadoc: strategy: matrix: - java: [ '11' ] - + java: ['11'] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + - name: Javadoc + run: ant clean javadoc -buildfile build.xml + rat: + strategy: + matrix: + java: ['11'] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + - name: Run Apache Rat + run: ant clean run-rat -buildfile build.xml + - name: Cache unknown licenses + run: echo "UNKNOWN_LICENSES=$(sed -n 18p /home/runner/work/nutch/nutch/build/apache-rat-report.txt)" >> $GITHUB_ENV + - name: Versions + run: | + echo $UNKNOWN_LICENSES + - name: Fail if any unknown licenses + if: ${{ env.UNKNOWN_LICENSES != '0 Unknown Licenses' }} + run: exit 1 + test: + strategy: + matrix: + java: ['11'] + os: [ubuntu-latest, macos-latest] + runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} - - name: Build with Ant - run: ant clean nightly javadoc -buildfile build.xml + distribution: 'temurin' + - name: Test + run: ant clean test -buildfile build.xml diff --git a/.gitignore b/.gitignore index 0612a99c23..12365dd0d4 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,5 @@ naivebayes-model csvindexwriter lib/spotbugs-* ivy/dependency-check-ant/* +.gradle* +ivy/apache-rat-* diff --git a/LICENSE-binary b/LICENSE-binary index 8e24a728e2..b317945bc6 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -208,7 +208,6 @@ This product bundles some components that are also licensed under the Apache License Version 2.0: -ch.qos.reload4j:reload4j com.101tec:zkclient com.amazonaws:aws-java-sdk-cloudsearch com.amazonaws:aws-java-sdk-core @@ -327,11 +326,6 @@ net.sourceforge.owlapi:owlapi-impl net.sourceforge.owlapi:owlapi-parsers net.sourceforge.owlapi:owlapi-rio net.sourceforge.owlapi:owlapi-tools -org.apache.any23:apache-any23-api -org.apache.any23:apache-any23-core -org.apache.any23:apache-any23-csvutils -org.apache.any23:apache-any23-encoding -org.apache.any23:apache-any23-mime org.apache.avro:avro org.apache.commons:commons-collections4 org.apache.commons:commons-compress @@ -758,7 +752,6 @@ org.jsoup:jsoup org.rypt:f8 org.slf4j:jcl-over-slf4j org.slf4j:slf4j-api -org.slf4j:slf4j-reload4j Mozilla Public License 1.1 (MPL 1.1) diff --git a/NOTICE-binary b/NOTICE-binary index 1aab2cb411..5f1ff44bab 100644 --- a/NOTICE-binary +++ b/NOTICE-binary @@ -29,7 +29,7 @@ code and source code. The following provides more details on the included cryptographic software: -The plugins parse-tika and any23 use Apache Tika and the Bouncy Castle +The parse-tika plugin uses Apache Tika and the Bouncy Castle generic encryption libraries for extracting text content and metadata from encrypted PDF files. See for more details on Bouncy Castle and for details @@ -46,9 +46,6 @@ on Apache Tika. Apache projects --------------- -Apache Any23 (https://any23.apache.org/) - see https://github.com/apache/any23/blob/master/NOTICE.txt - Apache Avro (https://avro.apache.org) see https://github.com/apache/avro/blob/master/NOTICE.txt @@ -163,10 +160,6 @@ AOP alliance (http://aopalliance.sourceforge.net) - license: Public Domain (licenses-binary/LICENSE-public-domain.txt) -# ch.qos.reload4j:reload4j -reload4j (https://reload4j.qos.ch) -- license: The Apache Software License, Version 2.0 - # com.101tec:zkclient ZkClient (https://github.com/sgroschupf/zkclient) - license: The Apache Software License, Version 2.0 @@ -1100,10 +1093,6 @@ JCL 1.2 implemented over SLF4J (http://www.slf4j.org) (licenses-binary/LICENSE-mit-license.txt) # org.slf4j:slf4j-api SLF4J API Module (http://www.slf4j.org) -- license: MIT License - (licenses-binary/LICENSE-mit-license.txt) -# org.slf4j:slf4j-reload4j -SLF4J Reload4j Binding (http://reload4j.qos.ch) - license: MIT License (licenses-binary/LICENSE-mit-license.txt) diff --git a/NOTICE.txt b/NOTICE.txt index 939ddc8031..4fdd968ab0 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -29,7 +29,7 @@ code and source code. The following provides more details on the included cryptographic software: -The plugins parse-tika and any23 use Apache Tika and the Bouncy Castle +The parse-tika plugin uses Apache Tika and the Bouncy Castle generic encryption libraries for extracting text content and metadata from encrypted PDF files. See for more details on Bouncy Castle and for details diff --git a/build.xml b/build.xml index f2a1989fc2..ea3031c8fe 100644 --- a/build.xml +++ b/build.xml @@ -38,7 +38,7 @@ - + @@ -48,7 +48,7 @@ - + @@ -202,7 +202,6 @@ - @@ -642,13 +641,15 @@ - + + reportformat="ALL" + assemblyAnalyzerEnabled="false" + failBuildOnCVSS="1"> @@ -688,7 +689,6 @@ - @@ -774,7 +774,6 @@ - @@ -1030,7 +1029,7 @@ - - + @@ -1052,8 +1051,40 @@ - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1182,8 +1213,6 @@ - - diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index e6d4ab6791..6543737cf0 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -141,8 +141,9 @@ http.robots.503.defer.visits true Temporarily suspend fetching from a host if the - robots.txt response is HTTP 503 or any other 5xx server error. See - also http.robots.503.defer.visits.delay and + robots.txt response is HTTP 503 or any other 5xx server error + and HTTP 429 Too Many Requests. See also + http.robots.503.defer.visits.delay and http.robots.503.defer.visits.retries @@ -150,7 +151,7 @@ http.robots.503.defer.visits.delay 300000 Time in milliseconds to suspend crawling a host if the - robots.txt response is HTTP 5xx - see + robots.txt response is HTTP 5xx or 429 Too Many Requests - see http.robots.503.defer.visits. @@ -158,8 +159,18 @@ http.robots.503.defer.visits.retries 3 Number of retries crawling a host if the robots.txt - response is HTTP 5xx - see http.robots.503.defer.visits. After n - retries the host queue is dropped for this segment/cycle. + response is HTTP 5xx or 429 - see http.robots.503.defer.visits. + After n retries the host queue is dropped for this segment/cycle. + + + + + http.robots.redirect.max + 5 + Maximum number of redirects followed when fetching + a robots.txt file. RFC 9309 specifies that "crawlers SHOULD + follow at least five consecutive redirects, even across authorities + (for example, hosts in the case of HTTP)." @@ -1445,22 +1456,6 @@ - - - - - - any23.extractors - html-microdata - Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html) - - - - any23.content_types - text/html,application/xhtml+xml - Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported. - - diff --git a/default.properties b/default.properties index f314d7f85e..15447c354d 100644 --- a/default.properties +++ b/default.properties @@ -44,7 +44,7 @@ test.junit.output.format = plain javadoc.proxy.host=-J-DproxyHost= javadoc.proxy.port=-J-DproxyPort= javadoc.link.java=https://docs.oracle.com/en/java/javase/11/docs/api/ -javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.3.4/api/ +javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.3.6/api/ javadoc.packages=org.apache.nutch.* dist.dir=./dist @@ -219,6 +219,4 @@ plugins.misc=\ org.apache.nutch.collection*:\ org.apache.nutch.analysis.lang*:\ org.creativecommons.nutch*:\ - org.apache.nutch.microformats.reltag*:\ - org.apache.nutch.any23* - + org.apache.nutch.microformats.reltag*: diff --git a/ivy/dependency-check-ant/dependency-check-suppressions.xml b/ivy/dependency-check-ant/dependency-check-suppressions.xml index e7de8febb2..a7f4ca16df 100644 --- a/ivy/dependency-check-ant/dependency-check-suppressions.xml +++ b/ivy/dependency-check-ant/dependency-check-suppressions.xml @@ -1,8 +1,3 @@ - - only applies to tika-server < 1.18 - ^org\.(apache\.tika:tika-(core|parsers)|gagravarr:vorbis-java-tika):.*$ - CVE-2018-1335 - diff --git a/ivy/ivy.xml b/ivy/ivy.xml index f4ad209b07..71464ed25c 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -36,35 +36,53 @@ - - - - + + + + - + - - + + + + - - - - - + + + + + + + + + + + + + + + + - + + - - + the charset in text resp. HTML documents. --> + + + + - + @@ -85,10 +103,10 @@ - - - - + + + + @@ -114,7 +132,10 @@ - + + + + diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml index 2f1adedcf9..ee04195884 100644 --- a/ivy/ivysettings.xml +++ b/ivy/ivysettings.xml @@ -35,7 +35,14 @@ + + + + + values, Context context) (injected.getFetchInterval() + old.getFetchInterval()) / 2); } } - if (injectedSet && oldSet) { - context.getCounter("injector", "urls_merged").increment(1); + if (injectedSet) { + context.getCounter("injector", "urls_injected_unique").increment(1); + if (oldSet) { + context.getCounter("injector", "urls_merged").increment(1); + } } context.write(key, result); } @@ -369,10 +372,11 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, boolean update, boolean normalize, boolean filter, boolean filterNormalizeAll) throws IOException, ClassNotFoundException, InterruptedException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Injector: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + + LOG.info("Injector: starting"); LOG.info("Injector: crawlDb: {}", crawlDb); LOG.info("Injector: urlDir: {}", urlDir); LOG.info("Injector: Converting injected urls to crawl db entries."); @@ -448,22 +452,24 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, if (LOG.isInfoEnabled()) { long urlsInjected = job.getCounters() .findCounter("injector", "urls_injected").getValue(); + long urlsInjectedUniq = job.getCounters() + .findCounter("injector", "urls_injected_unique").getValue(); long urlsFiltered = job.getCounters() .findCounter("injector", "urls_filtered").getValue(); long urlsMerged = job.getCounters() .findCounter("injector", "urls_merged").getValue(); - long urlsPurged404= job.getCounters() + long urlsPurged404 = job.getCounters() .findCounter("injector", "urls_purged_404").getValue(); - long urlsPurgedFilter= job.getCounters() + long urlsPurgedFilter = job.getCounters() .findCounter("injector", "urls_purged_filter").getValue(); - LOG.info("Injector: Total urls rejected by filters: " + urlsFiltered); + LOG.info("Injector: Total urls rejected by filters: {}", urlsFiltered); LOG.info( - "Injector: Total urls injected after normalization and filtering: " - + urlsInjected); - LOG.info("Injector: Total urls injected but already in CrawlDb: " - + urlsMerged); - LOG.info("Injector: Total new urls injected: " - + (urlsInjected - urlsMerged)); + "Injector: Total urls injected after normalization and filtering: {} (unique URLs: {})", + urlsInjected, urlsInjectedUniq); + LOG.info("Injector: Total urls injected but already in CrawlDb: {}", + urlsMerged); + LOG.info("Injector: Total new urls injected: {}", + (urlsInjectedUniq - urlsMerged)); if (filterNormalizeAll) { LOG.info("Injector: Total urls removed from CrawlDb by filters: {}", urlsPurgedFilter); @@ -474,9 +480,8 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, urlsPurged404); } - long end = System.currentTimeMillis(); - LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Injector: finished, elapsed: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); } } catch (IOException | InterruptedException | ClassNotFoundException | NullPointerException e) { LOG.error("Injector job failed: {}", e.getMessage()); diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java b/src/java/org/apache/nutch/crawl/LinkDb.java index 2b3d2ed907..3c752ab1db 100644 --- a/src/java/org/apache/nutch/crawl/LinkDb.java +++ b/src/java/org/apache/nutch/crawl/LinkDb.java @@ -21,13 +21,14 @@ import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -54,7 +55,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; /** Maintains an inverted link map, listing incoming links for each url. */ public class LinkDb extends NutchTool implements Tool { @@ -196,9 +196,9 @@ public void invert(Path linkDb, Path[] segments, boolean normalize, Path currentLinkDb = new Path(linkDb, CURRENT_NAME); Configuration conf = job.getConfiguration(); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("LinkDb: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkDb: starting"); LOG.info("LinkDb: linkdb: {}", linkDb); LOG.info("LinkDb: URL normalize: {}", normalize); LOG.info("LinkDb: URL filter: {}", filter); @@ -260,8 +260,9 @@ public void invert(Path linkDb, Path[] segments, boolean normalize, } LinkDb.install(job, linkDb); - long end = System.currentTimeMillis(); - LOG.info("LinkDb: finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } private static Job createJob(Configuration config, Path linkDb, diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java b/src/java/org/apache/nutch/crawl/LinkDbMerger.java index f696c599e8..d6a41ab48c 100644 --- a/src/java/org/apache/nutch/crawl/LinkDbMerger.java +++ b/src/java/org/apache/nutch/crawl/LinkDbMerger.java @@ -18,11 +18,12 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Iterator; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -41,7 +42,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * This tool merges several LinkDb-s into one, optionally filtering URLs through @@ -112,9 +112,9 @@ public void reduce(Text key, Iterable values, Context context) public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("LinkDb merge: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkDb merge: starting"); Job job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { @@ -137,9 +137,9 @@ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME)); - long end = System.currentTimeMillis(); - LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDb merge: finished, elapsed: {} ms" + stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static Job createMergeJob(Configuration config, Path linkDb, diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java b/src/java/org/apache/nutch/crawl/LinkDbReader.java index c307b985d5..fa01f20bf3 100644 --- a/src/java/org/apache/nutch/crawl/LinkDbReader.java +++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java @@ -16,13 +16,15 @@ */ package org.apache.nutch.crawl; +import java.io.Closeable; import java.io.IOException; - import java.lang.invoke.MethodHandles; +import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.Iterator; -// Commons Logging imports +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,11 +48,8 @@ import org.apache.nutch.util.AbstractChecker; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; -import java.text.SimpleDateFormat; -import java.util.Iterator; -import java.io.Closeable; + /** * Read utility for the LinkDb. @@ -153,10 +152,9 @@ public void map(Text key, Inlinks value, Context context) public void processDumpJob(String linkdb, String output, String regex) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - - LOG.info("LinkDb dump: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkDb dump: starting"); LOG.info("LinkDb dump: db: {}", linkdb); Path outFolder = new Path(output); @@ -192,9 +190,9 @@ public void processDumpJob(String linkdb, String output, String regex) throw e; } - long end = System.currentTimeMillis(); - LOG.info("LinkDb dump: finished at {}, elapsed: {}", - sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDb dump: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } @Override diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index 12727db4d5..23fd30d731 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -25,9 +25,11 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; @@ -458,11 +460,10 @@ public void fetch(Path segment, int threads) throws IOException, checkConfiguration(); - long start = System.currentTimeMillis(); - if (LOG.isInfoEnabled()) { - LOG.info("Fetcher: starting at {}", TimingUtil.logDateMillis(start)); - LOG.info("Fetcher: segment: {}", segment); - } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("Fetcher: starting"); + LOG.info("Fetcher: segment: {}", segment); // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task @@ -534,9 +535,9 @@ public void fetch(Path segment, int threads) throws IOException, throw e; } - long end = System.currentTimeMillis(); - LOG.info("Fetcher: finished at {}, elapsed: {}", - TimingUtil.logDateMillis(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Fetcher: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } /** @@ -602,20 +603,11 @@ public Map run(Map args, String crawlId) throws Path segment = null; if(args.containsKey(Nutch.ARG_SEGMENTS)) { Object seg = args.get(Nutch.ARG_SEGMENTS); - if(seg instanceof Path) { + if (seg instanceof Path) { segment = (Path) seg; - } - else if(seg instanceof String){ + } else if (seg instanceof String) { segment = new Path(seg.toString()); } - else if(seg instanceof ArrayList) { - String[] segmentsArray = (String[])seg; - segment = new Path(segmentsArray[0].toString()); - - if(segmentsArray.length > 1){ - LOG.warn("Only the first segment of segments array is used."); - } - } } else { String segmentDir = crawlId+"/segments"; diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java index ffddb18898..0321a8652c 100644 --- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java +++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java @@ -18,9 +18,10 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -42,7 +43,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.SegmentReaderUtil; import org.apache.commons.jexl3.JexlBuilder; @@ -168,9 +168,9 @@ public void map(Text key, HostDatum datum, Context context) throws IOException, // } private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean dumpHostnames, String expr) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("ReadHostDb: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ReadHostDb: starting"); Configuration conf = getConf(); conf.setBoolean(HOSTDB_DUMP_HOMEPAGES, dumpHomepages); @@ -211,8 +211,9 @@ private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean throw e; } - long end = System.currentTimeMillis(); - LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ReadHostDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } private void getHostDbRecord(Path hostDb, String host) throws Exception { diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java index ffa68d0963..65e45c55d8 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java @@ -17,9 +17,10 @@ package org.apache.nutch.hostdb; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; @@ -40,7 +41,6 @@ import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -73,9 +73,9 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts, boolean checkFailed, boolean checkNew, boolean checkKnown, boolean force, boolean filter, boolean normalize) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("UpdateHostDb: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("UpdateHostDb: starting"); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); @@ -149,9 +149,9 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts, } LockUtil.removeLockFile(fs, lock); - long end = System.currentTimeMillis(); - LOG.info("UpdateHostDb: finished at " + sdf.format(end) + - ", elapsed: " + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("UpdateHostDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String args[]) throws Exception { diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java index dc3ed69e4a..04b9c2efa5 100644 --- a/src/java/org/apache/nutch/indexer/CleaningJob.java +++ b/src/java/org/apache/nutch/indexer/CleaningJob.java @@ -18,7 +18,9 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; + +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.ByteWritable; @@ -36,7 +38,6 @@ import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -139,9 +140,9 @@ public void reduce(ByteWritable key, Iterable values, public void delete(String crawldb, boolean noCommit) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("CleaningJob: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("CleaningJob: starting"); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); @@ -173,9 +174,8 @@ public void delete(String crawldb, boolean noCommit) throw e; } - long end = System.currentTimeMillis(); - LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("CleaningJob: finished, elapsed: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); } @Override diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java index 3aa7a05cba..1931c360d8 100644 --- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java +++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java @@ -93,7 +93,7 @@ public int run(String[] args) throws Exception { // Print help when no args given if (args.length < 1) { System.err.println(usage); - System.exit(-1); + return -1; } // read property "doIndex" for back-ward compatibility @@ -126,7 +126,7 @@ public int run(String[] args) throws Exception { } else if (i != args.length - 1) { System.err.println("ERR: Not a recognized argument: " + args[i]); System.err.println(usage); - System.exit(-1); + return -1; } else { url = args[i]; } diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java index ff46bc0eff..d2115230c8 100644 --- a/src/java/org/apache/nutch/indexer/IndexingJob.java +++ b/src/java/org/apache/nutch/indexer/IndexingJob.java @@ -19,7 +19,6 @@ import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -27,7 +26,9 @@ import java.util.Locale; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.segment.SegmentChecker; import org.apache.hadoop.conf.Configuration; @@ -44,7 +45,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -104,9 +104,9 @@ public void index(Path crawlDb, Path linkDb, List segments, boolean filter, boolean normalize, boolean addBinaryContent, boolean base64) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Indexer: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("Indexer: starting"); final Job job = NutchJob.getInstance(getConf()); job.setJobName("Indexer"); @@ -159,9 +159,9 @@ public void index(Path crawlDb, Path linkDb, List segments, String.format(Locale.ROOT, "%6d", counter.getValue()), counter.getName()); } - long end = System.currentTimeMillis(); - LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Indexer: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } finally { tmp.getFileSystem(conf).delete(tmp, true); } diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java similarity index 69% rename from src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java rename to src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java index 47010768c6..92e848ca2d 100644 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java +++ b/src/java/org/apache/nutch/metadata/CaseInsensitiveMetadata.java @@ -14,11 +14,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +package org.apache.nutch.metadata; + +import java.util.TreeMap; /** - * This packages uses the Apache Any23 library - * for parsing and extracting structured data in RDF format from a - * variety of Web documents. The supported formats can be found - * at Apache Any23. + * A decorator to Metadata that adds for case-insensitive lookup of keys. */ -package org.apache.nutch.any23; +public class CaseInsensitiveMetadata extends Metadata { + + /** + * Constructs a new, empty metadata. + */ + public CaseInsensitiveMetadata() { + metadata = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); + } + +} diff --git a/src/java/org/apache/nutch/metadata/Metadata.java b/src/java/org/apache/nutch/metadata/Metadata.java index 5c37911fb9..7fa0bb12ce 100644 --- a/src/java/org/apache/nutch/metadata/Metadata.java +++ b/src/java/org/apache/nutch/metadata/Metadata.java @@ -36,7 +36,7 @@ public class Metadata implements Writable, CreativeCommons, DublinCore, /** * A map of all metadata attributes. */ - private Map metadata = null; + protected Map metadata = null; /** * Constructs a new, empty metadata. @@ -66,7 +66,7 @@ public String[] names() { } /** - * Get the value associated to a metadata name. If many values are assiociated + * Get the value associated to a metadata name. If many values are associated * to the specified name, then the first one is returned. * * @param name diff --git a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java index fdbf1b62c8..be161440e2 100644 --- a/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java +++ b/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java @@ -25,7 +25,7 @@ /** * A decorator to Metadata that adds spellchecking capabilities to property - * names. Currently used spelling vocabulary contains just the httpheaders from + * names. Currently used spelling vocabulary contains just the HTTP headers from * {@link HttpHeaders} class. * */ @@ -94,7 +94,7 @@ private static String normalize(final String str) { /** * Get the normalized name of metadata attribute name. This method tries to * find a well-known metadata name (one of the metadata names defined in this - * class) that matches the specified name. The matching is error tolerent. For + * class) that matches the specified name. The matching is error tolerant. For * instance, *
    *
  • content-type gives Content-Type
  • @@ -105,8 +105,8 @@ private static String normalize(final String str) { * name is returned. * * @param name - * Name to normalize - * @return normalized name + * HTTP header name to normalize + * @return normalized HTTP header name */ public static String getNormalizedName(final String name) { String searched = normalize(name); diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java b/src/java/org/apache/nutch/net/URLFilterChecker.java index 7916cc5794..821f2e9267 100644 --- a/src/java/org/apache/nutch/net/URLFilterChecker.java +++ b/src/java/org/apache/nutch/net/URLFilterChecker.java @@ -41,7 +41,7 @@ public int run(String[] args) throws Exception { // Print help when no args given if (args.length < 1) { System.err.println(usage); - System.exit(-1); + return -1; } int numConsumed; @@ -53,7 +53,7 @@ public int run(String[] args) throws Exception { } else { System.err.println("ERROR: Not a recognized argument: " + args[i]); System.err.println(usage); - System.exit(-1); + return -1; } } diff --git a/src/java/org/apache/nutch/net/URLNormalizerChecker.java b/src/java/org/apache/nutch/net/URLNormalizerChecker.java index 586c7b2460..46fdd38cfb 100644 --- a/src/java/org/apache/nutch/net/URLNormalizerChecker.java +++ b/src/java/org/apache/nutch/net/URLNormalizerChecker.java @@ -44,7 +44,7 @@ public int run(String[] args) throws Exception { // Print help when no args given if (args.length < 1) { System.err.println(usage); - System.exit(-1); + return -1; } int numConsumed; @@ -58,7 +58,7 @@ public int run(String[] args) throws Exception { } else { System.err.println("ERROR: Not a recognized argument: " + args[i]); System.err.println(usage); - System.exit(-1); + return -1; } } diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java index 0159358ec0..514ce85613 100644 --- a/src/java/org/apache/nutch/net/protocols/Response.java +++ b/src/java/org/apache/nutch/net/protocols/Response.java @@ -86,7 +86,7 @@ public static enum TruncatedContentReason { /** * Get the value of a named header. - * @param name key of the header you wish to retreive + * @param name key of the header you wish to retrieve * @return header value */ public String getHeader(String name); diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index 7e4707d399..de45c463b9 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.parse; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.nutch.crawl.CrawlDatum; @@ -25,7 +26,6 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; import org.apache.nutch.util.StringUtil; -import org.apache.nutch.util.TimingUtil; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -50,13 +50,12 @@ import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; -import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; +import java.util.concurrent.TimeUnit; /* Parse content in a segment. */ public class ParseSegment extends NutchTool implements Tool { @@ -228,12 +227,10 @@ public void parse(Path segment) throws IOException, return; } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - if (LOG.isInfoEnabled()) { - LOG.info("ParseSegment: starting at {}", sdf.format(start)); - LOG.info("ParseSegment: segment: {}", segment); - } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ParseSegment: starting"); + LOG.info("ParseSegment: segment: {}", segment); Job job = NutchJob.getInstance(getConf()); job.setJobName("parse " + segment); @@ -263,9 +260,9 @@ public void parse(Path segment) throws IOException, throw e; } - long end = System.currentTimeMillis(); - LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ParseSegment: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { @@ -312,20 +309,11 @@ public Map run(Map args, String crawlId) throws Path segment = null; if(args.containsKey(Nutch.ARG_SEGMENTS)) { Object seg = args.get(Nutch.ARG_SEGMENTS); - if(seg instanceof Path) { + if (seg instanceof Path) { segment = (Path) seg; - } - else if(seg instanceof String){ + } else if (seg instanceof String) { segment = new Path(seg.toString()); } - else if(seg instanceof ArrayList) { - String[] segmentsArray = (String[])seg; - segment = new Path(segmentsArray[0].toString()); - - if(segmentsArray.length > 1){ - LOG.warn("Only the first segment of segments array is used."); - } - } } else { String segment_dir = crawlId+"/segments"; diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java index 1533ab57cc..10eec4b244 100644 --- a/src/java/org/apache/nutch/parse/ParserChecker.java +++ b/src/java/org/apache/nutch/parse/ParserChecker.java @@ -104,7 +104,7 @@ public int run(String[] args) throws Exception { // Print help when no args given if (args.length < 1) { System.err.println(usage); - System.exit(-1); + return -1; } // initialize plugins early to register URL stream handlers to support @@ -138,7 +138,7 @@ public int run(String[] args) throws Exception { } else if (i != args.length - 1) { System.err.println("ERR: Not a recognized argument: " + args[i]); System.err.println(usage); - System.exit(-1); + return -1; } else { url = args[i]; } diff --git a/src/java/org/apache/nutch/plugin/Plugin.java b/src/java/org/apache/nutch/plugin/Plugin.java index b2e717d20e..3a0fb2e915 100644 --- a/src/java/org/apache/nutch/plugin/Plugin.java +++ b/src/java/org/apache/nutch/plugin/Plugin.java @@ -90,9 +90,7 @@ private void setDescriptor(PluginDescriptor descriptor) { } @Override - @SuppressWarnings("deprecation") protected void finalize() throws Throwable { - super.finalize(); shutDown(); } } diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java index 562c2c694f..d73c075060 100644 --- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java +++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java @@ -98,6 +98,7 @@ public abstract class RobotRulesParser implements Tool { protected Configuration conf; protected Set agentNames; + protected int maxNumRedirects = 5; /** set of host names or IPs to be explicitly excluded from robots.txt checking */ protected Set allowList = new HashSet<>(); @@ -149,6 +150,10 @@ public void setConf(Configuration conf) { } } } + LOG.info("Checking robots.txt for the following agent names: {}", agentNames); + + maxNumRedirects = conf.getInt("http.robots.redirect.max", 5); + LOG.info("Following max. {} robots.txt redirects", maxNumRedirects); String[] confAllowList = conf.getStrings("http.robot.rules.allowlist"); if (confAllowList == null) { @@ -294,8 +299,11 @@ public int run(String[] args) { "", "\tlocal file or URL parsed as robots.txt file", "\tIf starts with a protocol specification", - "\t(`http', `https', `ftp' or `file'), robots.txt it is fetched", - "\tusing the specified protocol. Otherwise, a local file is assumed.", + "\t(`http', `https', `ftp' or `file'), the URL is parsed, URL path", + "\tand query are removed and the path \"/robots.txt\" is appended.", + "\tThe resulting URL (the canonical robots.txt location) is then", + "\tfetched using the specified protocol.", + "\tIf the URL does not include a protocol, a local file is assumed.", "", "\tlocal file with URLs (one per line), for every URL", "\tthe path part (including the query) is checked whether", @@ -323,6 +331,16 @@ public int run(String[] args) { return -1; } + if (args.length > 2) { + // set agent name from command-line in configuration + // Note: when fetching via protocol this must be done + // before the protocol is configured + String agents = args[2]; + conf.set("http.robots.agents", agents); + conf.set("http.agent.name", agents.split(",")[0]); + setConf(conf); + } + Protocol protocol = null; URL robotsTxtUrl = null; if (args[0].matches("^(?:https?|ftp|file)://?.*")) { @@ -334,6 +352,7 @@ public int run(String[] args) { ProtocolFactory factory = new ProtocolFactory(conf); try { protocol = factory.getProtocol(robotsTxtUrl); + LOG.debug("Using protocol {} to fetch robots.txt", protocol.getClass()); } catch (ProtocolNotFound e) { LOG.error("No protocol found for {}: {}", args[0], StringUtils.stringifyException(e)); @@ -357,14 +376,6 @@ public int run(String[] args) { File urlFile = new File(args[1]); - if (args.length > 2) { - // set agent name from command-line in configuration and update parser - String agents = args[2]; - conf.set("http.robots.agents", agents); - conf.set("http.agent.name", agents.split(",")[0]); - setConf(conf); - } - List robotsTxtContent = null; if (getConf().getBoolean("fetcher.store.robotstxt", false)) { robotsTxtContent = new LinkedList<>(); @@ -373,6 +384,7 @@ public int run(String[] args) { try { BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl, robotsTxtContent); + LOG.debug("Robots.txt rules:\n{}", rules); if (robotsTxtContent != null) { for (Content robotsTxt : robotsTxtContent) { diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java index 54cd8b8ed1..4831d73f38 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java +++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java @@ -20,10 +20,11 @@ import java.io.DataOutput; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; import java.util.Random; +import java.util.concurrent.TimeUnit; + import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; @@ -31,6 +32,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -57,7 +59,6 @@ import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * The LinkDumper tool creates a database of node to inlink information that can @@ -327,9 +328,9 @@ public void reduce(Text key, Iterable values, public void dumpLinks(Path webGraphDb) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("NodeDumper: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("NodeDumper: starting"); Configuration conf = getConf(); FileSystem fs = webGraphDb.getFileSystem(conf); @@ -400,9 +401,9 @@ public void dumpLinks(Path webGraphDb) throws IOException, } fs.delete(tempInverted, true); - long end = System.currentTimeMillis(); - LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDumper: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java index 739fe6cec1..c226ad130b 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java +++ b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java @@ -21,12 +21,12 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -35,6 +35,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -65,7 +66,6 @@ import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; public class LinkRank extends Configured implements Tool { @@ -651,9 +651,9 @@ public LinkRank(Configuration conf) { public void analyze(Path webGraphDb) throws IOException, ClassNotFoundException, InterruptedException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Analysis: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkRank Analysis: starting"); // store the link rank under the webgraphdb temporarily, final scores get // upddated into the nodedb @@ -714,9 +714,9 @@ public void analyze(Path webGraphDb) throws IOException, // remove the temporary link rank folder fs.delete(linkRank, true); - long end = System.currentTimeMillis(); - LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkRank Analysis: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java index ede9fa1c59..dfccccc19e 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java +++ b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java @@ -18,7 +18,7 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -27,6 +27,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -48,7 +49,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; /** @@ -293,9 +293,9 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("NodeDumper: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("NodeDumper: starting"); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Job dumper = NutchJob.getInstance(getConf()); @@ -357,9 +357,9 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, LOG.error("NodeDumper job failed:", e); throw e; } - long end = System.currentTimeMillis(); - LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("NodeDumper: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java index 130e1b2a1c..c10a6e37b0 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java +++ b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java @@ -18,8 +18,8 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.Random; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -28,6 +28,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -51,7 +52,6 @@ import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * Updates the score from the WebGraph node database into the crawl database. @@ -156,9 +156,9 @@ public void reduce(Text key, Iterable values, public void update(Path crawlDb, Path webGraphDb) throws IOException, ClassNotFoundException, InterruptedException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("ScoreUpdater: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ScoreUpdater: starting"); Configuration conf = getConf(); @@ -213,9 +213,9 @@ public void update(Path crawlDb, Path webGraphDb) throws IOException, LOG.info("ScoreUpdater: installing new crawldb " + crawlDb); CrawlDb.install(updater, crawlDb); - long end = System.currentTimeMillis(); - LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ScoreUpdater: finished, elapsed: {} ms ", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java index 63d0ead7da..b98329d1e0 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java +++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashMap; @@ -26,6 +25,7 @@ import java.util.Map; import java.util.Random; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -34,6 +34,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -67,7 +68,6 @@ import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; /** @@ -518,14 +518,12 @@ public void createWebGraph(Path webGraphDb, Path[] segments, boolean normalize, boolean filter) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - if (LOG.isInfoEnabled()) { - LOG.info("WebGraphDb: starting at " + sdf.format(start)); - LOG.info("WebGraphDb: webgraphdb: " + webGraphDb); - LOG.info("WebGraphDb: URL normalize: " + normalize); - LOG.info("WebGraphDb: URL filter: " + filter); - } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("WebGraphDb: starting"); + LOG.info("WebGraphDb: webgraphdb: " + webGraphDb); + LOG.info("WebGraphDb: URL normalize: " + normalize); + LOG.info("WebGraphDb: URL filter: " + filter); FileSystem fs = webGraphDb.getFileSystem(getConf()); @@ -715,9 +713,9 @@ public void createWebGraph(Path webGraphDb, Path[] segments, // remove the lock file for the webgraph LockUtil.removeLockFile(fs, lock); - long end = System.currentTimeMillis(); - LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("WebGraphDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java b/src/java/org/apache/nutch/segment/SegmentReader.java index 14546af543..ee5c266fd0 100644 --- a/src/java/org/apache/nutch/segment/SegmentReader.java +++ b/src/java/org/apache/nutch/segment/SegmentReader.java @@ -163,13 +163,16 @@ public void reduce(Text key, Iterable values, dump.append("\nRecno:: ").append(recNo++).append("\n"); dump.append("URL:: " + key.toString() + "\n"); Content content = null; - Charset charset = null; + // fall-back encoding for content of unparsed documents + Charset charset = StandardCharsets.UTF_8; for (NutchWritable val : values) { Writable value = val.get(); // unwrap if (value instanceof CrawlDatum) { dump.append("\nCrawlDatum::\n").append(((CrawlDatum) value).toString()); } else if (value instanceof Content) { if (recodeContent) { + // output recoded content later when charset is extracted from HTML + // metadata hold in ParseData content = (Content) value; } else { dump.append("\nContent::\n").append(((Content) value).toString()); diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java b/src/java/org/apache/nutch/tools/FreeGenerator.java index 039bccaece..e9f5c87619 100644 --- a/src/java/org/apache/nutch/tools/FreeGenerator.java +++ b/src/java/org/apache/nutch/tools/FreeGenerator.java @@ -18,10 +18,11 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Map.Entry; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.Path; @@ -47,7 +48,6 @@ import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * This tool generates fetchlists (segments to be fetched) from plain text files @@ -180,9 +180,9 @@ public int run(String[] args) throws Exception { } } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("FreeGenerator: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("FreeGenerator: starting"); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); @@ -226,9 +226,9 @@ public int run(String[] args) throws Exception { LOG.error("FAILED: " + StringUtils.stringifyException(e)); return -1; } - long end = System.currentTimeMillis(); - LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("FreeGenerator: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java index 4e916dbd50..825e752cc0 100644 --- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java +++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java @@ -21,7 +21,9 @@ import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map.Entry; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -56,7 +58,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; -import org.apache.nutch.util.TimingUtil; /** *

    @@ -368,10 +369,10 @@ public void map(Text key, BytesWritable bytes, public void createSegments(Path arcFiles, Path segmentsOutDir) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); if (LOG.isInfoEnabled()) { - LOG.info("ArcSegmentCreator: starting at " + sdf.format(start)); + LOG.info("ArcSegmentCreator: starting"); LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles); } @@ -402,10 +403,9 @@ public void createSegments(Path arcFiles, Path segmentsOutDir) throw e; } - - long end = System.currentTimeMillis(); - LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) - + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ArcSegmentCreator: finished, elapsed: {} ms" + stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String args[]) throws Exception { diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index cf000ba526..6d8a385572 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -29,8 +29,10 @@ import java.util.List; import java.util.Locale; import java.util.UUID; +import java.util.concurrent.TimeUnit; import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; @@ -58,7 +60,6 @@ import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -428,9 +429,9 @@ protected JsonObject metadataToJson(Metadata meta) { public int generateWARC(String output, List segments, boolean onlySuccessfulResponses, boolean includeParseData, boolean includeParseText) throws IOException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("WARCExporter: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("WARCExporter: starting"); final Job job = NutchJob.getInstance(getConf()); job.setJobName("warc-exporter " + output); @@ -479,9 +480,9 @@ public int generateWARC(String output, List segments, throw new RuntimeException(message); } LOG.info(job.getCounters().toString()); - long end = System.currentTimeMillis(); - LOG.info("WARCExporter: finished at {}, elapsed: {}", sdf.format(end), - TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("WARCExporter: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("WARCExporter job failed: {}", e.getMessage()); return -1; diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java b/src/java/org/apache/nutch/util/AbstractChecker.java index 3116ede146..1374812250 100644 --- a/src/java/org/apache/nutch/util/AbstractChecker.java +++ b/src/java/org/apache/nutch/util/AbstractChecker.java @@ -72,8 +72,7 @@ protected int parseArgs(String[] args, int i) { protected int run() throws Exception { // In listening mode? if (tcpPort != -1) { - processTCP(tcpPort); - return 0; + return processTCP(tcpPort); } else if (stdin) { return processStdin(); } @@ -104,7 +103,7 @@ protected int processStdin() throws Exception { // Open TCP socket and process input @SuppressWarnings("resource") - protected void processTCP(int tcpPort) throws Exception { + protected int processTCP(int tcpPort) throws Exception { ServerSocket server = null; try { @@ -113,7 +112,7 @@ protected void processTCP(int tcpPort) throws Exception { LOG.info(server.toString()); } catch (Exception e) { LOG.error("Could not listen on port " + tcpPort, e); - System.exit(-1); + return -1; } while(true){ @@ -124,7 +123,7 @@ protected void processTCP(int tcpPort) throws Exception { thread.start(); } catch (Exception e) { LOG.error("Accept failed: " + tcpPort, e); - System.exit(-1); + return -1; } } } diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java index 7210ee83af..8696d28221 100644 --- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java +++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java @@ -20,7 +20,7 @@ import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -30,6 +30,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; @@ -127,9 +128,9 @@ public int run(String[] args) throws Exception { numOfReducers = Integer.parseInt(args[3]); } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("CrawlCompletionStats: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("CrawlCompletionStats: starting"); int mode = 0; String jobName = "CrawlCompletionStats"; @@ -180,9 +181,9 @@ public int run(String[] args) throws Exception { throw e; } - long end = System.currentTimeMillis(); - LOG.info("CrawlCompletionStats: finished at {}, elapsed: {}", - sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("CrawlCompletionStats: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java index 478b24f89e..068c64fefb 100644 --- a/src/java/org/apache/nutch/util/NutchJob.java +++ b/src/java/org/apache/nutch/util/NutchJob.java @@ -35,7 +35,18 @@ public class NutchJob extends Job { private static final String JOB_FAILURE_LOG_FORMAT = "%s job did not succeed, job id: %s, job status: %s, reason: %s"; - @SuppressWarnings("deprecation") + /** + * @deprecated, use instead {@link #getInstance(Configuration)} or + * {@link Job#getInstance(Configuration, String)}. + * + * @param conf + * configuration for the job + * @param jobName + * name of the job + * @throws IOException + * see {@link Job#Job(Configuration, String)} + */ + @Deprecated public NutchJob(Configuration conf, String jobName) throws IOException { super(conf, jobName); if (conf != null) { diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java index 2499da0bfb..0fe6c57d03 100644 --- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java +++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java @@ -16,10 +16,11 @@ */ package org.apache.nutch.util; -import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; + +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -37,8 +38,6 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.metadata.Nutch; /** @@ -86,9 +85,9 @@ public int run(String[] args) throws Exception { numOfReducers = Integer.parseInt(args[2]); } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("ProtocolStatistics: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ProtocolStatistics: starting"); String jobName = "ProtocolStatistics"; @@ -130,9 +129,9 @@ public int run(String[] args) throws Exception { throw e; } - long end = System.currentTimeMillis(); - LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ProtocolStatistics: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index 98f7df839d..66fa9b0e7a 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -22,7 +22,9 @@ import java.util.Collection; import java.util.List; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; @@ -359,8 +361,9 @@ else if(sitemapDatum != null) { public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter, boolean normalize, int threads) throws Exception { - long start = System.currentTimeMillis(); - LOG.info("SitemapProcessor: Starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("SitemapProcessor: starting"); FileSystem fs = crawldb.getFileSystem(getConf()); Path old = new Path(crawldb, "old"); @@ -441,8 +444,9 @@ public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean stric LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches); LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries); - long end = System.currentTimeMillis(); - LOG.info("SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("SitemapProcessor: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("SitemapProcessor_" + crawldb.toString(), e); diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java index 638b6c94f1..f77b72bc5f 100644 --- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java +++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java @@ -20,8 +20,9 @@ import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; @@ -39,7 +40,6 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -92,9 +92,9 @@ public int run(String[] args) throws Exception { numOfReducers = Integer.parseInt(args[3]); } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("DomainStatistics: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("DomainStatistics: starting"); int mode = 0; String jobName = "DomainStatistics"; @@ -151,9 +151,9 @@ public int run(String[] args) throws Exception { throw e; } - long end = System.currentTimeMillis(); - LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("DomainStatistics: finished, elapsed: {} ms ", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/overview.html b/src/java/overview.html index 11321417ba..3de53a7d28 100644 --- a/src/java/overview.html +++ b/src/java/overview.html @@ -1,3 +1,19 @@ + Apache Nutch diff --git a/src/plugin/any23/build-ivy.xml b/src/plugin/any23/build-ivy.xml deleted file mode 100644 index 6c7c6b906b..0000000000 --- a/src/plugin/any23/build-ivy.xml +++ /dev/null @@ -1,47 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/plugin/any23/howto_upgrade_any23.txt b/src/plugin/any23/howto_upgrade_any23.txt deleted file mode 100644 index 32f9162f41..0000000000 --- a/src/plugin/any23/howto_upgrade_any23.txt +++ /dev/null @@ -1,22 +0,0 @@ -1. Upgrade Any23 dependency in src/plugin/any23/ivy.xml - -2. Upgrade Any23's own dependencies in src/plugin/any23/plugin.xml - To get the list of dependencies and their versions execute: - $ cd src/plugin/any23/ - $ ant -f ./build-ivy.xml - $ ls lib | sed 's/^/ /g' - - In the plugin.xml replace all lines between - - and - - with the output of the command above. - -3. Remove the locally "installed" dependencies in src/plugin/any23/lib/: - - $ rm -rf lib/ - -4. Build Nutch and run all unit tests: - - $ cd ../../../ - $ ant clean runtime test diff --git a/src/plugin/any23/ivy.xml b/src/plugin/any23/ivy.xml deleted file mode 100644 index 3b755ee3fa..0000000000 --- a/src/plugin/any23/ivy.xml +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - - - - - Apache Nutch - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/plugin/any23/plugin.xml b/src/plugin/any23/plugin.xml deleted file mode 100644 index dae8c47aa3..0000000000 --- a/src/plugin/any23/plugin.xml +++ /dev/null @@ -1,216 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/plugin/any23/sample/BBC_News_Scotland.html b/src/plugin/any23/sample/BBC_News_Scotland.html deleted file mode 100644 index d7cb10a826..0000000000 --- a/src/plugin/any23/sample/BBC_News_Scotland.html +++ /dev/null @@ -1,3780 +0,0 @@ - - - - - - BBC News - Scotland - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - -
    -
    - - -

    - BBC News - Scotland -

    - - -
    - -
    - - - - RSS feed - - - - - - -
    - -
    - - -
    - -
    - -
    -
    - - - -
    - - - - - -
    -
    - 31 March 2014 -Last updated at 14:53 - -
    - -
    - -
    - -
    - - -
    - -
    - - - - - - -
    - -

    - - Australian firm to create 110 jobsClough graphic -

    - -

    An Australian firm which services the energy, chemical and -mining sectors is to open a base in Scotland, creating 110 new jobs. - -

    -
    -
    - - - - - - - - - -
    - -
    - - - - - - - - - - - - - -

    - Hearts won Sunday's Edinburgh derby 2-0Hearts in fight to stay afloat - -

    - - -

    Hearts face the prospect of running out of money by the end -of April if a deal to take the club out of administration is not agreed -soon. BBC Sport - -

    - -
    -
    - - - - - - - - - - - -
    - -
    - - - - - - - - - - - - - -

    - Primary school pupil with the Queen's baton in St HelenaEmail error on Queen's Baton bearers - -

    - - -

    Blank emails are sent to some people waiting to find out -whether they have been chosen to carry the Queen's Baton before the -Commonwealth Games. -

    - - -
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - -
    - - -
    - - - -
    -

    Also In The News

    - - - -
    - - - - - - - - -

    More news from around Scotland

    - -
    - - -
    - - - - - - - - - - - - -
    -
    -
    - - - - - - - - -
    - -

    Edinburgh, Fife & East -

    - - - -
    - - - - - - - -
    - -

    Glasgow & West -

    - - - -
    - - - - - - - -
    - -

    Highlands & Islands -

    - - - -
    - - - - - - - -
    - -

    North East, Orkney & Shetland -

    - - - -
    - - - - - - - -
    - -

    South Scotland -

    - - - -
    - - - - - - - -
    - -

    Tayside & Central -

    - - - -
    - - -
    -
    - - -
    -

    Our Experts

    - -
    -
    - - Brian Taylor, Political editor, Scotland - Article written by Brian Taylor - Brian Taylor - - Political editor, Scotland - - - - -
    - -
    -

    Contest of doubt and reassurance

    - -

    Does it matter that an unnamed minister of unknown status - follows an undiscernible motivation and gives an off-the-record comment - to The Guardian? Frankly, yes it does.

    -

    - - Read full article -

    -
    - - -
    - - - - -
    -
    - -
    -
    - - Douglas Fraser, Business and economy editor, Scotland - Article written by Douglas Fraser - Douglas Fraser - - Business and economy editor, Scotland - - - - -
    - -
    -

    Has Scotland ‘de-globalised’?

    - -

    As Scotland looks to a choice on its future, two academic - contributions give us a new take on the economic route that got us to -where we are now.

    -

    - - Read full article -

    -
    - - -
    - - - - -
    -
    - -
    - - -
    - -

    - Special Reports -

    - - -
    - - - - -
    - - - - - - - - - - - - - - -

    - Saltire and union flagsScotland's Future - -

    - -

    Latest news, background and analysis on the referendum on Scottish independence

    - - -
    - -
    - - -
    - -
    -

    More Special Reports:

    - -
    -
    - -
    - - - - -
    - -

    More from Scotland

    - - - - - - - - -
    - -

    Politics

    - - -
    - - - - - - - - -
    - -

    Business

    - - -
    - - - - - - - - -
    - -

    Sport

    - - -
    - - - - - - - - -
    - -

    Naidheachdan

    - - -
    - -
    - - - - - - - - - - - -
    - - -
    - -
    - -
    - -
    - - - -
    - - -
    - -

    Watch/Listen

    - -
    - - - -
    - -
    - - - - - -
    - - - - -
    - - -
    - - -
    - -
    - - -
    - - -
    -

    Features & Analysis

    - -
      - - - - - - - - - - - - -
    • - - - - - - - - - - - - - - -

      - OtterNatural selection - -

      - - -

      The winning entries from the 2013 Scottish Nature Photography Awards -

      - -
    • - - - - - - - - - - - - - - - - -
    • - - - - - - - - - - - - - - -

      - Dundee waterfront imageMuseum with a mission - -

      - - -

      V&A Dundee aims to change thinking about design -

      - -
      -
    • - - -
    • - - - - - - - - - - - - - - -

      - Henry McLeish, former Labour First MinisterGot a question? - -

      - - -

      Former FM Henry McLeish in the referendum webcast hot-seat -

      - -
      -
    • - - -
    • - - - - - - - - - - - - - - -

      - A close up of her face, some sort of realisation dawns upon her. She stares open-mouthed.The Key - -

      - - -

      A novel in graphic art form on the theme of freedom -

      - -
      -
    • - - -
    • - - - - - - - - - - - - - - -

      - Belladrum poster illustrationsArt attack - -

      - - -

      How the idea for Belladrum's 50ft woman grew -

      - -
      -
    • - - -
    • - - - - - - - - - - - - - - -

      - Tractor spreading muckYour pictures - -

      - - -

      A selection of your pictures taken across Scotland -

      - -
      -
    • - - -
    • - - - - - - - - - - - - - - -

      - Scotland's newspapersScottish papers - -

      - - -

      Newspaper review: Scotland's front pages -

      - -
      -
    • - - -
    • - - - - - - - - - - - - - - -

      - Saltire and union flagJoin in - -

      - - -

      Apply to take part in a TV referendum debate -

      - -
      -
    • - - -
    • - - - - - - - - - - - - - - -

      - TwitterSee our tweets - -

      - - -

      Follow the latest BBC Scotland News updates on Twitter -

      - -
      -
    • - - - - - - - - - - - - - - - - -
    • - - - - - - - - - - - - - - -

      - Take part - -

      - - -

      Join Brian Taylor's Big Debate audience -

      - -
    • - - -
    • - - - - - - - - - - - - - - -

      - Send us your pictures - -

      - - -

      How to send us your images from across Scotland -

      - -
    • - - -
    • - - - - - - - - - - - - - - -

      - Scotland's future - -

      - - -

      Latest news, background and analysis on the 2014 referendum -

      - -
    • - - -
    • - - - - - - - - - - - - - - -

      - We're on Facebook - -

      - - -

      Join us to get the highlights from BBC Scotland news -

      - -
    • - - - - - - -
    -
    - - -
    - - - - - - - -
    - -
    - - - - - -
    -
    - -

    Elsewhere on the BBC

    - -
      - - - -
    • - - - - - - - - - - - - - -

      - Queen's Baton RelayBaton relay - -

      - -

      70 nations and territories, 288 days - Mark Beaumont travels the Commonwealth

      -
    • -
    - -
    - -
    - - -
    - - - -
    - -
    - - - - -
    - - - - - -
    - - - - -
    -

    Programmes

    - - - BBC iPlayer - - -
      - - - -
    • - - - - - - - - - - - - - -

      - Reporting ScotlandReporting Scotland Watch - -

      - -

      The latest news and weather from around Scotland.

      -
      -
    • -
    - -
    - -
    - -
    - - -
    - -
    - - -
    - -
    - - - -
    - -
    - - -
    - - - -
    - - -
    - -
    - - - - - - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/src/plugin/any23/sample/microdata_basic.html b/src/plugin/any23/sample/microdata_basic.html deleted file mode 100644 index 3ffca84251..0000000000 --- a/src/plugin/any23/sample/microdata_basic.html +++ /dev/null @@ -1,107 +0,0 @@ - - - - - - - -
    -

    My name is Elizabeth.

    -
    - -
    -

    My name is Daniel.

    -
    - - -
    -

    My name is Neil.

    -

    My band is called Four Parts Water.

    -

    I am British.

    -
    - - -
    - Google -
    - - -
    - I was born on . -
    - - -
    -

    Flavors in my favorite ice cream:

    -
      -
    • Lemon sorbet
    • -
    • Apricot sorbet
    • -
    -
    - - -
    - orange -
    - - -
    - -
    The Castle (1986)
    -
    - - - -
    - -
    The Castle (1986)
    -
    - - -
    -

    Hedral

    -

    Hedral is a male american domestic shorthair, - with a fluffy black fur with white paws and belly.

    - -
    - - -
    -
    Title -
    The Reality Dysfunction -
    Author -
    Publication date -
    - -
    - - -
    -

    Hedral

    -

    Hedral is a male american domestic shorthair, with a fluffy - black fur with - white paws and belly.

    - -
    - - - - \ No newline at end of file diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java deleted file mode 100644 index 09dc32e02d..0000000000 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.any23; - -import java.util.HashMap; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.parse.Parse; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - *

    This implementation of {@link org.apache.nutch.indexer.IndexingFilter} - * adds a triple(s) field to the {@link org.apache.nutch.indexer.NutchDocument}.

    - *

    Triples are extracted via Apache Any23.

    - * @see org.apache.nutch.any23.Any23ParseFilter - */ -public class Any23IndexingFilter implements IndexingFilter { - - /** Logging instance */ - private static final Logger LOG = LoggerFactory.getLogger(Any23IndexingFilter.class); - - public static final String STRUCTURED_DATA = "structured_data"; - - private Configuration conf; - - /** - * Get the {@link Configuration} object - * @see org.apache.hadoop.conf.Configurable#getConf() - */ - @Override - public Configuration getConf() { - return this.conf; - } - - /** - * Set the {@link Configuration} object - * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration) - */ - @Override - public void setConf(Configuration conf) { - this.conf = conf; - } - - /** - * - * @param doc - * document instance for collecting fields - * @param parse - * parse data instance - * @param url - * page url - * @param datum - * crawl datum for the page (fetch datum from segment containing - * fetch status and fetch time) - * @param inlinks - * page inlinks - * @return filtered NutchDocument - * @see org.apache.nutch.indexer.IndexingFilter#filter(NutchDocument, Parse, Text, CrawlDatum, Inlinks) - * - * @throws IndexingException if there is a fatl error whilst indexing - */ - @Override - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { - String[] metadata = parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES); - - if (metadata != null) { - for (String triple : metadata) { - Pattern pattern = Pattern.compile("^([^ ]+) ([^ ]+) (.+) \\."); - Matcher matcher = pattern.matcher(triple); - if (matcher.find()) { - Map map = new HashMap<>(); - map.put("node", matcher.group(1)); - map.put("key", matcher.group(2)); - map.put("short_key", keyToShortKey(matcher.group(2))); - map.put("value", matcher.group(3)); - doc.add("structured_data", map); - } else { - LOG.warn("Unsupported triple format " + triple); - } - } - } - return doc; - } - - private static String keyToShortKey(String key) { - if (key.startsWith("<") && key.endsWith(">")) { - key = key.substring(1, key.length() - 1); - } - String[] keyParts = key.split("/"); - String[] keySubParts = keyParts[keyParts.length - 1].split("#"); - return keySubParts[keySubParts.length - 1]; - } -} diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java deleted file mode 100644 index bed659f352..0000000000 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.any23; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.charset.Charset; -import java.util.Arrays; -import java.util.Collections; -import java.util.Set; -import java.util.TreeSet; - -import org.apache.any23.Any23; -import org.apache.any23.extractor.ExtractionException; -import org.apache.any23.filter.IgnoreAccidentalRDFa; -import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments; -import org.apache.any23.mime.TikaMIMETypeDetector; -import org.apache.any23.mime.purifier.WhiteSpacesPurifier; -import org.apache.any23.writer.BenchmarkTripleHandler; -import org.apache.any23.writer.NTriplesWriter; -import org.apache.any23.writer.TripleHandler; -import org.apache.any23.writer.TripleHandlerException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.HtmlParseFilter; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.protocol.Content; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.DocumentFragment; - -/** - *

    This implementation of {@link org.apache.nutch.parse.HtmlParseFilter} - * uses the Apache Any23 library - * for parsing and extracting structured data in RDF format from a - * variety of Web documents. The supported formats can be found at Apache Any23. - *

    In this implementation triples are written as Notation3 - * and triples are identified within output triple streams by the presence of '\n'. - * The presence of the '\n' is a characteristic specific to N3 serialization in Any23. - * In order to use another/other writers implementing the - * TripleHandler - * interface, we will most likely need to identify an alternative data characteristic - * which we can use to split triples streams.

    - */ -public class Any23ParseFilter implements HtmlParseFilter { - - /** Logging instance */ - private static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class); - - private Configuration conf = null; - - /** - * Constant identifier used as a Key for writing and reading - * triples to and from the metadata Map field. - */ - public static final String ANY23_TRIPLES = "Any23-Triples"; - - public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors"; - public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types"; - - private static class Any23Parser { - - Set triples = null; - - Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException { - this.triples = new TreeSet<>(); - try { - parse(url, htmlContent, contentType, extractorNames); - } catch (URISyntaxException e) { - LOG.error("Error parsing URI: {}", url, e); - throw new RuntimeException(e.getReason()); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Maintains a {@link java.util.Set} containing the triples - * @return a {@link java.util.Set} of triples. - */ - Set getTriples() { - return this.triples; - } - - private void parse(String url, String htmlContent, String contentType, String... extractorNames) - throws URISyntaxException, IOException, TripleHandlerException { - Any23 any23 = new Any23(extractorNames); - any23.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier())); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try (TripleHandler tHandler = new IgnoreTitlesOfEmptyDocuments( - new IgnoreAccidentalRDFa( - new NTriplesWriter(baos))); - BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler)) { - try { - any23.extract(htmlContent, url, contentType, "UTF-8", bHandler); - } catch (IOException e) { - LOG.error("Error while reading the source", e); - } catch (ExtractionException e) { - LOG.error("Error while extracting structured data", e); - } - - LOG.debug("Any23 BenchmarkTripleHandler.report: " + bHandler.report()); - - String n3 = baos.toString("UTF-8"); - String[] triplesStrings = n3.split("\n"); - Collections.addAll(this.triples, triplesStrings); - } catch (IOException e) { - LOG.error("Unexpected IOException", e); - } - } - } - - @Override - public Configuration getConf() { - return this.conf; - } - - @Override - public void setConf(Configuration conf) { - this.conf = conf; - } - - /** - * @see org.apache.nutch.parse.HtmlParseFilter#filter(Content, ParseResult, HTMLMetaTags, DocumentFragment) - */ - @Override - public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { - String[] extractorNames = this.conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta"); - String[] supportedContentTypes = this.conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml"); - String contentType = content.getContentType(); - if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) { - LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType); - return parseResult; - } - - Any23Parser parser; - try { - String htmlContent = new String(content.getContent(), Charset.forName("UTF-8")); - parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames); - } catch (TripleHandlerException e) { - throw new RuntimeException("Error running Any23 parser: " + e.getMessage()); - } - Set triples = parser.getTriples(); - - Parse parse = parseResult.get(content.getUrl()); - Metadata metadata = parse.getData().getParseMeta(); - - for (String triple : triples) { - metadata.add(ANY23_TRIPLES, triple); - } - - return parseResult; - } -} diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java deleted file mode 100644 index 1367e19c46..0000000000 --- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.any23; - -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.io.Text; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -public class TestAny23IndexingFilter { - @Test - public void testAny23TriplesFields() throws Exception { - Configuration conf = NutchConfiguration.create(); - Any23IndexingFilter filter = new Any23IndexingFilter(); - filter.setConf(conf); - Assert.assertNotNull(filter); - NutchDocument doc = new NutchDocument(); - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "The Foo Page", - new Outlink[] { }, new Metadata()); - ParseImpl parse = new ParseImpl("test page", parseData); - String[] triples = new String[]{ - " .", - " \"77\" .", - " \"Zurique\"@pt ." - }; - for (String triple : triples) { - parse.getData().getParseMeta().add(Any23ParseFilter.ANY23_TRIPLES, triple); - } - try { - doc = filter.filter(doc, parse, new Text("http://nutch.apache.org/"), new CrawlDatum(), new Inlinks()); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.getMessage()); - } - List docTriples = doc.getField(Any23IndexingFilter.STRUCTURED_DATA).getValues(); - Assert.assertEquals(docTriples.size(), triples.length); - - Object triple = docTriples.get(0); - Assert.assertTrue(triple instanceof Map); - @SuppressWarnings("unchecked") - Map structuredData = (Map) triple; - Assert.assertEquals(structuredData.get("node"), ""); - Assert.assertEquals(structuredData.get("key"), ""); - Assert.assertEquals(structuredData.get("short_key"), "sameAs"); - Assert.assertEquals(structuredData.get("value"), ""); - - triple = docTriples.get(1); - Assert.assertTrue(triple instanceof Map); - structuredData = (Map) triple; - Assert.assertEquals(structuredData.get("node"), ""); - Assert.assertEquals(structuredData.get("key"), ""); - Assert.assertEquals(structuredData.get("short_key"), "yearHumidity"); - Assert.assertEquals(structuredData.get("value"), "\"77\""); - } -} diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java deleted file mode 100644 index 09c253fbc5..0000000000 --- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.any23; - -import java.io.File; -import java.io.IOException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParserNotFound; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -public class TestAny23ParseFilter { - - - private Configuration conf; - - private String fileSeparator = System.getProperty("file.separator"); - - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/any23/build.xml during plugin compilation. - private String file1 = "BBC_News_Scotland.html"; - - private String file2 = "microdata_basic.html"; - - private static final int EXPECTED_TRIPLES_1 = 79; - - private static final int EXPECTED_TRIPLES_2 = 40; - - @Before - public void setUp() { - this.conf = NutchConfiguration.create(); - conf.set("file.content.limit", "-1"); - conf.set("parser.timeout", "-1"); - conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, "html-embedded-jsonld,html-head-icbm,html-head-links," - + "html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard," - + "html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate," - + "html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath"); - conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html"); - } - - @Test - public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, ParseException { - String[] triplesArray = getTriples(file1); - - Assert.assertEquals("We expect 117 tab-separated triples extracted by the filter", - EXPECTED_TRIPLES_1, triplesArray.length); - } - - @Test - public void extractMicroDataFromHTML() throws ParserNotFound, IOException, ParseException { - String[] triplesArray = getTriples(file2); - - Assert.assertEquals("We expect 40 tab-separated triples extracted by the filter", - EXPECTED_TRIPLES_2, triplesArray.length); - } - - @Test - public void ignoreUnsupported() throws ParserNotFound, IOException, ParseException { - String[] triplesArray = getTriples(file1, "application/pdf"); - - Assert.assertEquals("We expect no triples extracted by the filter since content-type should be ignored", - 0, triplesArray.length); - } - - public String[] extract(String urlString, File file, String contentType) { - try { - System.out.println(urlString); - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - content.setContentType(contentType); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES); - } catch (Exception e) { - e.printStackTrace(); - Assert.fail(e.toString()); - } - return null; - } - - private String[] getTriples(String fileName) { - return getTriples(fileName, "text/html"); - } - - private String[] getTriples(String fileName, String contentType) { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - - File file = new File(sampleDir + fileSeparator + fileName); - - return extract(urlString, file, contentType); - } -} diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml index f6e87e8057..3f0d9ca44a 100755 --- a/src/plugin/build-plugin.xml +++ b/src/plugin/build-plugin.xml @@ -265,5 +265,7 @@ - + + + diff --git a/src/plugin/build.xml b/src/plugin/build.xml index ef5c6b5e81..0600944375 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -35,7 +35,6 @@ - @@ -115,7 +114,6 @@ - @@ -177,7 +175,6 @@ - diff --git a/src/plugin/creativecommons/conf/crawl-urlfilter.txt b/src/plugin/creativecommons/conf/crawl-urlfilter.txt index 324617f07a..eb6786e4b4 100644 --- a/src/plugin/creativecommons/conf/crawl-urlfilter.txt +++ b/src/plugin/creativecommons/conf/crawl-urlfilter.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Creative Commnons crawl filter # Each non-comment, non-blank line contains a regular expression diff --git a/src/plugin/creativecommons/conf/nutch-site.xml b/src/plugin/creativecommons/conf/nutch-site.xml index e28e12a9a8..4b343b2cc9 100644 --- a/src/plugin/creativecommons/conf/nutch-site.xml +++ b/src/plugin/creativecommons/conf/nutch-site.xml @@ -1,5 +1,21 @@ + diff --git a/src/plugin/creativecommons/data/anchor.html b/src/plugin/creativecommons/data/anchor.html index 90b522759d..3267bc9ea8 100755 --- a/src/plugin/creativecommons/data/anchor.html +++ b/src/plugin/creativecommons/data/anchor.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/creativecommons/data/rdf.html b/src/plugin/creativecommons/data/rdf.html index fb2c34dfe5..60c27cc541 100755 --- a/src/plugin/creativecommons/data/rdf.html +++ b/src/plugin/creativecommons/data/rdf.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/creativecommons/data/rel.html b/src/plugin/creativecommons/data/rel.html index 413d52f869..3d11572d82 100755 --- a/src/plugin/creativecommons/data/rel.html +++ b/src/plugin/creativecommons/data/rel.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/creativecommons/ivy.xml b/src/plugin/creativecommons/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/creativecommons/ivy.xml +++ b/src/plugin/creativecommons/ivy.xml @@ -1,5 +1,4 @@ - + exchange-jexl plugin for Nutch ============================== diff --git a/src/plugin/exchange-jexl/ivy.xml b/src/plugin/exchange-jexl/ivy.xml index 1275664e5d..cb5a0f1862 100644 --- a/src/plugin/exchange-jexl/ivy.xml +++ b/src/plugin/exchange-jexl/ivy.xml @@ -1,5 +1,4 @@ - + indexer-links plugin for Nutch ============================== diff --git a/src/plugin/index-links/ivy.xml b/src/plugin/index-links/ivy.xml index 624dcaf4a2..3d4fc905c3 100644 --- a/src/plugin/index-links/ivy.xml +++ b/src/plugin/index-links/ivy.xml @@ -1,5 +1,4 @@ - Testing the power of the index-replace plugin diff --git a/src/plugin/index-static/ivy.xml b/src/plugin/index-static/ivy.xml index 1275664e5d..cb5a0f1862 100644 --- a/src/plugin/index-static/ivy.xml +++ b/src/plugin/index-static/ivy.xml @@ -1,5 +1,4 @@ - + AWS CloudSearch plugin for Nutch ================================ diff --git a/src/plugin/indexer-cloudsearch/createCSDomain.sh b/src/plugin/indexer-cloudsearch/createCSDomain.sh index 24fb0156c6..1cb8481fe0 100644 --- a/src/plugin/indexer-cloudsearch/createCSDomain.sh +++ b/src/plugin/indexer-cloudsearch/createCSDomain.sh @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # example of domain configuration for CloudSearch DOMAIN="$1" diff --git a/src/plugin/indexer-cloudsearch/plugin.xml b/src/plugin/indexer-cloudsearch/plugin.xml index 5b4425359a..f18bc49eab 100644 --- a/src/plugin/indexer-cloudsearch/plugin.xml +++ b/src/plugin/indexer-cloudsearch/plugin.xml @@ -29,9 +29,9 @@ - - - + + + diff --git a/src/plugin/indexer-csv/README.md b/src/plugin/indexer-csv/README.md index 80220974a7..4d1288b198 100644 --- a/src/plugin/indexer-csv/README.md +++ b/src/plugin/indexer-csv/README.md @@ -1,3 +1,20 @@ + + indexer-csv plugin for Nutch ============================ diff --git a/src/plugin/indexer-csv/ivy.xml b/src/plugin/indexer-csv/ivy.xml index 75b5d54e55..e7bf875468 100644 --- a/src/plugin/indexer-csv/ivy.xml +++ b/src/plugin/indexer-csv/ivy.xml @@ -1,5 +1,4 @@ - + indexer-dummy plugin for Nutch ============================== diff --git a/src/plugin/indexer-dummy/ivy.xml b/src/plugin/indexer-dummy/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/indexer-dummy/ivy.xml +++ b/src/plugin/indexer-dummy/ivy.xml @@ -1,5 +1,4 @@ - + indexer-elastic plugin for Nutch ================================ diff --git a/src/plugin/indexer-elastic/howto_upgrade_es.txt b/src/plugin/indexer-elastic/howto_upgrade_es.md similarity index 61% rename from src/plugin/indexer-elastic/howto_upgrade_es.txt rename to src/plugin/indexer-elastic/howto_upgrade_es.md index a8156444c6..b57e0c02fa 100644 --- a/src/plugin/indexer-elastic/howto_upgrade_es.txt +++ b/src/plugin/indexer-elastic/howto_upgrade_es.md @@ -1,3 +1,20 @@ + + 1. Upgrade Elasticsearch dependency in src/plugin/indexer-elastic/ivy.xml 2. Upgrade the Elasticsearch specific dependencies in src/plugin/indexer-elastic/plugin.xml diff --git a/src/plugin/indexer-elastic/ivy.xml b/src/plugin/indexer-elastic/ivy.xml index abdcceae29..e5cdfdf656 100644 --- a/src/plugin/indexer-elastic/ivy.xml +++ b/src/plugin/indexer-elastic/ivy.xml @@ -40,7 +40,22 @@ + + + + + + + + + + + + + + + diff --git a/src/plugin/indexer-elastic/plugin.xml b/src/plugin/indexer-elastic/plugin.xml index 387a3ac664..fc3723a608 100644 --- a/src/plugin/indexer-elastic/plugin.xml +++ b/src/plugin/indexer-elastic/plugin.xml @@ -22,50 +22,51 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java index 053bfd68aa..290d9dfca2 100644 --- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java +++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java @@ -210,6 +210,9 @@ public HttpAsyncClientBuilder customizeHttpClient( restClientBuilder.setHttpClientConfigCallback(new HttpClientConfigCallback() { @Override public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) { + if (auth) { + httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider); + } // ignore issues with self-signed certificates httpClientBuilder.setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE); return httpClientBuilder.setSSLContext(sslContext); diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml index 7bdd94324a..9d605c50b5 100644 --- a/src/plugin/indexer-kafka/ivy.xml +++ b/src/plugin/indexer-kafka/ivy.xml @@ -1,5 +1,4 @@ - + indexer-opensearch1x plugin for Nutch ================================ diff --git a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md new file mode 100644 index 0000000000..c9b723ffcf --- /dev/null +++ b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md @@ -0,0 +1,50 @@ + + +1. Upgrade OpenSearch dependency in src/plugin/indexer-opensearch-1x/ivy.xml + +2. Upgrade the OpenSearch specific dependencies in src/plugin/indexer-opensearch-1x/plugin.xml + To get the list of dependencies and their versions execute: + $ cd src/plugin/indexer-opensearch-1x/ + $ ant -f ./build-ivy.xml + $ ls lib | sed 's/^/ /g' + + In the plugin.xml replace all lines between + + and + + with the output of the command above. + +4. (Optionally) remove overlapping dependencies between indexer-opensearch-1x and Nutch core dependencies: + - check for libs present both in + build/lib + and + build/plugins/indexer-opensearch-1x/ + (eventually with different versions) + - duplicated libs can be added to the exclusions of transitive dependencies in + build/plugins/indexer-opensearch-1x/ivy.xml + - but it should be made sure that the library versions in ivy/ivy.xml correspend to + those required by Tika + +5. Remove the locally "installed" dependencies in src/plugin/indexer-opensearch-1x/lib/: + + $ rm -rf lib/ + +6. Build Nutch and run all unit tests: + + $ cd ../../../ + $ ant clean runtime test \ No newline at end of file diff --git a/src/plugin/indexer-opensearch-1x/ivy.xml b/src/plugin/indexer-opensearch-1x/ivy.xml index 1505ad3c82..ae5d91e41e 100644 --- a/src/plugin/indexer-opensearch-1x/ivy.xml +++ b/src/plugin/indexer-opensearch-1x/ivy.xml @@ -40,7 +40,22 @@ + + + + + + + + + + + + + + + diff --git a/src/plugin/indexer-opensearch-1x/plugin.xml b/src/plugin/indexer-opensearch-1x/plugin.xml index 1bf5affc2f..ee0d45dc2a 100644 --- a/src/plugin/indexer-opensearch-1x/plugin.xml +++ b/src/plugin/indexer-opensearch-1x/plugin.xml @@ -22,50 +22,50 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/indexer-rabbit/README.md b/src/plugin/indexer-rabbit/README.md index 6ea09a9151..8040cd6c76 100644 --- a/src/plugin/indexer-rabbit/README.md +++ b/src/plugin/indexer-rabbit/README.md @@ -1,3 +1,20 @@ + + indexer-rabbit plugin for Nutch =============================== diff --git a/src/plugin/indexer-rabbit/ivy.xml b/src/plugin/indexer-rabbit/ivy.xml index dd450cf7f0..d2daf91dad 100644 --- a/src/plugin/indexer-rabbit/ivy.xml +++ b/src/plugin/indexer-rabbit/ivy.xml @@ -1,5 +1,4 @@ - + indexer-solr plugin for Nutch ============================= diff --git a/src/plugin/indexer-solr/howto_upgrade_solr.txt b/src/plugin/indexer-solr/howto_upgrade_solr.md similarity index 60% rename from src/plugin/indexer-solr/howto_upgrade_solr.txt rename to src/plugin/indexer-solr/howto_upgrade_solr.md index b2a7eb5c89..905fb84a9e 100644 --- a/src/plugin/indexer-solr/howto_upgrade_solr.txt +++ b/src/plugin/indexer-solr/howto_upgrade_solr.md @@ -1,3 +1,20 @@ + + 1. Upgrade Solr dependency in src/plugin/indexer-solr/ivy.xml 2. Upgrade the Solr specific dependencies in src/plugin/indexer-solr/plugin.xml diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml index ce59942daf..ab5fd72c7a 100644 --- a/src/plugin/indexer-solr/ivy.xml +++ b/src/plugin/indexer-solr/ivy.xml @@ -1,15 +1,20 @@ + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> diff --git a/src/plugin/indexer-solr/plugin.xml b/src/plugin/indexer-solr/plugin.xml index d49641cf9c..21cc7d8bdf 100644 --- a/src/plugin/indexer-solr/plugin.xml +++ b/src/plugin/indexer-solr/plugin.xml @@ -1,14 +1,20 @@ - + @@ -17,7 +23,7 @@ - + diff --git a/src/plugin/language-identifier/ivy.xml b/src/plugin/language-identifier/ivy.xml index 395047c6fc..f64b97055b 100644 --- a/src/plugin/language-identifier/ivy.xml +++ b/src/plugin/language-identifier/ivy.xml @@ -1,5 +1,4 @@ - - - - - - + diff --git a/src/plugin/language-identifier/plugin.xml b/src/plugin/language-identifier/plugin.xml index 357c4a67cd..dab1a52f31 100644 --- a/src/plugin/language-identifier/plugin.xml +++ b/src/plugin/language-identifier/plugin.xml @@ -26,16 +26,7 @@ - - - - - - - - - - + diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml index b03211667a..795e6b3358 100644 --- a/src/plugin/lib-htmlunit/ivy.xml +++ b/src/plugin/lib-htmlunit/ivy.xml @@ -1,5 +1,4 @@ - + # Updates * The use of phantomjs has been deprecated. Check [Wikipedia](https://en.wikipedia.org/wiki/PhantomJS) for more info. * The updated code for Safari webriver is under development as starting Safari 10 on OS X El Capitan and macOS Sierra, Safari comes bundled with a new driver implementation. diff --git a/src/plugin/any23/build.xml b/src/plugin/lib-selenium/howto_upgrade_selenium.md similarity index 51% rename from src/plugin/any23/build.xml rename to src/plugin/lib-selenium/howto_upgrade_selenium.md index 790b18548d..3071c74cbf 100644 --- a/src/plugin/any23/build.xml +++ b/src/plugin/lib-selenium/howto_upgrade_selenium.md @@ -1,4 +1,3 @@ - - - +1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml - - - - - +2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml - - - - - - - + To get a list of dependencies and their versions execute: + $ ant -f ./build-ivy.xml + $ ls lib | sed 's/^/ \n \n <\/library>/g' - + Note that all dependent libraries are exported for a "library" plugin ("lib-selenium"). + + N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows + + $ brew install gnu-sed --with-default-names + + You can then restart your terminal and the Regex + Sed command should work just fine! diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.txt b/src/plugin/lib-selenium/howto_upgrade_selenium.txt deleted file mode 100644 index 1892a6275e..0000000000 --- a/src/plugin/lib-selenium/howto_upgrade_selenium.txt +++ /dev/null @@ -1,15 +0,0 @@ -1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml - -2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml - - To get a list of dependencies and their versions execute: - $ ant -f ./build-ivy.xml - $ ls lib | sed 's/^/ \n \n <\/library>/g' - - Note that all dependent libraries are exported for a "library" plugin ("lib-selenium"). - - N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows - - $ brew install gnu-sed --with-default-names - - You can then restart your terminal and the Regex + Sed command should work just fine! diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml index 7d3a2d6242..0d460cdb4d 100644 --- a/src/plugin/lib-selenium/ivy.xml +++ b/src/plugin/lib-selenium/ivy.xml @@ -1,5 +1,4 @@ - diff --git a/src/plugin/parse-js/sample/parse_pure_js_test.js b/src/plugin/parse-js/sample/parse_pure_js_test.js index f196313f85..0e486a8793 100644 --- a/src/plugin/parse-js/sample/parse_pure_js_test.js +++ b/src/plugin/parse-js/sample/parse_pure_js_test.js @@ -1,3 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // test data for link extraction from "pure" JavaScript function selectProvider(form) { diff --git a/src/plugin/parse-metatags/ivy.xml b/src/plugin/parse-metatags/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/parse-metatags/ivy.xml +++ b/src/plugin/parse-metatags/ivy.xml @@ -1,5 +1,4 @@ - diff --git a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html index ca8b737c2b..36d2c8814a 100644 --- a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html +++ b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/parse-tika/howto_upgrade_tika.md b/src/plugin/parse-tika/howto_upgrade_tika.md new file mode 100644 index 0000000000..8ed6c3f3cd --- /dev/null +++ b/src/plugin/parse-tika/howto_upgrade_tika.md @@ -0,0 +1,79 @@ + + +We are currently using a shim (https://github.com/tballison/hadoop-safe-tika +because of binary conflicts in commons-io versions between what Hadoop supports and the more +modern features that Apache Tika and Apache POI were using in commons-io. + +For now, all you have to do is update the fat jar dependencies: + +1. tika-core-shaded in ivy/ivy.xml + +2. tika-parsers-standard-package-shaded in src/plugin/parse-tika/ivy.xml + +3. The library name version for tika-parsers-standard-package-shaded in src/plugin/parse-tika/plugin.xml + +4. Repeat steps 2 and 3 for the language-identifier + +5. Build Nutch and run all unit tests: + + $ cd ../../../ + $ ant clean runtime test + +The following directions are what we used to do with thin jars. Hopefully, we'll +be able to get back to these directions once we have version harmony with Hadoop and Tika/POI. + +1. Upgrade Tika dependency (tika-core) in ivy/ivy.xml + +2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml + +3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml + + To get the list of dependencies and their versions execute: + $ cd src/plugin/parse-tika/ + $ ant -f ./build-ivy.xml + $ ls lib | sed 's/^/ /g' + + In the plugin.xml replace all lines between + + and + + with the output of the command above. + +4. (Optionally) remove overlapping dependencies between parse-tika and Nutch core dependencies: + - check for libs present both in + build/lib + and + build/plugins/parse-tika/ + (eventually with different versions) + - duplicated libs can be added to the exclusions of transitive dependencies in + build/plugins/parse-tika/ivy.xml + - but the library versions in ivy/ivy.xml MUST correspond to those required by Tika + +5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/: + + $ rm -rf lib/ + +6. Repeat steps 2-5 for the language-identifier plugin which also depends on Tika modules + + $ cd ../language-identifier/ + +7. Build Nutch and run all unit tests: + + $ cd ../../../ + $ ant clean runtime test + diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt deleted file mode 100644 index cb3ed6be87..0000000000 --- a/src/plugin/parse-tika/howto_upgrade_tika.txt +++ /dev/null @@ -1,42 +0,0 @@ -1. Upgrade Tika dependency (tika-core) in ivy/ivy.xml - -2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml - -3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml - - To get the list of dependencies and their versions execute: - $ cd src/plugin/parse-tika/ - $ ant -f ./build-ivy.xml - $ ls lib | sed 's/^/ /g' - - In the plugin.xml replace all lines between - - and - - with the output of the command above. - -4. (Optionally) remove overlapping dependencies between parse-tika and Nutch core dependencies: - - check for libs present both in - build/lib - and - build/plugins/parse-tika/ - (eventually with different versions) - - duplicated libs can be added to the exclusions of transitive dependencies in - build/plugins/parse-tika/ivy.xml - - but the library versions in ivy/ivy.xml MUST correspond to those required by Tika - -5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/: - - $ rm -rf lib/ - -6. Repeat steps 2-5 for the language-identifier plugin which also depends on Tika modules - - $ cd ../language-identifier/ - -It should be noted that Any23 also has a dependency on Tika so you may wish to check that there are no classpath conflicts in the any23 plugin as well. - -7. Build Nutch and run all unit tests: - - $ cd ../../../ - $ ant clean runtime test - diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml index f0ec7a8d8c..f16636d255 100644 --- a/src/plugin/parse-tika/ivy.xml +++ b/src/plugin/parse-tika/ivy.xml @@ -1,5 +1,4 @@ - - - - - - - - - - - - - - - + diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml index d88405bc1c..a20fa7266a 100644 --- a/src/plugin/parse-tika/plugin.xml +++ b/src/plugin/parse-tika/plugin.xml @@ -25,86 +25,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + diff --git a/src/plugin/parse-tika/sample/nutch.html b/src/plugin/parse-tika/sample/nutch.html index 0aa7c98959..8098535126 100644 --- a/src/plugin/parse-tika/sample/nutch.html +++ b/src/plugin/parse-tika/sample/nutch.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/parse-zip/ivy.xml b/src/plugin/parse-zip/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/parse-zip/ivy.xml +++ b/src/plugin/parse-zip/ivy.xml @@ -1,5 +1,4 @@ - + Nutch Interactive Selenium ========================== diff --git a/src/plugin/protocol-interactiveselenium/ivy.xml b/src/plugin/protocol-interactiveselenium/ivy.xml index 506be0aecb..112483bcdc 100644 --- a/src/plugin/protocol-interactiveselenium/ivy.xml +++ b/src/plugin/protocol-interactiveselenium/ivy.xml @@ -1,5 +1,4 @@ - + 1. Upgrade OkHttp dependency in src/plugin/protocol-okhttp/ivy.xml 2. Upgrade OkHttp's own dependencies in src/plugin/protocol-okhttp/plugin.xml diff --git a/src/plugin/protocol-okhttp/ivy.xml b/src/plugin/protocol-okhttp/ivy.xml index ead8232474..73b4fa6369 100644 --- a/src/plugin/protocol-okhttp/ivy.xml +++ b/src/plugin/protocol-okhttp/ivy.xml @@ -1,5 +1,4 @@ - + Nutch Selenium ============== diff --git a/src/plugin/protocol-selenium/ivy.xml b/src/plugin/protocol-selenium/ivy.xml index 506be0aecb..112483bcdc 100644 --- a/src/plugin/protocol-selenium/ivy.xml +++ b/src/plugin/protocol-selenium/ivy.xml @@ -1,5 +1,4 @@ - Filters URLs based on a file of regular expressions using host/domains matching first. The default policy is to accept a URL if no matches diff --git a/src/plugin/urlfilter-fast/ivy.xml b/src/plugin/urlfilter-fast/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/urlfilter-fast/ivy.xml +++ b/src/plugin/urlfilter-fast/ivy.xml @@ -1,5 +1,4 @@ - + urlfilter-ignoreexempt ====================== This plugin allows certain urls to be exempted when the external links are configured to be ignored. diff --git a/src/plugin/urlfilter-ignoreexempt/ivy.xml b/src/plugin/urlfilter-ignoreexempt/ivy.xml index 956fd25efc..5c2c5b77e1 100644 --- a/src/plugin/urlfilter-ignoreexempt/ivy.xml +++ b/src/plugin/urlfilter-ignoreexempt/ivy.xml @@ -1,5 +1,4 @@ - diff --git a/src/test/filter-all.txt b/src/test/filter-all.txt index 4ed567ab1c..d738aec76a 100644 --- a/src/test/filter-all.txt +++ b/src/test/filter-all.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Config file for urlfilter-suffix plugin # Filter away all urls diff --git a/src/test/log4j.properties b/src/test/log4j.properties index 3ff115f46f..08e272c712 100644 --- a/src/test/log4j.properties +++ b/src/test/log4j.properties @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # log4j configuration used during build and unit tests log4j.rootLogger=info,stdout diff --git a/src/test/nutch-site.xml b/src/test/nutch-site.xml index dd408739dc..0d6177e5e6 100644 --- a/src/test/nutch-site.xml +++ b/src/test/nutch-site.xml @@ -1,4 +1,20 @@ + diff --git a/src/testresources/fetch-test-site/dup_of_pagea.html b/src/testresources/fetch-test-site/dup_of_pagea.html index 6444c41225..63c4e61537 100644 --- a/src/testresources/fetch-test-site/dup_of_pagea.html +++ b/src/testresources/fetch-test-site/dup_of_pagea.html @@ -1,3 +1,19 @@ + page a diff --git a/src/testresources/fetch-test-site/exception.html b/src/testresources/fetch-test-site/exception.html index e1192a176b..66f134ee25 100644 --- a/src/testresources/fetch-test-site/exception.html +++ b/src/testresources/fetch-test-site/exception.html @@ -1,3 +1,19 @@ + diff --git a/src/testresources/fetch-test-site/index.html b/src/testresources/fetch-test-site/index.html index d73ff3f691..3fc6e61e5a 100644 --- a/src/testresources/fetch-test-site/index.html +++ b/src/testresources/fetch-test-site/index.html @@ -1,3 +1,19 @@ + front page diff --git a/src/testresources/fetch-test-site/nested_spider_trap.html b/src/testresources/fetch-test-site/nested_spider_trap.html index 5dcf7c2209..dd32ee2362 100644 --- a/src/testresources/fetch-test-site/nested_spider_trap.html +++ b/src/testresources/fetch-test-site/nested_spider_trap.html @@ -1,3 +1,19 @@ + nested spider trap diff --git a/src/testresources/fetch-test-site/pagea.html b/src/testresources/fetch-test-site/pagea.html index 6444c41225..63c4e61537 100644 --- a/src/testresources/fetch-test-site/pagea.html +++ b/src/testresources/fetch-test-site/pagea.html @@ -1,3 +1,19 @@ + page a diff --git a/src/testresources/fetch-test-site/pageb.html b/src/testresources/fetch-test-site/pageb.html index 66e3725ef0..cf77ff4f75 100644 --- a/src/testresources/fetch-test-site/pageb.html +++ b/src/testresources/fetch-test-site/pageb.html @@ -1,3 +1,19 @@ + bage b diff --git a/src/testresources/fetch-test-site/robots.txt b/src/testresources/fetch-test-site/robots.txt index e69de29bb2..fc590f9733 100644 --- a/src/testresources/fetch-test-site/robots.txt +++ b/src/testresources/fetch-test-site/robots.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file