Skip to content

Commit

Permalink
Merge branch 'master' into cc
Browse files Browse the repository at this point in the history
Notable changes:
- NUTCH-2959 upgrade Tika to 2.9.0
- NUTCH-2990 HttpRobotRulesParser to follow 5 redirects as specified by RFC 9309
- NUTCH-3011 HttpRobotRulesParser: handle HTTP 429 Too Many Requests same as server errors (HTTP 5xx)
- NUTCH-3002 Protocol-okhttp HttpResponse: HTTP header metadata lookup should be case-insensitive

Notes:
- the upgrade to Tika 2.9.0 is based on a shaded package
  to get around a conflict with the Hadoop-provided dependency
  to commons-io (Hadoop ships with 2.8.0 but Tika requires 2.11.0)
- because no module-level shaded Tika packages are available,
  Nutch core for now already includes the Tika standards parsers package
  and parse-tika relies on the package provided via Nutch core
- cf. the comments and modifications in ivy/ivy.xml and
  src/plugin/parse-tika/{ivy,plugin}.xml
  • Loading branch information
sebastian-nagel committed Oct 31, 2023
2 parents b76798b + 792ed28 commit f3f948e
Show file tree
Hide file tree
Showing 221 changed files with 2,149 additions and 5,612 deletions.
37 changes: 37 additions & 0 deletions .github/workflows/dependency-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: master pr build

on:
schedule:
- cron: '0 0 * * *' # every day at midnight

jobs:
dependency-check:
strategy:
matrix:
java: ['11']
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Set up JDK ${{ matrix.java }}
uses: actions/setup-java@v3
with:
java-version: ${{ matrix.java }}
distribution: 'temurin'
- name: Dependency check
run: ant clean dependency-check -buildfile build.xml
65 changes: 51 additions & 14 deletions .github/workflows/master-build.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
Expand All @@ -13,29 +12,67 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

name: master pr build
name: master pull request ci

on:
push:
branches: [ master ]
branches: [master]
pull_request:
branches: [ master ]
types: [opened, synchronize, reopened]
branches: [master]

jobs:
build:
runs-on: ubuntu-latest
javadoc:
strategy:
matrix:
java: [ '11' ]

java: ['11']
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Set up JDK ${{ matrix.java }}
uses: actions/setup-java@v3
with:
java-version: ${{ matrix.java }}
distribution: 'temurin'
- name: Javadoc
run: ant clean javadoc -buildfile build.xml
rat:
strategy:
matrix:
java: ['11']
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Set up JDK ${{ matrix.java }}
uses: actions/setup-java@v3
with:
java-version: ${{ matrix.java }}
distribution: 'temurin'
- name: Run Apache Rat
run: ant clean run-rat -buildfile build.xml
- name: Cache unknown licenses
run: echo "UNKNOWN_LICENSES=$(sed -n 18p /home/runner/work/nutch/nutch/build/apache-rat-report.txt)" >> $GITHUB_ENV
- name: Versions
run: |
echo $UNKNOWN_LICENSES
- name: Fail if any unknown licenses
if: ${{ env.UNKNOWN_LICENSES != '0 Unknown Licenses' }}
run: exit 1
test:
strategy:
matrix:
java: ['11']
os: [ubuntu-latest, macos-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Set up JDK ${{ matrix.java }}
uses: actions/setup-java@v1
uses: actions/setup-java@v3
with:
java-version: ${{ matrix.java }}
- name: Build with Ant
run: ant clean nightly javadoc -buildfile build.xml
distribution: 'temurin'
- name: Test
run: ant clean test -buildfile build.xml
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,5 @@ naivebayes-model
csvindexwriter
lib/spotbugs-*
ivy/dependency-check-ant/*
.gradle*
ivy/apache-rat-*
7 changes: 0 additions & 7 deletions LICENSE-binary
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,6 @@ This product bundles some components that are also licensed under
the Apache License Version 2.0:


ch.qos.reload4j:reload4j
com.101tec:zkclient
com.amazonaws:aws-java-sdk-cloudsearch
com.amazonaws:aws-java-sdk-core
Expand Down Expand Up @@ -327,11 +326,6 @@ net.sourceforge.owlapi:owlapi-impl
net.sourceforge.owlapi:owlapi-parsers
net.sourceforge.owlapi:owlapi-rio
net.sourceforge.owlapi:owlapi-tools
org.apache.any23:apache-any23-api
org.apache.any23:apache-any23-core
org.apache.any23:apache-any23-csvutils
org.apache.any23:apache-any23-encoding
org.apache.any23:apache-any23-mime
org.apache.avro:avro
org.apache.commons:commons-collections4
org.apache.commons:commons-compress
Expand Down Expand Up @@ -758,7 +752,6 @@ org.jsoup:jsoup
org.rypt:f8
org.slf4j:jcl-over-slf4j
org.slf4j:slf4j-api
org.slf4j:slf4j-reload4j


Mozilla Public License 1.1 (MPL 1.1)
Expand Down
13 changes: 1 addition & 12 deletions NOTICE-binary
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ code and source code.

The following provides more details on the included cryptographic software:

The plugins parse-tika and any23 use Apache Tika and the Bouncy Castle
The parse-tika plugin uses Apache Tika and the Bouncy Castle
generic encryption libraries for extracting text content and metadata
from encrypted PDF files. See <https://www.bouncycastle.org/> for more
details on Bouncy Castle and <https://tika.apache.org/> for details
Expand All @@ -46,9 +46,6 @@ on Apache Tika.
Apache projects
---------------

Apache Any23 (https://any23.apache.org/)
see https://github.com/apache/any23/blob/master/NOTICE.txt

Apache Avro (https://avro.apache.org)
see https://github.com/apache/avro/blob/master/NOTICE.txt

Expand Down Expand Up @@ -163,10 +160,6 @@ AOP alliance (http://aopalliance.sourceforge.net)
- license: Public Domain
(licenses-binary/LICENSE-public-domain.txt)

# ch.qos.reload4j:reload4j
reload4j (https://reload4j.qos.ch)
- license: The Apache Software License, Version 2.0

# com.101tec:zkclient
ZkClient (https://github.com/sgroschupf/zkclient)
- license: The Apache Software License, Version 2.0
Expand Down Expand Up @@ -1100,10 +1093,6 @@ JCL 1.2 implemented over SLF4J (http://www.slf4j.org)
(licenses-binary/LICENSE-mit-license.txt)
# org.slf4j:slf4j-api
SLF4J API Module (http://www.slf4j.org)
- license: MIT License
(licenses-binary/LICENSE-mit-license.txt)
# org.slf4j:slf4j-reload4j
SLF4J Reload4j Binding (http://reload4j.qos.ch)
- license: MIT License
(licenses-binary/LICENSE-mit-license.txt)

Expand Down
2 changes: 1 addition & 1 deletion NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ code and source code.

The following provides more details on the included cryptographic software:

The plugins parse-tika and any23 use Apache Tika and the Bouncy Castle
The parse-tika plugin uses Apache Tika and the Bouncy Castle
generic encryption libraries for extracting text content and metadata
from encrypted PDF files. See <https://www.bouncycastle.org/> for more
details on Bouncy Castle and <https://tika.apache.org/> for details
Expand Down
57 changes: 43 additions & 14 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
<property name="maven-javadoc-jar" value="${release.dir}/${artifactId}-${version}-javadoc.jar" />
<property name="maven-sources-jar" value="${release.dir}/${artifactId}-${version}-sources.jar" />

<property name="dependency-check-ant.version" value="7.1.1" />
<property name="dependency-check-ant.version" value="8.4.2" />
<property name="dependency-check-ant.home" value="${ivy.dir}/dependency-check-ant" />
<property name="dependency-check-ant.jar" value="${dependency-check-ant.home}/dependency-check-ant.jar" />

Expand All @@ -48,7 +48,7 @@
<property name="spotbugs.home" value="${ivy.dir}/spotbugs-${spotbugs.version}" />
<property name="spotbugs.jar" value="${spotbugs.home}/lib/spotbugs-ant.jar" />

<property name="apache-rat.version" value="0.14" />
<property name="apache-rat.version" value="0.15" />
<property name="apache-rat.home" value="${ivy.dir}/apache-rat-${apache-rat.version}" />
<property name="apache-rat.jar" value="${apache-rat.home}/apache-rat-${apache-rat.version}.jar" />

Expand Down Expand Up @@ -202,7 +202,6 @@
<arg value="--no-module-directories" if:set="using.jdk.11"/>

<packageset dir="${src.dir}"/>
<packageset dir="${plugins.dir}/any23/src/java/"/>
<packageset dir="${plugins.dir}/creativecommons/src/java"/>
<packageset dir="${plugins.dir}/feed/src/java"/>
<packageset dir="${plugins.dir}/headings/src/java"/>
Expand Down Expand Up @@ -642,13 +641,15 @@
</fileset>
</path>

<target name="report-vulnerabilities" depends="jar, compile-plugins, dependency-check-ant-download" description="--> check dependencies for security vulnerabilities">
<target name="dependency-check" depends="jar, compile-plugins, dependency-check-ant-download" description="--> check dependencies for security vulnerabilities">
<taskdef resource="dependency-check-taskdefs.properties">
<classpath refid="dependency-check-ant.path" />
</taskdef>
<dependency-check projectname="${name}"
reportoutputdirectory="${dependency-check-ant.home}"
reportformat="ALL">
reportformat="ALL"
assemblyAnalyzerEnabled="false"
failBuildOnCVSS="1">
<suppressionfile path="${dependency-check-ant.home}/dependency-check-suppressions.xml" />
<retirejsFilter regex="copyright.*jeremy long" />
<fileset dir="${build.dir}">
Expand Down Expand Up @@ -688,7 +689,6 @@
<arg value="--no-module-directories" if:set="using.jdk.11"/>

<packageset dir="${src.dir}"/>
<packageset dir="${plugins.dir}/any23/src/java/" />
<packageset dir="${plugins.dir}/creativecommons/src/java"/>
<packageset dir="${plugins.dir}/feed/src/java"/>
<packageset dir="${plugins.dir}/headings/src/java"/>
Expand Down Expand Up @@ -774,7 +774,6 @@
<classpath>
<fileset dir="${build.plugins}" >
<include name="**/*.jar"/>
<exclude name="any23/javax.annotation-api*.jar"/>
</fileset>
</classpath>

Expand Down Expand Up @@ -1030,7 +1029,7 @@

<target name="apache-rat-download-unchecked" unless="apache-rat.jar.found"
description="--> downloads the Apache Rat jar">
<get src="https://www.apache.org/dist/creadur/apache-rat-${apache-rat.version}/apache-rat-${apache-rat.version}-bin.tar.gz"
<get src="https://archive.apache.org/dist/creadur/apache-rat-${apache-rat.version}/apache-rat-${apache-rat.version}-bin.tar.gz"
dest="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz" usetimestamp="false" />

<untar src="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz"
Expand All @@ -1040,8 +1039,8 @@
<delete file="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz" />
</target>

<target name="rat-sources" depends="init, apache-rat-download"
description="--> runs RAT tasks over src/java">
<target name="run-rat" depends="init, apache-rat-download"
description="--> runs Apache Rat on codebase">
<taskdef
uri="antlib:org.apache.rat.anttasks"
resource="org/apache/rat/anttasks/antlib.xml">
Expand All @@ -1052,8 +1051,40 @@
<rat:report
reportFile="${build.dir}/apache-rat-report.txt">
<fileset dir="src">
<include name="java/**/*"/>
<include name="plugin/**/src/**/*"/>
<include name="**"/>
<exclude name="plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/de.test"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/en.test"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/es.test"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/fi.test"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/fr.test"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/it.test"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/nl.test"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/pt.test"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/sv.test"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/test-referencial.txt"/>
<exclude name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/da.test"/>
<exclude name="plugin/parse-tika/sample/ootest.txt"/>
<exclude name="plugin/parse-tika/sample/test.rtf"/>
<exclude name="plugin/urlfilter-ignoreexempt/data/.donotdelete"/>
<exclude name="plugin/urlfilter-automaton/sample/Benchmarks.rules"/>
<exclude name="plugin/urlfilter-automaton/sample/Benchmarks.urls"/>
<exclude name="plugin/urlfilter-automaton/sample/IntranetCrawling.rules"/>
<exclude name="plugin/urlfilter-automaton/sample/IntranetCrawling.urls"/>
<exclude name="plugin/urlfilter-automaton/sample/WholeWebCrawling.rules"/>
<exclude name="plugin/urlfilter-automaton/sample/WholeWebCrawling.urls"/>
<exclude name="plugin/urlfilter-fast/sample/Benchmarks.urls"/>
<exclude name="plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt"/>
<exclude name="plugin/urlfilter-fast/sample/fast-urlfilter-test.txt"/>
<exclude name="plugin/urlfilter-fast/sample/test.urls"/>
<exclude name="plugin/urlfilter-regex/sample/Benchmarks.rules"/>
<exclude name="plugin/urlfilter-regex/sample/Benchmarks.urls"/>
<exclude name="plugin/urlfilter-regex/sample/IntranetCrawling.rules"/>
<exclude name="plugin/urlfilter-regex/sample/IntranetCrawling.urls"/>
<exclude name="plugin/urlfilter-regex/sample/WholeWebCrawling.rules"/>
<exclude name="plugin/urlfilter-regex/sample/WholeWebCrawling.urls"/>
<exclude name="plugin/urlfilter-regex/sample/nutch1838.rules"/>
<exclude name="plugin/urlfilter-regex/sample/nutch1838.urls"/>
</fileset>
</rat:report>
</target>
Expand Down Expand Up @@ -1182,8 +1213,6 @@
<source path="${basedir}/src/java/" />
<source path="${basedir}/src/test/" output="build/test/classes" />

<source path="${plugins.dir}/any23/src/java/" />
<source path="${plugins.dir}/any23/src/test/" />
<source path="${plugins.dir}/creativecommons/src/java/" />
<source path="${plugins.dir}/creativecommons/src/test/" />
<source path="${plugins.dir}/feed/src/java/" />
Expand Down
Loading

0 comments on commit f3f948e

Please sign in to comment.