Hearts face the prospect of running out of money by the end
-of April if a deal to take the club out of administration is not agreed
-soon. BBC Sport
-
-
Does it matter that an unnamed minister of unknown status
- follows an undiscernible motivation and gives an off-the-record comment
- to The Guardian? Frankly, yes it does.
This
- page is best viewed in an up-to-date web browser with style sheets
-(CSS) enabled. While you will be able to view the content of this page
-in your current browser, you will not be able to get the full visual
-experience. Please consider upgrading your browser software or enabling
-style sheets (CSS) if you are able to do so.
Hedral is a male american domestic shorthair,
- with a fluffy black fur with white paws and belly.
-
-
-
-
-
-
Title
-
The Reality Dysfunction
-
Author
-
Peter F. Hamilton
-
Publication date
-
-
-
-
-
-
-
Hedral
-
Hedral is a male american domestic shorthair, with a fluffy
- black fur with
- white paws and belly.
-
-
-
-
-
-
\ No newline at end of file
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
deleted file mode 100644
index 09dc32e02d..0000000000
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.any23;
-
-import java.util.HashMap;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- *
This implementation of {@link org.apache.nutch.indexer.IndexingFilter}
- * adds a triple(s) field to the {@link org.apache.nutch.indexer.NutchDocument}.
- * @see org.apache.nutch.any23.Any23ParseFilter
- */
-public class Any23IndexingFilter implements IndexingFilter {
-
- /** Logging instance */
- private static final Logger LOG = LoggerFactory.getLogger(Any23IndexingFilter.class);
-
- public static final String STRUCTURED_DATA = "structured_data";
-
- private Configuration conf;
-
- /**
- * Get the {@link Configuration} object
- * @see org.apache.hadoop.conf.Configurable#getConf()
- */
- @Override
- public Configuration getConf() {
- return this.conf;
- }
-
- /**
- * Set the {@link Configuration} object
- * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
- */
- @Override
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- /**
- *
- * @param doc
- * document instance for collecting fields
- * @param parse
- * parse data instance
- * @param url
- * page url
- * @param datum
- * crawl datum for the page (fetch datum from segment containing
- * fetch status and fetch time)
- * @param inlinks
- * page inlinks
- * @return filtered NutchDocument
- * @see org.apache.nutch.indexer.IndexingFilter#filter(NutchDocument, Parse, Text, CrawlDatum, Inlinks)
- *
- * @throws IndexingException if there is a fatl error whilst indexing
- */
- @Override
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
- String[] metadata = parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
-
- if (metadata != null) {
- for (String triple : metadata) {
- Pattern pattern = Pattern.compile("^([^ ]+) ([^ ]+) (.+) \\.");
- Matcher matcher = pattern.matcher(triple);
- if (matcher.find()) {
- Map map = new HashMap<>();
- map.put("node", matcher.group(1));
- map.put("key", matcher.group(2));
- map.put("short_key", keyToShortKey(matcher.group(2)));
- map.put("value", matcher.group(3));
- doc.add("structured_data", map);
- } else {
- LOG.warn("Unsupported triple format " + triple);
- }
- }
- }
- return doc;
- }
-
- private static String keyToShortKey(String key) {
- if (key.startsWith("<") && key.endsWith(">")) {
- key = key.substring(1, key.length() - 1);
- }
- String[] keyParts = key.split("/");
- String[] keySubParts = keyParts[keyParts.length - 1].split("#");
- return keySubParts[keySubParts.length - 1];
- }
-}
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
deleted file mode 100644
index bed659f352..0000000000
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.any23;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.net.URISyntaxException;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Set;
-import java.util.TreeSet;
-
-import org.apache.any23.Any23;
-import org.apache.any23.extractor.ExtractionException;
-import org.apache.any23.filter.IgnoreAccidentalRDFa;
-import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
-import org.apache.any23.mime.TikaMIMETypeDetector;
-import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
-import org.apache.any23.writer.BenchmarkTripleHandler;
-import org.apache.any23.writer.NTriplesWriter;
-import org.apache.any23.writer.TripleHandler;
-import org.apache.any23.writer.TripleHandlerException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.protocol.Content;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.DocumentFragment;
-
-/**
- *
This implementation of {@link org.apache.nutch.parse.HtmlParseFilter}
- * uses the Apache Any23 library
- * for parsing and extracting structured data in RDF format from a
- * variety of Web documents. The supported formats can be found at Apache Any23.
- *
In this implementation triples are written as Notation3
- * and triples are identified within output triple streams by the presence of '\n'.
- * The presence of the '\n' is a characteristic specific to N3 serialization in Any23.
- * In order to use another/other writers implementing the
- * TripleHandler
- * interface, we will most likely need to identify an alternative data characteristic
- * which we can use to split triples streams.
- */
-public class Any23ParseFilter implements HtmlParseFilter {
-
- /** Logging instance */
- private static final Logger LOG = LoggerFactory.getLogger(Any23ParseFilter.class);
-
- private Configuration conf = null;
-
- /**
- * Constant identifier used as a Key for writing and reading
- * triples to and from the metadata Map field.
- */
- public static final String ANY23_TRIPLES = "Any23-Triples";
-
- public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors";
- public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types";
-
- private static class Any23Parser {
-
- Set triples = null;
-
- Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException {
- this.triples = new TreeSet<>();
- try {
- parse(url, htmlContent, contentType, extractorNames);
- } catch (URISyntaxException e) {
- LOG.error("Error parsing URI: {}", url, e);
- throw new RuntimeException(e.getReason());
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- /**
- * Maintains a {@link java.util.Set} containing the triples
- * @return a {@link java.util.Set} of triples.
- */
- Set getTriples() {
- return this.triples;
- }
-
- private void parse(String url, String htmlContent, String contentType, String... extractorNames)
- throws URISyntaxException, IOException, TripleHandlerException {
- Any23 any23 = new Any23(extractorNames);
- any23.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()));
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- try (TripleHandler tHandler = new IgnoreTitlesOfEmptyDocuments(
- new IgnoreAccidentalRDFa(
- new NTriplesWriter(baos)));
- BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler)) {
- try {
- any23.extract(htmlContent, url, contentType, "UTF-8", bHandler);
- } catch (IOException e) {
- LOG.error("Error while reading the source", e);
- } catch (ExtractionException e) {
- LOG.error("Error while extracting structured data", e);
- }
-
- LOG.debug("Any23 BenchmarkTripleHandler.report: " + bHandler.report());
-
- String n3 = baos.toString("UTF-8");
- String[] triplesStrings = n3.split("\n");
- Collections.addAll(this.triples, triplesStrings);
- } catch (IOException e) {
- LOG.error("Unexpected IOException", e);
- }
- }
- }
-
- @Override
- public Configuration getConf() {
- return this.conf;
- }
-
- @Override
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- /**
- * @see org.apache.nutch.parse.HtmlParseFilter#filter(Content, ParseResult, HTMLMetaTags, DocumentFragment)
- */
- @Override
- public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
- String[] extractorNames = this.conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
- String[] supportedContentTypes = this.conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
- String contentType = content.getContentType();
- if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
- LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
- return parseResult;
- }
-
- Any23Parser parser;
- try {
- String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
- parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
- } catch (TripleHandlerException e) {
- throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
- }
- Set triples = parser.getTriples();
-
- Parse parse = parseResult.get(content.getUrl());
- Metadata metadata = parse.getData().getParseMeta();
-
- for (String triple : triples) {
- metadata.add(ANY23_TRIPLES, triple);
- }
-
- return parseResult;
- }
-}
diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java
deleted file mode 100644
index 1367e19c46..0000000000
--- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.any23;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestAny23IndexingFilter {
- @Test
- public void testAny23TriplesFields() throws Exception {
- Configuration conf = NutchConfiguration.create();
- Any23IndexingFilter filter = new Any23IndexingFilter();
- filter.setConf(conf);
- Assert.assertNotNull(filter);
- NutchDocument doc = new NutchDocument();
- ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "The Foo Page",
- new Outlink[] { }, new Metadata());
- ParseImpl parse = new ParseImpl("test page", parseData);
- String[] triples = new String[]{
- " .",
- " \"77\" .",
- " \"Zurique\"@pt ."
- };
- for (String triple : triples) {
- parse.getData().getParseMeta().add(Any23ParseFilter.ANY23_TRIPLES, triple);
- }
- try {
- doc = filter.filter(doc, parse, new Text("http://nutch.apache.org/"), new CrawlDatum(), new Inlinks());
- } catch (Exception e) {
- e.printStackTrace();
- Assert.fail(e.getMessage());
- }
- List
-
+
+
+
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index ef5c6b5e81..0600944375 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -35,7 +35,6 @@
-
@@ -115,7 +114,6 @@
-
@@ -177,7 +175,6 @@
-
diff --git a/src/plugin/creativecommons/conf/crawl-urlfilter.txt b/src/plugin/creativecommons/conf/crawl-urlfilter.txt
index 324617f07a..eb6786e4b4 100644
--- a/src/plugin/creativecommons/conf/crawl-urlfilter.txt
+++ b/src/plugin/creativecommons/conf/crawl-urlfilter.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# Creative Commnons crawl filter
# Each non-comment, non-blank line contains a regular expression
diff --git a/src/plugin/creativecommons/conf/nutch-site.xml b/src/plugin/creativecommons/conf/nutch-site.xml
index e28e12a9a8..4b343b2cc9 100644
--- a/src/plugin/creativecommons/conf/nutch-site.xml
+++ b/src/plugin/creativecommons/conf/nutch-site.xml
@@ -1,5 +1,21 @@
+
diff --git a/src/plugin/creativecommons/data/anchor.html b/src/plugin/creativecommons/data/anchor.html
index 90b522759d..3267bc9ea8 100755
--- a/src/plugin/creativecommons/data/anchor.html
+++ b/src/plugin/creativecommons/data/anchor.html
@@ -1,3 +1,19 @@
+
diff --git a/src/plugin/creativecommons/data/rdf.html b/src/plugin/creativecommons/data/rdf.html
index fb2c34dfe5..60c27cc541 100755
--- a/src/plugin/creativecommons/data/rdf.html
+++ b/src/plugin/creativecommons/data/rdf.html
@@ -1,3 +1,19 @@
+
diff --git a/src/plugin/creativecommons/data/rel.html b/src/plugin/creativecommons/data/rel.html
index 413d52f869..3d11572d82 100755
--- a/src/plugin/creativecommons/data/rel.html
+++ b/src/plugin/creativecommons/data/rel.html
@@ -1,3 +1,19 @@
+
diff --git a/src/plugin/creativecommons/ivy.xml b/src/plugin/creativecommons/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/creativecommons/ivy.xml
+++ b/src/plugin/creativecommons/ivy.xml
@@ -1,5 +1,4 @@
-
+
exchange-jexl plugin for Nutch
==============================
diff --git a/src/plugin/exchange-jexl/ivy.xml b/src/plugin/exchange-jexl/ivy.xml
index 1275664e5d..cb5a0f1862 100644
--- a/src/plugin/exchange-jexl/ivy.xml
+++ b/src/plugin/exchange-jexl/ivy.xml
@@ -1,5 +1,4 @@
-
+
indexer-links plugin for Nutch
==============================
diff --git a/src/plugin/index-links/ivy.xml b/src/plugin/index-links/ivy.xml
index 624dcaf4a2..3d4fc905c3 100644
--- a/src/plugin/index-links/ivy.xml
+++ b/src/plugin/index-links/ivy.xml
@@ -1,5 +1,4 @@
-
Testing the power of the index-replace plugin
diff --git a/src/plugin/index-static/ivy.xml b/src/plugin/index-static/ivy.xml
index 1275664e5d..cb5a0f1862 100644
--- a/src/plugin/index-static/ivy.xml
+++ b/src/plugin/index-static/ivy.xml
@@ -1,5 +1,4 @@
-
+
AWS CloudSearch plugin for Nutch
================================
diff --git a/src/plugin/indexer-cloudsearch/createCSDomain.sh b/src/plugin/indexer-cloudsearch/createCSDomain.sh
index 24fb0156c6..1cb8481fe0 100644
--- a/src/plugin/indexer-cloudsearch/createCSDomain.sh
+++ b/src/plugin/indexer-cloudsearch/createCSDomain.sh
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# example of domain configuration for CloudSearch
DOMAIN="$1"
diff --git a/src/plugin/indexer-cloudsearch/plugin.xml b/src/plugin/indexer-cloudsearch/plugin.xml
index 5b4425359a..f18bc49eab 100644
--- a/src/plugin/indexer-cloudsearch/plugin.xml
+++ b/src/plugin/indexer-cloudsearch/plugin.xml
@@ -29,9 +29,9 @@
-
-
-
+
+
+
diff --git a/src/plugin/indexer-csv/README.md b/src/plugin/indexer-csv/README.md
index 80220974a7..4d1288b198 100644
--- a/src/plugin/indexer-csv/README.md
+++ b/src/plugin/indexer-csv/README.md
@@ -1,3 +1,20 @@
+
+
indexer-csv plugin for Nutch
============================
diff --git a/src/plugin/indexer-csv/ivy.xml b/src/plugin/indexer-csv/ivy.xml
index 75b5d54e55..e7bf875468 100644
--- a/src/plugin/indexer-csv/ivy.xml
+++ b/src/plugin/indexer-csv/ivy.xml
@@ -1,5 +1,4 @@
-
+
indexer-dummy plugin for Nutch
==============================
diff --git a/src/plugin/indexer-dummy/ivy.xml b/src/plugin/indexer-dummy/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/indexer-dummy/ivy.xml
+++ b/src/plugin/indexer-dummy/ivy.xml
@@ -1,5 +1,4 @@
-
+
indexer-elastic plugin for Nutch
================================
diff --git a/src/plugin/indexer-elastic/howto_upgrade_es.txt b/src/plugin/indexer-elastic/howto_upgrade_es.md
similarity index 61%
rename from src/plugin/indexer-elastic/howto_upgrade_es.txt
rename to src/plugin/indexer-elastic/howto_upgrade_es.md
index a8156444c6..b57e0c02fa 100644
--- a/src/plugin/indexer-elastic/howto_upgrade_es.txt
+++ b/src/plugin/indexer-elastic/howto_upgrade_es.md
@@ -1,3 +1,20 @@
+
+
1. Upgrade Elasticsearch dependency in src/plugin/indexer-elastic/ivy.xml
2. Upgrade the Elasticsearch specific dependencies in src/plugin/indexer-elastic/plugin.xml
diff --git a/src/plugin/indexer-elastic/ivy.xml b/src/plugin/indexer-elastic/ivy.xml
index abdcceae29..e5cdfdf656 100644
--- a/src/plugin/indexer-elastic/ivy.xml
+++ b/src/plugin/indexer-elastic/ivy.xml
@@ -40,7 +40,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/indexer-elastic/plugin.xml b/src/plugin/indexer-elastic/plugin.xml
index 387a3ac664..fc3723a608 100644
--- a/src/plugin/indexer-elastic/plugin.xml
+++ b/src/plugin/indexer-elastic/plugin.xml
@@ -22,50 +22,51 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index 053bfd68aa..290d9dfca2 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -210,6 +210,9 @@ public HttpAsyncClientBuilder customizeHttpClient(
restClientBuilder.setHttpClientConfigCallback(new HttpClientConfigCallback() {
@Override
public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) {
+ if (auth) {
+ httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider);
+ }
// ignore issues with self-signed certificates
httpClientBuilder.setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE);
return httpClientBuilder.setSSLContext(sslContext);
diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml
index 7bdd94324a..9d605c50b5 100644
--- a/src/plugin/indexer-kafka/ivy.xml
+++ b/src/plugin/indexer-kafka/ivy.xml
@@ -1,5 +1,4 @@
-
+
indexer-opensearch1x plugin for Nutch
================================
diff --git a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md
new file mode 100644
index 0000000000..c9b723ffcf
--- /dev/null
+++ b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md
@@ -0,0 +1,50 @@
+
+
+1. Upgrade OpenSearch dependency in src/plugin/indexer-opensearch-1x/ivy.xml
+
+2. Upgrade the OpenSearch specific dependencies in src/plugin/indexer-opensearch-1x/plugin.xml
+ To get the list of dependencies and their versions execute:
+ $ cd src/plugin/indexer-opensearch-1x/
+ $ ant -f ./build-ivy.xml
+ $ ls lib | sed 's/^/ /g'
+
+ In the plugin.xml replace all lines between
+
+ and
+
+ with the output of the command above.
+
+4. (Optionally) remove overlapping dependencies between indexer-opensearch-1x and Nutch core dependencies:
+ - check for libs present both in
+ build/lib
+ and
+ build/plugins/indexer-opensearch-1x/
+ (eventually with different versions)
+ - duplicated libs can be added to the exclusions of transitive dependencies in
+ build/plugins/indexer-opensearch-1x/ivy.xml
+ - but it should be made sure that the library versions in ivy/ivy.xml correspend to
+ those required by Tika
+
+5. Remove the locally "installed" dependencies in src/plugin/indexer-opensearch-1x/lib/:
+
+ $ rm -rf lib/
+
+6. Build Nutch and run all unit tests:
+
+ $ cd ../../../
+ $ ant clean runtime test
\ No newline at end of file
diff --git a/src/plugin/indexer-opensearch-1x/ivy.xml b/src/plugin/indexer-opensearch-1x/ivy.xml
index 1505ad3c82..ae5d91e41e 100644
--- a/src/plugin/indexer-opensearch-1x/ivy.xml
+++ b/src/plugin/indexer-opensearch-1x/ivy.xml
@@ -40,7 +40,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/indexer-opensearch-1x/plugin.xml b/src/plugin/indexer-opensearch-1x/plugin.xml
index 1bf5affc2f..ee0d45dc2a 100644
--- a/src/plugin/indexer-opensearch-1x/plugin.xml
+++ b/src/plugin/indexer-opensearch-1x/plugin.xml
@@ -22,50 +22,50 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/indexer-rabbit/README.md b/src/plugin/indexer-rabbit/README.md
index 6ea09a9151..8040cd6c76 100644
--- a/src/plugin/indexer-rabbit/README.md
+++ b/src/plugin/indexer-rabbit/README.md
@@ -1,3 +1,20 @@
+
+
indexer-rabbit plugin for Nutch
===============================
diff --git a/src/plugin/indexer-rabbit/ivy.xml b/src/plugin/indexer-rabbit/ivy.xml
index dd450cf7f0..d2daf91dad 100644
--- a/src/plugin/indexer-rabbit/ivy.xml
+++ b/src/plugin/indexer-rabbit/ivy.xml
@@ -1,5 +1,4 @@
-
+
indexer-solr plugin for Nutch
=============================
diff --git a/src/plugin/indexer-solr/howto_upgrade_solr.txt b/src/plugin/indexer-solr/howto_upgrade_solr.md
similarity index 60%
rename from src/plugin/indexer-solr/howto_upgrade_solr.txt
rename to src/plugin/indexer-solr/howto_upgrade_solr.md
index b2a7eb5c89..905fb84a9e 100644
--- a/src/plugin/indexer-solr/howto_upgrade_solr.txt
+++ b/src/plugin/indexer-solr/howto_upgrade_solr.md
@@ -1,3 +1,20 @@
+
+
1. Upgrade Solr dependency in src/plugin/indexer-solr/ivy.xml
2. Upgrade the Solr specific dependencies in src/plugin/indexer-solr/plugin.xml
diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml
index ce59942daf..ab5fd72c7a 100644
--- a/src/plugin/indexer-solr/ivy.xml
+++ b/src/plugin/indexer-solr/ivy.xml
@@ -1,15 +1,20 @@
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
diff --git a/src/plugin/indexer-solr/plugin.xml b/src/plugin/indexer-solr/plugin.xml
index d49641cf9c..21cc7d8bdf 100644
--- a/src/plugin/indexer-solr/plugin.xml
+++ b/src/plugin/indexer-solr/plugin.xml
@@ -1,14 +1,20 @@
-
+
@@ -17,7 +23,7 @@
-
+
diff --git a/src/plugin/language-identifier/ivy.xml b/src/plugin/language-identifier/ivy.xml
index 395047c6fc..f64b97055b 100644
--- a/src/plugin/language-identifier/ivy.xml
+++ b/src/plugin/language-identifier/ivy.xml
@@ -1,5 +1,4 @@
-
-
-
-
-
-
+
diff --git a/src/plugin/language-identifier/plugin.xml b/src/plugin/language-identifier/plugin.xml
index 357c4a67cd..dab1a52f31 100644
--- a/src/plugin/language-identifier/plugin.xml
+++ b/src/plugin/language-identifier/plugin.xml
@@ -26,16 +26,7 @@
-
-
-
-
-
-
-
-
-
-
+
diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml
index b03211667a..795e6b3358 100644
--- a/src/plugin/lib-htmlunit/ivy.xml
+++ b/src/plugin/lib-htmlunit/ivy.xml
@@ -1,5 +1,4 @@
-
+
# Updates
* The use of phantomjs has been deprecated. Check [Wikipedia](https://en.wikipedia.org/wiki/PhantomJS) for more info.
* The updated code for Safari webriver is under development as starting Safari 10 on OS X El Capitan and macOS Sierra, Safari comes bundled with a new driver implementation.
diff --git a/src/plugin/any23/build.xml b/src/plugin/lib-selenium/howto_upgrade_selenium.md
similarity index 51%
rename from src/plugin/any23/build.xml
rename to src/plugin/lib-selenium/howto_upgrade_selenium.md
index 790b18548d..3071c74cbf 100644
--- a/src/plugin/any23/build.xml
+++ b/src/plugin/lib-selenium/howto_upgrade_selenium.md
@@ -1,4 +1,3 @@
-
-
-
+1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml
-
-
-
-
-
+2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
-
-
-
-
-
-
-
+ To get a list of dependencies and their versions execute:
+ $ ant -f ./build-ivy.xml
+ $ ls lib | sed 's/^/ \n \n <\/library>/g'
-
+ Note that all dependent libraries are exported for a "library" plugin ("lib-selenium").
+
+ N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows
+
+ $ brew install gnu-sed --with-default-names
+
+ You can then restart your terminal and the Regex + Sed command should work just fine!
diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.txt b/src/plugin/lib-selenium/howto_upgrade_selenium.txt
deleted file mode 100644
index 1892a6275e..0000000000
--- a/src/plugin/lib-selenium/howto_upgrade_selenium.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml
-
-2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
-
- To get a list of dependencies and their versions execute:
- $ ant -f ./build-ivy.xml
- $ ls lib | sed 's/^/ \n \n <\/library>/g'
-
- Note that all dependent libraries are exported for a "library" plugin ("lib-selenium").
-
- N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows
-
- $ brew install gnu-sed --with-default-names
-
- You can then restart your terminal and the Regex + Sed command should work just fine!
diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml
index 7d3a2d6242..0d460cdb4d 100644
--- a/src/plugin/lib-selenium/ivy.xml
+++ b/src/plugin/lib-selenium/ivy.xml
@@ -1,5 +1,4 @@
-
diff --git a/src/plugin/parse-js/sample/parse_pure_js_test.js b/src/plugin/parse-js/sample/parse_pure_js_test.js
index f196313f85..0e486a8793 100644
--- a/src/plugin/parse-js/sample/parse_pure_js_test.js
+++ b/src/plugin/parse-js/sample/parse_pure_js_test.js
@@ -1,3 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
// test data for link extraction from "pure" JavaScript
function selectProvider(form) {
diff --git a/src/plugin/parse-metatags/ivy.xml b/src/plugin/parse-metatags/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/parse-metatags/ivy.xml
+++ b/src/plugin/parse-metatags/ivy.xml
@@ -1,5 +1,4 @@
-
diff --git a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
index ca8b737c2b..36d2c8814a 100644
--- a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
+++ b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
@@ -1,3 +1,19 @@
+
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.md b/src/plugin/parse-tika/howto_upgrade_tika.md
new file mode 100644
index 0000000000..8ed6c3f3cd
--- /dev/null
+++ b/src/plugin/parse-tika/howto_upgrade_tika.md
@@ -0,0 +1,79 @@
+
+
+We are currently using a shim (https://github.com/tballison/hadoop-safe-tika
+because of binary conflicts in commons-io versions between what Hadoop supports and the more
+modern features that Apache Tika and Apache POI were using in commons-io.
+
+For now, all you have to do is update the fat jar dependencies:
+
+1. tika-core-shaded in ivy/ivy.xml
+
+2. tika-parsers-standard-package-shaded in src/plugin/parse-tika/ivy.xml
+
+3. The library name version for tika-parsers-standard-package-shaded in src/plugin/parse-tika/plugin.xml
+
+4. Repeat steps 2 and 3 for the language-identifier
+
+5. Build Nutch and run all unit tests:
+
+ $ cd ../../../
+ $ ant clean runtime test
+
+The following directions are what we used to do with thin jars. Hopefully, we'll
+be able to get back to these directions once we have version harmony with Hadoop and Tika/POI.
+
+1. Upgrade Tika dependency (tika-core) in ivy/ivy.xml
+
+2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
+
+3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
+
+ To get the list of dependencies and their versions execute:
+ $ cd src/plugin/parse-tika/
+ $ ant -f ./build-ivy.xml
+ $ ls lib | sed 's/^/ /g'
+
+ In the plugin.xml replace all lines between
+
+ and
+
+ with the output of the command above.
+
+4. (Optionally) remove overlapping dependencies between parse-tika and Nutch core dependencies:
+ - check for libs present both in
+ build/lib
+ and
+ build/plugins/parse-tika/
+ (eventually with different versions)
+ - duplicated libs can be added to the exclusions of transitive dependencies in
+ build/plugins/parse-tika/ivy.xml
+ - but the library versions in ivy/ivy.xml MUST correspond to those required by Tika
+
+5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/:
+
+ $ rm -rf lib/
+
+6. Repeat steps 2-5 for the language-identifier plugin which also depends on Tika modules
+
+ $ cd ../language-identifier/
+
+7. Build Nutch and run all unit tests:
+
+ $ cd ../../../
+ $ ant clean runtime test
+
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt
deleted file mode 100644
index cb3ed6be87..0000000000
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-1. Upgrade Tika dependency (tika-core) in ivy/ivy.xml
-
-2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
-
-3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
-
- To get the list of dependencies and their versions execute:
- $ cd src/plugin/parse-tika/
- $ ant -f ./build-ivy.xml
- $ ls lib | sed 's/^/ /g'
-
- In the plugin.xml replace all lines between
-
- and
-
- with the output of the command above.
-
-4. (Optionally) remove overlapping dependencies between parse-tika and Nutch core dependencies:
- - check for libs present both in
- build/lib
- and
- build/plugins/parse-tika/
- (eventually with different versions)
- - duplicated libs can be added to the exclusions of transitive dependencies in
- build/plugins/parse-tika/ivy.xml
- - but the library versions in ivy/ivy.xml MUST correspond to those required by Tika
-
-5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/:
-
- $ rm -rf lib/
-
-6. Repeat steps 2-5 for the language-identifier plugin which also depends on Tika modules
-
- $ cd ../language-identifier/
-
-It should be noted that Any23 also has a dependency on Tika so you may wish to check that there are no classpath conflicts in the any23 plugin as well.
-
-7. Build Nutch and run all unit tests:
-
- $ cd ../../../
- $ ant clean runtime test
-
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index f0ec7a8d8c..f16636d255 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -1,5 +1,4 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index d88405bc1c..a20fa7266a 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -25,86 +25,7 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
diff --git a/src/plugin/parse-tika/sample/nutch.html b/src/plugin/parse-tika/sample/nutch.html
index 0aa7c98959..8098535126 100644
--- a/src/plugin/parse-tika/sample/nutch.html
+++ b/src/plugin/parse-tika/sample/nutch.html
@@ -1,3 +1,19 @@
+
diff --git a/src/plugin/parse-zip/ivy.xml b/src/plugin/parse-zip/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/parse-zip/ivy.xml
+++ b/src/plugin/parse-zip/ivy.xml
@@ -1,5 +1,4 @@
-
+
Nutch Interactive Selenium
==========================
diff --git a/src/plugin/protocol-interactiveselenium/ivy.xml b/src/plugin/protocol-interactiveselenium/ivy.xml
index 506be0aecb..112483bcdc 100644
--- a/src/plugin/protocol-interactiveselenium/ivy.xml
+++ b/src/plugin/protocol-interactiveselenium/ivy.xml
@@ -1,5 +1,4 @@
-
+
1. Upgrade OkHttp dependency in src/plugin/protocol-okhttp/ivy.xml
2. Upgrade OkHttp's own dependencies in src/plugin/protocol-okhttp/plugin.xml
diff --git a/src/plugin/protocol-okhttp/ivy.xml b/src/plugin/protocol-okhttp/ivy.xml
index ead8232474..73b4fa6369 100644
--- a/src/plugin/protocol-okhttp/ivy.xml
+++ b/src/plugin/protocol-okhttp/ivy.xml
@@ -1,5 +1,4 @@
-
+
Nutch Selenium
==============
diff --git a/src/plugin/protocol-selenium/ivy.xml b/src/plugin/protocol-selenium/ivy.xml
index 506be0aecb..112483bcdc 100644
--- a/src/plugin/protocol-selenium/ivy.xml
+++ b/src/plugin/protocol-selenium/ivy.xml
@@ -1,5 +1,4 @@
-
Filters URLs based on a file of regular expressions using host/domains
matching first. The default policy is to accept a URL if no matches
diff --git a/src/plugin/urlfilter-fast/ivy.xml b/src/plugin/urlfilter-fast/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlfilter-fast/ivy.xml
+++ b/src/plugin/urlfilter-fast/ivy.xml
@@ -1,5 +1,4 @@
-
+
urlfilter-ignoreexempt
======================
This plugin allows certain urls to be exempted when the external links are configured to be ignored.
diff --git a/src/plugin/urlfilter-ignoreexempt/ivy.xml b/src/plugin/urlfilter-ignoreexempt/ivy.xml
index 956fd25efc..5c2c5b77e1 100644
--- a/src/plugin/urlfilter-ignoreexempt/ivy.xml
+++ b/src/plugin/urlfilter-ignoreexempt/ivy.xml
@@ -1,5 +1,4 @@
-
diff --git a/src/test/filter-all.txt b/src/test/filter-all.txt
index 4ed567ab1c..d738aec76a 100644
--- a/src/test/filter-all.txt
+++ b/src/test/filter-all.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# Config file for urlfilter-suffix plugin
# Filter away all urls
diff --git a/src/test/log4j.properties b/src/test/log4j.properties
index 3ff115f46f..08e272c712 100644
--- a/src/test/log4j.properties
+++ b/src/test/log4j.properties
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# log4j configuration used during build and unit tests
log4j.rootLogger=info,stdout
diff --git a/src/test/nutch-site.xml b/src/test/nutch-site.xml
index dd408739dc..0d6177e5e6 100644
--- a/src/test/nutch-site.xml
+++ b/src/test/nutch-site.xml
@@ -1,4 +1,20 @@
+
diff --git a/src/testresources/fetch-test-site/dup_of_pagea.html b/src/testresources/fetch-test-site/dup_of_pagea.html
index 6444c41225..63c4e61537 100644
--- a/src/testresources/fetch-test-site/dup_of_pagea.html
+++ b/src/testresources/fetch-test-site/dup_of_pagea.html
@@ -1,3 +1,19 @@
+
page a
diff --git a/src/testresources/fetch-test-site/exception.html b/src/testresources/fetch-test-site/exception.html
index e1192a176b..66f134ee25 100644
--- a/src/testresources/fetch-test-site/exception.html
+++ b/src/testresources/fetch-test-site/exception.html
@@ -1,3 +1,19 @@
+
diff --git a/src/testresources/fetch-test-site/index.html b/src/testresources/fetch-test-site/index.html
index d73ff3f691..3fc6e61e5a 100644
--- a/src/testresources/fetch-test-site/index.html
+++ b/src/testresources/fetch-test-site/index.html
@@ -1,3 +1,19 @@
+
front page
diff --git a/src/testresources/fetch-test-site/nested_spider_trap.html b/src/testresources/fetch-test-site/nested_spider_trap.html
index 5dcf7c2209..dd32ee2362 100644
--- a/src/testresources/fetch-test-site/nested_spider_trap.html
+++ b/src/testresources/fetch-test-site/nested_spider_trap.html
@@ -1,3 +1,19 @@
+
nested spider trap
diff --git a/src/testresources/fetch-test-site/pagea.html b/src/testresources/fetch-test-site/pagea.html
index 6444c41225..63c4e61537 100644
--- a/src/testresources/fetch-test-site/pagea.html
+++ b/src/testresources/fetch-test-site/pagea.html
@@ -1,3 +1,19 @@
+
page a
diff --git a/src/testresources/fetch-test-site/pageb.html b/src/testresources/fetch-test-site/pageb.html
index 66e3725ef0..cf77ff4f75 100644
--- a/src/testresources/fetch-test-site/pageb.html
+++ b/src/testresources/fetch-test-site/pageb.html
@@ -1,3 +1,19 @@
+
bage b
diff --git a/src/testresources/fetch-test-site/robots.txt b/src/testresources/fetch-test-site/robots.txt
index e69de29bb2..fc590f9733 100644
--- a/src/testresources/fetch-test-site/robots.txt
+++ b/src/testresources/fetch-test-site/robots.txt
@@ -0,0 +1,14 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file