diff --git a/build.xml b/build.xml
index 0a18682f88..49187d3ba4 100644
--- a/build.xml
+++ b/build.xml
@@ -203,6 +203,7 @@
+
@@ -646,6 +647,7 @@
+
@@ -1173,6 +1175,8 @@
+
+
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 8b24f092ae..edcaeb569f 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2252,6 +2252,72 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
+
+
+ index.arbitrary.function.count
+
+ The count of arbitrary additions/edits to the document.
+ Specify the remaining properties (fieldName, className, constructorArgs,
+ methodName, and methodArgs) independently in this file by appending a
+ dot (.) followed by integer numerals (beginning with '0') to the property
+ names, e.g.:
+
+ index.arbitrary.fieldName.0
+ for the field to add/set with the first arbitrary addition or:
+
+ index.arbitrary.className.3
+ for the POJO class name to use in setting the fourth arbitrary addition.
+
+
+
+
+ index.arbitrary.fieldName.0
+
+ The name of the field to add to the document with the value
+ returned from the custom POJO.
+
+
+
+ index.arbitrary.className.0
+
+ The fully qualified name of the POJO class that will supply
+ values for the new field.
+
+
+
+ index.arbitrary.constructorArgs.0
+
+ The values (as strings) to pass into the POJO constructor.
+ The POJO must accept a String representation of the NutchDocument's URL
+ as the first parameter in the constructor. The values you specify here
+ will populate the constructor arguments 1,..,n-1 where n=the count of
+ arguments to the constructor. Argument #0 will be the NutchDocument's URL.
+
+
+
+
+ index.arbitrary.methodName.0
+
+ The name of the method to invoke on the instance of your custom
+ class in order to determine the value to add to the document.
+
+
+
+ index.arbitrary.methodArgs.0
+
+ The values (as strings) to pass into the named method on the POJO
+ instance. Unlike the constructor args, there is no required argument that this
+ method in the POJO must accept, i.e., the Arbitrary Indexer doesn't supply any
+ arguments taken from the NutchDocument values by default.
+
+
+
+ index.arbitrary.overwrite.0
+ Whether to overwrite any existing value in the doc for
+ for fieldName. Default is false if not specified in config
+
+
+
metatags.names
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 34688ed566..498259a950 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -40,6 +40,7 @@
+
@@ -117,6 +118,7 @@
+
@@ -179,6 +181,7 @@
+
diff --git a/src/plugin/index-arbitrary/build.xml b/src/plugin/index-arbitrary/build.xml
new file mode 100644
index 0000000000..818020c848
--- /dev/null
+++ b/src/plugin/index-arbitrary/build.xml
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
diff --git a/src/plugin/index-arbitrary/ivy.xml b/src/plugin/index-arbitrary/ivy.xml
new file mode 100644
index 0000000000..9feb1e1b4a
--- /dev/null
+++ b/src/plugin/index-arbitrary/ivy.xml
@@ -0,0 +1,39 @@
+
+
+
+
+
+
+
+ Apache Nutch
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/index-arbitrary/plugin.xml b/src/plugin/index-arbitrary/plugin.xml
new file mode 100644
index 0000000000..f79188a51d
--- /dev/null
+++ b/src/plugin/index-arbitrary/plugin.xml
@@ -0,0 +1,42 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java
new file mode 100644
index 0000000000..7677ef7f81
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import org.apache.hadoop.io.Text;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.invoke.MethodHandles;
+import java.lang.Class;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Adds arbitrary searchable fields to a document from the class and method
+ * the user identifies in the config. The user supplies the name of the field
+ * to add with the class and method names that supply the value.
+ *
+ * Example:
+ * <property>
+ * <name>index.arbitrary.function.count</name>
+ * <value>1</value>
+ * </property>
+ *
+ * <property>
+ * <name>index.arbitrary.fieldName.0</name>
+ * <value>advisors</value>
+ * </property>
+ *
+ * <property>
+ * <name>index.arbitrary.className.0</name>
+ * <value>com.example.arbitrary.AdvisorCalculator</value>
+ * </property>
+ *
+ * <property>
+ * <name>index.arbitrary.constructorArgs.0</name>
+ * <value>Kirk</value>
+ * </property>
+ *
+ * <property>
+ * <name>index.arbitrary.methodName.0</name>
+ * <value>countAdvisors</value>
+ * </property>
+ *
+ * <property>
+ * <name>index.arbitrary.methodArgs.0</name>
+ * <value>Spock,McCoy</value>
+ * </property>
+ *
+ * To set more than one arbitrary field value,
+ * increment {@code index.arbitrary.function.count} and
+ * repeat the rest of these blocks with successive int values
+ * appended to the property names, e.g. fieldName.1, methodName.1, etc.
+ */
+public class ArbitraryIndexingFilter implements IndexingFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ /** How many arbitrary field definitions to set. */
+ private int arbitraryAddsCount = 0;
+
+ /** The name of the field to insert/overwrite in the NutchDocument */
+ private String fieldName;
+
+ /** The fully-qualified class name of the custom class to use for the
+ * new field. This class must be in the Nutch runtime classpath,
+ * e.g., nutch/lib/ dierctory. */
+ private String className;
+
+ /** The String values to pass to the custom class constructor. The plugin
+ * will add the document url as the first argument in className's
+ * String[] args. */
+ private String[] userConstrArgs;
+
+ /** The array where the plugin copies the url & the userConstrArgs
+ * to create the instance of className. */
+ private String[] constrArgs;
+
+ /** The name of the method in the custom class to call. Its return value
+ * will become the value of fieldName in the NutchDocument. */
+ private String methodName;
+
+ /** The String values of the arguments to methodName. It's up to the
+ * developer of className to do any casts/conversions from String to
+ * another class in the code of className. */
+ private String[] methodArgs;
+
+ /** The result that returns from methodName. The plugin will set the value
+ * of fieldName to this. */
+ private Object result;
+
+ /** Optional flag to determine whether to overwrite the existing value in the
+ * NutchDocument fieldName if this is set to true. Default behavior is to
+ * add the value from calling methodName to existing values for fieldName. */
+ private boolean overwrite = false;
+
+ /** Hadoop Configuration object to pass these values into the plugin. */
+ private Configuration conf;
+
+ /**
+ * The {@link ArbitraryIndexingFilter} filter object uses reflection
+ * to instantiate the configured class and invoke the configured method.
+ * It requires a few configuration settings for adding arbitrary fields
+ * and values to the NutchDocument as searchable fields.
+ * See {@code index.arbitrary.function.count}, and (possibly multiple
+ * instances when {@code index.arbitrary.function.count} > 1) of the following
+ * {@code index.arbitrary.fieldName}.index,
+ * {@code index.arbitrary.className}.index,
+ * {@code index.arbitrary.constructorArgs}.index,
+ * {@code index.arbitrary.methodName}.index, and
+ * {@code index.arbitrary.methodArgs}.index
+ * in nutch-default.xml or nutch-site.xml where index ranges from 0
+ * to {@code index.arbitrary.function.count} - 1.
+ *
+ * @param doc
+ * The {@link NutchDocument} object
+ * @param parse
+ * The relevant {@link Parse} object passing through the filter
+ * @param url
+ * URL to be filtered by the user-specified class
+ * @param datum
+ * The {@link CrawlDatum} entry
+ * @param inlinks
+ * The {@link Inlinks} containing anchor text
+ * @return filtered NutchDocument
+ */
+ @Override
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ Class theClass = null;
+ Method theMethod = null;
+ Constructor> theConstructor = null;
+ Object instance = null;
+
+ // This'll be quick
+ if (doc == null) {
+ LOG.debug("In filter() where doc is null for url == {}",
+ String.valueOf(url));
+ return doc;
+ } else if (url == null) {
+ LOG.debug("In filter() where url is null. Nothing to do.");
+ return doc;
+ }
+
+ int cfgCounter = 0;
+ while (cfgCounter < arbitraryAddsCount) {
+ setIndexedConf(conf,cfgCounter);
+ cfgCounter++;
+ try {
+ theClass = Class.forName(className);
+ if (methodArgs.length > 0) {
+ theMethod = theClass.getDeclaredMethod(methodName,String[].class);
+ } else {
+ theMethod = theClass.getMethod(methodName);
+ }
+ theConstructor = theClass.getDeclaredConstructor(String[].class);
+ } catch (Exception e) {
+ LOG.error("Exception preparing reflection tasks. className was {}",
+ String.valueOf(className));
+ e.printStackTrace();
+ }
+ try {
+ constrArgs = new String[userConstrArgs.length + 1];
+ constrArgs[0] = url.toString();
+ System.arraycopy(userConstrArgs,0,constrArgs,1,userConstrArgs.length);
+ instance = theConstructor.newInstance(new Object[]{constrArgs});
+ if (methodArgs.length > 0) {
+ result = theMethod.invoke(instance, new Object[]{methodArgs});
+ } else {
+ result = theMethod.invoke(instance);
+ }
+ } catch (Exception e) {
+ LOG.error("Exception in reflection trying to instantiate/invoke. "
+ + "url was {} & className was {}",
+ String.valueOf(url), String.valueOf(className));
+ if (constrArgs.length > 0) {
+ LOG.error("constrArgs[1] was {}", String.valueOf(constrArgs[1]));
+ }
+ LOG.error("methodName was {}", String.valueOf(className));
+ if (methodArgs.length > 0) {
+ LOG.error("methodArgs[0] was {}", String.valueOf(methodArgs[0]));
+ }
+ e.printStackTrace();
+ }
+
+ LOG.debug("{}.{}() returned {} for field {}.", className,
+ methodName, String.valueOf(result), String.valueOf(fieldName));
+
+ // If user chose to overwrite, remove existing value
+ if (overwrite) {
+ LOG.debug("overwrite == true for fieldName == {} ", fieldName);
+ if (doc.getFieldNames().contains(fieldName)) {
+ LOG.debug("Removing field '{}' from doc for overwrite", fieldName);
+ doc.removeField(fieldName);
+ }
+ }
+ if (result == null) {
+ LOG.debug("Call to {}.{} returned null", className, methodName);
+ if (overwrite) {
+ LOG.debug("{} has been cleared.", fieldName);
+ }
+ }
+ LOG.debug("Adding value '{}' for field '{}' to doc", result, fieldName);
+ doc.add(fieldName, result);
+ }
+ return doc;
+ }
+
+ /**
+ * Set the {@link Configuration} object
+ */
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ arbitraryAddsCount = conf.getInt("index.arbitrary.function.count",1);
+ LOG.info("Will process the first {} fieldName defs in config.", String.valueOf(arbitraryAddsCount));
+ }
+
+ /**
+ * Set the {@link Configuration} object for a specific set of values in the config
+ *
+ * @param conf
+ * The Configuration object holding values for the current arbitrary field.
+ * @param ndx
+ * The ordinal counter value for the current arbitrary field appended to the
+ * base property names in the xml configuration file.
+ */
+ public void setIndexedConf(Configuration conf, int ndx) {
+ LOG.debug("In setIndexedConf() where ndx was passed in as {}", String.valueOf(ndx));
+ fieldName = conf.get("index.arbitrary.fieldName.".concat(String.valueOf(ndx)));
+ LOG.debug("Looking now for index.arbitrary.fieldname.{} which was: {}",
+ String.valueOf(ndx),String.valueOf(fieldName));
+
+ if (fieldName == null || fieldName == "") {
+ throw new RuntimeException ("Problem in configuration where the index.arbitrary.fieldName."
+ + String.valueOf(ndx) + " is missing.");
+ }
+
+ className = conf.get("index.arbitrary.className.".concat(String.valueOf(ndx)));
+ if (className == null || className == "") {
+ throw new RuntimeException ("Problem in configuration where the index.arbitrary.className."
+ + String.valueOf(ndx) + " is missing.");
+ }
+
+ userConstrArgs = conf.getTrimmedStrings("index.arbitrary.constructorArgs.".concat(String.valueOf(ndx)));
+ methodName = conf.get("index.arbitrary.methodName.".concat(String.valueOf(ndx)),"");
+ methodArgs = conf.getTrimmedStrings("index.arbitrary.methodArgs.".concat(String.valueOf(ndx)));
+ overwrite = conf.getBoolean("index.arbitrary.overwrite.".concat(String.valueOf(ndx)),false);
+ if (overwrite) {
+ LOG.info("overwrite set == true for processing {}.", fieldName);
+ }
+ }
+
+ /**
+ * Get the {@link Configuration} object */
+ @Override
+ public Configuration getConf() {
+ return this.conf;
+ }
+}
diff --git a/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java
new file mode 100644
index 0000000000..6e6d47513d
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to add document arbitrary data to the index
+ * from the output of a user-specified class.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java
new file mode 100644
index 0000000000..52e8939ce5
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+
+import java.io.PrintStream;
+
+public class Echo {
+
+ private static PrintStream out = System.out;
+ private String words;
+
+ public Echo(String args[]) {
+ super();
+ words = String.valueOf(args[1]);
+ }
+
+ public String getText() {
+ return words;
+ }
+
+ public static void main(String[] args) {
+ Echo echo = new Echo(args);
+ out.println(echo.getText());
+ }
+}
diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java
new file mode 100644
index 0000000000..38875d0daf
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+import java.io.PrintStream;
+
+public class Multiplier {
+ private float product = 1;
+ private static PrintStream err = System.err;
+ private static PrintStream out = System.out;
+
+ public Multiplier(String args[]) {
+ super();
+ }
+
+ public String getProduct(String args[]) {
+ int i = args.length - 1;
+ try {
+ while (i >= 0) {
+ product = product * Float.parseFloat(args[i]);
+ i--;
+ }
+ } catch (NumberFormatException nfe) {
+ err.println("NumberFormatException while trying to parse " + String.valueOf(args[i]));
+ }
+ return String.valueOf(product);
+ }
+
+ public static void main(String[] args) {
+ Multiplier mp = new Multiplier(args);
+ out.println(mp.getProduct(args));
+ }
+}
diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java
new file mode 100644
index 0000000000..17f31b1839
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import java.lang.invoke.MethodHandles;
+
+/**
+ * Tests that the index-arbitrary filter can add a new field with an arbitrary
+ * value, supplement an existing field with an arbitrary value, and overwrite
+ * an existing field with an arbitrary value where it takes the arbitrary value
+ * from some POJO outside the normal Nutch codebase.
+ *
+ * @author Joe Gilvary
+ */
+
+public class TestArbitraryIndexingFilter {
+
+ Configuration conf;
+ Inlinks inlinks;
+ ParseImpl parse;
+ CrawlDatum crawlDatum;
+ Text url;
+ ArbitraryIndexingFilter filter;
+ NutchDocument doc;
+
+ @Before
+ public void setUp() throws Exception {
+ parse = new ParseImpl();
+ url = new Text("http://nutch.apache.org/index.html");
+ crawlDatum = new CrawlDatum();
+ inlinks = new Inlinks();
+ }
+
+
+ /**
+ * Test adding field with arbitrary content from POJO
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testAddingNewField() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.set("index.arbitrary.function.count","1");
+ conf.set("index.arbitrary.fieldName.0","foo");
+ conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo");
+ conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar");
+ conf.set("index.arbitrary.methodName.0","getText");
+
+ filter = new ArbitraryIndexingFilter();
+ Assert.assertNotNull("No filter exists for testAddingNewField",filter);
+
+ filter.setConf(conf);
+ doc = new NutchDocument();
+
+ try {
+ filter.filter(doc, parse, url, crawlDatum, inlinks);
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.fail(e.getMessage());
+ }
+
+ Assert.assertNotNull(doc);
+ Assert.assertFalse("test if doc is not empty", doc.getFieldNames()
+ .isEmpty());
+ Assert.assertTrue("test if doc has new field with arbitrary value", doc.getField("foo")
+ .getValues().contains("Arbitrary text to add - bar"));
+ }
+
+ /**
+ * Test supplementing a doc field with arbitrary content from POJO
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testSupplementExistingField() throws Exception {
+
+ conf = NutchConfiguration.create();
+ conf.set("index.arbitrary.function.count","2");
+ conf.set("index.arbitrary.fieldName.0","foo");
+ conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo");
+ conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar");
+ conf.set("index.arbitrary.methodName.0","getText");
+ conf.set("index.arbitrary.fieldName.1","description");
+ conf.set("index.arbitrary.className.1","org.apache.nutch.indexer.arbitrary.Multiplier");
+ conf.set("index.arbitrary.constructorArgs.1","");
+ conf.set("index.arbitrary.methodName.1","getProduct");
+ conf.set("index.arbitrary.methodArgs.1","-1,3.14");
+
+ filter = new ArbitraryIndexingFilter();
+ Assert.assertNotNull("No filter exists for testSupplementExistingField", filter);
+
+ filter.setConf(conf);
+
+ doc = new NutchDocument();
+ Assert.assertNotNull("doc doesn't exist", doc);
+
+ doc.add("description","irrational");
+
+ Assert.assertFalse("doc is empty", doc.getFieldNames().isEmpty());
+
+ Assert.assertEquals("field description does not have exactly one value", 1,
+ doc.getField("description").getValues().size());
+
+ Assert.assertTrue("field description does not have initial value 'irrational'",
+ doc.getField("description").getValues().contains("irrational"));
+
+ try {
+ filter.filter(doc, parse, url, crawlDatum, inlinks);
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.fail(e.getMessage());
+ }
+
+ Assert.assertTrue("doc doesn't have new field with arbitrary value",
+ doc.getField("foo").getValues()
+ .contains("Arbitrary text to add - bar"));
+
+ Assert.assertEquals("field description does not have 2 values", 2,
+ doc.getField("description").getValues().size());
+
+ Assert.assertTrue("field description original value gone", doc.getField("description")
+ .getValues().contains("irrational"));
+
+ Assert.assertTrue("field description missing new value", doc.getField("description")
+ .getValues().contains("-3.14"));
+ }
+
+
+ /**
+ * Test overwriting a doc field with arbitrary content from POJO
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testOverwritingExistingField() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.set("index.arbitrary.function.count","3");
+ conf.set("index.arbitrary.fieldName.0","foo");
+ conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo");
+ conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar");
+ conf.set("index.arbitrary.methodName.0","getText");
+ conf.set("index.arbitrary.fieldName.1","description");
+ conf.set("index.arbitrary.className.1","org.apache.nutch.indexer.arbitrary.Multiplier");
+ conf.set("index.arbitrary.methodArgs.1","-1,3.14159265");
+ conf.set("index.arbitrary.methodName.1","getProduct");
+ conf.set("index.arbitrary.fieldName.2","philosopher");
+ conf.set("index.arbitrary.className.2","org.apache.nutch.indexer.arbitrary.Echo");
+ conf.set("index.arbitrary.constructorArgs.2","Popeye");
+ conf.set("index.arbitrary.methodName.2","getText");
+ conf.set("index.arbitrary.overwrite.2","true");
+
+ filter = new ArbitraryIndexingFilter();
+ Assert.assertNotNull("No filter exists for testOverwritingExistingField",filter);
+
+ filter.setConf(conf);
+ Assert.assertNotNull("conf does not exist",conf);
+
+ doc = new NutchDocument();
+
+ Assert.assertNotNull("doc does not exist",doc);
+
+ doc.add("description","irrational");
+ doc.add("philosopher","Socrates");
+
+ Assert.assertEquals("field description does not have exactly one value", 1, doc.getField("description")
+ .getValues().size());
+
+ Assert.assertEquals("field philosopher does not have exactly one value", 1, doc.getField("philosopher")
+ .getValues().size());
+
+ Assert.assertTrue("field description does not have initial value 'irrational'", doc.getField("description")
+ .getValues().contains("irrational"));
+
+ Assert.assertTrue("field philosopher does not have initial value 'Socrates'", doc.getField("philosopher")
+ .getValues().contains("Socrates"));
+
+ try {
+ filter.filter(doc, parse, url, crawlDatum, inlinks);
+ } catch (Exception e) {
+ e.printStackTrace(System.out);
+ Assert.fail(e.getMessage());
+ }
+
+ Assert.assertNotNull(doc);
+
+ Assert.assertEquals("field philosopher no longer has only one value", 1, doc.getField("philosopher")
+ .getValues().size());
+
+ Assert.assertFalse("field philosopher's original value 'Socrates' NOT overwritten", doc.getField("philosopher")
+ .getValues().contains("Socrates"));
+
+ Assert.assertTrue("field philosopher does not have new value 'Popeye'", doc.getField("philosopher")
+ .getValues().contains("Popeye"));
+ }
+}