diff --git a/build.xml b/build.xml index 0a18682f88..49187d3ba4 100644 --- a/build.xml +++ b/build.xml @@ -203,6 +203,7 @@ + @@ -646,6 +647,7 @@ + @@ -1173,6 +1175,8 @@ + + diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 8b24f092ae..edcaeb569f 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -2252,6 +2252,72 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this + + + index.arbitrary.function.count + + The count of arbitrary additions/edits to the document. + Specify the remaining properties (fieldName, className, constructorArgs, + methodName, and methodArgs) independently in this file by appending a + dot (.) followed by integer numerals (beginning with '0') to the property + names, e.g.: + + index.arbitrary.fieldName.0 + for the field to add/set with the first arbitrary addition or: + + index.arbitrary.className.3 + for the POJO class name to use in setting the fourth arbitrary addition. + + + + + index.arbitrary.fieldName.0 + + The name of the field to add to the document with the value + returned from the custom POJO. + + + + index.arbitrary.className.0 + + The fully qualified name of the POJO class that will supply + values for the new field. + + + + index.arbitrary.constructorArgs.0 + + The values (as strings) to pass into the POJO constructor. + The POJO must accept a String representation of the NutchDocument's URL + as the first parameter in the constructor. The values you specify here + will populate the constructor arguments 1,..,n-1 where n=the count of + arguments to the constructor. Argument #0 will be the NutchDocument's URL. + + + + + index.arbitrary.methodName.0 + + The name of the method to invoke on the instance of your custom + class in order to determine the value to add to the document. + + + + index.arbitrary.methodArgs.0 + + The values (as strings) to pass into the named method on the POJO + instance. Unlike the constructor args, there is no required argument that this + method in the POJO must accept, i.e., the Arbitrary Indexer doesn't supply any + arguments taken from the NutchDocument values by default. + + + + index.arbitrary.overwrite.0 + Whether to overwrite any existing value in the doc for + for fieldName. Default is false if not specified in config + + + metatags.names diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 34688ed566..498259a950 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -40,6 +40,7 @@ + @@ -117,6 +118,7 @@ + @@ -179,6 +181,7 @@ + diff --git a/src/plugin/index-arbitrary/build.xml b/src/plugin/index-arbitrary/build.xml new file mode 100644 index 0000000000..818020c848 --- /dev/null +++ b/src/plugin/index-arbitrary/build.xml @@ -0,0 +1,22 @@ + + + + + + + diff --git a/src/plugin/index-arbitrary/ivy.xml b/src/plugin/index-arbitrary/ivy.xml new file mode 100644 index 0000000000..9feb1e1b4a --- /dev/null +++ b/src/plugin/index-arbitrary/ivy.xml @@ -0,0 +1,39 @@ + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + diff --git a/src/plugin/index-arbitrary/plugin.xml b/src/plugin/index-arbitrary/plugin.xml new file mode 100644 index 0000000000..f79188a51d --- /dev/null +++ b/src/plugin/index-arbitrary/plugin.xml @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java new file mode 100644 index 0000000000..7677ef7f81 --- /dev/null +++ b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.arbitrary; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.hadoop.io.Text; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.lang.Class; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import org.apache.hadoop.conf.Configuration; + +/** + * Adds arbitrary searchable fields to a document from the class and method + * the user identifies in the config. The user supplies the name of the field + * to add with the class and method names that supply the value. + * + * Example:

+ * <property>
+ * <name>index.arbitrary.function.count</name>
+ * <value>1</value>
+ * </property>
+ *
+ * <property>
+ * <name>index.arbitrary.fieldName.0</name>
+ * <value>advisors</value>
+ * </property>
+ *
+ * <property>
+ * <name>index.arbitrary.className.0</name>
+ * <value>com.example.arbitrary.AdvisorCalculator</value>
+ * </property>
+ *
+ * <property>
+ * <name>index.arbitrary.constructorArgs.0</name>
+ * <value>Kirk</value>
+ * </property>
+ *
+ * <property>
+ * <name>index.arbitrary.methodName.0</name>
+ * <value>countAdvisors</value>
+ * </property>
+ *
+ * <property>
+ * <name>index.arbitrary.methodArgs.0</name>
+ * <value>Spock,McCoy</value>
+ * </property>
+ *
+ * To set more than one arbitrary field value, + * increment {@code index.arbitrary.function.count} and + * repeat the rest of these blocks with successive int values + * appended to the property names, e.g. fieldName.1, methodName.1, etc. + */ +public class ArbitraryIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** How many arbitrary field definitions to set. */ + private int arbitraryAddsCount = 0; + + /** The name of the field to insert/overwrite in the NutchDocument */ + private String fieldName; + + /** The fully-qualified class name of the custom class to use for the + * new field. This class must be in the Nutch runtime classpath, + * e.g., nutch/lib/ dierctory. */ + private String className; + + /** The String values to pass to the custom class constructor. The plugin + * will add the document url as the first argument in className's + * String[] args. */ + private String[] userConstrArgs; + + /** The array where the plugin copies the url & the userConstrArgs + * to create the instance of className. */ + private String[] constrArgs; + + /** The name of the method in the custom class to call. Its return value + * will become the value of fieldName in the NutchDocument. */ + private String methodName; + + /** The String values of the arguments to methodName. It's up to the + * developer of className to do any casts/conversions from String to + * another class in the code of className. */ + private String[] methodArgs; + + /** The result that returns from methodName. The plugin will set the value + * of fieldName to this. */ + private Object result; + + /** Optional flag to determine whether to overwrite the existing value in the + * NutchDocument fieldName if this is set to true. Default behavior is to + * add the value from calling methodName to existing values for fieldName. */ + private boolean overwrite = false; + + /** Hadoop Configuration object to pass these values into the plugin. */ + private Configuration conf; + + /** + * The {@link ArbitraryIndexingFilter} filter object uses reflection + * to instantiate the configured class and invoke the configured method. + * It requires a few configuration settings for adding arbitrary fields + * and values to the NutchDocument as searchable fields. + * See {@code index.arbitrary.function.count}, and (possibly multiple + * instances when {@code index.arbitrary.function.count} > 1) of the following + * {@code index.arbitrary.fieldName}.index, + * {@code index.arbitrary.className}.index, + * {@code index.arbitrary.constructorArgs}.index, + * {@code index.arbitrary.methodName}.index, and + * {@code index.arbitrary.methodArgs}.index + * in nutch-default.xml or nutch-site.xml where index ranges from 0 + * to {@code index.arbitrary.function.count} - 1. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered by the user-specified class + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Class theClass = null; + Method theMethod = null; + Constructor theConstructor = null; + Object instance = null; + + // This'll be quick + if (doc == null) { + LOG.debug("In filter() where doc is null for url == {}", + String.valueOf(url)); + return doc; + } else if (url == null) { + LOG.debug("In filter() where url is null. Nothing to do."); + return doc; + } + + int cfgCounter = 0; + while (cfgCounter < arbitraryAddsCount) { + setIndexedConf(conf,cfgCounter); + cfgCounter++; + try { + theClass = Class.forName(className); + if (methodArgs.length > 0) { + theMethod = theClass.getDeclaredMethod(methodName,String[].class); + } else { + theMethod = theClass.getMethod(methodName); + } + theConstructor = theClass.getDeclaredConstructor(String[].class); + } catch (Exception e) { + LOG.error("Exception preparing reflection tasks. className was {}", + String.valueOf(className)); + e.printStackTrace(); + } + try { + constrArgs = new String[userConstrArgs.length + 1]; + constrArgs[0] = url.toString(); + System.arraycopy(userConstrArgs,0,constrArgs,1,userConstrArgs.length); + instance = theConstructor.newInstance(new Object[]{constrArgs}); + if (methodArgs.length > 0) { + result = theMethod.invoke(instance, new Object[]{methodArgs}); + } else { + result = theMethod.invoke(instance); + } + } catch (Exception e) { + LOG.error("Exception in reflection trying to instantiate/invoke. " + + "url was {} & className was {}", + String.valueOf(url), String.valueOf(className)); + if (constrArgs.length > 0) { + LOG.error("constrArgs[1] was {}", String.valueOf(constrArgs[1])); + } + LOG.error("methodName was {}", String.valueOf(className)); + if (methodArgs.length > 0) { + LOG.error("methodArgs[0] was {}", String.valueOf(methodArgs[0])); + } + e.printStackTrace(); + } + + LOG.debug("{}.{}() returned {} for field {}.", className, + methodName, String.valueOf(result), String.valueOf(fieldName)); + + // If user chose to overwrite, remove existing value + if (overwrite) { + LOG.debug("overwrite == true for fieldName == {} ", fieldName); + if (doc.getFieldNames().contains(fieldName)) { + LOG.debug("Removing field '{}' from doc for overwrite", fieldName); + doc.removeField(fieldName); + } + } + if (result == null) { + LOG.debug("Call to {}.{} returned null", className, methodName); + if (overwrite) { + LOG.debug("{} has been cleared.", fieldName); + } + } + LOG.debug("Adding value '{}' for field '{}' to doc", result, fieldName); + doc.add(fieldName, result); + } + return doc; + } + + /** + * Set the {@link Configuration} object + */ + @Override + public void setConf(Configuration conf) { + this.conf = conf; + arbitraryAddsCount = conf.getInt("index.arbitrary.function.count",1); + LOG.info("Will process the first {} fieldName defs in config.", String.valueOf(arbitraryAddsCount)); + } + + /** + * Set the {@link Configuration} object for a specific set of values in the config + * + * @param conf + * The Configuration object holding values for the current arbitrary field. + * @param ndx + * The ordinal counter value for the current arbitrary field appended to the + * base property names in the xml configuration file. + */ + public void setIndexedConf(Configuration conf, int ndx) { + LOG.debug("In setIndexedConf() where ndx was passed in as {}", String.valueOf(ndx)); + fieldName = conf.get("index.arbitrary.fieldName.".concat(String.valueOf(ndx))); + LOG.debug("Looking now for index.arbitrary.fieldname.{} which was: {}", + String.valueOf(ndx),String.valueOf(fieldName)); + + if (fieldName == null || fieldName == "") { + throw new RuntimeException ("Problem in configuration where the index.arbitrary.fieldName." + + String.valueOf(ndx) + " is missing."); + } + + className = conf.get("index.arbitrary.className.".concat(String.valueOf(ndx))); + if (className == null || className == "") { + throw new RuntimeException ("Problem in configuration where the index.arbitrary.className." + + String.valueOf(ndx) + " is missing."); + } + + userConstrArgs = conf.getTrimmedStrings("index.arbitrary.constructorArgs.".concat(String.valueOf(ndx))); + methodName = conf.get("index.arbitrary.methodName.".concat(String.valueOf(ndx)),""); + methodArgs = conf.getTrimmedStrings("index.arbitrary.methodArgs.".concat(String.valueOf(ndx))); + overwrite = conf.getBoolean("index.arbitrary.overwrite.".concat(String.valueOf(ndx)),false); + if (overwrite) { + LOG.info("overwrite set == true for processing {}.", fieldName); + } + } + + /** + * Get the {@link Configuration} object */ + @Override + public Configuration getConf() { + return this.conf; + } +} diff --git a/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java new file mode 100644 index 0000000000..6e6d47513d --- /dev/null +++ b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Indexing filter to add document arbitrary data to the index + * from the output of a user-specified class. + */ +package org.apache.nutch.indexer.arbitrary; + diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java new file mode 100644 index 0000000000..52e8939ce5 --- /dev/null +++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.arbitrary; + + +import java.io.PrintStream; + +public class Echo { + + private static PrintStream out = System.out; + private String words; + + public Echo(String args[]) { + super(); + words = String.valueOf(args[1]); + } + + public String getText() { + return words; + } + + public static void main(String[] args) { + Echo echo = new Echo(args); + out.println(echo.getText()); + } +} diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java new file mode 100644 index 0000000000..38875d0daf --- /dev/null +++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.arbitrary; + +import java.io.PrintStream; + +public class Multiplier { + private float product = 1; + private static PrintStream err = System.err; + private static PrintStream out = System.out; + + public Multiplier(String args[]) { + super(); + } + + public String getProduct(String args[]) { + int i = args.length - 1; + try { + while (i >= 0) { + product = product * Float.parseFloat(args[i]); + i--; + } + } catch (NumberFormatException nfe) { + err.println("NumberFormatException while trying to parse " + String.valueOf(args[i])); + } + return String.valueOf(product); + } + + public static void main(String[] args) { + Multiplier mp = new Multiplier(args); + out.println(mp.getProduct(args)); + } +} diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java new file mode 100644 index 0000000000..17f31b1839 --- /dev/null +++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.arbitrary; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.NutchField; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.lang.invoke.MethodHandles; + +/** + * Tests that the index-arbitrary filter can add a new field with an arbitrary + * value, supplement an existing field with an arbitrary value, and overwrite + * an existing field with an arbitrary value where it takes the arbitrary value + * from some POJO outside the normal Nutch codebase. + * + * @author Joe Gilvary + */ + +public class TestArbitraryIndexingFilter { + + Configuration conf; + Inlinks inlinks; + ParseImpl parse; + CrawlDatum crawlDatum; + Text url; + ArbitraryIndexingFilter filter; + NutchDocument doc; + + @Before + public void setUp() throws Exception { + parse = new ParseImpl(); + url = new Text("http://nutch.apache.org/index.html"); + crawlDatum = new CrawlDatum(); + inlinks = new Inlinks(); + } + + + /** + * Test adding field with arbitrary content from POJO + * + * @throws Exception + */ + @Test + public void testAddingNewField() throws Exception { + conf = NutchConfiguration.create(); + conf.set("index.arbitrary.function.count","1"); + conf.set("index.arbitrary.fieldName.0","foo"); + conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo"); + conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar"); + conf.set("index.arbitrary.methodName.0","getText"); + + filter = new ArbitraryIndexingFilter(); + Assert.assertNotNull("No filter exists for testAddingNewField",filter); + + filter.setConf(conf); + doc = new NutchDocument(); + + try { + filter.filter(doc, parse, url, crawlDatum, inlinks); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); + } + + Assert.assertNotNull(doc); + Assert.assertFalse("test if doc is not empty", doc.getFieldNames() + .isEmpty()); + Assert.assertTrue("test if doc has new field with arbitrary value", doc.getField("foo") + .getValues().contains("Arbitrary text to add - bar")); + } + + /** + * Test supplementing a doc field with arbitrary content from POJO + * + * @throws Exception + */ + @Test + public void testSupplementExistingField() throws Exception { + + conf = NutchConfiguration.create(); + conf.set("index.arbitrary.function.count","2"); + conf.set("index.arbitrary.fieldName.0","foo"); + conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo"); + conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar"); + conf.set("index.arbitrary.methodName.0","getText"); + conf.set("index.arbitrary.fieldName.1","description"); + conf.set("index.arbitrary.className.1","org.apache.nutch.indexer.arbitrary.Multiplier"); + conf.set("index.arbitrary.constructorArgs.1",""); + conf.set("index.arbitrary.methodName.1","getProduct"); + conf.set("index.arbitrary.methodArgs.1","-1,3.14"); + + filter = new ArbitraryIndexingFilter(); + Assert.assertNotNull("No filter exists for testSupplementExistingField", filter); + + filter.setConf(conf); + + doc = new NutchDocument(); + Assert.assertNotNull("doc doesn't exist", doc); + + doc.add("description","irrational"); + + Assert.assertFalse("doc is empty", doc.getFieldNames().isEmpty()); + + Assert.assertEquals("field description does not have exactly one value", 1, + doc.getField("description").getValues().size()); + + Assert.assertTrue("field description does not have initial value 'irrational'", + doc.getField("description").getValues().contains("irrational")); + + try { + filter.filter(doc, parse, url, crawlDatum, inlinks); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); + } + + Assert.assertTrue("doc doesn't have new field with arbitrary value", + doc.getField("foo").getValues() + .contains("Arbitrary text to add - bar")); + + Assert.assertEquals("field description does not have 2 values", 2, + doc.getField("description").getValues().size()); + + Assert.assertTrue("field description original value gone", doc.getField("description") + .getValues().contains("irrational")); + + Assert.assertTrue("field description missing new value", doc.getField("description") + .getValues().contains("-3.14")); + } + + + /** + * Test overwriting a doc field with arbitrary content from POJO + * + * @throws Exception + */ + @Test + public void testOverwritingExistingField() throws Exception { + conf = NutchConfiguration.create(); + conf.set("index.arbitrary.function.count","3"); + conf.set("index.arbitrary.fieldName.0","foo"); + conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo"); + conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar"); + conf.set("index.arbitrary.methodName.0","getText"); + conf.set("index.arbitrary.fieldName.1","description"); + conf.set("index.arbitrary.className.1","org.apache.nutch.indexer.arbitrary.Multiplier"); + conf.set("index.arbitrary.methodArgs.1","-1,3.14159265"); + conf.set("index.arbitrary.methodName.1","getProduct"); + conf.set("index.arbitrary.fieldName.2","philosopher"); + conf.set("index.arbitrary.className.2","org.apache.nutch.indexer.arbitrary.Echo"); + conf.set("index.arbitrary.constructorArgs.2","Popeye"); + conf.set("index.arbitrary.methodName.2","getText"); + conf.set("index.arbitrary.overwrite.2","true"); + + filter = new ArbitraryIndexingFilter(); + Assert.assertNotNull("No filter exists for testOverwritingExistingField",filter); + + filter.setConf(conf); + Assert.assertNotNull("conf does not exist",conf); + + doc = new NutchDocument(); + + Assert.assertNotNull("doc does not exist",doc); + + doc.add("description","irrational"); + doc.add("philosopher","Socrates"); + + Assert.assertEquals("field description does not have exactly one value", 1, doc.getField("description") + .getValues().size()); + + Assert.assertEquals("field philosopher does not have exactly one value", 1, doc.getField("philosopher") + .getValues().size()); + + Assert.assertTrue("field description does not have initial value 'irrational'", doc.getField("description") + .getValues().contains("irrational")); + + Assert.assertTrue("field philosopher does not have initial value 'Socrates'", doc.getField("philosopher") + .getValues().contains("Socrates")); + + try { + filter.filter(doc, parse, url, crawlDatum, inlinks); + } catch (Exception e) { + e.printStackTrace(System.out); + Assert.fail(e.getMessage()); + } + + Assert.assertNotNull(doc); + + Assert.assertEquals("field philosopher no longer has only one value", 1, doc.getField("philosopher") + .getValues().size()); + + Assert.assertFalse("field philosopher's original value 'Socrates' NOT overwritten", doc.getField("philosopher") + .getValues().contains("Socrates")); + + Assert.assertTrue("field philosopher does not have new value 'Popeye'", doc.getField("philosopher") + .getValues().contains("Popeye")); + } +}