diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index b20afdfe3f..8b24f092ae 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1882,6 +1882,30 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
compression codecs.
+
+ urlfilter.fast.url.max.length
+ -1
+ Filters URLs based on their overall length.
+ The default value of -1 means that it is deactivated.
+
+
+
+
+ urlfilter.fast.url.path.max.length
+ -1
+ Filters URLs based on the length of their path element.
+ The default value of -1 means that it is deactivated.
+
+
+
+
+ urlfilter.fast.url.query.max.length
+ -1
+ Filters URLs based on the length of their query element.
+ The default value of -1 means that it is deactivated.
+
+
+
urlfilter.order
diff --git a/src/plugin/urlfilter-fast/README.md b/src/plugin/urlfilter-fast/README.md
index 2e58605752..b4b0dfcd96 100644
--- a/src/plugin/urlfilter-fast/README.md
+++ b/src/plugin/urlfilter-fast/README.md
@@ -73,3 +73,9 @@ the end of the line.
The rules file is defined via the property `urlfilter.fast.file`,
the default name is `fast-urlfilter.txt`.
+
+In addition to this, the filter checks that the length of the path element of the URL and its query
+done not exceed the values set in the properties `urlfilter.fast.url.path.max.length` and
+`urlfilter.fast.url.query.max.length` if set. The overall length of the URL can also be used for
+filtering through the config `urlfilter.fast.url.max.length`.
+
diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
index bb4a11b7cb..b1e589a0e1 100644
--- a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
+++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
@@ -95,6 +95,9 @@
*
* The rules file is defined via the property urlfilter.fast.file
,
* the default name is fast-urlfilter.txt
.
+ *
+ * In addition, it can filter based on the length of the whole URL, its path element or
+ * its query element. See urlfilter.fast.url.*
configurations.
*/
public class FastURLFilter implements URLFilter {
@@ -103,21 +106,45 @@ public class FastURLFilter implements URLFilter {
private Configuration conf;
public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file";
+ public static final String URLFILTER_FAST_MAX_LENGTH = "urlfilter.fast.url.max.length";
+ public static final String URLFILTER_FAST_PATH_MAX_LENGTH = "urlfilter.fast.url.path.max.length";
+ public static final String URLFILTER_FAST_QUERY_MAX_LENGTH = "urlfilter.fast.url.query.max.length";
+
private Multimap hostRules = LinkedHashMultimap.create();
private Multimap domainRules = LinkedHashMultimap.create();
+ /** Max allowed size of the path of a URL **/
+ private int maxLengthPath = -1;
+ /** Max allowed size of the query of a URL **/
+ private int maxLengthQuery = -1;
+ /** Max allowed size for the whole URL **/
+ private int maxLength = -1;
+
private static final Pattern CATCH_ALL_RULE = Pattern
.compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
public FastURLFilter() {}
+ /** Used by the tests so that the rules file doesn't have to be in the jar **/
FastURLFilter(Reader rules) throws IOException, PatternSyntaxException {
reloadRules(rules);
}
+
+ /** Used by the tests so that the rules file doesn't have to be in the jar AND
+ * we can set the conf for the length-based filtering **/
+ FastURLFilter(Reader rules, Configuration conf) throws IOException, PatternSyntaxException {
+ maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1);
+ maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1);
+ maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1);
+ reloadRules(rules);
+ }
@Override
public void setConf(Configuration conf) {
this.conf = conf;
+ maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1);
+ maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1);
+ maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1);
try {
reloadRules();
} catch (Exception e) {
@@ -134,6 +161,12 @@ public Configuration getConf() {
@Override
public String filter(String url) {
+ if (maxLength != -1 && url.length() > maxLength) {
+ LOG.debug("Rejected {} because URL length ({}) greater than limit {}", url,
+ url.length(), maxLength);
+ return null;
+ }
+
URL u;
try {
@@ -143,6 +176,22 @@ public String filter(String url) {
e.getMessage());
return null;
}
+
+ final String path = u.getPath();
+ if (maxLengthPath != -1 && path.length() > maxLengthPath)
+ {
+ LOG.debug("Rejected {} as path length {} is greater than {}", url,
+ path.length(), maxLengthPath);
+ return null;
+ }
+
+ final String query = u.getQuery();
+ if (maxLengthQuery != -1 && query != null && query.length() > maxLengthQuery)
+ {
+ LOG.debug("Rejected {} as query length {} is greater than {}", url,
+ query.length(), maxLengthQuery);
+ return null;
+ }
String hostname = u.getHost();
@@ -187,7 +236,6 @@ public String filter(String url) {
public void reloadRules() throws IOException {
String fileRules = conf.get(URLFILTER_FAST_FILE);
-
InputStream is;
Path fileRulesPath = new Path(fileRules);
@@ -200,11 +248,22 @@ public void reloadRules() throws IOException {
CompressionCodec codec = new CompressionCodecFactory(conf)
.getCodec(fileRulesPath);
- if (codec != null) {
+ if (codec != null && is != null) {
is = codec.createInputStream(is);
}
- reloadRules(new InputStreamReader(is));
+ try {
+ reloadRules(new InputStreamReader(is));
+ } catch (Exception e) {
+ String message = "Couldn't load the rules from " + fileRules;
+ LOG.error(message);
+ throw new IOException(message);
+ }
+ finally {
+ if (is != null) {
+ is.close();
+ }
+ }
}
private void reloadRules(Reader rules) throws IOException {
diff --git a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
index 8e01d8d3cd..75b37250eb 100644
--- a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
+++ b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
@@ -19,13 +19,14 @@
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
+import java.io.StringReader;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLFilter;
import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
import org.junit.Assert;
import org.junit.Test;
-
public class TestFastURLFilter extends RegexURLFilterBaseTest {
@Override
@@ -53,4 +54,39 @@ public void benchmark() {
bench(800, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
}
+ @Test
+ public void lengthQueryAndPath() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(FastURLFilter.URLFILTER_FAST_PATH_MAX_LENGTH, 50);
+ conf.setInt(FastURLFilter.URLFILTER_FAST_QUERY_MAX_LENGTH, 50);
+ // not interested in testing rules
+ URLFilter filter = new FastURLFilter(new StringReader(""), conf);
+
+ StringBuilder url = new StringBuilder("http://nutch.apache.org/");
+ for (int i = 0; i < 50; i++) {
+ url.append(i);
+ }
+ Assert.assertEquals(null, filter.filter(url.toString()));
+
+ url = new StringBuilder("http://nutch.apache.org/path?");
+ for (int i = 0; i < 50; i++) {
+ url.append(i);
+ }
+
+ Assert.assertEquals(null, filter.filter(url.toString()));
+ }
+
+ @Test
+ public void overalLengthTest() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(FastURLFilter.URLFILTER_FAST_MAX_LENGTH, 100);
+ // not interested in testing rules
+ URLFilter filter = new FastURLFilter(new StringReader(""), conf);
+
+ StringBuilder url = new StringBuilder("http://nutch.apache.org/");
+ for (int i = 0; i < 500; i++) {
+ url.append(i);
+ }
+ Assert.assertEquals(null, filter.filter(url.toString()));
+ }
}