diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index b20afdfe3f..8b24f092ae 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1882,6 +1882,30 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this compression codecs. + + urlfilter.fast.url.max.length + -1 + Filters URLs based on their overall length. + The default value of -1 means that it is deactivated. + + + + + urlfilter.fast.url.path.max.length + -1 + Filters URLs based on the length of their path element. + The default value of -1 means that it is deactivated. + + + + + urlfilter.fast.url.query.max.length + -1 + Filters URLs based on the length of their query element. + The default value of -1 means that it is deactivated. + + + urlfilter.order diff --git a/src/plugin/urlfilter-fast/README.md b/src/plugin/urlfilter-fast/README.md index 2e58605752..b4b0dfcd96 100644 --- a/src/plugin/urlfilter-fast/README.md +++ b/src/plugin/urlfilter-fast/README.md @@ -73,3 +73,9 @@ the end of the line. The rules file is defined via the property `urlfilter.fast.file`, the default name is `fast-urlfilter.txt`. + +In addition to this, the filter checks that the length of the path element of the URL and its query +done not exceed the values set in the properties `urlfilter.fast.url.path.max.length` and +`urlfilter.fast.url.query.max.length` if set. The overall length of the URL can also be used for +filtering through the config `urlfilter.fast.url.max.length`. + diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java index bb4a11b7cb..b1e589a0e1 100644 --- a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java +++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java @@ -95,6 +95,9 @@ * * The rules file is defined via the property urlfilter.fast.file, * the default name is fast-urlfilter.txt. + * + * In addition, it can filter based on the length of the whole URL, its path element or + * its query element. See urlfilter.fast.url.* configurations. */ public class FastURLFilter implements URLFilter { @@ -103,21 +106,45 @@ public class FastURLFilter implements URLFilter { private Configuration conf; public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file"; + public static final String URLFILTER_FAST_MAX_LENGTH = "urlfilter.fast.url.max.length"; + public static final String URLFILTER_FAST_PATH_MAX_LENGTH = "urlfilter.fast.url.path.max.length"; + public static final String URLFILTER_FAST_QUERY_MAX_LENGTH = "urlfilter.fast.url.query.max.length"; + private Multimap hostRules = LinkedHashMultimap.create(); private Multimap domainRules = LinkedHashMultimap.create(); + /** Max allowed size of the path of a URL **/ + private int maxLengthPath = -1; + /** Max allowed size of the query of a URL **/ + private int maxLengthQuery = -1; + /** Max allowed size for the whole URL **/ + private int maxLength = -1; + private static final Pattern CATCH_ALL_RULE = Pattern .compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$"); public FastURLFilter() {} + /** Used by the tests so that the rules file doesn't have to be in the jar **/ FastURLFilter(Reader rules) throws IOException, PatternSyntaxException { reloadRules(rules); } + + /** Used by the tests so that the rules file doesn't have to be in the jar AND + * we can set the conf for the length-based filtering **/ + FastURLFilter(Reader rules, Configuration conf) throws IOException, PatternSyntaxException { + maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1); + maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1); + maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1); + reloadRules(rules); + } @Override public void setConf(Configuration conf) { this.conf = conf; + maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1); + maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1); + maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1); try { reloadRules(); } catch (Exception e) { @@ -134,6 +161,12 @@ public Configuration getConf() { @Override public String filter(String url) { + if (maxLength != -1 && url.length() > maxLength) { + LOG.debug("Rejected {} because URL length ({}) greater than limit {}", url, + url.length(), maxLength); + return null; + } + URL u; try { @@ -143,6 +176,22 @@ public String filter(String url) { e.getMessage()); return null; } + + final String path = u.getPath(); + if (maxLengthPath != -1 && path.length() > maxLengthPath) + { + LOG.debug("Rejected {} as path length {} is greater than {}", url, + path.length(), maxLengthPath); + return null; + } + + final String query = u.getQuery(); + if (maxLengthQuery != -1 && query != null && query.length() > maxLengthQuery) + { + LOG.debug("Rejected {} as query length {} is greater than {}", url, + query.length(), maxLengthQuery); + return null; + } String hostname = u.getHost(); @@ -187,7 +236,6 @@ public String filter(String url) { public void reloadRules() throws IOException { String fileRules = conf.get(URLFILTER_FAST_FILE); - InputStream is; Path fileRulesPath = new Path(fileRules); @@ -200,11 +248,22 @@ public void reloadRules() throws IOException { CompressionCodec codec = new CompressionCodecFactory(conf) .getCodec(fileRulesPath); - if (codec != null) { + if (codec != null && is != null) { is = codec.createInputStream(is); } - reloadRules(new InputStreamReader(is)); + try { + reloadRules(new InputStreamReader(is)); + } catch (Exception e) { + String message = "Couldn't load the rules from " + fileRules; + LOG.error(message); + throw new IOException(message); + } + finally { + if (is != null) { + is.close(); + } + } } private void reloadRules(Reader rules) throws IOException { diff --git a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java index 8e01d8d3cd..75b37250eb 100644 --- a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java +++ b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java @@ -19,13 +19,14 @@ import java.io.FileReader; import java.io.IOException; import java.io.Reader; +import java.io.StringReader; +import org.apache.hadoop.conf.Configuration; import org.apache.nutch.net.URLFilter; import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest; import org.junit.Assert; import org.junit.Test; - public class TestFastURLFilter extends RegexURLFilterBaseTest { @Override @@ -53,4 +54,39 @@ public void benchmark() { bench(800, "fast-urlfilter-benchmark.txt", "Benchmarks.urls"); } + @Test + public void lengthQueryAndPath() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(FastURLFilter.URLFILTER_FAST_PATH_MAX_LENGTH, 50); + conf.setInt(FastURLFilter.URLFILTER_FAST_QUERY_MAX_LENGTH, 50); + // not interested in testing rules + URLFilter filter = new FastURLFilter(new StringReader(""), conf); + + StringBuilder url = new StringBuilder("http://nutch.apache.org/"); + for (int i = 0; i < 50; i++) { + url.append(i); + } + Assert.assertEquals(null, filter.filter(url.toString())); + + url = new StringBuilder("http://nutch.apache.org/path?"); + for (int i = 0; i < 50; i++) { + url.append(i); + } + + Assert.assertEquals(null, filter.filter(url.toString())); + } + + @Test + public void overalLengthTest() throws Exception { + Configuration conf = new Configuration(); + conf.setInt(FastURLFilter.URLFILTER_FAST_MAX_LENGTH, 100); + // not interested in testing rules + URLFilter filter = new FastURLFilter(new StringReader(""), conf); + + StringBuilder url = new StringBuilder("http://nutch.apache.org/"); + for (int i = 0; i < 500; i++) { + url.append(i); + } + Assert.assertEquals(null, filter.filter(url.toString())); + } }