Add fuzzy/slop support to simple_query_string

dakrone · dakrone · commit d2078a5e287c · 2014-02-06T10:05:10.000-07:00
Ports the change from https://issues.apache.org/jira/browse/LUCENE-5410
diff --git a/docs/reference/query-dsl/queries/simple-query-string-query.asciidoc b/docs/reference/query-dsl/queries/simple-query-string-query.asciidoc
@@ -51,6 +51,8 @@ The `simple_query_string` supports the following special characters:
 * `"` wraps a number of tokens to signify a phrase for searching
 * `*` at the end of a term signifies a prefix query
 * `(` and `)` signify precedence
+* `~N` after a word signifies edit distance (fuzziness)
+* `~N` after a phrase signifies slop amount
 
 In order to search for any of these special characters, they will need to
 be escaped with `\`.
@@ -97,4 +99,4 @@ should be enabled. It is specified as a `|`-delimited string with the
 --------------------------------------------------
 
 The available flags are: `ALL`, `NONE`, `AND`, `OR`, `PREFIX`, `PHRASE`,
-`PRECEDENCE`, `ESCAPE`, and `WHITESPACE`.
+`PRECEDENCE`, `ESCAPE`, `WHITESPACE`, `FUZZY`, `NEAR`, and `SLOP`.
diff --git a/src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java b/src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java
@@ -19,16 +19,22 @@
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.search.*;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
 import org.apache.lucene.util.QueryBuilder;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util.automaton.LevenshteinAutomata;
 import org.elasticsearch.common.lucene.Lucene;
 
 import java.util.Collections;
 import java.util.Map;
 
 /**
- * XSimpleQueryParser is used to parse human readable query syntax.
+ * SimpleQueryParser is used to parse human readable query syntax.
  * <p>
  * The main idea behind this parser is that a person should be able to type
  * whatever they want to represent a query, and this parser will do its best
@@ -46,6 +52,8 @@
  *  <li>'{@code -}' negates a single token: <tt>-token0</tt>
  *  <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt>
  *  <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt>
+ *  <li>'{@code ~}N' at the end of terms specifies fuzzy query: <tt>term~1</tt>
+ *  <li>'{@code ~}N' at the end of phrases specifies near query: <tt>"term1 term2"~5</tt>
  *  <li>'{@code (}' and '{@code )}' specifies precedence: <tt>token1 + (token2 | token3)</tt>
  * </ul>
  * <p>
@@ -114,6 +122,11 @@ public class XSimpleQueryParser extends QueryBuilder {
     public static final int ESCAPE_OPERATOR      = 1<<6;
     /** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */
     public static final int WHITESPACE_OPERATOR  = 1<<7;
+    /** Enables {@code FUZZY} operators: (~) on single terms */
+    public static final int FUZZY_OPERATOR       = 1<<8;
+    /** Enables {@code NEAR} operators: (~) on phrases */
+    public static final int NEAR_OPERATOR        = 1<<9;
+
 
     private BooleanClause.Occur defaultOperator = BooleanClause.Occur.SHOULD;
 
@@ -269,6 +282,7 @@ private void consumePhrase(State state) {
         int start = ++state.index;
         int copied = 0;
         boolean escaped = false;
+        boolean hasSlop = false;
 
         while (state.index < state.length) {
             if (!escaped) {
@@ -282,10 +296,23 @@ private void consumePhrase(State state) {
 
                     continue;
                 } else if (state.data[state.index] == '"') {
-                    // this should be the end of the phrase
-                    // all characters found will used for
-                    // creating the phrase query
-                    break;
+                    // if there are still characters after the closing ", check for a
+                    // tilde
+                    if (state.length > (state.index + 1) &&
+                            state.data[state.index+1] == '~' &&
+                            (flags & NEAR_OPERATOR) != 0) {
+                        state.index++;
+                        // check for characters after the tilde
+                        if (state.length > (state.index + 1)) {
+                            hasSlop = true;
+                        }
+                        break;
+                    } else {
+                        // this should be the end of the phrase
+                        // all characters found will used for
+                        // creating the phrase query
+                        break;
+                    }
                 }
             }
 
@@ -308,7 +335,12 @@ private void consumePhrase(State state) {
             // a complete phrase has been found and is parsed through
             // through the analyzer from the given field
             String phrase = new String(state.buffer, 0, copied);
-            Query branch = newPhraseQuery(phrase);
+            Query branch;
+            if (hasSlop) {
+                branch = newPhraseQuery(phrase, parseFuzziness(state));
+            } else {
+                branch = newPhraseQuery(phrase, 0);
+            }
             buildQueryTree(state, branch);
 
             ++state.index;
@@ -319,6 +351,7 @@ private void consumeToken(State state) {
         int copied = 0;
         boolean escaped = false;
         boolean prefix = false;
+        boolean fuzzy = false;
 
         while (state.index < state.length) {
             if (!escaped) {
@@ -332,19 +365,14 @@ private void consumeToken(State state) {
                     ++state.index;
 
                     continue;
-                } else if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
-                        || (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
-                        || (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
-                        || (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
-                        || (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
-                        || ((state.data[state.index] == ' '
-                        || state.data[state.index] == '\t'
-                        || state.data[state.index] == '\n'
-                        || state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) {
+                } else if (tokenFinished(state)) {
                     // this should be the end of the term
                     // all characters found will used for
                     // creating the term query
                     break;
+                } else if (copied > 0 && state.data[state.index] == '~' && (flags & FUZZY_OPERATOR) != 0) {
+                    fuzzy = true;
+                    break;
                 }
 
                 // wildcard tracks whether or not the last character
@@ -361,7 +389,17 @@ private void consumeToken(State state) {
         if (copied > 0) {
             final Query branch;
 
-            if (prefix) {
+            if (fuzzy && (flags & FUZZY_OPERATOR) != 0) {
+                String token = new String(state.buffer, 0, copied);
+                int fuzziness = parseFuzziness(state);
+                // edit distance has a maximum, limit to the maximum supported
+                fuzziness = Math.min(fuzziness, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
+                if (fuzziness == 0) {
+                    branch = newDefaultQuery(token);
+                } else {
+                    branch = newFuzzyQuery(token, fuzziness);
+                }
+            } else if (prefix) {
                 // if a term is found with a closing '*' it is considered to be a prefix query
                 // and will have prefix added as an option
                 String token = new String(state.buffer, 0, copied - 1);
@@ -423,6 +461,60 @@ private void buildQueryTree(State state, Query branch) {
         }
     }
 
+    /**
+     * Helper parsing fuzziness from parsing state
+     * @return slop/edit distance, 0 in the case of non-parsing slop/edit string
+     */
+    private int parseFuzziness(State state) {
+        char slopText[] = new char[state.length];
+        int slopLength = 0;
+
+        if (state.data[state.index] == '~') {
+            while (state.index < state.length) {
+                state.index++;
+                // it's possible that the ~ was at the end, so check after incrementing
+                // to make sure we don't go out of bounds
+                if (state.index < state.length) {
+                    if (tokenFinished(state)) {
+                        break;
+                    }
+                    slopText[slopLength] = state.data[state.index];
+                    slopLength++;
+                }
+            }
+            int fuzziness = 0;
+            try {
+                fuzziness = Integer.parseInt(new String(slopText, 0, slopLength));
+            } catch (NumberFormatException e) {
+                // swallow number format exceptions parsing fuzziness
+            }
+            // negative -> 0
+            if (fuzziness < 0) {
+                fuzziness = 0;
+            }
+            return fuzziness;
+        }
+        return 0;
+    }
+
+    /**
+     * Helper returning true if the state has reached the end of token.
+     */
+    private boolean tokenFinished(State state) {
+        if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
+                || (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
+                || (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
+                || (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
+                || (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
+                || ((state.data[state.index] == ' '
+                || state.data[state.index] == '\t'
+                || state.data[state.index] == '\n'
+                || state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) {
+            return true;
+        }
+        return false;
+    }
+
     /**
      * Factory method to generate a standard query (no phrase or prefix operators).
      */
@@ -439,12 +531,27 @@ protected Query newDefaultQuery(String text) {
     }
 
     /**
-     * Factory method to generate a phrase query.
+     * Factory method to generate a fuzzy query.
+     */
+    protected Query newFuzzyQuery(String text, int fuzziness) {
+        BooleanQuery bq = new BooleanQuery(true);
+        for (Map.Entry<String,Float> entry : weights.entrySet()) {
+            Query q = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness);
+            if (q != null) {
+                q.setBoost(entry.getValue());
+                bq.add(q, BooleanClause.Occur.SHOULD);
+            }
+        }
+        return simplify(bq);
+    }
+
+    /**
+     * Factory method to generate a phrase query with slop.
      */
-    protected Query newPhraseQuery(String text) {
+    protected Query newPhraseQuery(String text, int slop) {
         BooleanQuery bq = new BooleanQuery(true);
         for (Map.Entry<String,Float> entry : weights.entrySet()) {
-            Query q = createPhraseQuery(entry.getKey(), text);
+            Query q = createPhraseQuery(entry.getKey(), text, slop);
             if (q != null) {
                 q.setBoost(entry.getValue());
                 bq.add(q, BooleanClause.Occur.SHOULD);
@@ -518,4 +625,3 @@ static class State {
         }
     }
 }
-
diff --git a/src/main/java/org/elasticsearch/index/query/SimpleQueryStringFlag.java b/src/main/java/org/elasticsearch/index/query/SimpleQueryStringFlag.java
@@ -37,7 +37,11 @@ public enum SimpleQueryStringFlag {
     PHRASE(XSimpleQueryParser.PHRASE_OPERATOR),
     PRECEDENCE(XSimpleQueryParser.PRECEDENCE_OPERATORS),
     ESCAPE(XSimpleQueryParser.ESCAPE_OPERATOR),
-    WHITESPACE(XSimpleQueryParser.WHITESPACE_OPERATOR);
+    WHITESPACE(XSimpleQueryParser.WHITESPACE_OPERATOR),
+    FUZZY(XSimpleQueryParser.FUZZY_OPERATOR),
+    // NEAR and SLOP are synonymous, since "slop" is a more familiar term than "near"
+    NEAR(XSimpleQueryParser.NEAR_OPERATOR),
+    SLOP(XSimpleQueryParser.NEAR_OPERATOR);
 
     final int value;
 
diff --git a/src/main/java/org/elasticsearch/index/query/SimpleQueryStringParser.java b/src/main/java/org/elasticsearch/index/query/SimpleQueryStringParser.java
@@ -46,6 +46,8 @@
  * <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt>
  * <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt>
  * <li>'{@code (}' and '{@code)}' specifies precedence: <tt>token1 + (token2 | token3)</tt>
+ * <li>'{@code ~}N' at the end of terms specifies fuzzy query: <tt>term~1</tt>
+ * <li>'{@code ~}N' at the end of phrases specifies near/slop query: <tt>"term1 term2"~5</tt>
  * </ul>
  * <p/>
  * See: {@link XSimpleQueryParser} for more information.
@@ -151,7 +153,7 @@ public Query parse(QueryParseContext parseContext) throws IOException, QueryPars
                 } else if ("flags".equals(currentFieldName)) {
                     if (parser.hasTextCharacters()) {
                         // Possible options are:
-                        // ALL, NONE, AND, OR, PREFIX, PHRASE, PRECEDENCE, ESCAPE, WHITESPACE
+                        // ALL, NONE, AND, OR, PREFIX, PHRASE, PRECEDENCE, ESCAPE, WHITESPACE, FUZZY, NEAR, SLOP
                         flags = SimpleQueryStringFlag.resolveFlags(parser.text());
                     } else {
                         flags = parser.intValue();