Skip to content

Commit d2078a5

Browse files
committed
Add fuzzy/slop support to simple_query_string
Ports the change from https://issues.apache.org/jira/browse/LUCENE-5410
1 parent f5a8de6 commit d2078a5

File tree

4 files changed

+138
-24
lines changed

4 files changed

+138
-24
lines changed

docs/reference/query-dsl/queries/simple-query-string-query.asciidoc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ The `simple_query_string` supports the following special characters:
5151
* `"` wraps a number of tokens to signify a phrase for searching
5252
* `*` at the end of a term signifies a prefix query
5353
* `(` and `)` signify precedence
54+
* `~N` after a word signifies edit distance (fuzziness)
55+
* `~N` after a phrase signifies slop amount
5456

5557
In order to search for any of these special characters, they will need to
5658
be escaped with `\`.
@@ -97,4 +99,4 @@ should be enabled. It is specified as a `|`-delimited string with the
9799
--------------------------------------------------
98100

99101
The available flags are: `ALL`, `NONE`, `AND`, `OR`, `PREFIX`, `PHRASE`,
100-
`PRECEDENCE`, `ESCAPE`, and `WHITESPACE`.
102+
`PRECEDENCE`, `ESCAPE`, `WHITESPACE`, `FUZZY`, `NEAR`, and `SLOP`.

src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java

Lines changed: 127 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,22 @@
1919

2020
import org.apache.lucene.analysis.Analyzer;
2121
import org.apache.lucene.index.Term;
22-
import org.apache.lucene.search.*;
22+
import org.apache.lucene.search.BooleanClause;
23+
import org.apache.lucene.search.BooleanQuery;
24+
import org.apache.lucene.search.FuzzyQuery;
25+
import org.apache.lucene.search.MatchAllDocsQuery;
26+
import org.apache.lucene.search.PrefixQuery;
27+
import org.apache.lucene.search.Query;
2328
import org.apache.lucene.util.QueryBuilder;
2429
import org.apache.lucene.util.Version;
30+
import org.apache.lucene.util.automaton.LevenshteinAutomata;
2531
import org.elasticsearch.common.lucene.Lucene;
2632

2733
import java.util.Collections;
2834
import java.util.Map;
2935

3036
/**
31-
* XSimpleQueryParser is used to parse human readable query syntax.
37+
* SimpleQueryParser is used to parse human readable query syntax.
3238
* <p>
3339
* The main idea behind this parser is that a person should be able to type
3440
* whatever they want to represent a query, and this parser will do its best
@@ -46,6 +52,8 @@
4652
* <li>'{@code -}' negates a single token: <tt>-token0</tt>
4753
* <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt>
4854
* <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt>
55+
* <li>'{@code ~}N' at the end of terms specifies fuzzy query: <tt>term~1</tt>
56+
* <li>'{@code ~}N' at the end of phrases specifies near query: <tt>"term1 term2"~5</tt>
4957
* <li>'{@code (}' and '{@code )}' specifies precedence: <tt>token1 + (token2 | token3)</tt>
5058
* </ul>
5159
* <p>
@@ -114,6 +122,11 @@ public class XSimpleQueryParser extends QueryBuilder {
114122
public static final int ESCAPE_OPERATOR = 1<<6;
115123
/** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */
116124
public static final int WHITESPACE_OPERATOR = 1<<7;
125+
/** Enables {@code FUZZY} operators: (~) on single terms */
126+
public static final int FUZZY_OPERATOR = 1<<8;
127+
/** Enables {@code NEAR} operators: (~) on phrases */
128+
public static final int NEAR_OPERATOR = 1<<9;
129+
117130

118131
private BooleanClause.Occur defaultOperator = BooleanClause.Occur.SHOULD;
119132

@@ -269,6 +282,7 @@ private void consumePhrase(State state) {
269282
int start = ++state.index;
270283
int copied = 0;
271284
boolean escaped = false;
285+
boolean hasSlop = false;
272286

273287
while (state.index < state.length) {
274288
if (!escaped) {
@@ -282,10 +296,23 @@ private void consumePhrase(State state) {
282296

283297
continue;
284298
} else if (state.data[state.index] == '"') {
285-
// this should be the end of the phrase
286-
// all characters found will used for
287-
// creating the phrase query
288-
break;
299+
// if there are still characters after the closing ", check for a
300+
// tilde
301+
if (state.length > (state.index + 1) &&
302+
state.data[state.index+1] == '~' &&
303+
(flags & NEAR_OPERATOR) != 0) {
304+
state.index++;
305+
// check for characters after the tilde
306+
if (state.length > (state.index + 1)) {
307+
hasSlop = true;
308+
}
309+
break;
310+
} else {
311+
// this should be the end of the phrase
312+
// all characters found will used for
313+
// creating the phrase query
314+
break;
315+
}
289316
}
290317
}
291318

@@ -308,7 +335,12 @@ private void consumePhrase(State state) {
308335
// a complete phrase has been found and is parsed through
309336
// through the analyzer from the given field
310337
String phrase = new String(state.buffer, 0, copied);
311-
Query branch = newPhraseQuery(phrase);
338+
Query branch;
339+
if (hasSlop) {
340+
branch = newPhraseQuery(phrase, parseFuzziness(state));
341+
} else {
342+
branch = newPhraseQuery(phrase, 0);
343+
}
312344
buildQueryTree(state, branch);
313345

314346
++state.index;
@@ -319,6 +351,7 @@ private void consumeToken(State state) {
319351
int copied = 0;
320352
boolean escaped = false;
321353
boolean prefix = false;
354+
boolean fuzzy = false;
322355

323356
while (state.index < state.length) {
324357
if (!escaped) {
@@ -332,19 +365,14 @@ private void consumeToken(State state) {
332365
++state.index;
333366

334367
continue;
335-
} else if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
336-
|| (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
337-
|| (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
338-
|| (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
339-
|| (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
340-
|| ((state.data[state.index] == ' '
341-
|| state.data[state.index] == '\t'
342-
|| state.data[state.index] == '\n'
343-
|| state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) {
368+
} else if (tokenFinished(state)) {
344369
// this should be the end of the term
345370
// all characters found will used for
346371
// creating the term query
347372
break;
373+
} else if (copied > 0 && state.data[state.index] == '~' && (flags & FUZZY_OPERATOR) != 0) {
374+
fuzzy = true;
375+
break;
348376
}
349377

350378
// wildcard tracks whether or not the last character
@@ -361,7 +389,17 @@ private void consumeToken(State state) {
361389
if (copied > 0) {
362390
final Query branch;
363391

364-
if (prefix) {
392+
if (fuzzy && (flags & FUZZY_OPERATOR) != 0) {
393+
String token = new String(state.buffer, 0, copied);
394+
int fuzziness = parseFuzziness(state);
395+
// edit distance has a maximum, limit to the maximum supported
396+
fuzziness = Math.min(fuzziness, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
397+
if (fuzziness == 0) {
398+
branch = newDefaultQuery(token);
399+
} else {
400+
branch = newFuzzyQuery(token, fuzziness);
401+
}
402+
} else if (prefix) {
365403
// if a term is found with a closing '*' it is considered to be a prefix query
366404
// and will have prefix added as an option
367405
String token = new String(state.buffer, 0, copied - 1);
@@ -423,6 +461,60 @@ private void buildQueryTree(State state, Query branch) {
423461
}
424462
}
425463

464+
/**
465+
* Helper parsing fuzziness from parsing state
466+
* @return slop/edit distance, 0 in the case of non-parsing slop/edit string
467+
*/
468+
private int parseFuzziness(State state) {
469+
char slopText[] = new char[state.length];
470+
int slopLength = 0;
471+
472+
if (state.data[state.index] == '~') {
473+
while (state.index < state.length) {
474+
state.index++;
475+
// it's possible that the ~ was at the end, so check after incrementing
476+
// to make sure we don't go out of bounds
477+
if (state.index < state.length) {
478+
if (tokenFinished(state)) {
479+
break;
480+
}
481+
slopText[slopLength] = state.data[state.index];
482+
slopLength++;
483+
}
484+
}
485+
int fuzziness = 0;
486+
try {
487+
fuzziness = Integer.parseInt(new String(slopText, 0, slopLength));
488+
} catch (NumberFormatException e) {
489+
// swallow number format exceptions parsing fuzziness
490+
}
491+
// negative -> 0
492+
if (fuzziness < 0) {
493+
fuzziness = 0;
494+
}
495+
return fuzziness;
496+
}
497+
return 0;
498+
}
499+
500+
/**
501+
* Helper returning true if the state has reached the end of token.
502+
*/
503+
private boolean tokenFinished(State state) {
504+
if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
505+
|| (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
506+
|| (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
507+
|| (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
508+
|| (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
509+
|| ((state.data[state.index] == ' '
510+
|| state.data[state.index] == '\t'
511+
|| state.data[state.index] == '\n'
512+
|| state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) {
513+
return true;
514+
}
515+
return false;
516+
}
517+
426518
/**
427519
* Factory method to generate a standard query (no phrase or prefix operators).
428520
*/
@@ -439,12 +531,27 @@ protected Query newDefaultQuery(String text) {
439531
}
440532

441533
/**
442-
* Factory method to generate a phrase query.
534+
* Factory method to generate a fuzzy query.
535+
*/
536+
protected Query newFuzzyQuery(String text, int fuzziness) {
537+
BooleanQuery bq = new BooleanQuery(true);
538+
for (Map.Entry<String,Float> entry : weights.entrySet()) {
539+
Query q = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness);
540+
if (q != null) {
541+
q.setBoost(entry.getValue());
542+
bq.add(q, BooleanClause.Occur.SHOULD);
543+
}
544+
}
545+
return simplify(bq);
546+
}
547+
548+
/**
549+
* Factory method to generate a phrase query with slop.
443550
*/
444-
protected Query newPhraseQuery(String text) {
551+
protected Query newPhraseQuery(String text, int slop) {
445552
BooleanQuery bq = new BooleanQuery(true);
446553
for (Map.Entry<String,Float> entry : weights.entrySet()) {
447-
Query q = createPhraseQuery(entry.getKey(), text);
554+
Query q = createPhraseQuery(entry.getKey(), text, slop);
448555
if (q != null) {
449556
q.setBoost(entry.getValue());
450557
bq.add(q, BooleanClause.Occur.SHOULD);
@@ -518,4 +625,3 @@ static class State {
518625
}
519626
}
520627
}
521-

src/main/java/org/elasticsearch/index/query/SimpleQueryStringFlag.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,11 @@ public enum SimpleQueryStringFlag {
3737
PHRASE(XSimpleQueryParser.PHRASE_OPERATOR),
3838
PRECEDENCE(XSimpleQueryParser.PRECEDENCE_OPERATORS),
3939
ESCAPE(XSimpleQueryParser.ESCAPE_OPERATOR),
40-
WHITESPACE(XSimpleQueryParser.WHITESPACE_OPERATOR);
40+
WHITESPACE(XSimpleQueryParser.WHITESPACE_OPERATOR),
41+
FUZZY(XSimpleQueryParser.FUZZY_OPERATOR),
42+
// NEAR and SLOP are synonymous, since "slop" is a more familiar term than "near"
43+
NEAR(XSimpleQueryParser.NEAR_OPERATOR),
44+
SLOP(XSimpleQueryParser.NEAR_OPERATOR);
4145

4246
final int value;
4347

src/main/java/org/elasticsearch/index/query/SimpleQueryStringParser.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
* <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt>
4747
* <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt>
4848
* <li>'{@code (}' and '{@code)}' specifies precedence: <tt>token1 + (token2 | token3)</tt>
49+
* <li>'{@code ~}N' at the end of terms specifies fuzzy query: <tt>term~1</tt>
50+
* <li>'{@code ~}N' at the end of phrases specifies near/slop query: <tt>"term1 term2"~5</tt>
4951
* </ul>
5052
* <p/>
5153
* See: {@link XSimpleQueryParser} for more information.
@@ -151,7 +153,7 @@ public Query parse(QueryParseContext parseContext) throws IOException, QueryPars
151153
} else if ("flags".equals(currentFieldName)) {
152154
if (parser.hasTextCharacters()) {
153155
// Possible options are:
154-
// ALL, NONE, AND, OR, PREFIX, PHRASE, PRECEDENCE, ESCAPE, WHITESPACE
156+
// ALL, NONE, AND, OR, PREFIX, PHRASE, PRECEDENCE, ESCAPE, WHITESPACE, FUZZY, NEAR, SLOP
155157
flags = SimpleQueryStringFlag.resolveFlags(parser.text());
156158
} else {
157159
flags = parser.intValue();

0 commit comments

Comments
 (0)