1919
2020import org .apache .lucene .analysis .Analyzer ;
2121import org .apache .lucene .index .Term ;
22- import org .apache .lucene .search .*;
22+ import org .apache .lucene .search .BooleanClause ;
23+ import org .apache .lucene .search .BooleanQuery ;
24+ import org .apache .lucene .search .FuzzyQuery ;
25+ import org .apache .lucene .search .MatchAllDocsQuery ;
26+ import org .apache .lucene .search .PrefixQuery ;
27+ import org .apache .lucene .search .Query ;
2328import org .apache .lucene .util .QueryBuilder ;
2429import org .apache .lucene .util .Version ;
30+ import org .apache .lucene .util .automaton .LevenshteinAutomata ;
2531import org .elasticsearch .common .lucene .Lucene ;
2632
2733import java .util .Collections ;
2834import java .util .Map ;
2935
3036/**
31- * XSimpleQueryParser is used to parse human readable query syntax.
37+ * SimpleQueryParser is used to parse human readable query syntax.
3238 * <p>
3339 * The main idea behind this parser is that a person should be able to type
3440 * whatever they want to represent a query, and this parser will do its best
4652 * <li>'{@code -}' negates a single token: <tt>-token0</tt>
4753 * <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt>
4854 * <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt>
55+ * <li>'{@code ~}N' at the end of terms specifies fuzzy query: <tt>term~1</tt>
56+ * <li>'{@code ~}N' at the end of phrases specifies near query: <tt>"term1 term2"~5</tt>
4957 * <li>'{@code (}' and '{@code )}' specifies precedence: <tt>token1 + (token2 | token3)</tt>
5058 * </ul>
5159 * <p>
@@ -114,6 +122,11 @@ public class XSimpleQueryParser extends QueryBuilder {
114122 public static final int ESCAPE_OPERATOR = 1 <<6 ;
115123 /** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */
116124 public static final int WHITESPACE_OPERATOR = 1 <<7 ;
125+ /** Enables {@code FUZZY} operators: (~) on single terms */
126+ public static final int FUZZY_OPERATOR = 1 <<8 ;
127+ /** Enables {@code NEAR} operators: (~) on phrases */
128+ public static final int NEAR_OPERATOR = 1 <<9 ;
129+
117130
118131 private BooleanClause .Occur defaultOperator = BooleanClause .Occur .SHOULD ;
119132
@@ -269,6 +282,7 @@ private void consumePhrase(State state) {
269282 int start = ++state .index ;
270283 int copied = 0 ;
271284 boolean escaped = false ;
285+ boolean hasSlop = false ;
272286
273287 while (state .index < state .length ) {
274288 if (!escaped ) {
@@ -282,10 +296,23 @@ private void consumePhrase(State state) {
282296
283297 continue ;
284298 } else if (state .data [state .index ] == '"' ) {
285- // this should be the end of the phrase
286- // all characters found will used for
287- // creating the phrase query
288- break ;
299+ // if there are still characters after the closing ", check for a
300+ // tilde
301+ if (state .length > (state .index + 1 ) &&
302+ state .data [state .index +1 ] == '~' &&
303+ (flags & NEAR_OPERATOR ) != 0 ) {
304+ state .index ++;
305+ // check for characters after the tilde
306+ if (state .length > (state .index + 1 )) {
307+ hasSlop = true ;
308+ }
309+ break ;
310+ } else {
311+ // this should be the end of the phrase
312+ // all characters found will used for
313+ // creating the phrase query
314+ break ;
315+ }
289316 }
290317 }
291318
@@ -308,7 +335,12 @@ private void consumePhrase(State state) {
308335 // a complete phrase has been found and is parsed through
309336 // through the analyzer from the given field
310337 String phrase = new String (state .buffer , 0 , copied );
311- Query branch = newPhraseQuery (phrase );
338+ Query branch ;
339+ if (hasSlop ) {
340+ branch = newPhraseQuery (phrase , parseFuzziness (state ));
341+ } else {
342+ branch = newPhraseQuery (phrase , 0 );
343+ }
312344 buildQueryTree (state , branch );
313345
314346 ++state .index ;
@@ -319,6 +351,7 @@ private void consumeToken(State state) {
319351 int copied = 0 ;
320352 boolean escaped = false ;
321353 boolean prefix = false ;
354+ boolean fuzzy = false ;
322355
323356 while (state .index < state .length ) {
324357 if (!escaped ) {
@@ -332,19 +365,14 @@ private void consumeToken(State state) {
332365 ++state .index ;
333366
334367 continue ;
335- } else if ((state .data [state .index ] == '"' && (flags & PHRASE_OPERATOR ) != 0 )
336- || (state .data [state .index ] == '|' && (flags & OR_OPERATOR ) != 0 )
337- || (state .data [state .index ] == '+' && (flags & AND_OPERATOR ) != 0 )
338- || (state .data [state .index ] == '(' && (flags & PRECEDENCE_OPERATORS ) != 0 )
339- || (state .data [state .index ] == ')' && (flags & PRECEDENCE_OPERATORS ) != 0 )
340- || ((state .data [state .index ] == ' '
341- || state .data [state .index ] == '\t'
342- || state .data [state .index ] == '\n'
343- || state .data [state .index ] == '\r' ) && (flags & WHITESPACE_OPERATOR ) != 0 )) {
368+ } else if (tokenFinished (state )) {
344369 // this should be the end of the term
345370 // all characters found will used for
346371 // creating the term query
347372 break ;
373+ } else if (copied > 0 && state .data [state .index ] == '~' && (flags & FUZZY_OPERATOR ) != 0 ) {
374+ fuzzy = true ;
375+ break ;
348376 }
349377
350378 // wildcard tracks whether or not the last character
@@ -361,7 +389,17 @@ private void consumeToken(State state) {
361389 if (copied > 0 ) {
362390 final Query branch ;
363391
364- if (prefix ) {
392+ if (fuzzy && (flags & FUZZY_OPERATOR ) != 0 ) {
393+ String token = new String (state .buffer , 0 , copied );
394+ int fuzziness = parseFuzziness (state );
395+ // edit distance has a maximum, limit to the maximum supported
396+ fuzziness = Math .min (fuzziness , LevenshteinAutomata .MAXIMUM_SUPPORTED_DISTANCE );
397+ if (fuzziness == 0 ) {
398+ branch = newDefaultQuery (token );
399+ } else {
400+ branch = newFuzzyQuery (token , fuzziness );
401+ }
402+ } else if (prefix ) {
365403 // if a term is found with a closing '*' it is considered to be a prefix query
366404 // and will have prefix added as an option
367405 String token = new String (state .buffer , 0 , copied - 1 );
@@ -423,6 +461,60 @@ private void buildQueryTree(State state, Query branch) {
423461 }
424462 }
425463
464+ /**
465+ * Helper parsing fuzziness from parsing state
466+ * @return slop/edit distance, 0 in the case of non-parsing slop/edit string
467+ */
468+ private int parseFuzziness (State state ) {
469+ char slopText [] = new char [state .length ];
470+ int slopLength = 0 ;
471+
472+ if (state .data [state .index ] == '~' ) {
473+ while (state .index < state .length ) {
474+ state .index ++;
475+ // it's possible that the ~ was at the end, so check after incrementing
476+ // to make sure we don't go out of bounds
477+ if (state .index < state .length ) {
478+ if (tokenFinished (state )) {
479+ break ;
480+ }
481+ slopText [slopLength ] = state .data [state .index ];
482+ slopLength ++;
483+ }
484+ }
485+ int fuzziness = 0 ;
486+ try {
487+ fuzziness = Integer .parseInt (new String (slopText , 0 , slopLength ));
488+ } catch (NumberFormatException e ) {
489+ // swallow number format exceptions parsing fuzziness
490+ }
491+ // negative -> 0
492+ if (fuzziness < 0 ) {
493+ fuzziness = 0 ;
494+ }
495+ return fuzziness ;
496+ }
497+ return 0 ;
498+ }
499+
500+ /**
501+ * Helper returning true if the state has reached the end of token.
502+ */
503+ private boolean tokenFinished (State state ) {
504+ if ((state .data [state .index ] == '"' && (flags & PHRASE_OPERATOR ) != 0 )
505+ || (state .data [state .index ] == '|' && (flags & OR_OPERATOR ) != 0 )
506+ || (state .data [state .index ] == '+' && (flags & AND_OPERATOR ) != 0 )
507+ || (state .data [state .index ] == '(' && (flags & PRECEDENCE_OPERATORS ) != 0 )
508+ || (state .data [state .index ] == ')' && (flags & PRECEDENCE_OPERATORS ) != 0 )
509+ || ((state .data [state .index ] == ' '
510+ || state .data [state .index ] == '\t'
511+ || state .data [state .index ] == '\n'
512+ || state .data [state .index ] == '\r' ) && (flags & WHITESPACE_OPERATOR ) != 0 )) {
513+ return true ;
514+ }
515+ return false ;
516+ }
517+
426518 /**
427519 * Factory method to generate a standard query (no phrase or prefix operators).
428520 */
@@ -439,12 +531,27 @@ protected Query newDefaultQuery(String text) {
439531 }
440532
441533 /**
442- * Factory method to generate a phrase query.
534+ * Factory method to generate a fuzzy query.
535+ */
536+ protected Query newFuzzyQuery (String text , int fuzziness ) {
537+ BooleanQuery bq = new BooleanQuery (true );
538+ for (Map .Entry <String ,Float > entry : weights .entrySet ()) {
539+ Query q = new FuzzyQuery (new Term (entry .getKey (), text ), fuzziness );
540+ if (q != null ) {
541+ q .setBoost (entry .getValue ());
542+ bq .add (q , BooleanClause .Occur .SHOULD );
543+ }
544+ }
545+ return simplify (bq );
546+ }
547+
548+ /**
549+ * Factory method to generate a phrase query with slop.
443550 */
444- protected Query newPhraseQuery (String text ) {
551+ protected Query newPhraseQuery (String text , int slop ) {
445552 BooleanQuery bq = new BooleanQuery (true );
446553 for (Map .Entry <String ,Float > entry : weights .entrySet ()) {
447- Query q = createPhraseQuery (entry .getKey (), text );
554+ Query q = createPhraseQuery (entry .getKey (), text , slop );
448555 if (q != null ) {
449556 q .setBoost (entry .getValue ());
450557 bq .add (q , BooleanClause .Occur .SHOULD );
@@ -518,4 +625,3 @@ static class State {
518625 }
519626 }
520627}
521-
0 commit comments