diff --git a/.gitignore b/.gitignore index a53ac3d..90db62f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ *.iml \.* !.travis.yml +*/target diff --git a/README.md b/README.md index 60eab6b..8bea758 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,129 @@ Result } ``` +针对ES 的 match_phrase query 搜索,是一个非常消耗CPU的query,因为需要处理 term 和 position 的相对位置。为了加速搜素,现优化了分词形式,保存了正确position 的相对位置信息,使得match_phrase query 可以在分词条件下使用,经测试使用该分词之后查询降为原来的 10% 以下。该分词器分为 index 和 search分词器,分别用于索引数据和查询数据。
+原理是分词出来的词项对应着首字 position ,所以可以在倒排中保存相对位置信息。index 分词器是切分出了所有的组合,search 分词器是没有重复的切出最少词项的组合,且不会重复。
+使用:
+1, 定义text 字段,analyzer 设置为 index 分词器,search_analyer 设置为 search分词器;
+2, 写数据。
+3, 查询。
+4, 分词器首字确定位置: fcp_index、fcp_search; 末字确定位置:lcp_index、lcp_search
+5, 缺点是目前原生的高亮不支持这种分词方式
+ +原理
+ +```json +# 使用index 分词是,最细粒度的,按照字的position确定词的position,确定了position的取值标准 +POST /_analyze +{ + "analyzer": "fcp_index", + "text": "中国平安" +} +# response +{ + "tokens": [ + { + "token": "中", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 0 + }, + { + "token": "中国", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 0 + }, + { + "token": "国", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 1 + }, + { + "token": "平", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 2 + }, + { + "token": "平安", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 2 + }, + { + "token": "安", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 3 + } + ] +} +# 使用search 分词是粗粒度、无重叠分词,但仍按照字的position确定词的position,所以使用match_phrase有效 +POST /_analyze +{ + "analyzer": "fcp_search", + "text": "中国平安" +} +# response +{ + "tokens": [ + { + "token": "中国", + "start_offset": 0, + "end_offset": 2, + "type": "", + "position": 0 + }, + { + "token": "平安", + "start_offset": 2, + "end_offset": 4, + "type": "", + "position": 2 + } + ] +} +``` + +```json +PUT test_index +{ + "mappings": { + "properties": { + "content":{ + "type": "text", + "analyzer": "fcp_index", + "search_analyzer": "fcp_search" + } + } + } +} + +POST test_index/_doc/1 +{ + "content": "如果需要覆盖原来的配置" +} + +GET test_index/_search +{ + "query": { + "match_phrase": { + "content": { + "query": "要覆盖" + } + } + } +} +``` + # Dictionary Configuration Config file `IKAnalyzer.cfg.xml` can be located at `{conf}/analysis-ik/config/IKAnalyzer.cfg.xml` diff --git a/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java b/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java index 9e7b6fe..33e6013 100644 --- a/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java +++ b/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java @@ -32,7 +32,7 @@ /** * 词典树分段,表示词典树的一个分枝 */ -class DictSegment implements Comparable{ +public class DictSegment implements Comparable{ //公用字典表,存储汉字 private static final Map charMap = new ConcurrentHashMap(16 , 0.95f); @@ -55,7 +55,7 @@ class DictSegment implements Comparable{ private int nodeState = 0; - DictSegment(Character nodeChar){ + public DictSegment(Character nodeChar){ if(nodeChar == null){ throw new IllegalArgumentException("node char cannot be empty"); } @@ -78,7 +78,7 @@ boolean hasNextNode(){ * @param charArray * @return Hit */ - Hit match(char[] charArray){ + public Hit match(char[] charArray){ return this.match(charArray , 0 , charArray.length , null); } @@ -166,7 +166,7 @@ Hit match(char[] charArray , int begin , int length , Hit searchHit){ * 加载填充词典片段 * @param charArray */ - void fillSegment(char[] charArray){ + public void fillSegment(char[] charArray){ this.fillSegment(charArray, 0 , charArray.length , 1); } diff --git a/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java index 80a92da..a6d6027 100755 --- a/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -126,6 +126,10 @@ private Dictionary(Configuration cfg) { } } + public DictSegment get_MainDict() { + return _MainDict; + } + private String getProperty(String key){ if(props!=null){ return props.getProperty(key); diff --git a/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java new file mode 100644 index 0000000..e3ad6c4 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java @@ -0,0 +1,185 @@ +package org.wltea.analyzer.fcp; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.wltea.analyzer.fcp.util.CharacterUtil; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; +import java.util.Set; + +/** + * combine continues english or number + */ +public class CombineCharFilter extends TokenFilter { + public static final int DEFAULT_MAX_WORD_LEN = 255; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + + // used for saving upstream tokens , implemented by Arraylist + private List tokenBodies = null; + private Queue tokenResults = new LinkedList(); + // token 最大长度。防止过长English + private final int maxTokenLen; + + private static final Set numberDot; + static { + Set tmp = new HashSet<>(); + tmp.add("."); // 2.345 + tmp.add(","); // 1,234,567 + numberDot = Collections.unmodifiableSet(tmp); + } + + public CombineCharFilter(TokenStream input) { + super(input); + this.maxTokenLen = DEFAULT_MAX_WORD_LEN; + } + /** + * Construct a token stream filtering the given input. + * + * @param input + * @param maxTokenLen + */ + public CombineCharFilter(TokenStream input, int maxTokenLen) { + super(input); + this.maxTokenLen = maxTokenLen; + } + + @Override + public final boolean incrementToken() throws IOException { + if (tokenBodies == null && input.incrementToken()) { + tokenBodies = new ArrayList<>(); + do { + TokenBody tb = new TokenBody( + termAtt.toString(), + offsetAtt.startOffset(), + offsetAtt.endOffset(), + typeAtt.type()); + tokenBodies.add(tb); + } while (input.incrementToken()); + + combineCharsByType(tokenBodies); + } + if (tokenResults.size() > 0) { + TokenBody body = tokenResults.poll(); + char[] chars = body.termBuffer.toCharArray(); + termAtt.copyBuffer(chars, 0, chars.length); + offsetAtt.setOffset(body.startOffset, body.endOffset); + typeAtt.setType(body.type); + posIncrAtt.setPositionIncrement(1); + return true; + } else { + tokenBodies = null; + } + return false; + } + + private void combineCharsByType(List tokenBodies) { + if (tokenBodies == null || tokenBodies.size() == 0) { + return; + } + // 处理合并 english number useless + List sameType = new ArrayList<>(); + for (int beginI = 0; beginI < tokenBodies.size();) { + int nextTypeIndex = getNextTypeIndex(tokenBodies, beginI); + TokenBody body = composeTokens(tokenBodies, beginI, nextTypeIndex, tokenBodies.get(beginI).type); + sameType.add(body); + beginI = nextTypeIndex; + } + // 继续处理 english number + for (int beginI = 0; beginI < sameType.size();) { + TokenBody current = sameType.get(beginI); + int nextI = beginI + 1; + if (CharacterUtil.CHAR_NUMBER.equals(current.type) || CharacterUtil.CHAR_ENGLISH.equals(current.type)) { + for(; nextI < sameType.size(); nextI++) { + TokenBody next = sameType.get(nextI); + if (CharacterUtil.CHAR_NUMBER.equals(next.type) + || CharacterUtil.CHAR_ENGLISH.equals(next.type)) { + current.type = CharacterUtil.ALPHANUM; + current.termBuffer = current.termBuffer + next.termBuffer; + current.endOffset = next.endOffset; + } else { + break; + } + } + } + beginI = nextI; + tokenResults.add(current); + } + + } + + private TokenBody composeTokens(List tokenBodies, int beginI, int nextTypeIndex, String type) { + StringBuffer buffer = new StringBuffer(); + int startOffset = tokenBodies.get(beginI).startOffset; + int endOffset = tokenBodies.get(nextTypeIndex - 1).endOffset; + for(int i = beginI; i < nextTypeIndex; i++) { + buffer.append(tokenBodies.get(i).termBuffer); + } + return new TokenBody(buffer.toString(), startOffset, endOffset, type); + } + + // 首 TokenBody 的 type 作为整体 + private int getNextTypeIndex(List tokenBodies,final int beginI) { + int currentIndex = beginI; + // 如果 currentIndex 为 tokenBodies 的最后一个位置,直接返回 + if (currentIndex == tokenBodies.size() - 1) { + return currentIndex + 1; + } + TokenBody current = tokenBodies.get(currentIndex); + final String currentWordType = current.type; + int maxIndex = Math.min(currentIndex + maxTokenLen, tokenBodies.size()); + if (CharacterUtil.CHAR_NUMBER.equals(currentWordType)) { + for (currentIndex++; currentIndex < maxIndex; currentIndex++) { + current = tokenBodies.get(currentIndex); + if (CharacterUtil.CHAR_USELESS.equals(current.type) && numberDot.contains(current.termBuffer)) { + if (currentIndex+1 < maxIndex && CharacterUtil.CHAR_NUMBER.equals(tokenBodies.get(currentIndex+1).type)) { + // 改变了整体的 type + tokenBodies.get(beginI).type = CharacterUtil.CHAR_NUMBER_DOT; + } else { + break; + } + } else if (!CharacterUtil.CHAR_NUMBER.equals(current.type)) { + break; + } + } + return currentIndex; + } else if (CharacterUtil.CHAR_ENGLISH.equals(currentWordType) || CharacterUtil.CHAR_USELESS.equals(currentWordType)) { + for (currentIndex++; currentIndex < maxIndex; currentIndex++) { + current = tokenBodies.get(currentIndex); + if (!currentWordType.equals(current.type)) { + break; + } + } + return currentIndex; + } else { + return currentIndex + 1; + } + } + + + private static class TokenBody { + String termBuffer; + int startOffset, endOffset; + String type; + + TokenBody(String termBuffer, int startOffset, int endOffset, String type){ + this.termBuffer = termBuffer; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.type = type; + } + } +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java new file mode 100644 index 0000000..be2451a --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java @@ -0,0 +1,269 @@ +package org.wltea.analyzer.fcp; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.wltea.analyzer.dic.DictSegment; +import org.wltea.analyzer.dic.Dictionary; +import org.wltea.analyzer.dic.Hit; +import org.wltea.analyzer.fcp.util.CharacterUtil; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.PriorityQueue; + +/** + * use dict to extend terms + */ +public class ExtendFilter extends TokenFilter { + // 默认入库模式 + public static final boolean DEFAULT_INDEX_MODE = true; + // 默认对于特殊字符采用模糊搜索,扩大搜索范围 + public static final boolean DEFAULT_USELESS_MAPPING = true; + // 默认对于句子的空白进行忽略 + public static final boolean DEFAULT_IGNORE_BLANK = true; + // 默认使用 lcp 的模式,使用最后一个char的position + public static final boolean DEFAULT_USE_FIRST_POSITION = false; + // 在高亮的时候使用 offset + public static final boolean DEFAULT_SHOW_OFFSET = false; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + + // used for saving upstream tokens , implemented by Arraylist + private List tokenBodies = null; + //use to save analyzed tokens ,use priority heap save order + PriorityQueue tokenResults = new PriorityQueue(new Comparator(){ + @Override + public int compare(TokenBody o1, TokenBody o2){ +// return o1.position != o2.position ? Integer.compare(o1.position, o2.position) : Integer.compare(o2.startOffset, o1.startOffset); + if(o1.position != o2.position) { + return Integer.compare(o1.position, o2.position); + } else if (o2.startOffset != o1.startOffset) { + return Integer.compare(o2.startOffset, o1.startOffset); + } else { + return Integer.compare(o1.endOffset-o1.startOffset, o2.endOffset-o2.startOffset); + } + } + }); + // 记录上一个 term 的position ,用于计算 positionIncrement + private int prePosition = -1; + + private final boolean indexMode; + // 对于上游的 分词结果 上个 end_offset 和 下一个 token的 start_offset 不相等。 像 “成 功” 之间有空格,该参数决定是否忽略空格组词, 默认为true,忽略之间的 空白 + private boolean ignoreBlank = true; + // 是否使用 first char position ,默认使用,如果为 false,则变为 lcp_analyzer + private boolean useFirstPos = true; + // 特殊字符的映射,默认为 true 表示模糊匹配特殊字符。如果设置为 false ,将会把原始的char放到最终分词结果中。 + private boolean uselessMapping = true; + // 入库模式下不显示,search 模式下显示offset,在 highlight 的时候也开启 + private boolean showOffset = false; + + + public ExtendFilter setIgnoreBlank(boolean ignoreBlank) { + this.ignoreBlank = ignoreBlank; + return this; + } + + + public ExtendFilter setUseFirstPos(boolean useFirstPos) { + this.useFirstPos = useFirstPos; + return this; + } + + public ExtendFilter setUselessMapping(boolean uselessMapping) { + this.uselessMapping = uselessMapping; + return this; + } + + public ExtendFilter setShowOffset(boolean showOffset) { + this.showOffset = showOffset; + return this; + } + + + /** + * Construct a token stream filtering the given input. + * + * @param input + */ + public ExtendFilter(TokenStream input) { + this(input, DEFAULT_INDEX_MODE); + } + + public ExtendFilter(TokenStream input, boolean indexMode) { + super(input); + this.indexMode = indexMode; + } + + @Override + public final boolean incrementToken() throws IOException { + if (tokenBodies == null && input.incrementToken()) { + tokenBodies = new ArrayList<>(); + int position = -1; + do { + TokenBody tb= new TokenBody(); + // TODO lcp analyzer 入库的特殊处理方式(不支持 offset 和 term_vector 存储方式),否则就要改变 lucene源码。 + tb.startOffset = showOffset ? offsetAtt.startOffset() : 0; + tb.endOffset = showOffset ? offsetAtt.endOffset() : 0; + // blank 类型会被舍弃,position不变 + tb.termBuffer = termAtt.toString(); + // 下面是处理 position 和 type的赋值 + if (CharacterUtil.CHAR_USELESS.equals(typeAtt.type())) { + if (isAllBlank(tb.termBuffer) && this.ignoreBlank) { + // 表示沿用上一个 position,下面将会被舍弃掉 + tb.position = position; + tb.type = CharacterUtil.CHAR_BLANK; + tb.termBuffer = ""; + } else { + position += posIncrAtt.getPositionIncrement(); + tb.position = position; + tb.type = typeAtt.type(); + if (uselessMapping) { + tb.termBuffer = "#"; // 无特殊含义,将特殊字符统一映射为 # 方便查询, 否则特殊字符也是需要精准匹配 + } + } + } else { + position += posIncrAtt.getPositionIncrement(); + tb.position = position; + tb.type = typeAtt.type(); + } + tokenBodies.add(tb); + } while (input.incrementToken()); + + extendTerms(tokenBodies, indexMode, ignoreBlank, useFirstPos); + } + if (tokenResults.size() > 0) { + TokenBody body = tokenResults.poll(); + + posIncrAtt.setPositionIncrement(body.position - prePosition); + prePosition = body.position; + char[] chars = body.termBuffer.toCharArray(); + termAtt.copyBuffer(chars, 0, chars.length); + offsetAtt.setOffset(body.startOffset, body.endOffset); + typeAtt.setType(body.type); + return true; + } else { + tokenBodies = null; + prePosition = -1; + } + return false; + } + + + /** + * 判断参数是否全部由空白字符(空格、制表符、换行……)组成 + * @param s + * @return + */ + private boolean isAllBlank(String s) { + return s.trim().length() == 0; + } + + private void extendTerms(List tokenBodies, boolean indexMode, boolean ignoreBlank, boolean useFirstPos) { + if (tokenBodies == null || tokenBodies.size() == 0) { + return; + } + for (int beginI = 0; beginI < tokenBodies.size(); beginI++) { + TokenBody tokenBody = tokenBodies.get(beginI); + if (!tokenBody.type.equals(CharacterUtil.CHAR_BLANK)) { + // 处理当前char, 但要考虑向后扩展,得到以当前位置开始 以 endList 中位置结束的一系列term, + List endList = getCurrentEndList(tokenBodies, beginI, ignoreBlank); + if (indexMode) { + tokenResults.add(tokenBody); + for (Integer endI : endList) { + TokenBody tb= new TokenBody(); + tb.termBuffer = combineTermBuffer(tokenBodies, beginI, endI); + tb.startOffset = tokenBodies.get(beginI).startOffset; + tb.endOffset = tokenBodies.get(endI).endOffset; + if (useFirstPos) { + tb.position = tokenBodies.get(beginI).position; + } else { + tb.position = tokenBodies.get(endI).position; + } + tb.type = CharacterUtil.COMBINE_WORD; + tokenResults.add(tb); + } + } else { + // 处理search analyzer 结果,贪婪向后匹配 + // 1,只有单字,加入单字 + // 2,有后缀匹配,采用最长的token结果(目的是找到个数最少的组合,非最优,但比较简单) + if (endList.isEmpty()) { + tokenResults.add(tokenBody); // 单字 + } else { + int lastEnd = endList.get(endList.size()-1); // 取最长token + tokenBody.termBuffer = combineTermBuffer(tokenBodies, beginI, lastEnd); + tokenBody.startOffset = tokenBodies.get(beginI).startOffset; + tokenBody.endOffset = tokenBodies.get(lastEnd).endOffset; + if (useFirstPos) { + tokenBody.position = tokenBodies.get(beginI).position; + } else { + tokenBody.position = tokenBodies.get(lastEnd).position; + } + tokenBody.type = CharacterUtil.COMBINE_WORD; + tokenResults.add(tokenBody); + + beginI = lastEnd; + } + } + } + } + } + + /** + * 以 begin 开始,但是不包含 begin + * @param tokenBodies + * @param begin + * @param ignoreBlank + * @return + */ + private List getCurrentEndList(List tokenBodies, int begin, boolean ignoreBlank) { + List endList = new ArrayList<>(); + DictSegment dict = Dictionary.getSingleton().get_MainDict(); + StringBuffer sb = new StringBuffer(tokenBodies.get(begin).termBuffer); + for (int j = begin+1; j < tokenBodies.size(); j++) { + TokenBody current = tokenBodies.get(j); + if (current.type.equals(CharacterUtil.CHAR_BLANK)) { + if(ignoreBlank) { + continue; + } else { + break; + } + } + // 处理 中文情况 + sb.append(current.termBuffer); + Hit hit = dict.match(sb.toString().toCharArray()); + if (hit.isUnmatch()) { + break; + } + if (hit.isMatch()) { + endList.add(j); + } + } +// System.out.println(endList); + return endList; + } + + /** + * 拼接 [begin, end] termBuffer + * @param tokenBodies + * @param begin + * @param end + * @return + */ + private String combineTermBuffer(List tokenBodies, int begin, int end) { + StringBuffer sb = new StringBuffer(tokenBodies.get(begin).termBuffer); + for(int i = begin+1; i <= end; i++) { + sb.append(tokenBodies.get(i).termBuffer); + } + return sb.toString(); + } + +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java new file mode 100644 index 0000000..ec38154 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java @@ -0,0 +1,128 @@ +package org.wltea.analyzer.fcp; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + + +public final class FCPAnalyzer extends Analyzer { + /** Default maximum allowed token length */ + public static final boolean DEFAULT_SPLIT_COMPLETE = false; + + // 决定分词时对 英文、数字 是否进行完全切分,默认为 false,表示数字和英文为一个整体,不会继续向下切分,完全切分的话 splitComplete = true + private boolean splitComplete = false; + // 默认为建立 索引模式, 如果为 查询模式 indexMode = false + private final boolean indexMode; + // 特殊字符的映射,默认为 true 表示模糊匹配特殊字符。如果设置为 false ,将会把原始的char放到最终分词结果中。 + private boolean uselessMapping = true; + // 默认文本是正确文本,其中的空白是有意义的,不能忽略空白。如果认为原文中的空白由于ETL错误引入,应该忽略空白。 + private boolean ignoreBlank = true; + // 是否使用 first char position ,默认使用,如果为 false,则变为 lcp_analyzer + private boolean useFirstPos = true; + // 是否显示 offset,默认随着 indexMode 变化 + private boolean showOffset; + + private int maxTokenLength = CombineCharFilter.DEFAULT_MAX_WORD_LEN; + + public FCPAnalyzer() { + this(ExtendFilter.DEFAULT_INDEX_MODE); + } + public FCPAnalyzer(boolean indexMode) { + this.indexMode = indexMode; + // 改变 showOffset 的默认值 + if (indexMode) { + showOffset = false; + } else { + showOffset = true; + } + } + + public FCPAnalyzer setIgnoreBlank(boolean ignoreBlank) { + this.ignoreBlank = ignoreBlank; + return this; + } + + public FCPAnalyzer setUselessMapping(boolean uselessMapping) { + this.uselessMapping = uselessMapping; + return this; + } + + public FCPAnalyzer setSplitComplete(boolean splitComplete) { + this.splitComplete = splitComplete; + return this; + } + + public FCPAnalyzer setShowOffset(boolean showOffset) { + this.showOffset = showOffset; + return this; + } + + public FCPAnalyzer setUseFirstPos(boolean useFirstPos) { + this.useFirstPos = useFirstPos; + return this; + } + + /** + * Set the max allowed token length. Tokens larger than this will be chopped + * up at this token length and emitted as multiple tokens. If you need to + * skip such large tokens, you could increase this max length, and then + * use {@code LengthFilter} to remove long tokens. The default is + * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}. + */ + public FCPAnalyzer setMaxTokenLength(int length) { + maxTokenLength = length; + return this; + } + + /** Returns the current maximum token length + * + * @see #setMaxTokenLength */ + public int getMaxTokenLength() { + return maxTokenLength; + } + + public boolean isIgnoreBlank() { + return ignoreBlank; + } + + + public boolean isIndexMode() { + return indexMode; + } + + public boolean isUseFirstPos() { + return useFirstPos; + } + + @Override + protected TokenStreamComponents createComponents(final String fieldName) { + final Tokenizer src = new NGramTokenizer(1, 1); + TokenStream tok = new FormatFilter(src); + if (!splitComplete) { + tok = new CombineCharFilter(tok, maxTokenLength); + } + + tok = new ExtendFilter(tok, indexMode) + .setShowOffset(showOffset) + .setIgnoreBlank(ignoreBlank) + .setUseFirstPos(useFirstPos) + .setUselessMapping(uselessMapping); + return new TokenStreamComponents(src, tok); + } + + @Override + public String toString() { + return "FCPAnalyzer{" + + "splitComplete=" + splitComplete + + ", indexMode=" + indexMode + + ", showOffset=" + showOffset + + ", uselessMapping=" + uselessMapping + + ", ignoreBlank=" + ignoreBlank + + ", useFirstPos=" + useFirstPos + + ", maxTokenLength=" + maxTokenLength + + '}'; + } + +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java new file mode 100644 index 0000000..e85d6cb --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java @@ -0,0 +1,51 @@ +package org.wltea.analyzer.fcp; + +import org.apache.lucene.analysis.CharacterUtils; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.wltea.analyzer.fcp.util.CharacterUtil; + +import java.io.IOException; + +/** + * 英文转小写 + * 字符的类型处理 + */ +public class FormatFilter extends TokenFilter { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + + /** + * Construct a token stream filtering the given input. + * + * @param input + */ + public FormatFilter(TokenStream input) { + super(input); + } + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String s = termAtt.toString(); + // 如果从 ngram 1 的 Tokenizer 得到的 token 应该length 都为 1 + if (s.length() == 1) { + int c = s.codePointAt(0); + typeAtt.setType(CharacterUtil.identifyCharType(c)); + c = CharacterUtil.regularize(c); + char[] chars = Character.toChars(c); + termAtt.copyBuffer(chars, 0, chars.length); + } else { + // 对英文进行 lower case + CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length()); + } + return true; + } else { + return false; + } + } + +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java b/core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java new file mode 100644 index 0000000..659469d --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java @@ -0,0 +1,102 @@ +package org.wltea.analyzer.fcp; + +import java.util.Arrays; + +/** + * present a no conflict path for choose + */ +public class OptionPath implements Comparable { + private static final int DEFAULT_CAPACITY = 10; + int[] groups; + int size = 0; + int payloadLength = 0; + + OptionPath() { + groups = new int[DEFAULT_CAPACITY]; + } + + OptionPath(int capacity) { + assert capacity > 0; + groups = new int[capacity]; + } + + private OptionPath(int size, int[] groups) { + this.size = size; + int newCapacity = Math.max(size * 2, groups.length); + this.groups = Arrays.copyOf(groups, newCapacity); + } + + OptionPath copy() { + return new OptionPath(this.size, this.groups); + } + + void addElement(int startPosition, int endPosition) { + assert endPosition > startPosition; + this.size++; + if (this.size*2 >= this.groups.length) { + this.groups = Arrays.copyOf(this.groups, this.groups.length * 2); + } + this.payloadLength += (endPosition - startPosition + 1); + this.groups[size*2 - 2] = startPosition; + this.groups[size*2 - 1] = endPosition; + } + + int getValueByIndex(int index) { + assert -1 < index && index < this.groups.length; + return this.groups[index]; + } + + int getEndPosition(int startPosition) { + int endPosition = -1; + for(int i = 0; i < size && this.groups[2*i] <= startPosition; i++) { + if (startPosition == this.groups[2*i]) { + endPosition = this.groups[2*i + 1]; + } + } + return endPosition; + } + + int getPathLength() { + return this.groups[this.size*2+1] - this.groups[0]; + } + + int getPathEnd() { + return this.groups[size*2+1]; + } + + int getXWeight() { + int product = 1; + for(int i = 0; i < size; i++) { + product *= (this.groups[2*i+1] - this.groups[2*i]); + } + return product; + } + + int getPWeight() { + int pWeight = 0; + int p = 0; + for(int i = 0; i < size; i++) { + p++; + pWeight += p * (this.groups[2*i+1] - this.groups[2*i]); + } + return pWeight; + } + + // ik_smart 解决歧义问题的实现逻辑 + @Override + public int compareTo(OptionPath o) { + if (this.payloadLength != o.payloadLength) { + return Integer.compare(this.payloadLength, o.payloadLength); + } else if (this.size != o.size) { + return Integer.compare(this.size, o.size); + } else if (this.getPathLength() != o.getPathLength()) { + return Integer.compare(this.getPathLength(), o.getPathLength()); + } else if(this.getPathEnd() != o.getPathEnd()) { + return Integer.compare(this.getPathEnd(), o.getPathEnd()); + } else if (this.getXWeight() != o.getXWeight()) { + return Integer.compare(this.getXWeight(), o.getXWeight()); + } else { + return Integer.compare(this.getPWeight(), o.getPWeight()); + } + } +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java new file mode 100644 index 0000000..6e9bcf4 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java @@ -0,0 +1,48 @@ +package org.wltea.analyzer.fcp; + +import java.util.List; + +/** + * compose term + */ +class TokenBody { + String termBuffer; + int startOffset, endOffset; + // position 用于表示在 elasticsearch 分词时得到的 position, 通过 curr.position - prev.position 得到 positionIncrement + int position; + // todo 未来startPosition、endPosition 用于收集 那些在 词库中 扩展出来的 token,主要给 ik_smart 使用 + int startPosition = -1, endPosition = -1; + String type; + + List child; + + TokenBody(){} + TokenBody(String termBuffer, int startOffset, int endOffset, int position, int startPosition, int endPosition, String type){ + this.termBuffer = termBuffer; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.position = position; + this.startPosition = startPosition; + this.endPosition = endPosition; + this.type = type; + } + + + TokenBody copy() { + return new TokenBody(termBuffer, startOffset, endOffset, position, startPosition, endPosition, ""); + } + + @Override + public String toString() { + return "TokenBody{" + + "termBuffer='" + termBuffer + '\'' + + ", startOffset=" + startOffset + + ", endOffset=" + endOffset + + ", position=" + position + + ", startPosition=" + startPosition + + ", endPosition=" + endPosition + + ", type='" + type + '\'' + + ", child=" + child + + '}'; + } +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java new file mode 100644 index 0000000..2e0f632 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java @@ -0,0 +1,34 @@ +package org.wltea.analyzer.fcp.tokenattributes; + +import org.apache.lucene.util.Attribute; + +/** Determines how many positions this + * token spans. Very few analyzer components actually + * produce this attribute, and indexing ignores it, but + * it's useful to express the graph structure naturally + * produced by decompounding, word splitting/joining, + * synonym filtering, etc. + * + *

NOTE: this is optional, and most analyzers + * don't change the default value (1). */ + +@Deprecated +public interface PositionLengthAttribute extends Attribute { + /** + * Set the position length of this Token. + *

+ * The default value is one. + * @param positionLength how many positions this token + * spans. + * @throws IllegalArgumentException if positionLength + * is zero or negative. + * @see #getPositionLength() + */ + public void setPositionLength(int positionLength); + + /** Returns the position length of this Token. + * @see #setPositionLength + */ + public int getPositionLength(); +} + diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java new file mode 100644 index 0000000..c4d5dff --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java @@ -0,0 +1,63 @@ +package org.wltea.analyzer.fcp.tokenattributes; + + +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeReflector; + +/** Default implementation of {@link PositionLengthAttribute}. */ +@Deprecated +public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable { + private int positionLength = 1; + + /** Initializes this attribute with position length of 1. */ + public PositionLengthAttributeImpl() {} + + @Override + public void setPositionLength(int positionLength) { + if (positionLength < 1) { + throw new IllegalArgumentException("Position length must be 1 or greater; got " + positionLength); + } + this.positionLength = positionLength; + } + + @Override + public int getPositionLength() { + return positionLength; + } + + @Override + public void clear() { + this.positionLength = 1; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof PositionLengthAttributeImpl) { + PositionLengthAttributeImpl _other = (PositionLengthAttributeImpl) other; + return positionLength == _other.positionLength; + } + + return false; + } + + @Override + public int hashCode() { + return positionLength; + } + + @Override + public void copyTo(AttributeImpl target) { + PositionLengthAttribute t = (PositionLengthAttribute) target; + t.setPositionLength(positionLength); + } + + @Override + public void reflectWith(AttributeReflector reflector) { + reflector.reflect(PositionLengthAttribute.class, "positionLength", positionLength); + } +} + diff --git a/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java b/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java new file mode 100644 index 0000000..0f61896 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java @@ -0,0 +1,124 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + * 字符集识别工具类 + */ +package org.wltea.analyzer.fcp.util; + +import java.util.HashMap; +import java.util.Map; + +/** + * 字符集识别工具类 + */ +public class CharacterUtil { + + public static final String CHAR_USELESS = ""; + + public static final String CHAR_ENGLISH = ""; + + public static final String CHAR_NUMBER = ""; + + public static final String CHAR_NUMBER_DOT = ""; + + public static final String ALPHANUM = ""; + + public static final String CHAR_CHINESE = ""; + + public static final String COMBINE_WORD = ""; + + public static final String CHAR_MAPPING = ""; + + public static final String CHAR_BLANK = ""; + + // pinyin + public static final String CHAR_PINYIN = ""; + // pinyin 前缀 + public static final String CHAR_PINYIN_PRE = ""; + + private static Map order; + static { + // value 越小,排序越靠前,用于区分在同一个 position 上的不同 type 之间的排序 + order = new HashMap<>(); + order.put(CHAR_CHINESE, 0); + order.put(CHAR_PINYIN_PRE, 5); + order.put(CHAR_PINYIN, 10); + + order.put(CHAR_USELESS, 0); + order.put(CHAR_MAPPING, 10); + } + + public static int getOrderByType(String type) { + return order.getOrDefault(type, 0); + } + + + + /** + * 识别字符类型 + * @param input + * @return int CharacterUtil定义的字符类型常量 + */ + public static String identifyCharType(int input){ + + if (input >= '0' && input <= '9') { + return CHAR_NUMBER; + } else if ((input >= 'a' && input <= 'z') + || (input >= 'A' && input <= 'Z')) { + return CHAR_ENGLISH; + } else { + Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); + + if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS + || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS + || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){ + //目前已知的中文字符UTF-8集合 + return CHAR_CHINESE; + + } + } + //其他的不做处理的字符 + return CHAR_USELESS; + + } + + /** + * 进行字符规格化(全角转半角,大写转小写处理) + * @param input + * @return char + */ + public static int regularize(int input){ + if (input == 12288) { + input = 32; + + }else if (input > 65280 && input < 65375) { + input = input - 65248; + + }else if (input >= 'A' && input <= 'Z') { + input += 32; + } + + + return input; + } +} diff --git a/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java b/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java new file mode 100644 index 0000000..34c6d8e --- /dev/null +++ b/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java @@ -0,0 +1,29 @@ +package org.wltea.analyzer.fcp; + +import org.wltea.analyzer.cfg.Configuration; + +import java.io.File; +import java.net.URI; +import java.nio.file.Path; +import java.nio.file.Paths; + +/** + * @ClassName Configuration4Test + * @Description: + */ +public class Configuration4Test extends Configuration { + @Override + public Path getConfDir() { + return Paths.get("../", "config"); + } + + @Override + public Path getConfigInPluginDir() { + return Paths.get("../", "config"); + } + + @Override + public Path getPath(String first, String... more) { + return Paths.get(first, more); + } +} diff --git a/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java b/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java new file mode 100644 index 0000000..b65507d --- /dev/null +++ b/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java @@ -0,0 +1,80 @@ +package org.wltea.analyzer.fcp; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.junit.Before; +import org.junit.Test; +import org.wltea.analyzer.dic.Dictionary; + +import java.io.IOException; +import java.io.StringReader; + +/** + * @ClassName FCPAnalyzerTest + * @Description: fcp test + */ +public class FCPAnalyzerTest { + + @Before + public void init() { + // 初始化词典 + Dictionary.initial(new Configuration4Test()); + } + + @Test + public void testFcpIndexAnalyzer() { + FCPAnalyzer fcpIndex = new FCPAnalyzer(true); + String str = "这里是中国, this is china #4.345^"; + TokenStream stream = null ; + try { + stream = fcpIndex.tokenStream( "any", new StringReader(str)) ; + PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class ) ; //保存位置 + OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class ) ; //保存辞与词之间偏移量 + CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class ) ;//保存响应词汇 + TypeAttribute ta = stream.addAttribute(TypeAttribute.class ) ; //保存类型 + stream.reset() ; + int position = -1; + while (stream.incrementToken()) { + position += pia.getPositionIncrement(); + System. out.println(position + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type()); + } + stream.end() ; + } catch (IOException e) { + e.printStackTrace(); + } + + } + + @Test + public void testFcpSearchAnalyzer() { + FCPAnalyzer fcpSearch = new FCPAnalyzer(false); + String str = "这里是中国, this is china #4.345^"; + TokenStream stream = null ; + try { + stream = fcpSearch.tokenStream( "any", new StringReader(str)) ; + PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class ) ; //保存位置 + OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class ) ; //保存辞与词之间偏移量 + CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class ) ;//保存响应词汇 + TypeAttribute ta = stream.addAttribute(TypeAttribute.class ) ; //保存类型 + stream.reset() ; + int position = -1; + while (stream.incrementToken()) { + position += pia.getPositionIncrement(); + System. out.println(position + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type()); + } + stream.end() ; + } catch (IOException e) { + e.printStackTrace(); + } + + } + + @Test + public void test03() { + String s = " \t \n"; + System.out.println(s.trim().length() == 0); + } +} diff --git a/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java b/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java index 54ee735..f906af6 100644 --- a/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java +++ b/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java @@ -33,6 +33,12 @@ public Map { + private final FCPAnalyzer analyzer; + + /** + * indexMode 作为重要的参数, + * @param indexSettings + * @param env + * @param name + * @param settings + * @param indexMode + */ + public FCPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings, boolean indexMode) { + super(name, settings); + boolean splitComplete = settings.getAsBoolean("split_complete", FCPAnalyzer.DEFAULT_SPLIT_COMPLETE); + int maxTokenLength = settings.getAsInt("max_token_length", CombineCharFilter.DEFAULT_MAX_WORD_LEN); + boolean uselessMapping = settings.getAsBoolean("useless_mapping", ExtendFilter.DEFAULT_USELESS_MAPPING); + boolean ignoreBlank = settings.getAsBoolean("ignore_blank", ExtendFilter.DEFAULT_IGNORE_BLANK); + boolean useFirstPos = settings.getAsBoolean("use_first_position", ExtendFilter.DEFAULT_USE_FIRST_POSITION); + Boolean showOffset = settings.getAsBoolean("show_offset", null); + analyzer = new FCPAnalyzer(indexMode); + if (showOffset != null) { + analyzer.setShowOffset(showOffset); + } + analyzer.setSplitComplete(splitComplete); + analyzer.setUselessMapping(uselessMapping); + analyzer.setMaxTokenLength(maxTokenLength); + analyzer.setIgnoreBlank(ignoreBlank); + analyzer.setUseFirstPos(useFirstPos); + } + + public static FCPAnalyzerProvider getFCPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { + boolean indexMode = settings.getAsBoolean("index_mode", ExtendFilter.DEFAULT_INDEX_MODE); + return new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode); + } + + public static FCPAnalyzerProvider getFCPIndexAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) { + boolean indexMode = true; + boolean useFirstPos = true; + FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode); + FCPAnalyzer fcpAnalyzer = provider.get(); + fcpAnalyzer.setUseFirstPos(useFirstPos); + return provider; + } + + public static FCPAnalyzerProvider getFCPSearchAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) { + boolean indexMode = false; + boolean useFirstPos = true; + FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode); + FCPAnalyzer fcpAnalyzer = provider.get(); + fcpAnalyzer.setUseFirstPos(useFirstPos); + return provider; + } + + public static FCPAnalyzerProvider getLCPIndexAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) { + boolean indexMode = true; + FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode); + FCPAnalyzer fcpAnalyzer = provider.get(); + fcpAnalyzer.setUseFirstPos(false); + return provider; + } + + public static FCPAnalyzerProvider getLCPSearchAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) { + boolean indexMode = false; + FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode); + FCPAnalyzer fcpAnalyzer = provider.get(); + fcpAnalyzer.setUseFirstPos(false); + return provider; + } + + @Override + public FCPAnalyzer get() { + return analyzer; + } +} \ No newline at end of file