跟益达学Solr5之拼音分词[改进版]

2023-11-02 01:38

本文主要是介绍跟益达学Solr5之拼音分词[改进版],希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!

      之前一篇介绍过如何自定义实现拼音分词器,不过当初只考虑了全拼这种情况,且有些BUG,趁着抗日胜利70周年阅兵3天假期有时间,又把当初的代码拿起来进行了改进,改进点包括支持全拼,简拼以及全拼+简拼,支持汉字数字是否NGram处理的可配置,支持NGram长度范围的可配置等,特此更新此篇进行分享!如有不妥之处,还望不吝指正!

      废话不多说,直接上代码:

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.apache.lucene.analysis.pinyin.utils.Pinyin4jUtil;
import org.apache.lucene.analysis.pinyin.utils.StringUtils;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/*** 拼音过滤器[负责将汉字转换为拼音]* @author Lanxiaowei**/
public class PinyinTokenFilter extends TokenFilter {/**是否输出原中文*/private boolean isOutChinese;/**是否只转换简拼*/private boolean shortPinyin;/**是否转换全拼+简拼*/private boolean pinyinAll;/**中文词组长度过滤,默认超过2位长度的中文才转换拼音*/private int minTermLength;/**词元输入缓存*/private char[] curTermBuffer;/**词元输入长度*/private int curTermLength;private final CharTermAttribute termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class);/**位置增量属性*/private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);/**当前输入是否已输出*/private boolean hasCurOut;/**拼音结果集*/private Collection<String> terms;/**拼音结果集迭代器*/private Iterator<String> termIte;public PinyinTokenFilter(TokenStream input) {this(input,Constant.DEFAULT_MIN_TERM_LRNGTH);}public PinyinTokenFilter(TokenStream input, int minTermLength) {this(input, Constant.DEFAULT_SHORT_PINYIN, Constant.DEFAULT_PINYIN_ALL,minTermLength);}public PinyinTokenFilter(TokenStream input, boolean shortPinyin) {this(input, shortPinyin, Constant.DEFAULT_PINYIN_ALL);}public PinyinTokenFilter(TokenStream input, boolean shortPinyin,boolean pinyinAll) {this(input, shortPinyin,pinyinAll, Constant.DEFAULT_MIN_TERM_LRNGTH);}public PinyinTokenFilter(TokenStream input, boolean shortPinyin,boolean pinyinAll,int minTermLength) {this(input, shortPinyin,pinyinAll,Constant.DEFAULT_OUT_CHINESE, minTermLength);}public PinyinTokenFilter(TokenStream input, boolean shortPinyin,boolean pinyinAll,boolean isOutChinese,int minTermLength) {super(input);this.minTermLength = minTermLength;if (this.minTermLength < 1) {this.minTermLength = 1;}this.isOutChinese = isOutChinese;this.shortPinyin = shortPinyin;this.pinyinAll = pinyinAll;// 偏移量属性addAttribute(OffsetAttribute.class); }@Overridepublic final boolean incrementToken() throws IOException {while (true) {// 开始处理或上一输入词元已被处理完成if (this.curTermBuffer == null) {// 获取下一词元输入if (!this.input.incrementToken()) { // 没有后继词元输入,处理完成,返回false,结束上层调用return false; }// 缓存词元输入this.curTermBuffer = ((char[]) this.termAtt.buffer().clone());this.curTermLength = this.termAtt.length();}String chinese = this.termAtt.toString();// 处理原输入词元if ((this.isOutChinese) && (!this.hasCurOut) && (this.termIte == null)) {// 准许输出原中文词元且当前没有输出原输入词元且还没有处理拼音结果集// 标记以保证下次循环不会输出this.hasCurOut = true; // 写入原输入词元this.termAtt.copyBuffer(this.curTermBuffer, 0,this.curTermLength);this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());this.typeAtt.setType(StringUtils.isNumeric(chinese)? "numeric_original" : (StringUtils.containsChinese(chinese)?"chinese_original" : "normal_word"));return true;}String type = this.typeAtt.type();// 若包含中文且中文字符长度不小于限定的最小长度minTermLengthif (StringUtils.chineseCharCount(chinese) >= this.minTermLength) {// 如果需要全拼+简拼if(this.pinyinAll) {Collection<String> quanpinColl = Pinyin4jUtil.getPinyinCollection(chinese);quanpinColl.addAll(Pinyin4jUtil.getPinyinShortCollection(chinese));this.terms = quanpinColl;} else {// 简拼 or 全拼,二选一this.terms = this.shortPinyin ? Pinyin4jUtil.getPinyinShortCollection(chinese) : Pinyin4jUtil.getPinyinCollection(chinese);}if (this.terms != null) {this.termIte = this.terms.iterator();}} else {if(null != type && ("numeric_original".equals(type) ||"normal_word".equals(type))) {Collection<String> coll = new ArrayList<String>();coll.add(chinese);this.terms = coll;if (this.terms != null) {this.termIte = this.terms.iterator();}}}if (this.termIte != null) {// 有拼音结果集且未处理完成while (this.termIte.hasNext()) { String pinyin = this.termIte.next();this.termAtt.copyBuffer(pinyin.toCharArray(), 0, pinyin.length());//同义词的原理this.posIncrAtt.setPositionIncrement(0);this.typeAtt.setType(this.shortPinyin ? "short_pinyin" : "pinyin");return true;}}// 没有中文或转换拼音失败,不用处理,// 清理缓存,下次取新词元this.curTermBuffer = null;this.termIte = null;this.hasCurOut = false; }}@Overridepublic void reset() throws IOException {super.reset();}
}

   

import java.io.IOException;import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.apache.lucene.analysis.pinyin.utils.StringUtils;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;/*** 对转换后的拼音进行NGram处理的TokenFilter* * @author Lanxiaowei* */
@SuppressWarnings("unused")
public class PinyinNGramTokenFilter extends TokenFilter {private char[] curTermBuffer;private int curTermLength;private int curCodePointCount;private int curGramSize;private int curPos;private int curPosInc, curPosLen;private int tokStart;private int tokEnd;private boolean hasIllegalOffsets;private int minGram;private int maxGram;/** 是否需要对中文进行NGram[默认为false] */private final boolean nGramChinese;/** 是否需要对纯数字进行NGram[默认为false] */private final boolean nGramNumber;private final CharacterUtils charUtils;private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);private PositionIncrementAttribute posIncAtt;private PositionLengthAttribute posLenAtt;private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);private TypeAttribute typeAtt;public PinyinNGramTokenFilter(TokenStream input, int minGram, int maxGram,boolean nGramChinese,boolean nGramNumber) {super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));this.charUtils = CharacterUtils.getInstance();if (minGram < 1) {throw new IllegalArgumentException("minGram must be greater than zero");}if (minGram > maxGram) {throw new IllegalArgumentException("minGram must not be greater than maxGram");}this.minGram = minGram;this.maxGram = maxGram;this.nGramChinese = nGramChinese;this.nGramNumber = nGramNumber;this.termAtt = addAttribute(CharTermAttribute.class);this.offsetAtt = addAttribute(OffsetAttribute.class);this.typeAtt = addAttribute(TypeAttribute.class);this.posIncAtt = addAttribute(PositionIncrementAttribute.class);this.posLenAtt = addAttribute(PositionLengthAttribute.class);}public PinyinNGramTokenFilter(TokenStream input, int minGram, int maxGram,boolean nGramChinese) {this(input, minGram, maxGram, nGramChinese, Constant.DEFAULT_NGRAM_NUMBER);}public PinyinNGramTokenFilter(TokenStream input, int minGram, int maxGram) {this(input, minGram, maxGram, Constant.DEFAULT_NGRAM_CHINESE);}public PinyinNGramTokenFilter(TokenStream input, int minGram) {this(input, minGram, Constant.DEFAULT_MAX_GRAM);}public PinyinNGramTokenFilter(TokenStream input) {this(input, Constant.DEFAULT_MIN_GRAM);}@Overridepublic final boolean incrementToken() throws IOException {while (true) {if (curTermBuffer == null) {if (!input.incrementToken()) {return false;}String type = this.typeAtt.type();if(null != type && "normal_word".equals(type)) {return true;}if(null != type && "numeric_original".equals(type)) {return true;}if(null != type && "chinese_original".equals(type)) {return true;}if ((!this.nGramNumber)&& (StringUtils.isNumeric(this.termAtt.toString()))) {return true;}if ((!this.nGramChinese)&& (StringUtils.containsChinese(this.termAtt.toString()))) {return true;}curTermBuffer = termAtt.buffer().clone();curTermLength = termAtt.length();curCodePointCount = charUtils.codePointCount(termAtt);curGramSize = minGram;curPos = 0;curPosInc = posIncAtt.getPositionIncrement();curPosLen = posLenAtt.getPositionLength();tokStart = offsetAtt.startOffset();tokEnd = offsetAtt.endOffset();hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;}if (curGramSize > maxGram|| (curPos + curGramSize) > curCodePointCount) {++curPos;curGramSize = minGram;}if ((curPos + curGramSize) <= curCodePointCount) {clearAttributes();final int start = charUtils.offsetByCodePoints(curTermBuffer,0, curTermLength, 0, curPos);final int end = charUtils.offsetByCodePoints(curTermBuffer, 0,curTermLength, start, curGramSize);termAtt.copyBuffer(curTermBuffer, start, end - start);posIncAtt.setPositionIncrement(curPosInc);curPosInc = 0;posLenAtt.setPositionLength(curPosLen);offsetAtt.setOffset(tokStart, tokEnd);curGramSize++;return true;}curTermBuffer = null;}}@Overridepublic void reset() throws IOException {super.reset();curTermBuffer = null;}
}

   

import java.io.IOException;import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.apache.lucene.analysis.pinyin.utils.StringUtils;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;/*** 对转换后的拼音进行EdgeNGram处理的TokenFilter* * @author Lanxiaowei* */
public class PinyinEdgeNGramTokenFilter extends TokenFilter {private final int minGram;private final int maxGram;/** 是否需要对中文进行NGram[默认为false] */private final boolean nGramChinese;/** 是否需要对纯数字进行NGram[默认为false] */private final boolean nGramNumber;private final CharacterUtils charUtils;private char[] curTermBuffer;private int curTermLength;private int curCodePointCount;private int curGramSize;private int tokStart;private int tokEnd;private int savePosIncr;private int savePosLen;private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);private TypeAttribute typeAtt = addAttribute(TypeAttribute.class);public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram,int maxGram, boolean nGramChinese, boolean nGramNumber) {super(input);if (minGram < 1) {throw new IllegalArgumentException("minGram must be greater than zero");}if (minGram > maxGram) {throw new IllegalArgumentException("minGram must not be greater than maxGram");}this.charUtils = CharacterUtils.getInstance();this.minGram = minGram;this.maxGram = maxGram;this.nGramChinese = nGramChinese;this.nGramNumber = nGramNumber;}public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram,int maxGram, boolean nGramChinese) {this(input, minGram, maxGram, nGramChinese, Constant.DEFAULT_NGRAM_NUMBER);}public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram,int maxGram) {this(input, minGram, maxGram, Constant.DEFAULT_NGRAM_CHINESE);}public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram) {this(input, minGram, Constant.DEFAULT_MAX_GRAM);}public PinyinEdgeNGramTokenFilter(TokenStream input) {this(input, Constant.DEFAULT_MIN_GRAM);}@Overridepublic final boolean incrementToken() throws IOException {while (true) {if (curTermBuffer == null) {if (!input.incrementToken()) {return false;}String type = this.typeAtt.type();if(null != type && "normal_word".equals(type)) {return true;}if(null != type && "numeric_original".equals(type)) {return true;}if(null != type && "chinese_original".equals(type)) {return true;}if ((!this.nGramNumber)&& (StringUtils.isNumeric(this.termAtt.toString()))) {return true;}if ((!this.nGramChinese)&& (StringUtils.containsChinese(this.termAtt.toString()))) {return true;}curTermBuffer = termAtt.buffer().clone();curTermLength = termAtt.length();curCodePointCount = charUtils.codePointCount(termAtt);curGramSize = minGram;tokStart = offsetAtt.startOffset();tokEnd = offsetAtt.endOffset();savePosIncr += posIncrAtt.getPositionIncrement();savePosLen = posLenAtt.getPositionLength();}if (curGramSize <= maxGram) { if (curGramSize <= curCodePointCount) { clearAttributes();offsetAtt.setOffset(tokStart, tokEnd);if (curGramSize == minGram) {posIncrAtt.setPositionIncrement(savePosIncr);savePosIncr = 0;} else {posIncrAtt.setPositionIncrement(0);}posLenAtt.setPositionLength(savePosLen);final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);termAtt.copyBuffer(curTermBuffer, 0, charLength);curGramSize++;return true;}}curTermBuffer = null;}}@Overridepublic void reset() throws IOException {super.reset();curTermBuffer = null;savePosIncr = 0;}
}

   

package org.apache.lucene.analysis.pinyin.lucene5;import java.io.BufferedReader;
import java.io.Reader;
import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.wltea.analyzer.lucene.IKTokenizer;
/*** 自定义拼音分词器* @author Lanxiaowei**/
public class PinyinAnalyzer extends Analyzer {private int minGram;private int maxGram;private boolean useSmart;/** 是否需要对中文进行NGram[默认为false] */private boolean nGramChinese;/** 是否需要对纯数字进行NGram[默认为false] */private boolean nGramNumber;/**是否开启edgesNGram模式*/private boolean edgesNGram;public PinyinAnalyzer() {this(Constant.DEFAULT_IK_USE_SMART);}public PinyinAnalyzer(boolean useSmart) {this(Constant.DEFAULT_MIN_GRAM, Constant.DEFAULT_MAX_GRAM, Constant.DEFAULT_EDGES_GRAM, useSmart,Constant.DEFAULT_NGRAM_CHINESE);}public PinyinAnalyzer(int minGram) {this(minGram, Constant.DEFAULT_MAX_GRAM, Constant.DEFAULT_EDGES_GRAM, Constant.DEFAULT_IK_USE_SMART, Constant.DEFAULT_NGRAM_CHINESE,Constant.DEFAULT_NGRAM_NUMBER);}public PinyinAnalyzer(int minGram,boolean useSmart) {this(minGram, Constant.DEFAULT_MAX_GRAM, Constant.DEFAULT_EDGES_GRAM, useSmart,Constant.DEFAULT_NGRAM_CHINESE);}public PinyinAnalyzer(int minGram, int maxGram) {this(minGram, maxGram, Constant.DEFAULT_EDGES_GRAM);}public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram) {this(minGram, maxGram, edgesNGram, Constant.DEFAULT_IK_USE_SMART);}public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram,boolean useSmart) {this(minGram, maxGram, edgesNGram, useSmart,Constant.DEFAULT_NGRAM_CHINESE);}public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram,boolean useSmart,boolean nGramChinese) {this(minGram, maxGram, edgesNGram, useSmart,nGramChinese,Constant.DEFAULT_NGRAM_NUMBER);}public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram,boolean useSmart,boolean nGramChinese,boolean nGramNumber) {super();this.minGram = minGram;this.maxGram = maxGram;this.edgesNGram = edgesNGram;this.useSmart = useSmart;this.nGramChinese = nGramChinese;this.nGramNumber = nGramNumber;}@Overrideprotected TokenStreamComponents createComponents(String fieldName) {Reader reader = new BufferedReader(new StringReader(fieldName));Tokenizer tokenizer = new IKTokenizer(reader, useSmart);//转拼音TokenStream tokenStream = new PinyinTokenFilter(tokenizer,Constant.DEFAULT_SHORT_PINYIN,Constant.DEFAULT_PINYIN_ALL, Constant.DEFAULT_MIN_TERM_LRNGTH);//对拼音进行NGram处理if(edgesNGram) {tokenStream = new PinyinEdgeNGramTokenFilter(tokenStream,this.minGram,this.maxGram,this.nGramChinese,this.nGramNumber);} else {tokenStream = new PinyinNGramTokenFilter(tokenStream,this.minGram,this.maxGram,this.nGramChinese,this.nGramNumber);}return new Analyzer.TokenStreamComponents(tokenizer, tokenStream);}
}

   Lucene5中PinyinAnalyzer分词器使用示例代码如下:

import java.io.IOException;import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;import com.yida.framework.lucene5.pinyin.PinyinAnalyzer;
@SuppressWarnings("resource")
public class AnalyzerTest {public static void main(String[] args) throws IOException {String s = "京华时报2009年1月23日报道 the this that welcome to beijing 虽然我很丑,但是我很温柔,昨天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。";//Analyzer analyzer = new IKAnalyzer();Analyzer analyzer = new PinyinAnalyzer();TokenStream tokenStream = analyzer.tokenStream("text", s);displayTokens(tokenStream);}public static void displayTokens(TokenStream tokenStream) throws IOException {OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);tokenStream.reset();int position = 0;while (tokenStream.incrementToken()) {int increment = positionIncrementAttribute.getPositionIncrement();if(increment > 0) {position = position + increment;System.out.print(position + ":");}int startOffset = offsetAttribute.startOffset();int endOffset = offsetAttribute.endOffset();String term = charTermAttribute.toString();System.out.println("[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type());}tokenStream.end();tokenStream.close();}
}

    

package org.apache.lucene.analysis.pinyin.solr5;import java.util.Map;import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pinyin.lucene5.PinyinTokenFilter;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/*** PinyinTokenFilter工厂类* @author Lanxiaowei**/
public class PinyinTokenFilterFactory extends TokenFilterFactory {/**是否输出原中文*/private boolean outChinese;/**是否只转换简拼*/private boolean shortPinyin;/**是否转换全拼+简拼*/private boolean pinyinAll;/**中文词组长度过滤,默认超过minTermLength长度的中文才转换拼音*/private int minTermLength;public PinyinTokenFilterFactory(Map<String, String> args) {super(args);this.outChinese = getBoolean(args, "outChinese", Constant.DEFAULT_OUT_CHINESE);this.shortPinyin = getBoolean(args, "shortPinyin", Constant.DEFAULT_SHORT_PINYIN);this.pinyinAll = getBoolean(args, "pinyinAll", Constant.DEFAULT_PINYIN_ALL);this.minTermLength = getInt(args, "minTermLength", Constant.DEFAULT_MIN_TERM_LRNGTH);}public TokenFilter create(TokenStream input) {return new PinyinTokenFilter(input, this.shortPinyin,this.outChinese,this.minTermLength);}public boolean isOutChinese() {return outChinese;}public void setOutChinese(boolean outChinese) {this.outChinese = outChinese;}public boolean isShortPinyin() {return shortPinyin;}public void setShortPinyin(boolean shortPinyin) {this.shortPinyin = shortPinyin;}public boolean isPinyinAll() {return pinyinAll;}public void setPinyinAll(boolean pinyinAll) {this.pinyinAll = pinyinAll;}public int getMinTermLength() {return minTermLength;}public void setMinTermLength(int minTermLength) {this.minTermLength = minTermLength;}
}

   

import java.util.Map;import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pinyin.lucene5.PinyinEdgeNGramTokenFilter;
import org.apache.lucene.analysis.pinyin.lucene5.PinyinNGramTokenFilter;
import org.apache.lucene.analysis.pinyin.utils.Constant;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/*** PinyinNGramTokenFilter工厂类* @author Lanxiaowei**/
public class PinyinNGramTokenFilterFactory extends TokenFilterFactory {private int minGram;private int maxGram;/** 是否需要对中文进行NGram[默认为false] */private boolean nGramChinese;/** 是否需要对纯数字进行NGram[默认为false] */private boolean nGramNumber;/**是否开启edgesNGram模式*/private boolean edgesNGram;public PinyinNGramTokenFilterFactory(Map<String, String> args) {super(args);this.minGram = getInt(args, "minGram", Constant.DEFAULT_MIN_GRAM);this.maxGram = getInt(args, "maxGram", Constant.DEFAULT_MAX_GRAM);this.edgesNGram = getBoolean(args, "edgesNGram", Constant.DEFAULT_EDGES_GRAM);this.nGramChinese = getBoolean(args, "nGramChinese", Constant.DEFAULT_NGRAM_CHINESE);this.nGramNumber = getBoolean(args, "nGramNumber", Constant.DEFAULT_NGRAM_NUMBER);}public TokenFilter create(TokenStream input) {if(edgesNGram) {return new PinyinEdgeNGramTokenFilter(input, this.minGram, this.maxGram, this.nGramChinese, this.nGramNumber);}return new PinyinNGramTokenFilter(input, this.minGram, this.maxGram,this.nGramChinese,this.nGramNumber);}
}

    我已经将他们打包成了两个jar包:lucene-analyzer-pinyin.5.1.0.jar和solr-analyzer-pinyin.5.1.0.jar(这两个jar包我已经上传到最底下的附件里,特此提醒!!!),只需要把这两个jar放入core的lib目录下,如图:

 然后在schema.xml中添加拼音分词的域类型,如图:

 然后如图应用定义好的text_pinyin这个域类型,看图:

 然后你就可以启动你的tomcat部署solr,进行拼音分词测试了:

 如果你看到如图效果,表明拼音分词已经部署成功且测试成功!如果你有任何疑问,请联系我!我的联系方式请查阅我之前的博客,打完收工,谢谢!就此晚安啦!

 

       

    

这篇关于跟益达学Solr5之拼音分词[改进版]的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!



http://www.chinasem.cn/article/327209

相关文章

中文分词jieba库的使用与实景应用(一)

知识星球:https://articles.zsxq.com/id_fxvgc803qmr2.html 目录 一.定义: 精确模式(默认模式): 全模式: 搜索引擎模式: paddle 模式(基于深度学习的分词模式): 二 自定义词典 三.文本解析   调整词出现的频率 四. 关键词提取 A. 基于TF-IDF算法的关键词提取 B. 基于TextRank算法的关键词提取

Solr 使用Facet分组过程中与分词的矛盾解决办法

对于一般查询而言  ,  分词和存储都是必要的  .  比如  CPU  类型  ”Intel  酷睿  2  双核  P7570”,  拆分成  ”Intel”,”  酷睿  ”,”P7570”  这样一些关键字并分别索引  ,  可能提供更好的搜索体验  .  但是如果将  CPU  作为 Facet  字段  ,  最好不进行分词  .  这样就造成了矛盾  ,  解决方法

Java实现Smartcn中文分词

新建一个Maven项目,修改pom.xml文件内容:注意版本的不同; <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-smartcn --><dependency><groupId>org.apache.lucene</groupId><artifactId>lucene-analyzers

PHP使用elasticsearch搜索安装及分词方法

一、背景 为什么会用到这个ES搜索?是因为我在看乌云的漏洞案例库时候,搜索即为不方便。 比如说说我要搜索一个 SQL注入 那mysql匹配的时候是like模糊匹配,搜索必须要有SQL注入这四个字,连续的才能查找到那这样会不太方便。 然后我就想着做一个分词,搜索起来会方便不少,第一个想到的就是ES搜索了。 怎么去用ES呢? 二、安装ES搜索 我们只需要一个JAVA环境并且把Java的环

分词关键字提取-jieba

####jieba支持三种分词模式: 精确模式,试图将句子最精确地切开,适合文本分析;全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。 ###主要功能 ####分词jieba.cut 方法接受三个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数

Ubuntu 中安装fcitx中文输入法,五笔和拼音

Ubuntu 14.04 LTS中自带的iBus输入法有多么的难用,这个不用我来说,今后你会看到各种吐嘈会像滔滔江水连绵不绝的。这里我们不抱怨,我们自己来着手解决中国人自己的Linux中文输入法问题。Fcitx是当之无愧的最好的中文输入法(框架),真的非常感谢作者——当然是中国人,不然谁给你写这头痛的东西。当Linux走进平常百姓家的时候,我确信Fcitx的贡献起码也得有好几个百分点呀!

华为OD机试真题 - 中文分词模拟器(Python/JS/C/C++ 2024 D卷 100分)

华为OD机试 2024E卷题库疯狂收录中,刷题点这里 专栏导读 本专栏收录于《华为OD机试真题(Python/JS/C/C++)》。 刷的越多,抽中的概率越大,私信哪吒,备注华为OD,加入华为OD刷题交流群,每一题都有详细的答题思路、详细的代码注释、3个测试用例、为什么这道题采用XX算法、XX算法的适用场景,发现新题目,随时更新,全天CSDN在线答疑。 一、题目描述 给定一个连

NLP-文本处理:依存句法分析(主谓、动宾、动补...)【基于“分词后得到的词语列表A”+“A进行词性标注后得到的词性列表B”来进行依存句法分析】【使用成熟的第三方工具包】

句法分析(syntactic parsing)是自然语言处理中的关键技术之一,它是对输入的文本句子进行分析以得到句子的句法结构的处理过程。对句法结构进行分析,一方面是语言理解的自身需求,句法分析是语言理解的重要一环,另一方面也为其它自然语言处理任务提供支持。例如句法驱动的统计机器翻译需要对源语言或目标语言(或者同时两种语言)进行句法分析。 第三方工具包: 哈工大LTP首页 哈工大LTP4 文档

NLP-信息抽取:关系抽取【即:三元组抽取,主要用于抽取实体间的关系】【基于命名实体识别、分词、词性标注、依存句法分析、语义角色标注】【自定义模板/规则、监督学习(分类器)、半监督学习、无监督学习】

信息抽取主要包括三个子任务: 实体抽取与链指:也就是命名实体识别关系抽取:通常我们说的三元组(triple)抽取,主要用于抽取实体间的关系事件抽取:相当于一种多元关系的抽取 一、关系抽取概述 关系抽取通常在实体抽取与实体链指之后。在识别出句子中的关键实体后,还需要抽取两个实体或多个实体之间的语义关系。语义关系通常用于连接两个实体,并与实体一起表达文本的主要含义。常见的关系抽取结果

NLP-文本处理:词性标注【使用成熟的第三方工具包:中文(哈工大LTP)、英文()】【对分词后得到的“词语列表”进行词性标注,词性标注的结果用于依存句法分析、语义角色标注】

词性: 语言中对词的一种分类方法,以语法特征为主要依据、兼顾词汇意义对词进行划分的结果, 常见的词性有14种, 如: 名词, 动词, 形容词等. 顾名思义, 词性标注(Part-Of-Speech tagging, 简称POS)就是标注出一段文本中每个词汇的词性. 举个栗子: 我爱自然语言处理==>我/rr, 爱/v, 自然语言/n, 处理/vnrr: 人称代词v: 动词n: 名词vn