本文主要是介绍pinyin4j获取多音字首字母同时保留非中文字符,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
pinyin4j获取多音字首字母同时保留非中文字符
- 前情:获取中文的首字母,要求正确识别多音字(例:重庆,重启,重量,成长等),同时需要保留非中文字符
- 要求项目中导入com.belerweb.pinyin4j.2.5.1包,然后将下面的类放入项目中即可使用
- ==以下内容暂时还未经过大量数据测试,后续若发现问题会及时修改==
前情:获取中文的首字母,要求正确识别多音字(例:重庆,重启,重量,成长等),同时需要保留非中文字符
当前pinyin4j的最新版2.5.1里面不支持多音字的正确获取首字母(网上找的解决方案大多数也是当遇到多音字时只取第一个拼音),于是扩展了下它的部分源码,支持多音字的首字母获取。
要求项目中导入com.belerweb.pinyin4j.2.5.1包,然后将下面的类放入项目中即可使用
以下内容暂时还未经过大量数据测试,后续若发现问题会及时修改
以下表格为修改记录
修改时间 | 修改内容 |
---|---|
2019-05-28 | 发布 |
2020-04-23 | 修改部分获取首字母异常,加了py.length() > 0判断 |
2022-07-05 | 支持classpath下自定义拼音扩展库 |
如下是重新定义的**PinyinHelper.toHanYuPinyinString()**方法,命名、使用方式与源码一致,使用时需注意正确地导入类名
multi_pinyin.txt是多音字库(pinyin4j源码包里有),可以自己改个名字以及存储路径来扩展里面的多音字,里面并不是全的,比如“重启”需要添加“重启 (chong2,qi3)”才能正确识别
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import net.sourceforge.pinyin4j.multipinyin.Trie;
import org.springframework.core.io.ClassPathResource;
import org.springframework.util.StringUtils;import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;public class PinyinHelper {/*** 自定义拼音扩展库,从classpath下查找*/private static final String MULTI_PINYIN_APPENDER = "multi_pinyin_appender.txt";public static String toHanYuPinyinString(String str, HanyuPinyinOutputFormat outputFormat, String separate, boolean retain) throws BadHanyuPinyinOutputFormatCombination {ChineseToPinyinResource resource = ChineseToPinyinResource.getInstance();StringBuilder resultPinyinStrBuf = new StringBuilder();char[] chars = str.toCharArray();for (int i = 0; i < chars.length; i++) {// 匹配到的最长的结果String result = null;char ch = chars[i];Trie currentTrie = resource.getUnicodeToHanyuPinyinTable();int success = i;int current = i;do {String hexStr = Integer.toHexString((int) ch).toUpperCase();currentTrie = currentTrie.get(hexStr);if (currentTrie != null) {if (currentTrie.getPinyin() != null) {result = currentTrie.getPinyin();success = current;}currentTrie = currentTrie.getNextTire();} else {}current++;if (current < chars.length) {ch = chars[current];} else {break;}} while (currentTrie != null);// 如果在前缀树中没有匹配到,那么它就不能转换为拼音,直接输出或者去掉if (result == null) {if (retain) {if (i != 0 && current != success && resultPinyinStrBuf.lastIndexOf(separate) != resultPinyinStrBuf.length() - 1) {resultPinyinStrBuf.append(separate);}resultPinyinStrBuf.append(chars[i]);}} else {String[] pinyinStrArray = resource.parsePinyinString(result);if (pinyinStrArray != null) {for (int j = 0; j < pinyinStrArray.length; j++) {if (i != 0 && current != success && resultPinyinStrBuf.lastIndexOf(separate) != resultPinyinStrBuf.length() - 1) {resultPinyinStrBuf.append(separate);}resultPinyinStrBuf.append(PinyinFormatter.formatHanyuPinyin(pinyinStrArray[j], outputFormat));// 不是最后一个,(也不是拼音的最后一个,并且不是最后匹配成功的)if (current < chars.length || (j < pinyinStrArray.length - 1 && i != success)) {resultPinyinStrBuf.append(separate);}if (i == success) {break;}}}}i = success;}return resultPinyinStrBuf.toString();}static class PinyinFormatter {static String formatHanyuPinyin(String pinyinStr, HanyuPinyinOutputFormat outputFormat)throws BadHanyuPinyinOutputFormatCombination {if ((HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType())&& ((HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) || (HanyuPinyinVCharType.WITH_U_AND_COLON == outputFormat.getVCharType()))) {throw new BadHanyuPinyinOutputFormatCombination("tone marks cannot be added to v or u:");}if (HanyuPinyinToneType.WITHOUT_TONE == outputFormat.getToneType()) {pinyinStr = pinyinStr.replaceAll("[1-5]", "");} else if (HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType()) {pinyinStr = pinyinStr.replaceAll("u:", "v");pinyinStr = convertToneNumber2ToneMark(pinyinStr);}if (HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) {pinyinStr = pinyinStr.replaceAll("u:", "v");} else if (HanyuPinyinVCharType.WITH_U_UNICODE == outputFormat.getVCharType()) {pinyinStr = pinyinStr.replaceAll("u:", "ü");}if (HanyuPinyinCaseType.UPPERCASE == outputFormat.getCaseType()) {pinyinStr = pinyinStr.toUpperCase();}return pinyinStr;}/*** Convert tone numbers to tone marks using Unicode <br/><br/>** <b>Algorithm for determining location of tone mark</b><br/>* <p>* A simple algorithm for determining the vowel on which the tone mark* appears is as follows:<br/>** <ol>* <li>First, look for an "a" or an "e". If either vowel appears, it takes* the tone mark. There are no possible pinyin syllables that contain both* an "a" and an "e".** <li>If there is no "a" or "e", look for an "ou". If "ou" appears, then* the "o" takes the tone mark.** <li>If none of the above cases hold, then the last vowel in the syllable* takes the tone mark.** </ol>** @param pinyinStr the ascii represention with tone numbers* @return the unicode represention with tone marks*/private static String convertToneNumber2ToneMark(final String pinyinStr) {String lowerCasePinyinStr = pinyinStr.toLowerCase();if (lowerCasePinyinStr.matches("[a-z]*[1-5]?")) {final char defautlCharValue = '$';final int defautlIndexValue = -1;char unmarkedVowel = defautlCharValue;int indexOfUnmarkedVowel = defautlIndexValue;final char charA = 'a';final char charE = 'e';final String ouStr = "ou";final String allUnmarkedVowelStr = "aeiouv";final String allMarkedVowelStr = "āáăàaēéĕèeīíĭìiōóŏòoūúŭùuǖǘǚǜü";if (lowerCasePinyinStr.matches("[a-z]*[1-5]")) {int tuneNumber =Character.getNumericValue(lowerCasePinyinStr.charAt(lowerCasePinyinStr.length() - 1));int indexOfA = lowerCasePinyinStr.indexOf(charA);int indexOfE = lowerCasePinyinStr.indexOf(charE);int ouIndex = lowerCasePinyinStr.indexOf(ouStr);if (-1 != indexOfA) {indexOfUnmarkedVowel = indexOfA;unmarkedVowel = charA;} else if (-1 != indexOfE) {indexOfUnmarkedVowel = indexOfE;unmarkedVowel = charE;} else if (-1 != ouIndex) {indexOfUnmarkedVowel = ouIndex;unmarkedVowel = ouStr.charAt(0);} else {for (int i = lowerCasePinyinStr.length() - 1; i >= 0; i--) {if (String.valueOf(lowerCasePinyinStr.charAt(i)).matches("[" + allUnmarkedVowelStr + "]")) {indexOfUnmarkedVowel = i;unmarkedVowel = lowerCasePinyinStr.charAt(i);break;}}}if ((defautlCharValue != unmarkedVowel) && (defautlIndexValue != indexOfUnmarkedVowel)) {int rowIndex = allUnmarkedVowelStr.indexOf(unmarkedVowel);int columnIndex = tuneNumber - 1;int vowelLocation = rowIndex * 5 + columnIndex;char markedVowel = allMarkedVowelStr.charAt(vowelLocation);return lowerCasePinyinStr.substring(0, indexOfUnmarkedVowel).replaceAll("v", "ü")+ markedVowel+ lowerCasePinyinStr.substring(indexOfUnmarkedVowel + 1,lowerCasePinyinStr.length() - 1).replaceAll("v", "ü");} else// error happens in the procedure of locating vowel{return lowerCasePinyinStr;}} else// input string has no any tune number{// only replace v with ü (umlat) characterreturn lowerCasePinyinStr.replaceAll("v", "ü");}} else// bad format{return lowerCasePinyinStr;}}}static class ChineseToPinyinResource {/*** A hash table contains <Unicode, HanyuPinyin> pairs*/private Trie unicodeToHanyuPinyinTable = null;/*** @param unicodeToHanyuPinyinTable The unicodeToHanyuPinyinTable to set.*/private void setUnicodeToHanyuPinyinTable(Trie unicodeToHanyuPinyinTable) {this.unicodeToHanyuPinyinTable = unicodeToHanyuPinyinTable;}/*** @return Returns the unicodeToHanyuPinyinTable.*/Trie getUnicodeToHanyuPinyinTable() {return unicodeToHanyuPinyinTable;}/*** Private constructor as part of the singleton pattern.*/private ChineseToPinyinResource() {initializeResource();}/*** Initialize a hash-table contains <Unicode, HanyuPinyin> pairs*/private void initializeResource() {try {final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt";final String resourceMultiName = "/pinyindb/multi_pinyin.txt";setUnicodeToHanyuPinyinTable(new Trie());getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName));getUnicodeToHanyuPinyinTable().loadMultiPinyin(ResourceHelper.getResourceInputStream(resourceMultiName));// 新增classpath下拼音扩展库if (StringUtils.hasLength(MULTI_PINYIN_APPENDER)) {ClassPathResource pathResource = new ClassPathResource(MULTI_PINYIN_APPENDER);if (pathResource.exists()) {getUnicodeToHanyuPinyinTable().loadMultiPinyin(pathResource.getInputStream());}}// 原始拼音扩展库,仅支持绝对路径getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend();} catch (FileNotFoundException ex) {ex.printStackTrace();} catch (IOException ex) {ex.printStackTrace();}}Trie getHanyuPinyinTrie(char ch) {String codepointHexStr = Integer.toHexString((int) ch).toUpperCase();// fetch from hashtablereturn getUnicodeToHanyuPinyinTable().get(codepointHexStr);}/*** Get the unformatted Hanyu Pinyin representations of the given Chinese* character in array format.** @param ch given Chinese character in Unicode* @return The Hanyu Pinyin strings of the given Chinese character in array* format; return null if there is no corresponding Pinyin string.*/String[] getHanyuPinyinStringArray(char ch) {String pinyinRecord = getHanyuPinyinRecordFromChar(ch);return parsePinyinString(pinyinRecord);}String[] parsePinyinString(String pinyinRecord) {if (null != pinyinRecord) {int indexOfLeftBracket = pinyinRecord.indexOf(Field.LEFT_BRACKET);int indexOfRightBracket = pinyinRecord.lastIndexOf(Field.RIGHT_BRACKET);String stripedString =pinyinRecord.substring(indexOfLeftBracket + Field.LEFT_BRACKET.length(),indexOfRightBracket);return stripedString.split(Field.COMMA);} else {// no record found or mal-formatted recordreturn null;}}/*** @param record given record string of Hanyu Pinyin* @return return true if record is not null and record is not "none0" and* record is not mal-formatted, else return false*/private boolean isValidRecord(String record) {final String noneStr = "(none0)";return (null != record) && !record.equals(noneStr) && record.startsWith(Field.LEFT_BRACKET)&& record.endsWith(Field.RIGHT_BRACKET);}/*** @param ch given Chinese character in Unicode* @return corresponding Hanyu Pinyin Record in Properties file; null if no* record found*/private String getHanyuPinyinRecordFromChar(char ch) {// convert Chinese character to code point (integer)// please refer to http://www.unicode.org/glossary/#code_point// Another reference: http://en.wikipedia.org/wiki/Unicodeint codePointOfChar = ch;String codepointHexStr = Integer.toHexString(codePointOfChar).toUpperCase();// fetch from hashtableTrie trie = getUnicodeToHanyuPinyinTable().get(codepointHexStr);String foundRecord = null;if (trie != null) {foundRecord = trie.getPinyin();}return isValidRecord(foundRecord) ? foundRecord : null;}/*** Singleton factory method.** @return the one and only MySingleton.*/static ChineseToPinyinResource getInstance() {return ChineseToPinyinResourceHolder.THE_INSTANCE;}/*** Singleton implementation helper.*/private static class ChineseToPinyinResourceHolder {static final ChineseToPinyinResource THE_INSTANCE = new ChineseToPinyinResource();}/*** A class encloses common string constants used in Properties files** @author Li Min (xmlerlimin@gmail.com)*/class Field {static final String LEFT_BRACKET = "(";static final String RIGHT_BRACKET = ")";static final String COMMA = ",";}}static class ResourceHelper {/*** @param resourceName* @return resource (mainly file in file system or file in compressed* package) as BufferedInputStream*/static BufferedInputStream getResourceInputStream(String resourceName) {return new BufferedInputStream(ResourceHelper.class.getResourceAsStream(resourceName));}}
}
下面是使用方式: 里面用到了google的guava包的部分内容
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;import java.util.List;/*** 拼音工具类*/
public class PinyinUtil {private static HanyuPinyinOutputFormat outputFormat;private static final String SEPARATE = "#";static {outputFormat = new HanyuPinyinOutputFormat();outputFormat.setVCharType(HanyuPinyinVCharType.WITH_V);outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);}/*** 获取文本的拼音** @param str 需要转换拼音的文本* @param retain true:保留中文以外的其他字符* @param initial true:只需要首字母* @return 拼音*/public static String toPinYinString(String str, boolean retain, boolean initial) {StringBuilder sb = new StringBuilder();try {List<String> list = Lists.newArrayList();StringBuilder notChinese = new StringBuilder();for (int i = 0; i < str.length(); i++) {if (str.charAt(i) < 0x4E00 || str.charAt(i) > 0x9FA5) {notChinese.append(str.charAt(i));if (i == str.length() - 1) {list.add(notChinese.toString());}} else {if (notChinese.length() > 0) {list.add(notChinese.toString());notChinese = new StringBuilder();}}}String pinyin = PinyinHelper.toHanYuPinyinString(str, outputFormat, SEPARATE, retain);Splitter.on(SEPARATE).split(pinyin).forEach(py -> {if (list.contains(py)) {sb.append(py);return;}if (initial) {if (py.length() > 0) {sb.append(py.charAt(0));}} else {sb.append(py);}});} catch (BadHanyuPinyinOutputFormatCombination e) {e.printStackTrace();}return sb.toString();}}
下面是临时测试结果:
String str = "成长,重启,重量,长大了,角色,角落,呼啦啦,1我2,3爱4,5你6";System.out.println(PinyinUtil.toPinYinString(str, true, true));// cz,cq,zl,zdl,js,jl,hll,1w2,3a4,5n6System.out.println(PinyinUtil.toPinYinString(str, false, true));// czcqzlzdljsjlhllwanSystem.out.println(PinyinUtil.toPinYinString(str, true, false));// chengzhang,chongqi,zhongliang,zhangdale,juese,jiaoluo,hulala,1wo2,3ai4,5ni6System.out.println(PinyinUtil.toPinYinString(str, false, false));// chengzhangchongqizhongliangzhangdalejuesejiaoluohulalawoaini
这篇关于pinyin4j获取多音字首字母同时保留非中文字符的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!