最近参与一个小project,需要编写一个针对英文单词的stem 算法。
1. 最为常见的stem 算法 就是The English (Porter2) stemming algorithm http://snowball.tartarus.org/algorithms/english/stemmer.html
// This file was generated automatically by the Snowball to Java compilerpackage org.tartarus.snowball.ext;import org.tartarus.snowball.Among;/*** This class was automatically generated by a Snowball to Java compiler * It implements the stemming algorithm defined by a snowball script.*/public class englishStemmer extends org.tartarus.snowball.SnowballStemmer {private static final long serialVersionUID = 1L;private final static englishStemmer methodObject = new englishStemmer ();private final static Among a_0[] = {new Among ( "arsen", -1, -1, "", methodObject ),new Among ( "commun", -1, -1, "", methodObject ),new Among ( "gener", -1, -1, "", methodObject )};private final static Among a_1[] = {new Among ( "'", -1, 1, "", methodObject ),new Among ( "'s'", 0, 1, "", methodObject ),new Among ( "'s", -1, 1, "", methodObject )};private final static Among a_2[] = {new Among ( "ied", -1, 2, "", methodObject ),new Among ( "s", -1, 3, "", methodObject ),new Among ( "ies", 1, 2, "", methodObject ),new Among ( "sses", 1, 1, "", methodObject ),new Among ( "ss", 1, -1, "", methodObject ),new Among ( "us", 1, -1, "", methodObject )};private final static Among a_3[] = {new Among ( "", -1, 3, "", methodObject ),new Among ( "bb", 0, 2, "", methodObject ),new Among ( "dd", 0, 2, "", methodObject ),new Among ( "ff", 0, 2, "", methodObject ),new Among ( "gg", 0, 2, "", methodObject ),new Among ( "bl", 0, 1, "", methodObject ),new Among ( "mm", 0, 2, "", methodObject ),new Among ( "nn", 0, 2, "", methodObject ),new Among ( "pp", 0, 2, "", methodObject ),new Among ( "rr", 0, 2, "", methodObject ),new Among ( "at", 0, 1, "", methodObject ),new Among ( "tt", 0, 2, "", methodObject ),new Among ( "iz", 0, 1, "", methodObject )};private final static Among a_4[] = {new Among ( "ed", -1, 2, "", methodObject ),new Among ( "eed", 0, 1, "", methodObject ),new Among ( "ing", -1, 2, "", methodObject ),new Among ( "edly", -1, 2, "", methodObject ),new Among ( "eedly", 3, 1, "", methodObject ),new Among ( "ingly", -1, 2, "", methodObject )};private final static Among a_5[] = {new Among ( "anci", -1, 3, "", methodObject ),new Among ( "enci", -1, 2, "", methodObject ),new Among ( "ogi", -1, 13, "", methodObject ),new Among ( "li", -1, 16, "", methodObject ),new Among ( "bli", 3, 12, "", methodObject ),new Among ( "abli", 4, 4, "", methodObject ),new Among ( "alli", 3, 8, "", methodObject ),new Among ( "fulli", 3, 14, "", methodObject ),new Among ( "lessli", 3, 15, "", methodObject ),new Among ( "ousli", 3, 10, "", methodObject ),new Among ( "entli", 3, 5, "", methodObject ),new Among ( "aliti", -1, 8, "", methodObject ),new Among ( "biliti", -1, 12, "", methodObject ),new Among ( "iviti", -1, 11, "", methodObject ),new Among ( "tional", -1, 1, "", methodObject ),new Among ( "ational", 14, 7, "", methodObject ),new Among ( "alism", -1, 8, "", methodObject ),new Among ( "ation", -1, 7, "", methodObject ),new Among ( "ization", 17, 6, "", methodObject ),new Among ( "izer", -1, 6, "", methodObject ),new Among ( "ator", -1, 7, "", methodObject ),new Among ( "iveness", -1, 11, "", methodObject ),new Among ( "fulness", -1, 9, "", methodObject ),new Among ( "ousness", -1, 10, "", methodObject )};private final static Among a_6[] = {new Among ( "icate", -1, 4, "", methodObject ),new Among ( "ative", -1, 6, "", methodObject ),new Among ( "alize", -1, 3, "", methodObject ),new Among ( "iciti", -1, 4, "", methodObject ),new Among ( "ical", -1, 4, "", methodObject ),new Among ( "tional", -1, 1, "", methodObject ),new Among ( "ational", 5, 2, "", methodObject ),new Among ( "ful", -1, 5, "", methodObject ),new Among ( "ness", -1, 5, "", methodObject )};private final static Among a_7[] = {new Among ( "ic", -1, 1, "", methodObject ),new Among ( "ance", -1, 1, "", methodObject ),new Among ( "ence", -1, 1, "", methodObject ),new Among ( "able", -1, 1, "", methodObject ),new Among ( "ible", -1, 1, "", methodObject ),new Among ( "ate", -1, 1, "", methodObject ),new Among ( "ive", -1, 1, "", methodObject ),new Among ( "ize", -1, 1, "", methodObject ),new Among ( "iti", -1, 1, "", methodObject ),new Among ( "al", -1, 1, "", methodObject ),new Among ( "ism", -1, 1, "", methodObject ),new Among ( "ion", -1, 2, "", methodObject ),new Among ( "er", -1, 1, "", methodObject ),new Among ( "ous", -1, 1, "", methodObject ),new Among ( "ant", -1, 1, "", methodObject ),new Among ( "ent", -1, 1, "", methodObject ),new Among ( "ment", 15, 1, "", methodObject ),new Among ( "ement", 16, 1, "", methodObject )};private final static Among a_8[] = {new Among ( "e", -1, 1, "", methodObject ),new Among ( "l", -1, 2, "", methodObject )};private final static Among a_9[] = {new Among ( "succeed", -1, -1, "", methodObject ),new Among ( "proceed", -1, -1, "", methodObject ),new Among ( "exceed", -1, -1, "", methodObject ),new Among ( "canning", -1, -1, "", methodObject ),new Among ( "inning", -1, -1, "", methodObject ),new Among ( "earring", -1, -1, "", methodObject ),new Among ( "herring", -1, -1, "", methodObject ),new Among ( "outing", -1, -1, "", methodObject )};private final static Among a_10[] = {new Among ( "andes", -1, -1, "", methodObject ),new Among ( "atlas", -1, -1, "", methodObject ),new Among ( "bias", -1, -1, "", methodObject ),new Among ( "cosmos", -1, -1, "", methodObject ),new Among ( "dying", -1, 3, "", methodObject ),new Among ( "early", -1, 9, "", methodObject ),new Among ( "gently", -1, 7, "", methodObject ),new Among ( "howe", -1, -1, "", methodObject ),new Among ( "idly", -1, 6, "", methodObject ),new Among ( "lying", -1, 4, "", methodObject ),new Among ( "news", -1, -1, "", methodObject ),new Among ( "only", -1, 10, "", methodObject ),new Among ( "singly", -1, 11, "", methodObject ),new Among ( "skies", -1, 2, "", methodObject ),new Among ( "skis", -1, 1, "", methodObject ),new Among ( "sky", -1, -1, "", methodObject ),new Among ( "tying", -1, 5, "", methodObject ),new Among ( "ugly", -1, 8, "", methodObject )};private static final char g_v[] = {17, 65, 16, 1 };private static final char g_v_WXY[] = {1, 17, 65, 208, 1 };private static final char g_valid_LI[] = {55, 141, 2 };private boolean B_Y_found;private int I_p2;private int I_p1;private void copy_from(englishStemmer other) {B_Y_found = other.B_Y_found;I_p2 = other.I_p2;I_p1 = other.I_p1;super.copy_from(other);}private boolean r_prelude() {int v_1;int v_2;int v_3;int v_4;int v_5;// (, line 25// unset Y_found, line 26B_Y_found = false;// do, line 27v_1 = cursor;lab0: do {// (, line 27// [, line 27bra = cursor;// literal, line 27if (!(eq_s(1, "'"))){break lab0;}// ], line 27ket = cursor;// delete, line 27 slice_del();} while (false);cursor = v_1;// do, line 28v_2 = cursor;lab1: do {// (, line 28// [, line 28bra = cursor;// literal, line 28if (!(eq_s(1, "y"))){break lab1;}// ], line 28ket = cursor;// <-, line 28slice_from("Y");// set Y_found, line 28B_Y_found = true;} while (false);cursor = v_2;// do, line 29v_3 = cursor;lab2: do {// repeat, line 29replab3: while(true){v_4 = cursor;lab4: do {// (, line 29// goto, line 29golab5: while(true){v_5 = cursor;lab6: do {// (, line 29if (!(in_grouping(g_v, 97, 121))){break lab6;}// [, line 29bra = cursor;// literal, line 29if (!(eq_s(1, "y"))){break lab6;}// ], line 29ket = cursor;cursor = v_5;break golab5;} while (false);cursor = v_5;if (cursor >= limit){break lab4;}cursor++;}// <-, line 29slice_from("Y");// set Y_found, line 29B_Y_found = true;continue replab3;} while (false);cursor = v_4;break replab3;}} while (false);cursor = v_3;return true;}private boolean r_mark_regions() {int v_1;int v_2;// (, line 32I_p1 = limit;I_p2 = limit;// do, line 35v_1 = cursor;lab0: do {// (, line 35// or, line 41lab1: do {v_2 = cursor;lab2: do {// among, line 36if (find_among(a_0, 3) == 0){break lab2;}break lab1;} while (false);cursor = v_2;// (, line 41// gopast, line 41golab3: while(true){lab4: do {if (!(in_grouping(g_v, 97, 121))){break lab4;}break golab3;} while (false);if (cursor >= limit){break lab0;}cursor++;}// gopast, line 41golab5: while(true){lab6: do {if (!(out_grouping(g_v, 97, 121))){break lab6;}break golab5;} while (false);if (cursor >= limit){break lab0;}cursor++;}} while (false);// setmark p1, line 42I_p1 = cursor;// gopast, line 43golab7: while(true){lab8: do {if (!(in_grouping(g_v, 97, 121))){break lab8;}break golab7;} while (false);if (cursor >= limit){break lab0;}cursor++;}// gopast, line 43golab9: while(true){lab10: do {if (!(out_grouping(g_v, 97, 121))){break lab10;}break golab9;} while (false);if (cursor >= limit){break lab0;}cursor++;}// setmark p2, line 43I_p2 = cursor;} while (false);cursor = v_1;return true;}private boolean r_shortv() {int v_1;// (, line 49// or, line 51lab0: do {v_1 = limit - cursor;lab1: do {// (, line 50if (!(out_grouping_b(g_v_WXY, 89, 121))){break lab1;}if (!(in_grouping_b(g_v, 97, 121))){break lab1;}if (!(out_grouping_b(g_v, 97, 121))){break lab1;}break lab0;} while (false);cursor = limit - v_1;// (, line 52if (!(out_grouping_b(g_v, 97, 121))){return false;}if (!(in_grouping_b(g_v, 97, 121))){return false;}// atlimit, line 52if (cursor > limit_backward){return false;}} while (false);return true;}private boolean r_R1() {if (!(I_p1 <= cursor)){return false;}return true;}private boolean r_R2() {if (!(I_p2 <= cursor)){return false;}return true;}private boolean r_Step_1a() {int among_var;int v_1;int v_2;// (, line 58// try, line 59v_1 = limit - cursor;lab0: do {// (, line 59// [, line 60ket = cursor;// substring, line 60among_var = find_among_b(a_1, 3);if (among_var == 0){cursor = limit - v_1;break lab0;}// ], line 60bra = cursor;switch(among_var) {case 0:cursor = limit - v_1;break lab0;case 1:// (, line 62// delete, line 62 slice_del();break;}} while (false);// [, line 65ket = cursor;// substring, line 65among_var = find_among_b(a_2, 6);if (among_var == 0){return false;}// ], line 65bra = cursor;switch(among_var) {case 0:return false;case 1:// (, line 66// <-, line 66slice_from("ss");break;case 2:// (, line 68// or, line 68lab1: do {v_2 = limit - cursor;lab2: do {// (, line 68// hop, line 68 {int c = cursor - 2;if (limit_backward > c || c > limit){break lab2;}cursor = c;}// <-, line 68slice_from("i");break lab1;} while (false);cursor = limit - v_2;// <-, line 68slice_from("ie");} while (false);break;case 3:// (, line 69// next, line 69if (cursor <= limit_backward){return false;}cursor--;// gopast, line 69golab3: while(true){lab4: do {if (!(in_grouping_b(g_v, 97, 121))){break lab4;}break golab3;} while (false);if (cursor <= limit_backward){return false;}cursor--;}// delete, line 69 slice_del();break;}return true;}private boolean r_Step_1b() {int among_var;int v_1;int v_3;int v_4;// (, line 74// [, line 75ket = cursor;// substring, line 75among_var = find_among_b(a_4, 6);if (among_var == 0){return false;}// ], line 75bra = cursor;switch(among_var) {case 0:return false;case 1:// (, line 77// call R1, line 77if (!r_R1()){return false;}// <-, line 77slice_from("ee");break;case 2:// (, line 79// test, line 80v_1 = limit - cursor;// gopast, line 80golab0: while(true){lab1: do {if (!(in_grouping_b(g_v, 97, 121))){break lab1;}break golab0;} while (false);if (cursor <= limit_backward){return false;}cursor--;}cursor = limit - v_1;// delete, line 80 slice_del();// test, line 81v_3 = limit - cursor;// substring, line 81among_var = find_among_b(a_3, 13);if (among_var == 0){return false;}cursor = limit - v_3;switch(among_var) {case 0:return false;case 1:// (, line 83// <+, line 83 {int c = cursor;insert(cursor, cursor, "e");cursor = c;}break;case 2:// (, line 86// [, line 86ket = cursor;// next, line 86if (cursor <= limit_backward){return false;}cursor--;// ], line 86bra = cursor;// delete, line 86 slice_del();break;case 3:// (, line 87// atmark, line 87if (cursor != I_p1){return false;}// test, line 87v_4 = limit - cursor;// call shortv, line 87if (!r_shortv()){return false;}cursor = limit - v_4;// <+, line 87 {int c = cursor;insert(cursor, cursor, "e");cursor = c;}break;}break;}return true;}private boolean r_Step_1c() {int v_1;int v_2;// (, line 93// [, line 94ket = cursor;// or, line 94lab0: do {v_1 = limit - cursor;lab1: do {// literal, line 94if (!(eq_s_b(1, "y"))){break lab1;}break lab0;} while (false);cursor = limit - v_1;// literal, line 94if (!(eq_s_b(1, "Y"))){return false;}} while (false);// ], line 94bra = cursor;if (!(out_grouping_b(g_v, 97, 121))){return false;}// not, line 95 {v_2 = limit - cursor;lab2: do {// atlimit, line 95if (cursor > limit_backward){break lab2;}return false;} while (false);cursor = limit - v_2;}// <-, line 96slice_from("i");return true;}private boolean r_Step_2() {int among_var;// (, line 99// [, line 100ket = cursor;// substring, line 100among_var = find_among_b(a_5, 24);if (among_var == 0){return false;}// ], line 100bra = cursor;// call R1, line 100if (!r_R1()){return false;}switch(among_var) {case 0:return false;case 1:// (, line 101// <-, line 101slice_from("tion");break;case 2:// (, line 102// <-, line 102slice_from("ence");break;case 3:// (, line 103// <-, line 103slice_from("ance");break;case 4:// (, line 104// <-, line 104slice_from("able");break;case 5:// (, line 105// <-, line 105slice_from("ent");break;case 6:// (, line 107// <-, line 107slice_from("ize");break;case 7:// (, line 109// <-, line 109slice_from("ate");break;case 8:// (, line 111// <-, line 111slice_from("al");break;case 9:// (, line 112// <-, line 112slice_from("ful");break;case 10:// (, line 114// <-, line 114slice_from("ous");break;case 11:// (, line 116// <-, line 116slice_from("ive");break;case 12:// (, line 118// <-, line 118slice_from("ble");break;case 13:// (, line 119// literal, line 119if (!(eq_s_b(1, "l"))){return false;}// <-, line 119slice_from("og");break;case 14:// (, line 120// <-, line 120slice_from("ful");break;case 15:// (, line 121// <-, line 121slice_from("less");break;case 16:// (, line 122if (!(in_grouping_b(g_valid_LI, 99, 116))){return false;}// delete, line 122 slice_del();break;}return true;}private boolean r_Step_3() {int among_var;// (, line 126// [, line 127ket = cursor;// substring, line 127among_var = find_among_b(a_6, 9);if (among_var == 0){return false;}// ], line 127bra = cursor;// call R1, line 127if (!r_R1()){return false;}switch(among_var) {case 0:return false;case 1:// (, line 128// <-, line 128slice_from("tion");break;case 2:// (, line 129// <-, line 129slice_from("ate");break;case 3:// (, line 130// <-, line 130slice_from("al");break;case 4:// (, line 132// <-, line 132slice_from("ic");break;case 5:// (, line 134// delete, line 134 slice_del();break;case 6:// (, line 136// call R2, line 136if (!r_R2()){return false;}// delete, line 136 slice_del();break;}return true;}private boolean r_Step_4() {int among_var;int v_1;// (, line 140// [, line 141ket = cursor;// substring, line 141among_var = find_among_b(a_7, 18);if (among_var == 0){return false;}// ], line 141bra = cursor;// call R2, line 141if (!r_R2()){return false;}switch(among_var) {case 0:return false;case 1:// (, line 144// delete, line 144 slice_del();break;case 2:// (, line 145// or, line 145lab0: do {v_1 = limit - cursor;lab1: do {// literal, line 145if (!(eq_s_b(1, "s"))){break lab1;}break lab0;} while (false);cursor = limit - v_1;// literal, line 145if (!(eq_s_b(1, "t"))){return false;}} while (false);// delete, line 145 slice_del();break;}return true;}private boolean r_Step_5() {int among_var;int v_1;int v_2;// (, line 149// [, line 150ket = cursor;// substring, line 150among_var = find_among_b(a_8, 2);if (among_var == 0){return false;}// ], line 150bra = cursor;switch(among_var) {case 0:return false;case 1:// (, line 151// or, line 151lab0: do {v_1 = limit - cursor;lab1: do {// call R2, line 151if (!r_R2()){break lab1;}break lab0;} while (false);cursor = limit - v_1;// (, line 151// call R1, line 151if (!r_R1()){return false;}// not, line 151 {v_2 = limit - cursor;lab2: do {// call shortv, line 151if (!r_shortv()){break lab2;}return false;} while (false);cursor = limit - v_2;}} while (false);// delete, line 151 slice_del();break;case 2:// (, line 152// call R2, line 152if (!r_R2()){return false;}// literal, line 152if (!(eq_s_b(1, "l"))){return false;}// delete, line 152 slice_del();break;}return true;}private boolean r_exception2() {// (, line 156// [, line 158ket = cursor;// substring, line 158if (find_among_b(a_9, 8) == 0){return false;}// ], line 158bra = cursor;// atlimit, line 158if (cursor > limit_backward){return false;}return true;}private boolean r_exception1() {int among_var;// (, line 168// [, line 170bra = cursor;// substring, line 170among_var = find_among(a_10, 18);if (among_var == 0){return false;}// ], line 170ket = cursor;// atlimit, line 170if (cursor < limit){return false;}switch(among_var) {case 0:return false;case 1:// (, line 174// <-, line 174slice_from("ski");break;case 2:// (, line 175// <-, line 175slice_from("sky");break;case 3:// (, line 176// <-, line 176slice_from("die");break;case 4:// (, line 177// <-, line 177slice_from("lie");break;case 5:// (, line 178// <-, line 178slice_from("tie");break;case 6:// (, line 182// <-, line 182slice_from("idl");break;case 7:// (, line 183// <-, line 183slice_from("gentl");break;case 8:// (, line 184// <-, line 184slice_from("ugli");break;case 9:// (, line 185// <-, line 185slice_from("earli");break;case 10:// (, line 186// <-, line 186slice_from("onli");break;case 11:// (, line 187// <-, line 187slice_from("singl");break;}return true;}private boolean r_postlude() {int v_1;int v_2;// (, line 203// Boolean test Y_found, line 203if (!(B_Y_found)){return false;}// repeat, line 203replab0: while(true){v_1 = cursor;lab1: do {// (, line 203// goto, line 203golab2: while(true){v_2 = cursor;lab3: do {// (, line 203// [, line 203bra = cursor;// literal, line 203if (!(eq_s(1, "Y"))){break lab3;}// ], line 203ket = cursor;cursor = v_2;break golab2;} while (false);cursor = v_2;if (cursor >= limit){break lab1;}cursor++;}// <-, line 203slice_from("y");continue replab0;} while (false);cursor = v_1;break replab0;}return true;}public boolean stem() {int v_1;int v_2;int v_3;int v_4;int v_5;int v_6;int v_7;int v_8;int v_9;int v_10;int v_11;int v_12;int v_13;// (, line 205// or, line 207lab0: do {v_1 = cursor;lab1: do {// call exception1, line 207if (!r_exception1()){break lab1;}break lab0;} while (false);cursor = v_1;lab2: do {// not, line 208 {v_2 = cursor;lab3: do {// hop, line 208 {int c = cursor + 3;if (0 > c || c > limit){break lab3;}cursor = c;}break lab2;} while (false);cursor = v_2;}break lab0;} while (false);cursor = v_1;// (, line 208// do, line 209v_3 = cursor;lab4: do {// call prelude, line 209if (!r_prelude()){break lab4;}} while (false);cursor = v_3;// do, line 210v_4 = cursor;lab5: do {// call mark_regions, line 210if (!r_mark_regions()){break lab5;}} while (false);cursor = v_4;// backwards, line 211limit_backward = cursor; cursor = limit;// (, line 211// do, line 213v_5 = limit - cursor;lab6: do {// call Step_1a, line 213if (!r_Step_1a()){break lab6;}} while (false);cursor = limit - v_5;// or, line 215lab7: do {v_6 = limit - cursor;lab8: do {// call exception2, line 215if (!r_exception2()){break lab8;}break lab7;} while (false);cursor = limit - v_6;// (, line 215// do, line 217v_7 = limit - cursor;lab9: do {// call Step_1b, line 217if (!r_Step_1b()){break lab9;}} while (false);cursor = limit - v_7;// do, line 218v_8 = limit - cursor;lab10: do {// call Step_1c, line 218if (!r_Step_1c()){break lab10;}} while (false);cursor = limit - v_8;// do, line 220v_9 = limit - cursor;lab11: do {// call Step_2, line 220if (!r_Step_2()){break lab11;}} while (false);cursor = limit - v_9;// do, line 221v_10 = limit - cursor;lab12: do {// call Step_3, line 221if (!r_Step_3()){break lab12;}} while (false);cursor = limit - v_10;// do, line 222v_11 = limit - cursor;lab13: do {// call Step_4, line 222if (!r_Step_4()){break lab13;}} while (false);cursor = limit - v_11;// do, line 224v_12 = limit - cursor;lab14: do {// call Step_5, line 224if (!r_Step_5()){break lab14;}} while (false);cursor = limit - v_12;} while (false);cursor = limit_backward; // do, line 227v_13 = cursor;lab15: do {// call postlude, line 227if (!r_postlude()){break lab15;}} while (false);cursor = v_13;} while (false);return true;}public boolean equals( Object o ) {return o instanceof englishStemmer;}public int hashCode() {return englishStemmer.class.getName().hashCode();}}
然而,porter stemming 仅仅是一个基于后缀的词干提取技术,它仅仅定义了一些基本的后缀规则,能识别出"books"->"book"等. 然而针对一些诸如 "bought"->"buy","brought"->"bring"等异常形式并不能识别出来。
2. The dragon toolkit (http://dragon.ischool.drexel.edu/download.asp)
然后发现上面nlp 处理工具,其中的EngLemmatiser 类就是stem类,能提取出单词的词干。
它首先定义一些基本点后缀规则(只有十几条),然后定义一些独立于这些规则的异常词库(master slave 的形式,这样就能基本实现单词词干的正确提取,解决了porter stemming 存在的问题。
String dictionaryPath = "lemmatiser";EngLemmatiser lemmatiser = new EngLemmatiser(dictionaryPath, false, true);String a = "brought";String lemmatizedWord = lemmatiser.lemmatize(a);System.out.println(lemmatizedWord);
然而我还是觉得,在规则基础之上附加词典的技术过于死板,不够灵活。
3. Stanford CoreNLP
后来发现斯坦福大学的一个NLP工具,其中提取词干的技术:针对大量语料库进行机器学习,利用有限自动机提炼并生成规则(不必附加词典)。能完美解决词干的提取问题,准确率很高。它对地名、人名等专有词识别不出来,但达到了基本的需求。
String word="magnificus";Morphology morph=new Morphology();System.out.println(morph.stem(word));