本文主要是介绍北大天网搜索引擎TSE分析及完全注释[3]来到关键字分词及相关性分析程序,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
有前面注释我们可以知道查询关键字和字典文件准备好好后,将进入用户关键字分词阶段
//TSESearch.cpp中:
- CHzSeg iHzSeg; //include ChSeg/HzSeg.h
- //
- iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //将get到的查询变量分词分成 "我/ 爱/ 你们/ 的/ 格式"
- vector<STRING></STRING> vecTerm;
- iQuery.ParseQuery(vecTerm); //将以"/"划分开的关键字一一顺序放入一个向量容器中
- set<STRING></STRING> setRelevantRst;
- iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
- gettimeofday(&end_tv,&tz);
- // search end
- //搜索完毕
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //将get到的查询变量分词分成 "我/ 爱/ 你们/ 的/ 格式"
vector vecTerm;
iQuery.ParseQuery(vecTerm); //将以"/"划分开的关键字一一顺序放入一个向量容器中
set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//搜索完毕
- 看CHzSeg 中的这个方法
看CHzSeg 中的这个方法
- //ChSeg/HzSeg.h
//ChSeg/HzSeg.h
- /**
- * 程序翻译说明
- * 进一步净化数据,转换汉字
- * @access public
- * @param CDict, string 参数的汉字说明:字典,查询字符串
- * @return string 0
- */
- // process a sentence before segmentation
- //在分词前处理句子
- string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
- {
- string s2="";
- unsigned int i,len;
- while (!s1.empty())
- {
- unsigned char ch=(unsigned char) s1[0];
- if(ch<128)
- { // deal with ASCII
- i=1;
- len = s1.size();
- while (i<LEN len="s1.length();" i="0;" 中文标点等非汉字字符="" if="" else="" yhf="" s1="s1.substr(i);" by="" added="" ch="=13)" s2="" cr=""></LEN>=161)
- && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
- && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
- && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
- || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
- || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
- {
- ii=i+2; // 假定没有半个汉字
- }
- if (i==0) ii=i+2;
- // 不处理中文空格
- if (!(ch==161 && (unsigned char)s1[1]==161))
- {
- if (i <= s1.size()) // yhf
- // 其他的非汉字双字节字符可能连续输出
- s2 += s1.substr(0, i) + SEPARATOR;
- else break; // yhf
- }
- if (i <= s1.size()) // yhf
- s1s1=s1.substr(i);
- else break; //yhf
- continue;
- }
- }
- // 以下处理汉字串
- i = 2;
- len = s1.length();
- while(i<LEN></LEN>=176)
- // while(i<LEN></LEN>=128 && (unsigned char)s1[i]!=161)
- i+=2;
- s2+=SegmentHzStrMM(dict, s1.substr(0,i));
- if (i <= len) // yhf
- s1s1=s1.substr(i);
- else break; // yhf
- }
- return s2;
- }
/**
* 程序翻译说明
* 进一步净化数据,转换汉字
* @access public
* @param CDict, string 参数的汉字说明:字典,查询字符串
* @return string 0
*/
// process a sentence before segmentation
//在分词前处理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;
while (!s1.empty())
{
unsigned char ch=(unsigned char) s1[0];
if(ch<128)
{ // deal with ASCII
i=1;
len = s1.size();
while (i=161)
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
&& (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
|| (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
|| (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
{
i=i+2; // 假定没有半个汉字
}
if (i==0) i=i+2;
// 不处理中文空格
if (!(ch==161 && (unsigned char)s1[1]==161))
{
if (i <= s1.size()) // yhf
// 其他的非汉字双字节字符可能连续输出
s2 += s1.substr(0, i) + SEPARATOR;
else break; // yhf
}
if (i <= s1.size()) // yhf
s1=s1.substr(i);
else break; //yhf
continue;
}
}
// 以下处理汉字串
i = 2;
len = s1.length();
while(i=176)
// while(i=128 && (unsigned char)s1[i]!=161)
i+=2;
s2+=SegmentHzStrMM(dict, s1.substr(0,i));
if (i <= len) // yhf
s1=s1.substr(i);
else break; // yhf
}
return s2;
}
- //Query.cpp
//Query.cpp
- <PRE class=csharp name="code">/**
- * 程序翻译说明
- * 将以"/"划分开的关键字一一顺序放入一个向量容器中
- *
- * @access public
- * @param vector<STRING></STRING> 参数的汉字说明:向量容器
- * @return void
- */
- void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)
- {
- string::size_type idx;
- while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
- vecTerm.push_back(m_sSegQuery.substr(0,idx));
- m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);
- }
- }
- </PRE>
- <PRE class=csharp name="code"> </PRE>
- <PRE class=csharp name="code"><PRE class=csharp name="code">/**
- * 程序翻译说明
- * 相关性分析查询,构造结果集合setRelevantRst //瓶颈所在
- *
- * @access public
- * @param vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明: 用户提交关键字的分词组,倒排索引映射,相关性结果集合
- * @return string 0
- */
- bool CQuery::GetRelevantRst
- (
- vector<STRING></STRING> &vecTerm,
- map &mapBuckets,
- set<STRING></STRING> &setRelevantRst
- ) const
- {
- set<STRING></STRING> setSRst;
- bool bFirst=true;
- vector<STRING></STRING>::iterator itTerm = vecTerm.begin();
- for ( ; itTerm != vecTerm.end(); ++itTerm )
- {
- setSRst.clear();
- copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));
- map mapRstDoc;
- string docid;
- int doccnt;
- map::iterator itBuckets = mapBuckets.find(*itTerm);
- if (itBuckets != mapBuckets.end())
- {
- string strBucket = (*itBuckets).second;
- string::size_type idx;
- idx = strBucket.find_first_not_of(" ");
- strBucketstrBucket = strBucket.substr(idx);
- while ( (idx = strBucket.find(" ")) != string::npos )
- {
- docid = strBucket.substr(0,idx);
- doccnt = 0;
- if (docid.empty()) continue;
- map::iterator it = mapRstDoc.find(docid);
- if ( it != mapRstDoc.end() )
- {
- doccnt = (*it).second + 1;
- mapRstDoc.erase(it);
- }
- mapRstDoc.insert( pair(docid,doccnt) );
- strBucketstrBucket = strBucket.substr(idx+1);
- }
- // remember the last one
- docid = strBucket;
- doccnt = 0;
- map::iterator it = mapRstDoc.find(docid);
- if ( it != mapRstDoc.end() )
- {
- doccnt = (*it).second + 1;
- mapRstDoc.erase(it);
- }
- mapRstDoc.insert( pair(docid,doccnt) );
- }
- // sort by term frequencty
- multimap > newRstDoc;
- map::iterator it0 = mapRstDoc.begin();
- for ( ; it0 != mapRstDoc.end(); ++it0 ){
- newRstDoc.insert( pair((*it0).second,(*it0).first) );
- }
- multimap::iterator itNewRstDoc = newRstDoc.begin();
- setRelevantRst.clear();
- for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
- string docid = (*itNewRstDoc).second;
- if (bFirst==true) {
- setRelevantRst.insert(docid);
- continue;
- }
- if ( setSRst.find(docid) != setSRst.end() ){
- setRelevantRst.insert(docid);
- }
- }
- //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";
- bFirst = false;
- }
- return true;
- }</PRE>
- </PRE>
- 接下来的就是现实了,前面都只是处理数据得到 setRelevantRst 这个查询结构集合,这里就不多说了下面就和php之类的脚本语言差不多,格式化结果集合并显示出来。
- /**
- * 程序翻译说明
- * 将以"/"划分开的关键字一一顺序放入一个向量容器中
- *
- * @access public
- * @param vector<STRING></STRING> 参数的汉字说明:向量容器
- * @return void
- */
- void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)
- {
- string::size_type idx;
- while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
- vecTerm.push_back(m_sSegQuery.substr(0,idx));
- m_sSegQuery = m_sSegQuery.substr(idx+3);
- }
- }
- <PRE class=csharp name="code">/**
- * 程序翻译说明
- * 相关性分析查询,构造结果集合setRelevantRst //瓶颈所在
- *
- * @access public
- * @param vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明: 用户提交关键字的分词组,倒排索引映射,相关性结果集合
- * @return string 0
- */
- bool CQuery::GetRelevantRst
- (
- vector<STRING></STRING> &vecTerm,
- map &mapBuckets,
- set<STRING></STRING> &setRelevantRst
- ) const
- {
- set<STRING></STRING> setSRst;
- bool bFirst=true;
- vector<STRING></STRING>::iterator itTerm = vecTerm.begin();
- for ( ; itTerm != vecTerm.end(); ++itTerm )
- {
- setSRst.clear();
- copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));
- map mapRstDoc;
- string docid;
- int doccnt;
- map::iterator itBuckets = mapBuckets.find(*itTerm);
- if (itBuckets != mapBuckets.end())
- {
- string strBucket = (*itBuckets).second;
- string::size_type idx;
- idx = strBucket.find_first_not_of(" ");
- strBucket = strBucket.substr(idx);
- while ( (idx = strBucket.find(" ")) != string::npos )
- {
- docid = strBucket.substr(0,idx);
- doccnt = 0;
- if (docid.empty()) continue;
- map::iterator it = mapRstDoc.find(docid);
- if ( it != mapRstDoc.end() )
- {
- doccnt = (*it).second + 1;
- mapRstDoc.erase(it);
- }
- mapRstDoc.insert( pair(docid,doccnt) );
- strBucket = strBucket.substr(idx+1);
- }
- // remember the last one
- docid = strBucket;
- doccnt = 0;
- map::iterator it = mapRstDoc.find(docid);
- if ( it != mapRstDoc.end() )
- {
- doccnt = (*it).second + 1;
- mapRstDoc.erase(it);
- }
- mapRstDoc.insert( pair(docid,doccnt) );
- }
- // sort by term frequencty
- multimap > newRstDoc;
- map::iterator it0 = mapRstDoc.begin();
- for ( ; it0 != mapRstDoc.end(); ++it0 ){
- newRstDoc.insert( pair((*it0).second,(*it0).first) );
- }
- multimap::iterator itNewRstDoc = newRstDoc.begin();
- setRelevantRst.clear();
- for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
- string docid = (*itNewRstDoc).second;
- if (bFirst==true) {
- setRelevantRst.insert(docid);
- continue;
- }
- if ( setSRst.find(docid) != setSRst.end() ){
- setRelevantRst.insert(docid);
- }
- }
- //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";
- bFirst = false;
- }
- return true;
- }</PRE>
接下来的就是现实了,前面都只是处理数据得到 setRelevantRst 这个查询结构集合,这里就不多说了下面就和php之类的脚本语言差不多,格式化结果集合并显示出来。
- /**
- * 程序翻译说明
- * 相关性分析查询,构造结果集合setRelevantRst //瓶颈所在
- *
- * @access public
- * @param vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明: 用户提交关键字的分词组,倒排索引映射,相关性结果集合
- * @return string 0
- */
- bool CQuery::GetRelevantRst
- (
- vector<STRING></STRING> &vecTerm,
- map &mapBuckets,
- set<STRING></STRING> &setRelevantRst
- ) const
- {
- set<STRING></STRING> setSRst;
- bool bFirst=true;
- vector<STRING></STRING>::iterator itTerm = vecTerm.begin();
- for ( ; itTerm != vecTerm.end(); ++itTerm )
- {
- setSRst.clear();
- copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));
- map mapRstDoc;
- string docid;
- int doccnt;
- map::iterator itBuckets = mapBuckets.find(*itTerm);
- if (itBuckets != mapBuckets.end())
- {
- string strBucket = (*itBuckets).second;
- string::size_type idx;
- idx = strBucket.find_first_not_of(" ");
- strBucket = strBucket.substr(idx);
- while ( (idx = strBucket.find(" ")) != string::npos )
- {
- docid = strBucket.substr(0,idx);
- doccnt = 0;
- if (docid.empty()) continue;
- map::iterator it = mapRstDoc.find(docid);
- if ( it != mapRstDoc.end() )
- {
- doccnt = (*it).second + 1;
- mapRstDoc.erase(it);
- }
- mapRstDoc.insert( pair(docid,doccnt) );
- strBucket = strBucket.substr(idx+1);
- }
- // remember the last one
- docid = strBucket;
- doccnt = 0;
- map::iterator it = mapRstDoc.find(docid);
- if ( it != mapRstDoc.end() )
- {
- doccnt = (*it).second + 1;
- mapRstDoc.erase(it);
- }
- mapRstDoc.insert( pair(docid,doccnt) );
- }
- // sort by term frequencty
- multimap > newRstDoc;
- map::iterator it0 = mapRstDoc.begin();
- for ( ; it0 != mapRstDoc.end(); ++it0 ){
- newRstDoc.insert( pair((*it0).second,(*it0).first) );
- }
- multimap::iterator itNewRstDoc = newRstDoc.begin();
- setRelevantRst.clear();
- for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
- string docid = (*itNewRstDoc).second;
- if (bFirst==true) {
- setRelevantRst.insert(docid);
- continue;
- }
- if ( setSRst.find(docid) != setSRst.end() ){
- setRelevantRst.insert(docid);
- }
- }
- //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";
- bFirst = false;
- }
- return true;
- }
//TSESearch.cpp
- //下面开始显示
- CDisplayRst iDisplayRst;
- iDisplayRst.ShowTop();
- float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
- +((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;
- iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
- setRelevantRst.size(), iQuery.m_iStart);
- iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);
这篇关于北大天网搜索引擎TSE分析及完全注释[3]来到关键字分词及相关性分析程序的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!