本文主要是介绍【7-1】实验——实体统一和歧义消除,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
一、使用jieba完成公司名的实体统一
#核心代码:建立main_extract,当输入公司名,返回会被统一的简称
def main_extract(company_name,d_4_delete,stop_word,d_city_province): """ company_name 输入的公司名 stop_word 停用词 d_4_delete 后缀名 d_city_province 地区 """ company_name_list = pseg.cut(company_name) # 前置获取到的地名 company_name_list = city_prov_ahead(company_name_list, d_city_province) # 去除通用后缀 company_name_list = delete_suffix(company_name_list, d_4_delete) # 其他自定义function company_name_list = my_function(company_name_list) company_name = ''.join(company_name_list) return company_name
#核心代码:初始化加载步骤,输出需要使用的词典
def my_initial(): #加载城市名、省份名 d_city_province = set() with open("../data/dict/co_City_Dim.txt", encoding='utf-8') as cts: for ct in cts.readlines(): d_city_province.add(ct[:-1]) with open("../data/dict/co_Province_Dim.txt", encoding='utf-8') as prvs: for prv in prvs.readlines(): d_city_province.add(prv[:-1]) #加载公司后缀 d_4_delete = set() with open(r"../data/dict/company_suffix.txt", encoding='utf-8') as sfs: for sf in sfs.readlines(): d_4_delete.add(sf[:-1]) #加载停用词 stop_word = set() with open(r"../data/dict/stopwords.txt", encoding='utf-8') as sts: for st in sts.readlines(): stop_word.add(st[:-1]) return d_4_delete,stop_word,d_city_province
二、使用tf-idf完成实体消歧
#建立关键词组,将需要进行实体消歧的实体存进keyword_list
import collections s = ''
keyword_list = []
for i in entity_data['entity_name'].values.tolist(): s += i + '|'
for k,v in collections.Counter(s.split('|')).items(): if v > 1: keyword_list.append(k)
#生成tfidf矩阵
from sklearn.feature_extraction.text import TfidfVectorizer train_sentence = []
for i in entity_data['desc'].values: train_sentence.append(' '.join(jieba.cut(i))) vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_sentence)
#获取包含关键词的句子中关键词所属的entity_id
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity def get_entityid(sentence): id_start = 1001 a_list = [' '.join(jieba.cut(sentence))] res = cosine_similarity(vectorizer.transform(a_list),X)[0] top_idx = np.argsort(res)[-1] return id_start + top_idx
这篇关于【7-1】实验——实体统一和歧义消除的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!