本文主要是介绍word2vec详解(CBOW、SG、hierarchical softmax、negative sampling),希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
- word2vec用来干什么的解释,参见这篇博客
- 一开始也是纯照论文《word2vec Parameter Learning Explained》推公式,到后来理解深了一点,所以后面和前面的公式间有点乱。
- python代码:理解不深刻。链接
import argparse
import math
import struct
import sys
import time
import warningsimport numpy as npfrom multiprocessing import Pool, Value, Arrayclass VocabItem:def __init__(self, word):self.word = wordself.count = 0self.path = None # Path (list of indices) from the root to the word (leaf) 每一个词哈夫曼树的路径self.code = None # Huffman encoding 每个词的哈夫曼编码class Vocab:def __init__(self, fi, min_count):vocab_items = []vocab_hash = {}word_count = 0fi = open(fi, 'r')# Add special tokens <bol> (beginning of line) and <eol> (end of line)# 添加特殊表示符(开始、停止)信息for token in ['<bol>', '<eol>']:vocab_hash[token] = len(vocab_items)vocab_items.append(VocabItem(token))# 查看文本# 查看文本的每一行for line in fi:tokens = line.split()# 查看每一行的每一个词for token in tokens:# 添加新词(不要重复)if token not in vocab_hash:vocab_hash[token] = len(vocab_items)vocab_items.append(VocabItem(token))#assert vocab_items[vocab_hash[token]].word == token, 'Wrong vocab_hash index'# 计算每个词的数量vocab_items[vocab_hash[token]].count += 1# 总词数word_count += 1if word_count % 10000 == 0:sys.stdout.write("\rReading word %d \n" % word_count)# sys.stdout.flush()# Add special tokens <bol> (beginning of line) and <eol> (end of line)# 每执行完一行,加一个开头结尾标识符,词数也加2个vocab_items[vocab_hash['<bol>']].count += 1vocab_items[vocab_hash['<eol>']].count += 1word_count += 2self.bytes = fi.tell()# 返回当前文件位置self.vocab_items = vocab_items # List of VocabItem objectsself.vocab_hash = vocab_hash # Mapping from each token to its index in vocabself.word_count = word_count # Total number of words in train file# Add special token <unk> (unknown),# merge words occurring less than min_count into <unk>, and# sort vocab in descending order by frequency in train file# 按照给定的min_count,以每个词出现的次数为标准进行排序self.__sort(min_count)#assert self.word_count == sum([t.count for t in self.vocab_items]), 'word_count and sum of t.count do not agree'print ('Total words in training file: %d' % self.word_count)print ('Total bytes in training file: %d' % self.bytes)print ('Vocab size: %d' % len(self))def __getitem__(self, i):return self.vocab_items[i]def __len__(self):return len(self.vocab_items)def __iter__(self):return iter(self.vocab_items)def __contains__(self, key):return key in self.vocab_hashdef __sort(self, min_count):tmp = []tmp.append(VocabItem('<unk>'))unk_hash = 0count_unk = 0for token in self.vocab_items:if token.count < min_count:count_unk += 1tmp[unk_hash].count += token.countelse:tmp.append(token)tmp.sort(key=lambda token : token.count, reverse=True)# 降序排列# Update vocab_hashvocab_hash = {}for i, token in enumerate(tmp):vocab_hash[token.word] = iself.vocab_items = tmpself.vocab_hash = vocab_hashprint ('Unknown vocab size:', count_unk)def indices(self, tokens):return [self.vocab_hash[token] if token in self else self.vocab_hash['<unk>'] for token in tokens]def encode_huffman(self):# Building a Huffman treevocab_size = len(self)# count已经是由大到小排好序的词频count = [t.count for t in self] + [1e15] * (vocab_size - 1) # 创建单词叶结点以及Huffman的中间节点# 后面部分的目的是储存中间节点,比如将两个单词构造一个树结构的情况,1e15=10^15,目的是构造出来的小数中的频次的值一定会大于所有单词# 出现的频次,否则会出现将两个树进行合并时出现错位的问题parent = [0] * (2 * vocab_size - 2) # [0]*2=[0,0] parent是Huffman树中节点的个数,parent节点个数貌似和Huffman边的个数一样??binary = [0] * (2 * vocab_size - 2) # Huffman树中边的个数pos1 = vocab_size - 1 # 和单词数目对应,表示叶结点[t.count for t in self]部分,并且因为# 单词表是根据降序排列的,vocab_size表示最小的频次的单词,之所以是-1是因为数组从0开始索引,而不是像字典可以根据具体单词索引pos2 = vocab_size # 和中间节点对应表示[1e15]*(vocab_size-1)这部分的标号for i in range(vocab_size - 1):# Find min1 第一个频次最小的节点if pos1 >= 0:if count[pos1] < count[pos2]:min1 = pos1pos1 -= 1 # 单词表中频次最小的单词挑选出来作为新构建树的左孩子节点,因此要将索引减一,再在单词表中查找只能查到次最小,以此类推else:min1 = pos2pos2 += 1else:min1 = pos2pos2 += 1# Find min2 第二个频次最小的节点if pos1 >= 0:if count[pos1] < count[pos2]:min2 = pos1pos1 -= 1else:min2 = pos2pos2 += 1else:min2 = pos2pos2 += 1count[vocab_size + i] = count[min1] + count[min2] # 构造出的第一个树的父节点,更新频次parent[min1] = vocab_size + i # 记录下单词索引为min1的父节点parent[min2] = vocab_size + i # 记录下单词索引为min2的父节点binary[min2] = 1 # 设置单词索引为min2与索引为vocab_size+i的父节点的连边为1# 到这里为止Huffman树构造完成,不过并不像C++或C语言中使用指针构造,而是将这些节点以及他们的索引保存起来,# 记录为parent、binary、count分别记录单词索引下的父节点索引、与父节点连边的编码值、count用于比较最小值# Assign binary code and path pointers to each vocab word 构造好Huffman树后为单词设置编码############################ 理 解 ############################ 注意这里之所以用语料库中词语的频次构造,只是为了迁移Huffman树构造方法,更加方便# 我猜只要满足Huffman树结构的树,单词可以任意放入这个树结构中,并随机编码 我猜是这样子# 因为如果不用频次构造Huffman树,构造这种树结构比较混乱,不能形式化去用############################ 理 解 ###########################root_idx = 2 * vocab_size - 2 # 根结点的索引for i, token in enumerate(self):path = [] # 记录从根节点到该单词token的路径code = [] # 根据path以及binary[]设置编码node_idx = iwhile node_idx < root_idx:if node_idx >= vocab_size: path.append(node_idx)code.append(binary[node_idx]) # 只是为了便于保存,更新单词中的编码的时候是倒序放入的node_idx = parent[node_idx]path.append(root_idx)# these are path and code from root to the leaftoken.path = [j - vocab_size for j in path[::-1]] # [::-1]表示从最后一个开始便利当前list列表# j根据Huffman树结构j-vocab_size刚好可以表示从根结点到当前单词所经过几个中间节点,也就是# 代表了编码长度token.code = code[::-1]'''
unigram,bigram,trigram,是自然语言处理(NLP)中的问题。父词条:n-gram.
unigram: 单个word
bigram: 双word
trigram:3 word
比如:
西安交通大学:
unigram 形式为:西/安/交/通/大/学
bigram形式为: 西安/安交/交通/通大/大学
trigram形式为:西安交/安交通/交通大/通大学
'''
class UnigramTable:"""A list of indices of tokens in the vocab following a power law distribution,used to draw negative samples."""def __init__(self, vocab):vocab_size = len(vocab)power = 0.75norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constanttable_size = int(1e8) # Length of the unigram tabletable = np.zeros(table_size, dtype=np.uint32)print ('Filling unigram table')p = 0 # Cumulative probabilityi = 0for j, unigram in enumerate(vocab):p += float(math.pow(unigram.count, power))/normwhile i < table_size and float(i) / table_size < p:table[i] = ji += 1self.table = tabledef sample(self, count):# 随机采样count条数据indices = np.random.randint(low=0, high=len(self.table), size=count)return [self.table[i] for i in indices]def sigmoid(z):if z > 6:return 1.0elif z < -6:return 0.0else:return 1 / (1 + math.exp(-z))def init_net(dim, vocab_size):# Init syn0 with random numbers from a uniform distribution on the interval [-0.5, 0.5]/dim# 产生随机均匀分布:np.random.uniform(a,b,size)-->产生size个大小在a到b之间的随机数# vocab_size:词表大小tmp = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(vocab_size, dim))# 从numpy数组创建并返回ctypes对象syn0 = np.ctypeslib.as_ctypes(tmp)syn0 = Array(syn0._type_, syn0, lock=False)# Init syn1 with zerostmp = np.zeros(shape=(vocab_size, dim))syn1 = np.ctypeslib.as_ctypes(tmp)syn1 = Array(syn1._type_, syn1, lock=False)return (syn0, syn1)def train_process(pid):# Set fi to point to the right chunk of training filestart = vocab.bytes / num_processes * pidend = vocab.bytes if pid == num_processes - 1 else vocab.bytes / num_processes * (pid + 1)fi.seek(start)#print 'Worker %d beginning training at %d, ending at %d' % (pid, start, end)alpha = starting_alphaword_count = 0last_word_count = 0while fi.tell() < end:line = fi.readline().strip()# Skip blank linesif not line:continue# Init sent, a list of indices of words in linesent = vocab.indices(['<bol>'] + line.split() + ['<eol>'])for sent_pos, token in enumerate(sent):if word_count % 10000 == 0:global_word_count.value += (word_count - last_word_count)last_word_count = word_count# Recalculate alphaalpha = starting_alpha * (1 - float(global_word_count.value) / vocab.word_count)if alpha < starting_alpha * 0.0001: alpha = starting_alpha * 0.0001# Print progress infosys.stdout.write("\rAlpha: %f Progress: %d of %d (%.2f%%)" %(alpha, global_word_count.value, vocab.word_count,float(global_word_count.value) / vocab.word_count * 100))sys.stdout.flush()# Randomize window size, where win is the max window size# 获取到窗内的单词current_win = np.random.randint(low=1, high=win+1)context_start = max(sent_pos - current_win, 0)context_end = min(sent_pos + current_win + 1, len(sent))context = sent[context_start:sent_pos] + sent[sent_pos+1:context_end] # Turn into an iterator?# CBOWif cbow:# Compute neu1# 隐藏层向量neu1neu1 = np.mean(np.array([syn0[c] for c in context]), axis=0)assert len(neu1) == dim, 'neu1 and dim do not agree'# Init neu1e with zerosneu1e = np.zeros(dim)# Compute neu1e and update syn1if neg > 0:# 正样本标记为1,负样本标记为0classifiers = [(token, 1)] + [(target, 0) for target in table.sample(neg)]else:classifiers = zip(vocab[token].path, vocab[token].code)for target, label in classifiers:# 隐藏层与样本词向量2的内积z = np.dot(neu1, syn1[target])# 得到的结果sigmoidp = sigmoid(z)# (p-label)为关于syn2中某词向量的偏导数,alpha为学习率g = alpha * (label - p)# 更新syn1syn1[target] += g * neu1 # Update syn1# 关于隐藏层的偏导数neu1e += g * syn1[target] # Error to backpropagate to syn0# Update syn0for context_word in context:syn0[context_word] += neu1e# Skip-gramelse:for context_word in context:# Init neu1e with zerosneu1e = np.zeros(dim)# Compute neu1e and update syn1if neg > 0:classifiers = [(token, 1)] + [(target, 0) for target in table.sample(neg)]else:classifiers = zip(vocab[token].path, vocab[token].code)for target, label in classifiers:z = np.dot(syn0[context_word], syn1[target])p = sigmoid(z)g = alpha * (label - p)neu1e += g * syn1[target] # Error to backpropagate to syn0syn1[target] += g * syn0[context_word] # Update syn1# Update syn0syn0[context_word] += neu1eword_count += 1# Print progress infoglobal_word_count.value += (word_count - last_word_count)sys.stdout.write("\rAlpha: %f Progress: %d of %d (%.2f%%)" %(alpha, global_word_count.value, vocab.word_count,float(global_word_count.value)/vocab.word_count * 100))sys.stdout.flush()fi.close()def save(vocab, syn0, fo, binary):print ('Saving model to', fo)dim = len(syn0[0])xxx = binary# if binary:# fo = open(fo, 'wb')# fo.write("%d %d\n" % (len(syn0), dim))# fo.write('\n')# for token, vector in zip(vocab, syn0):# fo.write('%s ' % token.word)# for s in vector:# fo.write(struct.pack('f', s))# fo.write('\n')# else:fo = open(fo, 'w')fo.write('%d %d\n' % (len(syn0), dim))for token, vector in zip(vocab, syn0):word = token.wordvector_str = ' '.join([str(s) for s in vector])fo.write('%s %s\n' % (word, vector_str))fo.close()def __init_process(*args):global vocab, syn0, syn1, table, cbow, neg, dim, starting_alphaglobal win, num_processes, global_word_count, fivocab, syn0_tmp, syn1_tmp, table, cbow, neg, dim, starting_alpha, win, num_processes, global_word_count = args[:-1]fi = open(args[-1], 'r')with warnings.catch_warnings():warnings.simplefilter('ignore', RuntimeWarning)syn0 = np.ctypeslib.as_array(syn0_tmp)syn1 = np.ctypeslib.as_array(syn1_tmp)def train(fi, fo, cbow, neg, dim, alpha, win, min_count, num_processes, binary):# Read train file to init vocabvocab = Vocab(fi, min_count)# Init net# 词向量定义为dim维# vocab:词表大小# 初始化word2vec中的两个权重矩阵,也就是词向量矩阵syn0, syn1 = init_net(dim, len(vocab))global_word_count = Value('i', 0)table = Noneif neg > 0:print( 'Initializing unigram table')table = UnigramTable(vocab)else:print ('Initializing Huffman tree')vocab.encode_huffman()# Begin training using num_processes workerst0 = time.time()pool = Pool(processes=num_processes, initializer=__init_process,initargs=(vocab, syn0, syn1, table, cbow, neg, dim, alpha,win, num_processes, global_word_count, fi))pool.map(train_process, range(num_processes))t1 = time.time()print ('Completed training. Training took', (t1 - t0) / 60, 'minutes')# Save model to filesave(vocab, syn0, fo, binary)if __name__ == '__main__':# parser = argparse.ArgumentParser()# parser.add_argument('-train', help='Training file', dest='fi', required=True)# parser.add_argument('-model', help='Output model file', dest='fo', required=True)# parser.add_argument('-cbow', help='1 for CBOW, 0 for skip-gram', dest='cbow', default=1, type=int)# parser.add_argument('-negative', help='Number of negative examples (>0) for negative sampling, 0 for hierarchical softmax', dest='neg', default=5, type=int)# parser.add_argument('-dim', help='Dimensionality of word embeddings', dest='dim', default=100, type=int)# parser.add_argument('-alpha', help='Starting alpha', dest='alpha', default=0.025, type=float)# parser.add_argument('-window', help='Max window length', dest='win', default=5, type=int)# parser.add_argument('-min-count', help='Min count for words used to learn <unk>', dest='min_count', default=5, type=int)# parser.add_argument('-processes', help='Number of processes', dest='num_processes', default=1, type=int)# parser.add_argument('-binary', help='1 for output model in binary format, 0 otherwise', dest='binary', default=0, type=int)# #TO DO: parser.add_argument('-epoch', help='Number of training epochs', dest='epoch', default=1, type=int)# args = parser.parse_args()# train(args.fi, args.fo, bool(args.cbow), args.neg, args.dim, args.alpha, args.win,# args.min_count, args.num_processes, bool(args.binary))train('questions-words.txt','result.txt', 0, 0, 300, 0.01, 1, 5, 1, 0)
这篇关于word2vec详解(CBOW、SG、hierarchical softmax、negative sampling)的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!