word2vec详解（CBOW、SG、hierarchical softmax、negative sampling）

本文主要是介绍word2vec详解（CBOW、SG、hierarchical softmax、negative sampling），希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！
word2vec用来干什么的解释，参见这篇博客
一开始也是纯照论文《word2vec Parameter Learning Explained》推公式，到后来理解深了一点，所以后面和前面的公式间有点乱。
在这里插入图片描述
python代码：理解不深刻。链接
import argparse
import math
import struct
import sys
import time
import warningsimport numpy as npfrom multiprocessing import Pool, Value, Arrayclass VocabItem:def __init__(self, word):self.word = wordself.count = 0self.path = None # Path (list of indices) from the root to the word (leaf) 每一个词哈夫曼树的路径self.code = None # Huffman encoding 每个词的哈夫曼编码class Vocab:def __init__(self, fi, min_count):vocab_items = []vocab_hash = {}word_count = 0fi = open(fi, 'r')# Add special tokens <bol> (beginning of line) and <eol> (end of line)# 添加特殊表示符（开始、停止）信息for token in ['<bol>', '<eol>']:vocab_hash[token] = len(vocab_items)vocab_items.append(VocabItem(token))# 查看文本# 查看文本的每一行for line in fi:tokens = line.split()# 查看每一行的每一个词for token in tokens:# 添加新词（不要重复）if token not in vocab_hash:vocab_hash[token] = len(vocab_items)vocab_items.append(VocabItem(token))#assert vocab_items[vocab_hash[token]].word == token, 'Wrong vocab_hash index'# 计算每个词的数量vocab_items[vocab_hash[token]].count += 1# 总词数word_count += 1if word_count % 10000 == 0:sys.stdout.write("\rReading word %d \n" % word_count)# sys.stdout.flush()# Add special tokens <bol> (beginning of line) and <eol> (end of line)# 每执行完一行，加一个开头结尾标识符，词数也加2个vocab_items[vocab_hash['<bol>']].count += 1vocab_items[vocab_hash['<eol>']].count += 1word_count += 2self.bytes = fi.tell()# 返回当前文件位置self.vocab_items = vocab_items         # List of VocabItem objectsself.vocab_hash = vocab_hash           # Mapping from each token to its index in vocabself.word_count = word_count           # Total number of words in train file# Add special token <unk> (unknown),# merge words occurring less than min_count into <unk>, and# sort vocab in descending order by frequency in train file# 按照给定的min_count，以每个词出现的次数为标准进行排序self.__sort(min_count)#assert self.word_count == sum([t.count for t in self.vocab_items]), 'word_count and sum of t.count do not agree'print ('Total words in training file: %d' % self.word_count)print ('Total bytes in training file: %d' % self.bytes)print ('Vocab size: %d' % len(self))def __getitem__(self, i):return self.vocab_items[i]def __len__(self):return len(self.vocab_items)def __iter__(self):return iter(self.vocab_items)def __contains__(self, key):return key in self.vocab_hashdef __sort(self, min_count):tmp = []tmp.append(VocabItem('<unk>'))unk_hash = 0count_unk = 0for token in self.vocab_items:if token.count < min_count:count_unk += 1tmp[unk_hash].count += token.countelse:tmp.append(token)tmp.sort(key=lambda token : token.count, reverse=True)# 降序排列# Update vocab_hashvocab_hash = {}for i, token in enumerate(tmp):vocab_hash[token.word] = iself.vocab_items = tmpself.vocab_hash = vocab_hashprint ('Unknown vocab size:', count_unk)def indices(self, tokens):return [self.vocab_hash[token] if token in self else self.vocab_hash['<unk>'] for token in tokens]def encode_huffman(self):# Building a Huffman treevocab_size = len(self)# count已经是由大到小排好序的词频count = [t.count for t in self] + [1e15] * (vocab_size - 1)  # 创建单词叶结点以及Huffman的中间节点# 后面部分的目的是储存中间节点，比如将两个单词构造一个树结构的情况,1e15=10^15，目的是构造出来的小数中的频次的值一定会大于所有单词# 出现的频次，否则会出现将两个树进行合并时出现错位的问题parent = [0] * (2 * vocab_size - 2)  # [0]*2=[0,0] parent是Huffman树中节点的个数，parent节点个数貌似和Huffman边的个数一样？？binary = [0] * (2 * vocab_size - 2)  # Huffman树中边的个数pos1 = vocab_size - 1  # 和单词数目对应，表示叶结点[t.count for t in self]部分,并且因为# 单词表是根据降序排列的，vocab_size表示最小的频次的单词，之所以是-1是因为数组从0开始索引，而不是像字典可以根据具体单词索引pos2 = vocab_size  # 和中间节点对应表示[1e15]*(vocab_size-1)这部分的标号for i in range(vocab_size - 1):# Find min1 第一个频次最小的节点if pos1 >= 0:if count[pos1] < count[pos2]:min1 = pos1pos1 -= 1  # 单词表中频次最小的单词挑选出来作为新构建树的左孩子节点，因此要将索引减一，再在单词表中查找只能查到次最小，以此类推else:min1 = pos2pos2 += 1else:min1 = pos2pos2 += 1# Find min2 第二个频次最小的节点if pos1 >= 0:if count[pos1] < count[pos2]:min2 = pos1pos1 -= 1else:min2 = pos2pos2 += 1else:min2 = pos2pos2 += 1count[vocab_size + i] = count[min1] + count[min2]  # 构造出的第一个树的父节点，更新频次parent[min1] = vocab_size + i  # 记录下单词索引为min1的父节点parent[min2] = vocab_size + i  # 记录下单词索引为min2的父节点binary[min2] = 1  # 设置单词索引为min2与索引为vocab_size+i的父节点的连边为1# 到这里为止Huffman树构造完成，不过并不像C++或C语言中使用指针构造，而是将这些节点以及他们的索引保存起来，# 记录为parent、binary、count分别记录单词索引下的父节点索引、与父节点连边的编码值、count用于比较最小值# Assign binary code and path pointers to each vocab word 构造好Huffman树后为单词设置编码############################         理       解          ############################     注意这里之所以用语料库中词语的频次构造，只是为了迁移Huffman树构造方法，更加方便#     我猜只要满足Huffman树结构的树，单词可以任意放入这个树结构中，并随机编码  我猜是这样子#     因为如果不用频次构造Huffman树，构造这种树结构比较混乱，不能形式化去用############################         理       解          ###########################root_idx = 2 * vocab_size - 2  # 根结点的索引for i, token in enumerate(self):path = []  # 记录从根节点到该单词token的路径code = []  # 根据path以及binary[]设置编码node_idx = iwhile node_idx < root_idx:if node_idx >= vocab_size: path.append(node_idx)code.append(binary[node_idx])  # 只是为了便于保存，更新单词中的编码的时候是倒序放入的node_idx = parent[node_idx]path.append(root_idx)# these are path and code from root to the leaftoken.path = [j - vocab_size for j in path[::-1]]  # [::-1]表示从最后一个开始便利当前list列表# j根据Huffman树结构j-vocab_size刚好可以表示从根结点到当前单词所经过几个中间节点，也就是# 代表了编码长度token.code = code[::-1]'''
unigram,bigram,trigram,是自然语言处理（NLP）中的问题。父词条：n-gram.
unigram: 单个word
bigram: 双word
trigram:3 word
比如：
西安交通大学：
unigram 形式为：西/安/交/通/大/学
bigram形式为： 西安/安交/交通/通大/大学
trigram形式为：西安交/安交通/交通大/通大学
'''
class UnigramTable:"""A list of indices of tokens in the vocab following a power law distribution,used to draw negative samples."""def __init__(self, vocab):vocab_size = len(vocab)power = 0.75norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constanttable_size = int(1e8) # Length of the unigram tabletable = np.zeros(table_size, dtype=np.uint32)print ('Filling unigram table')p = 0 # Cumulative probabilityi = 0for j, unigram in enumerate(vocab):p += float(math.pow(unigram.count, power))/normwhile i < table_size and float(i) / table_size < p:table[i] = ji += 1self.table = tabledef sample(self, count):# 随机采样count条数据indices = np.random.randint(low=0, high=len(self.table), size=count)return [self.table[i] for i in indices]def sigmoid(z):if z > 6:return 1.0elif z < -6:return 0.0else:return 1 / (1 + math.exp(-z))def init_net(dim, vocab_size):# Init syn0 with random numbers from a uniform distribution on the interval [-0.5, 0.5]/dim# 产生随机均匀分布：np.random.uniform(a,b,size)-->产生size个大小在a到b之间的随机数# vocab_size:词表大小tmp = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(vocab_size, dim))# 从numpy数组创建并返回ctypes对象syn0 = np.ctypeslib.as_ctypes(tmp)syn0 = Array(syn0._type_, syn0, lock=False)# Init syn1 with zerostmp = np.zeros(shape=(vocab_size, dim))syn1 = np.ctypeslib.as_ctypes(tmp)syn1 = Array(syn1._type_, syn1, lock=False)return (syn0, syn1)def train_process(pid):# Set fi to point to the right chunk of training filestart = vocab.bytes / num_processes * pidend = vocab.bytes if pid == num_processes - 1 else vocab.bytes / num_processes * (pid + 1)fi.seek(start)#print 'Worker %d beginning training at %d, ending at %d' % (pid, start, end)alpha = starting_alphaword_count = 0last_word_count = 0while fi.tell() < end:line = fi.readline().strip()# Skip blank linesif not line:continue# Init sent, a list of indices of words in linesent = vocab.indices(['<bol>'] + line.split() + ['<eol>'])for sent_pos, token in enumerate(sent):if word_count % 10000 == 0:global_word_count.value += (word_count - last_word_count)last_word_count = word_count# Recalculate alphaalpha = starting_alpha * (1 - float(global_word_count.value) / vocab.word_count)if alpha < starting_alpha * 0.0001: alpha = starting_alpha * 0.0001# Print progress infosys.stdout.write("\rAlpha: %f Progress: %d of %d (%.2f%%)" %(alpha, global_word_count.value, vocab.word_count,float(global_word_count.value) / vocab.word_count * 100))sys.stdout.flush()# Randomize window size, where win is the max window size# 获取到窗内的单词current_win = np.random.randint(low=1, high=win+1)context_start = max(sent_pos - current_win, 0)context_end = min(sent_pos + current_win + 1, len(sent))context = sent[context_start:sent_pos] + sent[sent_pos+1:context_end] # Turn into an iterator?# CBOWif cbow:# Compute neu1# 隐藏层向量neu1neu1 = np.mean(np.array([syn0[c] for c in context]), axis=0)assert len(neu1) == dim, 'neu1 and dim do not agree'# Init neu1e with zerosneu1e = np.zeros(dim)# Compute neu1e and update syn1if neg > 0:# 正样本标记为1，负样本标记为0classifiers = [(token, 1)] + [(target, 0) for target in table.sample(neg)]else:classifiers = zip(vocab[token].path, vocab[token].code)for target, label in classifiers:# 隐藏层与样本词向量2的内积z = np.dot(neu1, syn1[target])# 得到的结果sigmoidp = sigmoid(z)# （p-label）为关于syn2中某词向量的偏导数，alpha为学习率g = alpha * (label - p)# 更新syn1syn1[target] += g * neu1  # Update syn1# 关于隐藏层的偏导数neu1e += g * syn1[target] # Error to backpropagate to syn0# Update syn0for context_word in context:syn0[context_word] += neu1e# Skip-gramelse:for context_word in context:# Init neu1e with zerosneu1e = np.zeros(dim)# Compute neu1e and update syn1if neg > 0:classifiers = [(token, 1)] + [(target, 0) for target in table.sample(neg)]else:classifiers = zip(vocab[token].path, vocab[token].code)for target, label in classifiers:z = np.dot(syn0[context_word], syn1[target])p = sigmoid(z)g = alpha * (label - p)neu1e += g * syn1[target]              # Error to backpropagate to syn0syn1[target] += g * syn0[context_word] # Update syn1# Update syn0syn0[context_word] += neu1eword_count += 1# Print progress infoglobal_word_count.value += (word_count - last_word_count)sys.stdout.write("\rAlpha: %f Progress: %d of %d (%.2f%%)" %(alpha, global_word_count.value, vocab.word_count,float(global_word_count.value)/vocab.word_count * 100))sys.stdout.flush()fi.close()def save(vocab, syn0, fo, binary):print ('Saving model to', fo)dim = len(syn0[0])xxx = binary# if binary:#     fo = open(fo, 'wb')#     fo.write("%d %d\n" % (len(syn0), dim))#     fo.write('\n')#     for token, vector in zip(vocab, syn0):#         fo.write('%s ' % token.word)#         for s in vector:#             fo.write(struct.pack('f', s))#         fo.write('\n')# else:fo = open(fo, 'w')fo.write('%d %d\n' % (len(syn0), dim))for token, vector in zip(vocab, syn0):word = token.wordvector_str = ' '.join([str(s) for s in vector])fo.write('%s %s\n' % (word, vector_str))fo.close()def __init_process(*args):global vocab, syn0, syn1, table, cbow, neg, dim, starting_alphaglobal win, num_processes, global_word_count, fivocab, syn0_tmp, syn1_tmp, table, cbow, neg, dim, starting_alpha, win, num_processes, global_word_count = args[:-1]fi = open(args[-1], 'r')with warnings.catch_warnings():warnings.simplefilter('ignore', RuntimeWarning)syn0 = np.ctypeslib.as_array(syn0_tmp)syn1 = np.ctypeslib.as_array(syn1_tmp)def train(fi, fo, cbow, neg, dim, alpha, win, min_count, num_processes, binary):# Read train file to init vocabvocab = Vocab(fi, min_count)# Init net# 词向量定义为dim维# vocab:词表大小# 初始化word2vec中的两个权重矩阵，也就是词向量矩阵syn0, syn1 = init_net(dim, len(vocab))global_word_count = Value('i', 0)table = Noneif neg > 0:print( 'Initializing unigram table')table = UnigramTable(vocab)else:print ('Initializing Huffman tree')vocab.encode_huffman()# Begin training using num_processes workerst0 = time.time()pool = Pool(processes=num_processes, initializer=__init_process,initargs=(vocab, syn0, syn1, table, cbow, neg, dim, alpha,win, num_processes, global_word_count, fi))pool.map(train_process, range(num_processes))t1 = time.time()print ('Completed training. Training took', (t1 - t0) / 60, 'minutes')# Save model to filesave(vocab, syn0, fo, binary)if __name__ == '__main__':# parser = argparse.ArgumentParser()# parser.add_argument('-train', help='Training file', dest='fi', required=True)# parser.add_argument('-model', help='Output model file', dest='fo', required=True)# parser.add_argument('-cbow', help='1 for CBOW, 0 for skip-gram', dest='cbow', default=1, type=int)# parser.add_argument('-negative', help='Number of negative examples (>0) for negative sampling, 0 for hierarchical softmax', dest='neg', default=5, type=int)# parser.add_argument('-dim', help='Dimensionality of word embeddings', dest='dim', default=100, type=int)# parser.add_argument('-alpha', help='Starting alpha', dest='alpha', default=0.025, type=float)# parser.add_argument('-window', help='Max window length', dest='win', default=5, type=int)# parser.add_argument('-min-count', help='Min count for words used to learn <unk>', dest='min_count', default=5, type=int)# parser.add_argument('-processes', help='Number of processes', dest='num_processes', default=1, type=int)# parser.add_argument('-binary', help='1 for output model in binary format, 0 otherwise', dest='binary', default=0, type=int)# #TO DO: parser.add_argument('-epoch', help='Number of training epochs', dest='epoch', default=1, type=int)# args = parser.parse_args()# train(args.fi, args.fo, bool(args.cbow), args.neg, args.dim, args.alpha, args.win,#       args.min_count, args.num_processes, bool(args.binary))train('questions-words.txt','result.txt', 0, 0, 300, 0.01, 1, 5, 1, 0)