-
Word2Vec尝试
作者: Azure 发布: 2018-6-24 分类: 开发之路 阅读: 次 查看评论
-
ExportZhWiki.py
from gensim.corpora.wikicorpus import extract_pages,filter_wiki import bz2file import re from opencc import OpenCC from tqdm import tqdm import codecs import jieba CC = OpenCC('t2s') wiki = extract_pages(bz2file.open('zhwiki-20180620-pages-articles-multistream.xml.bz2')) def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = re.sub('[ -~]','',s) s = re.sub(',', ',\n', s) s = re.sub('。', '。\n', s) #punctuation = """!?。"#$%&'()〈〉*+-/《:∶²;<·=>@[\]^_`{|〡}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏""" #re_punctuation = "[{}]+".format(punctuation) #s = re.sub(re_punctuation, "", s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\t', '', s) s = re.sub(' ', '', s) return CC.convert(s).strip() i = 0 f = codecs.open('jieba-zhwiki-articles.txt', 'w', encoding='utf-8') w = tqdm(wiki, desc=u'已获取0篇文章') for d in w: i += 1 if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall(u'^#', d[1]): s = wiki_replace(d) if s!='': data=list(jieba.cut(s,cut_all=False)) readline=' '.join(data) f.write(readline) if i % 100 == 0: w.set_description(u'已获取%s篇文章'%i) else: if i % 100 == 0: w.set_description(u'当前文章%s被跳过'%i) f.close()
BuildWord2Vec.py
from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence import multiprocessing import logging import os.path import sys if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) print('Start...') rawdata='jieba-zhwiki-articles.txt' modelpath='word2vec.model' model=Word2Vec(LineSentence(rawdata),iter=10,size=100,window=10,min_count=5,workers=multiprocessing.cpu_count())#参数说明,gensim函数库的Word2Vec的参数说明 model.save(modelpath) print("Finished!")
TestWord2Vec.py
from gensim.models import Word2Vec model=Word2Vec.load('word2vec.model') semi='' while True: word = input('输出一个词查询相似度:') try: semi = model.wv.most_similar(word,topn=10) except KeyError: print('The word not in vocabulary!') #print(model[u'日本'])#打印词向量 for term in semi: print('%s,%s' %(term[0],term[1]))
中文wiki的语言库的下载地址为:https://dumps.wikimedia.org/zhwiki/20180620/zhwiki-20180620-pages-articles-multistream.xml.bz2
原创文章,转载请注明出处!标签: Word2Vec
- 控制面板
-
- 网站分类
-
- 搜索
-
- 最新留言
-
- 友情链接
-