Word2Vec尝试
ExportZhWiki.py
from gensim.corpora.wikicorpus import extract_pages,filter_wiki import bz2file import re from opencc import OpenCC from tqdm import tqdm import codecs import jieba CC = OpenCC('t2s') wiki = extract_pages(bz2file.open('zhwiki-20180620-pages-articles-multistream.xml.bz2')) def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = re.sub('[ -~]','',s) s = re.sub(',', ',\n', s) s = re.sub('。', '。\n', s) #punctuation = """!?。"#$%&'()〈〉*+-/《:∶²;<·=>@[\]^_`{|〡}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏""" #re_punctuation = "[{}]+".format(punctuation) #s = re.sub(re_punctuation, "", s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\n\n', '\n', s) s = re.sub('\t', '', s) s = re.sub(' ', '', s) return CC.convert(s).strip() i = 0 f = codecs.open('jieba-zhwiki-articles.txt', 'w', encoding='utf-8') w = tqdm(wiki, desc=u'已获取0篇文章') for d in w: i += 1 if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall(u'^#', d[1]): s = wiki_replace(d) if s!='': data=list(jieba.cut(s,cut_all=False)) readline=' '.join(data) f.write(readline) if i % 100 == 0: w.set_description(u'已获取%s篇文章'%i) else: if i % 100 == 0: w.set_description(u'当前文章%s被跳过'%i) f.close()
发布: 2018/6/24 分类: 开发之路 阅读: 次 评论: 0次