# 语料与向量空间

Reading time ~4 minutes

 1 2 >>> import logging >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 

# 1.将String映射到Vector

 1 2 3 4 5 6 7 8 9 10 11 >>> from gensim import corpora, models, similarities >>> >>> documents = ["Human machine interface for lab abc computer applications", >>> "A survey of user opinion of computer system response time", >>> "The EPS user interface management system", >>> "System and human system engineering testing of EPS", >>> "Relation of user perceived response time to error measurement", >>> "The generation of random binary unordered trees", >>> "The intersection graph of paths in trees", >>> "Graph minors IV Widths of trees and well quasi ordering", >>> "Graph minors A survey"] 

 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 >>> # remove common words and tokenize >>> stoplist = set('for a of the and to in'.split()) >>> texts = [[word for word in document.lower().split() if word not in stoplist] >>> for document in documents] >>> >>> # remove words that appear only once >>> all_tokens = sum(texts, []) >>> tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) >>> texts = [[word for word in text if word not in tokens_once] >>> for text in texts] >>> >>> print(texts) [['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] 

 1 "How many times does the word system appear in the document? Once." 

 1 2 3 4 >>> dictionary = corpora.Dictionary(texts) >>> dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference >>> print(dictionary) Dictionary(12 unique tokens) 

 1 2 3 >>> print(dictionary.token2id) {'minors': 11, 'graph': 10, 'system': 5, 'trees': 9, 'eps': 8, 'computer': 0, 'survey': 4, 'user': 7, 'human': 1, 'time': 6, 'interface': 2, 'response': 3} 

 1 2 3 4 >>> new_doc = "Human computer interaction" >>> new_vec = dictionary.doc2bow(new_doc.lower().split()) >>> print(new_vec) # the word "interaction" does not appear in the dictionary and is ignored [(0, 1), (1, 1)] 

 1 2 3 4 5 6 7 8 9 10 11 12 >>> corpus = [dictionary.doc2bow(text) for text in texts] >>> corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use >>> print(corpus) [(0, 1), (1, 1), (2, 1)] [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)] [(2, 1), (5, 1), (7, 1), (8, 1)] [(1, 1), (5, 2), (8, 1)] [(3, 1), (6, 1), (7, 1)] [(9, 1)] [(9, 1), (10, 1)] [(9, 1), (10, 1), (11, 1)] [(4, 1), (10, 1), (11, 1)] 

#2.语料流-一次一个文档

 1 2 3 4 5 >>> class MyCorpus(object): >>> def __iter__(self): >>> for line in open('mycorpus.txt'): >>> # assume there's one document per line, tokens separated by whitespace >>> yield dictionary.doc2bow(line.lower().split()) 

Corpus是一个对象。我们没有定义任何方法来打印它，因此它只打印内存对象的地址。不是非常有用。为了查看相应的矢量，可以迭代corpus对象来打印每个文档向量(一次一个) ：

 1 2 3 4 5 6 7 8 9 10 11 >>> for vector in corpus_memory_friendly: # load one vector into memory at a time ... print(vector) [(0, 1), (1, 1), (2, 1)] [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)] [(2, 1), (5, 1), (7, 1), (8, 1)] [(1, 1), (5, 2), (8, 1)] [(3, 1), (6, 1), (7, 1)] [(9, 1)] [(9, 1), (10, 1)] [(9, 1), (10, 1), (11, 1)] [(4, 1), (10, 1), (11, 1)] 

 1 2 3 4 5 6 7 8 9 10 >>> # collect statistics about all tokens >>> dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt')) >>> # remove stop words and words that appear only once >>> stop_ids = [dictionary.token2id[stopword] for stopword in stoplist >>> if stopword in dictionary.token2id] >>> once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] >>> dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once >>> dictionary.compactify() # remove gaps in id sequence after words that were removed >>> print(dictionary) Dictionary(12 unique tokens) 

#3.语料格式

 1 2 3 4 5 >>> from gensim import corpora >>> # create a toy corpus of 2 documents, as a plain Python list >>> corpus = [[(1, 0.5)], []] # make one document empty, for the heck of it >>> >>> corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus) 

 1 2 3 >>> corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus) >>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus) >>> corpora.LowCorpus.serialize('/tmp/corpus.low', corpus) 

 1 >>> corpus = corpora.MmCorpus('/tmp/corpus.mm') 

 1 2 >>> print(corpus) MmCorpus(2 documents, 2 features, 1 non-zero entries) 

 1 2 3 >>> # one way of printing a corpus: load it entirely into memory >>> print(list(corpus)) # calling list() will convert any sequence to a plain Python list [[(1, 0.5)], []] 

 1 2 3 4 5 >>> # another way of doing it: print one document at a time, making use of the streaming interface >>> for doc in corpus: ... print(doc) [(1, 0.5)] [] 

 1 >>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus) 

#4.兼容NumPy和SciPy

gensim也包含了有效的工具函数，来帮助转换numpy矩阵：

 1 2 >>> corpus = gensim.matutils.Dense2Corpus(numpy_matrix) >>> numpy_matrix = gensim.matutils.corpus2dense(corpus) 

 1 2 >>> corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix) >>> scipy_csc_matrix = gensim.matutils.corpus2csc(corpus) `