Created
March 20, 2018 20:23
-
-
Save persiyanov/5aed5165d7945c176a0f557a473ef848 to your computer and use it in GitHub Desktop.
python_memmap.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright (C) 2010 Radim Rehurek <[email protected]> | |
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | |
"""Corpus in the Matrix Market format. | |
This code uses python's struct library to read/write binary data | |
""" | |
import logging | |
import numpy as np | |
from scipy.sparse import csc_matrix | |
logger = logging.getLogger(__name__) | |
class MemmapReaderArray(object): | |
"""Matrix market file reader, used for :class:`~gensim.corpora.mmcorpus.MmCorpus`. | |
Wrap a term-document matrix on disk (in matrix-market format), and present it | |
as an object which supports iteration over the rows (~documents). | |
Attributes | |
---------- | |
num_docs : int | |
number of documents in market matrix file | |
num_terms : int | |
number of terms | |
num_nnz : int | |
number of non-zero terms | |
Notes | |
---------- | |
Note that the file is read into memory one document at a time, not the whole matrix at once | |
(unlike :meth:`~scipy.io.mmread`). This allows us to process corpora which are larger than the available RAM. | |
""" | |
def __init__(self, input, transposed=True): | |
""" | |
Parameters | |
---------- | |
input : {str, file-like object} | |
Path to input file in MM format or a file-like object that supports `seek()` | |
(e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`). | |
transposed : bool, optional | |
if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value. | |
""" | |
logger.info("initializing corpus reader from %s", input) | |
self.input, self.transposed = input, transposed | |
self.num_docs, self.num_terms, self.num_nnz = self.read_headers() | |
logger.info( | |
"accepted corpus with %i documents, %i features, %i non-zero entries", | |
self.num_docs, self.num_terms, self.num_nnz | |
) | |
def __len__(self): | |
"""Get size of corpus (number of documents).""" | |
return self.num_docs | |
def __str__(self): | |
return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % | |
(self.num_docs, self.num_terms, self.num_nnz)) | |
def read_headers(self): | |
"""Reader header row for file metadata | |
Returns | |
---------- | |
num_docs : int | |
num_terms : int | |
num_nnz : int | |
""" | |
meta_fp = np.memmap(self.input, dtype='int32', mode='r', shape=(3,)) | |
num_docs, num_terms, num_nnz = meta_fp[:] | |
return num_docs, num_terms, num_nnz | |
@staticmethod | |
def construct_csc(corpus): | |
# matrix term-document (then, csc.indptr[i]:csc.indptr[i+1] will be a slice for i-th document) | |
data, row_ind, col_ind = [], [], [] | |
for (doc_id, doc) in enumerate(corpus): | |
for (termid, value) in doc: | |
data.append(value) | |
row_ind.append(termid) | |
col_ind.append(doc_id) | |
return csc_matrix((data, (row_ind, col_ind)), dtype='float32') | |
@staticmethod | |
def save_corpus(fname, corpus): | |
logger.info("storing corpus in memmap Matrix Market format to %s", fname) | |
csc = MemmapReaderArray.construct_csc(corpus) | |
num_terms, num_docs = csc.shape | |
num_nnz = csc.nnz | |
logger.info( | |
"storing corpus with %i documents, %i features, %i non-zero entries", | |
num_docs, num_terms, num_nnz | |
) | |
# write out header info | |
meta_fp = np.memmap(fname, dtype='int32', mode='w+', shape=(3,)) | |
meta_fp[:] = [num_docs, num_terms, num_nnz] | |
del meta_fp # this forces flush() | |
data_fp = np.memmap(fname+'.data', dtype='float32', mode='w+', shape=csc.data.shape) | |
data_fp[:] = csc.data[:] | |
del data_fp | |
indices_fp = np.memmap(fname+'.indices', dtype='int32', mode='w+', shape=csc.indices.shape) | |
indices_fp[:] = csc.indices[:] | |
del indices_fp | |
indptr_fp = np.memmap(fname+'.indptr', dtype='int32', mode='w+', shape=csc.indptr.shape) | |
indptr_fp[:] = csc.indptr[:] | |
del indptr_fp | |
def __iter__(self): | |
"""Iterate through corpus. | |
Notes | |
------ | |
Note that the total number of vectors returned is always equal to the number of rows specified | |
in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly | |
stored in the Matrix Market file. | |
Yields | |
------ | |
(int, list of (int, number)) | |
Document id and Document in BoW format | |
""" | |
data_fp = np.memmap(self.input + '.data', dtype='float32', mode='r') | |
indices_fp = np.memmap(self.input + '.indices', dtype='int32', mode='r') | |
indptr_fp = np.memmap(self.input + '.indptr', dtype='int32', mode='r') | |
assert self.num_docs == indptr_fp.shape[0] - 1 | |
for i in range(self.num_docs): | |
yield zip(indices_fp[indptr_fp[i]:indptr_fp[i+1]], data_fp[indptr_fp[i]:indptr_fp[i+1]]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment