Created
June 4, 2018 15:49
-
-
Save moretea/4b8620e6ce49849788956db0660c8b68 to your computer and use it in GitHub Desktop.
Memory mapped file example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from annoy import AnnoyIndex | |
import struct | |
VECTOR_LENGTH = 300 | |
annoy_index = AnnoyIndex(VECTOR_LENGTH) | |
nr_lines = 0 | |
words = [] | |
print("Reading dataset") | |
with open("glove.6B.300d.txt") as input: | |
for line in input: | |
parts = line.split(" ") | |
word = parts[0] | |
vector = [float(x) for x in parts[1:]] | |
assert len(vector) == VECTOR_LENGTH | |
words.append((word, vector)) | |
nr_lines = nr_lines +1 | |
# if nr_lines >= 5000: | |
# break | |
# Sort the dataset by text to be able to do binary search lookup of words. | |
print("Sorting dataset") | |
words.sort(key=lambda x: x[0]) | |
# Build k-NN datastructure | |
print("Build k-NN datastructure") | |
for idx, word in enumerate(words): | |
annoy_index.add_item(idx, word[1]) | |
annoy_index.build(20) # need to test/tweak this number. | |
annoy_index.save("vectors.ann") | |
# Build lookup table from word to index. | |
print("Build lookup table") | |
with open("vectors.idx", "wb") as vi: | |
# Write how big our lookup list is going to be. | |
vi.write(struct.pack("<I", len(words))) | |
# Compute address where the words will start. | |
# 1 word for the length, then len(words) words for the fixed-width lookup table, | |
# for cheap index lookups. | |
word_idx = (1 + len(words)) * 4 | |
# Write lookup list for words. | |
for word in words: | |
# Write text index of that word. | |
vi.write(struct.pack("<I", word_idx)) | |
word_idx += len(bytes(word[0].encode("UTF-8"))) + 1 # Compute next index; length of word + null byte. | |
# padding_bytes = 4 - (word_idx % 4) # pad to multiple of 4 bytes. | |
# word_idx += padding_bytes | |
# Write null terminated strings; This way we can simply compute the address and consider | |
# them to be C's null-terminated strings. | |
for word in words: | |
vi.write(bytes(word[0].encode("UTF-8"))) | |
vi.write(bytes("\x00".encode("UTF-8"))) | |
# padding_bytes = 4 - (word_idx % 4) | |
# for _ in range(padding_bytes): | |
# vi.write(b"\x00") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from annoy import AnnoyIndex | |
import struct | |
import mmap | |
import ctypes | |
VECTOR_LENGTH = 300 | |
print("INGELADEN") | |
class WordIndex: | |
def __init__(self, index_file): | |
self.file = open(index_file, "rb") | |
self.mm = mmap.mmap(self.file.fileno(), 0, access=mmap.ACCESS_READ) | |
self.nr_words = self._get_long(0) | |
# Now do some magic to get the pointer of the mmaped file. | |
# Needed to construct a ctypes C string. | |
obj = ctypes.py_object(self.mm) | |
address = ctypes.c_void_p() | |
length = ctypes.c_ssize_t() | |
ctypes.pythonapi.PyObject_AsReadBuffer(obj, ctypes.byref(address), ctypes.byref(length)) | |
self.start_of_words_pointer = address.value | |
# Perform a binary search to find the index. | |
def find_index(self, word): | |
word = bytes(word.encode("utf-8")) | |
low = 0 | |
high = self.nr_words | |
while low <= high: | |
midpoint = (low + high) // 2 | |
midword = self.get_word(midpoint) | |
if midword == word: | |
return midpoint | |
elif word < midword: | |
high = midpoint -1 | |
else: | |
low = midpoint + 1 | |
# Return a pointer to a C-style string | |
def get_word(self, idx): | |
addr = (1 + idx) * 4 | |
cstr_ptr = self.start_of_words_pointer + self._get_long(addr) | |
return ctypes.c_char_p(cstr_ptr).value | |
# read a long int from the mmapped file. | |
def _get_long(self, addr): | |
return struct.unpack("<I", self.mm[addr:addr+4])[0] | |
word_index = WordIndex("vectors.idx") | |
annoy_index = AnnoyIndex(VECTOR_LENGTH) | |
annoy_index.load("vectors.ann") | |
apple_nr = word_index.find_index("apple") | |
company_nr = word_index.find_index("company") | |
fruit_nr = word_index.find_index("fruit") | |
def print_close(what_nr): | |
indices = annoy_index.get_nns_by_item(what_nr, 10) | |
close_words = [word_index.get_word(x) for x in indices] | |
print("close to", close_words) | |
print_close(apple_nr) | |
print_close(fruit_nr) | |
print_close(company_nr) | |
#for i in range(0, word_index.nr_words): | |
# print(word_index.get_word(i).decode("utf-8")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
annoy>=1.12.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment