Last active
July 25, 2021 18:06
-
-
Save rominf/fd6545c659689d0b899b99af14636cbe to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pony import orm | |
import lmdb | |
env = lmdb.open('../conceptnet-lite-data/conceptnet-lmdb.db', map_size=16*1024*1024*1024, max_dbs=2) | |
start_db = env.open_db(b'start') | |
db = orm.Database() | |
class Label(db.Entity): | |
text = orm.Required(str) | |
language = orm.Required(str) | |
from pathlib import Path | |
db.bind(filename=str(Path('../conceptnet-lite-data/normalization-test.db').resolve()), provider='sqlite', create_db=True) | |
db.generate_mapping(create_tables=True) | |
import csv | |
from pathlib import Path | |
from typing import Generator, Optional, Tuple, Union | |
PathOrStr = Union[Path, str] | |
def edges_from_dump_by_parts_generator( | |
path: PathOrStr, | |
count: Optional[int] = None, | |
) -> Generator[Tuple[str, str, str, str], None, None]: | |
with open(str(path), newline='') as f: | |
reader = csv.reader(f, delimiter='\t') | |
for i, row in enumerate(reader): | |
yield row[1:5] | |
if i == count: | |
break | |
%%time | |
i = 0 | |
with env.begin(start_db, write=True) as txn: | |
for relation_name, start_uri, end_uri, edge_etc_json in edges_from_dump_by_parts_generator('../conceptnet-lite-data/conceptnet-assertions-5.7.0.csv'): | |
i += 1 | |
language_b, start_b = [x.encode('utf8') for x in start_uri.split('/', maxsplit=4)[2:4]] | |
exising_start_language_b = txn.get(start_b) | |
if exising_start_language_b != language_b: | |
txn.put(start_b, language_b) | |
if i % 1000000 == 0: | |
print(i) | |
%%time | |
with env.begin(start_db) as txn: | |
cursor = txn.cursor() | |
total_count = txn.stat(start_db)['entries'] | |
i = 0 | |
while i < total_count: | |
with orm.db_session: | |
for key, value in cursor: | |
i += 1 | |
text = key.decode('utf8') | |
language = value.decode('utf8') | |
Label(text=text, language=language) | |
if i % 1000000 == 0: | |
print(i) | |
cursor.next() | |
break | |
with env.begin(start_db) as txn: | |
cursor = txn.cursor() | |
print(txn.stat(start_db)['entries']) | |
Label.select().count() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment