Created
May 17, 2025 04:30
-
-
Save wolfecameron/6120678a88bf52d7be524266c82c409a to your computer and use it in GitHub Desktop.
Build an infini-gram index (and store other relevant info) for use in OLMoTrace.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
from collections import Counter | |
import tempfile | |
from transformers import AutoTokenizer | |
# load tokenizer / data | |
enc = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", add_bos_token=False, add_eos_token=False) | |
data_rows = [{'text': 'here is some training data'}, ...] | |
# compute / save unigram probabilities | |
all_toks = [] | |
for x in data_rows: | |
all_toks.extend(enc.encode(x['text'])) | |
total_toks = len(all_toks) | |
tok_count = Counter(all_toks) | |
unigram_probs = {} | |
for tid in tok_count: | |
cnt = tok_count[tid] | |
unigram_probs[tid] = cnt / total_toks | |
with open(<save path>, 'w') as json_file: | |
json.dump(unigram_probs, json_file, indent=4) | |
# build infinigram index | |
data_dir = <path to data> | |
save_dir = <save index here> | |
temp_dir = tempfile.TemporaryDirectory() | |
command = ( | |
f"python -m infini_gram.indexing --data_dir {data_dir} " | |
f"--temp_dir {temp_dir.name} --save_dir {save_dir} " | |
f"--tokenizer llama --cpus 12 --mem 64 --shards 1 " | |
f"--add_metadata --ulimit 100000 " | |
) | |
print(command) | |
os.system(command) | |
temp_dir.cleanup() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment