Skip to content

Instantly share code, notes, and snippets.

@wolfecameron
Created May 17, 2025 04:30
Show Gist options
  • Save wolfecameron/6120678a88bf52d7be524266c82c409a to your computer and use it in GitHub Desktop.
Save wolfecameron/6120678a88bf52d7be524266c82c409a to your computer and use it in GitHub Desktop.
Build an infini-gram index (and store other relevant info) for use in OLMoTrace.
import os
import json
from collections import Counter
import tempfile
from transformers import AutoTokenizer
# load tokenizer / data
enc = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", add_bos_token=False, add_eos_token=False)
data_rows = [{'text': 'here is some training data'}, ...]
# compute / save unigram probabilities
all_toks = []
for x in data_rows:
all_toks.extend(enc.encode(x['text']))
total_toks = len(all_toks)
tok_count = Counter(all_toks)
unigram_probs = {}
for tid in tok_count:
cnt = tok_count[tid]
unigram_probs[tid] = cnt / total_toks
with open(<save path>, 'w') as json_file:
json.dump(unigram_probs, json_file, indent=4)
# build infinigram index
data_dir = <path to data>
save_dir = <save index here>
temp_dir = tempfile.TemporaryDirectory()
command = (
f"python -m infini_gram.indexing --data_dir {data_dir} "
f"--temp_dir {temp_dir.name} --save_dir {save_dir} "
f"--tokenizer llama --cpus 12 --mem 64 --shards 1 "
f"--add_metadata --ulimit 100000 "
)
print(command)
os.system(command)
temp_dir.cleanup()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment