wolfecameron · May 17, 2025 04:30
diff --git a/olmo_trace_index.py b/olmo_trace_index.py
 import os
 import json
 from collections import Counter
 import tempfile

 from transformers import AutoTokenizer

 # load tokenizer / data
 enc = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", add_bos_token=False, add_eos_token=False)
 data_rows = [{'text': 'here is some training data'}, ...]

 # compute / save unigram probabilities
 all_toks = []
 for x in data_rows:
    all_toks.extend(enc.encode(x['text']))
 total_toks = len(all_toks)
 tok_count = Counter(all_toks)
 unigram_probs = {}
 for tid in tok_count:
    cnt = tok_count[tid]
    unigram_probs[tid] = cnt / total_toks
 with open(<save path>, 'w') as json_file:
    json.dump(unigram_probs, json_file, indent=4)

 # build infinigram index
 data_dir = <path to data>
 save_dir = <save index here>
 temp_dir = tempfile.TemporaryDirectory()
 command = (
    f"python -m infini_gram.indexing --data_dir {data_dir} "
    f"--temp_dir {temp_dir.name} --save_dir {save_dir} "
    f"--tokenizer llama --cpus 12 --mem 64  --shards 1 "
    f"--add_metadata --ulimit 100000 "
 )
 print(command)
 os.system(command)
 temp_dir.cleanup()
	import os
	import json
	from collections import Counter
	import tempfile

	from transformers import AutoTokenizer

	# load tokenizer / data
	enc = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", add_bos_token=False, add_eos_token=False)
	data_rows = [{'text': 'here is some training data'}, ...]

	# compute / save unigram probabilities
	all_toks = []
	for x in data_rows:
	all_toks.extend(enc.encode(x['text']))
	total_toks = len(all_toks)
	tok_count = Counter(all_toks)
	unigram_probs = {}
	for tid in tok_count:
	cnt = tok_count[tid]
	unigram_probs[tid] = cnt / total_toks
	with open(<save path>, 'w') as json_file:
	json.dump(unigram_probs, json_file, indent=4)

	# build infinigram index
	data_dir = <path to data>
	save_dir = <save index here>
	temp_dir = tempfile.TemporaryDirectory()
	command = (
	f"python -m infini_gram.indexing --data_dir {data_dir} "
	f"--temp_dir {temp_dir.name} --save_dir {save_dir} "
	f"--tokenizer llama --cpus 12 --mem 64 --shards 1 "
	f"--add_metadata --ulimit 100000 "
	)
	print(command)
	os.system(command)
	temp_dir.cleanup()