macleginn · June 2, 2026 11:20
diff --git a/llama_for_political_classification.py b/llama_for_political_classification.py
 import pandas as pd
 from vllm import LLM, SamplingParams
 from transformers import AutoTokenizer
 from tqdm import tqdm

 def llama3_call(user_prompt, temperature=0.0):
    user_prompt = "### DOCUMENT:\n" + user_prompt + "\n### ANSWER" 
    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
    messages = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt = True)
    sampling_params = SamplingParams(temperature=temperature, max_tokens=10)
    outputs = llm.generate(messages, sampling_params, use_tqdm=False)  # Generate texts from the prompts.
    text = outputs[0].outputs[0].text
    logprobs = outputs[0].outputs[0].logprobs
    return text, logprobs

 model_path = "hf_cache/models--unsloth--Llama-3.3-70B-Instruct-bnb-4bit/snapshots/75779cdfa1240a4d048c6ecf65e97ff31b707214"
 # model_name = "/scratch/gpfs/ds8100/transformer_cache/Llama-3.3-70B-Instruct-bnb-4bit"

 PIPELINE_PARALLEL_SIZE = 2
 if "bnb" in model_path:
    llm = LLM(model=model_path, quantization="bitsandbytes", load_format="bitsandbytes", max_model_len=8192 * 4, 
    pipeline_parallel_size=PIPELINE_PARALLEL_SIZE)
 else:
    llm = LLM(model=model_path, max_model_len=8192, pipeline_parallel_size=PIPELINE_PARALLEL_SIZE)
 tokenizer = AutoTokenizer.from_pretrained(model_path)

 # data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/corpora/dolma/sample.jsonl.gz"
 # data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/corpora/dolma/dolmino_sample.jsonl.gz"
 data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/dominik-llama/wgmix.jsonl.gz"

 # df = pd.read_json("/scratch/gpfs/ds8100/political-leaning-corpora/dolma_sample.jsonl.gz", lines=True, compression="gzip")
 df = pd.read_json(data_path, lines=True, compression="gzip")


 system_prompt = """### PROMPT:
 You are an expert in political language. Classify the document below as LEFT (1), NEUTRAL (2), or RIGHT (3) based on:

 - **Language** (partisan terms)
 - **Position** (alignment with progressive or conservative policies)
 - **Framing** (balance vs. one-sidedness)

 Use only the content of the document. Ignore source or author.
 Respond with one number: 1 (LEFT), 2 (NEUTRAL), or 3 (RIGHT).

 ### DOCUMENT: 
 ### ANSWER:"""

 results = []
 logprobs = []
 for text in tqdm(df.text, total=len(df)):
    try:
        annotation, prob = llama3_call(text)
        results.append(annotation)
        logprobs.append(prob)
    except:
        results.append(2)
        logprobs.append(None)

    if len(results) % 1000 == 0:
        with open('wgmix_annotation_log.csv', 'a') as out:
            for a, p in zip(results[-1000:], logprobs[-1000:]):
                out.write(f'{a},{p}\n')

 df = pd.DataFrame([[i,j] for i,j in zip(results, logprobs)], columns=["prediction", "logprob"])
 df.to_csv("wgmix_annotated.csv", index=False)
	import pandas as pd
	from vllm import LLM, SamplingParams
	from transformers import AutoTokenizer
	from tqdm import tqdm

	def llama3_call(user_prompt, temperature=0.0):
	user_prompt = "### DOCUMENT:\n" + user_prompt + "\n### ANSWER"
	messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
	messages = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt = True)
	sampling_params = SamplingParams(temperature=temperature, max_tokens=10)
	outputs = llm.generate(messages, sampling_params, use_tqdm=False) # Generate texts from the prompts.
	text = outputs[0].outputs[0].text
	logprobs = outputs[0].outputs[0].logprobs
	return text, logprobs

	model_path = "hf_cache/models--unsloth--Llama-3.3-70B-Instruct-bnb-4bit/snapshots/75779cdfa1240a4d048c6ecf65e97ff31b707214"
	# model_name = "/scratch/gpfs/ds8100/transformer_cache/Llama-3.3-70B-Instruct-bnb-4bit"

	PIPELINE_PARALLEL_SIZE = 2
	if "bnb" in model_path:
	llm = LLM(model=model_path, quantization="bitsandbytes", load_format="bitsandbytes", max_model_len=8192 * 4,
	pipeline_parallel_size=PIPELINE_PARALLEL_SIZE)
	else:
	llm = LLM(model=model_path, max_model_len=8192, pipeline_parallel_size=PIPELINE_PARALLEL_SIZE)
	tokenizer = AutoTokenizer.from_pretrained(model_path)

	# data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/corpora/dolma/sample.jsonl.gz"
	# data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/corpora/dolma/dolmino_sample.jsonl.gz"
	data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/dominik-llama/wgmix.jsonl.gz"

	# df = pd.read_json("/scratch/gpfs/ds8100/political-leaning-corpora/dolma_sample.jsonl.gz", lines=True, compression="gzip")
	df = pd.read_json(data_path, lines=True, compression="gzip")


	system_prompt = """### PROMPT:
	You are an expert in political language. Classify the document below as LEFT (1), NEUTRAL (2), or RIGHT (3) based on:

	- Language (partisan terms)
	- Position (alignment with progressive or conservative policies)
	- Framing (balance vs. one-sidedness)

	Use only the content of the document. Ignore source or author.
	Respond with one number: 1 (LEFT), 2 (NEUTRAL), or 3 (RIGHT).

	### DOCUMENT:
	### ANSWER:"""

	results = []
	logprobs = []
	for text in tqdm(df.text, total=len(df)):
	try:
	annotation, prob = llama3_call(text)
	results.append(annotation)
	logprobs.append(prob)
	except:
	results.append(2)
	logprobs.append(None)

	if len(results) % 1000 == 0:
	with open('wgmix_annotation_log.csv', 'a') as out:
	for a, p in zip(results[-1000:], logprobs[-1000:]):
	out.write(f'{a},{p}\n')

	df = pd.DataFrame([[i,j] for i,j in zip(results, logprobs)], columns=["prediction", "logprob"])
	df.to_csv("wgmix_annotated.csv", index=False)
No results found