macleginn · June 2, 2026 11:21
diff --git a/llms_for_extremism_classification.py b/llms_for_extremism_classification.py
 import argparse
 import os

 cache_root = os.path.abspath("./hf_cache")
 os.environ["HF_HOME"] = cache_root
 os.environ["HF_HUB_CACHE"] = os.path.join(cache_root, "hub")
 os.environ["VLLM_CACHE_ROOT"] = os.path.join(cache_root, "vllm")

 import torch
 import pandas as pd
 from tqdm import tqdm
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams

 EXTREMISM_DEFINITIONS = {
    "uk": """Extremism is the promotion or advancement of an ideology based on violence, hatred or intolerance, that aims to:
 1. negate or destroy the fundamental rights and freedoms of others; or
 2. undermine, overturn or replace the system of liberal democracy and democratic rights; or
 3. intentionally create a permissive environment for others to achieve the results in (1) or (2).""",
    "berger": """Extremism refers to the belief that the success or survival of the speaker's or author's in-group can never be separated from the need for hostile action against an out-group. The hostile action must be part of the in-group's definition of success. Hostile acts can range from verbal attacks and diminishment to discriminatory behavior, violence, and even genocide.""",
    "schmid": """In the context of democratic societies, extremism is a form of political expression (usually on the far left or the far right of the political spectrum) that is not acceptable to the more moderate mainstream of political life. Extremist groups and parties tend to be anti-constitutional, antidemocratic, anti-pluralistic, fanatical, intolerant, non-compromising, single-minded, authoritarian and adhering to an ends-justify-means philosophy, wanting to realise their goals by any means, including the use of political violence against opponents. Extremists on the political left or right and those of a religious-fundamentalist orientation favour violence over persuasion, uniformity over diversity, unity over pluralism and orders over dialogue.""",
 }

 SYSTEM_PROMPT_TEMPLATE = """You are an expert in political language. Check if the document below satisfies the following definition of extremist speech:

 {extremism_defn}

 Use only the content of the document. Ignore source or author.
 Respond with one word: "Yes" or "No".
 """


 def make_tsv_safe(text):
    return str(text).replace("\t", " ").replace("\n", " ").replace("\r", " ")


 def load_tokenizer_with_fallback(model_tag: str):
    try:
        return AutoTokenizer.from_pretrained(model_tag)
    except AttributeError as exc:
        # Some model/tokenizer combinations hit a fast-tokenizer special-token
        # parsing bug where extra_special_tokens is read as a list.
        if "'list' object has no attribute 'keys'" in str(exc):
            print(
                "Fast tokenizer initialization failed due to special token config. "
                "Retrying with use_fast=False."
            )
            return AutoTokenizer.from_pretrained(model_tag, use_fast=False)
        raise


 def get_system_prompt(extremism_defn):
    return SYSTEM_PROMPT_TEMPLATE.format(extremism_defn=extremism_defn)


 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("defn_key", choices=EXTREMISM_DEFINITIONS.keys())
    parser.add_argument("model_tag")
    return parser.parse_args()


 def main():
    args = parse_args()

    defn_key = args.defn_key
    model_tag = args.model_tag
    extremism_defn = EXTREMISM_DEFINITIONS[defn_key]
    system_prompt = get_system_prompt(extremism_defn)
    tokenizer = load_tokenizer_with_fallback(model_tag)
    sampling_params = SamplingParams(temperature=0.0, max_tokens=3)
    num_gpus = torch.cuda.device_count()
    max_model_len = 8192
    if "bnb" in model_tag:
        llm = LLM(
            model=model_tag,
            pipeline_parallel_size=num_gpus,
            max_model_len=max_model_len,
        )
    else:
        llm = LLM(
            model=model_tag, tensor_parallel_size=num_gpus, max_model_len=max_model_len
        )

    def model_call(user_prompt):
        user_prompt = "Document:\n\n" + user_prompt
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]
        chat_template_kwargs = {"tokenize": False, "add_generation_prompt": True}
        if "Qwen3.5" in model_tag:
            chat_template_kwargs["enable_thinking"] = False
        chat_input = tokenizer.apply_chat_template(messages, **chat_template_kwargs)
        outputs = llm.generate([chat_input], sampling_params, use_tqdm=False)
        return outputs[0].outputs[0].text.strip()

    data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/corpora/dolma/sample.jsonl.gz"
    # data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/corpora/dolma/dolmino_sample.jsonl.gz"
    # data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/dominik-llama/wgmix.jsonl.gz"
    df = pd.read_json(data_path, lines=True, compression="gzip")
    model_name_safe = model_tag.replace("/", "_")
    extremist_sample_out_path = f"../extremism_clf_outputs/dolma_extremist_sample_{model_name_safe}_{defn_key}.txt"
    with open(extremist_sample_out_path, "w", encoding="utf-8") as out:
        for text in tqdm(df.text, total=len(df)):
            if pd.isnull(text) or not str(text).strip():
                continue
            try:
                annotation = model_call(str(text)).lower()
                if annotation.startswith("yes"):
                    out.write(make_tsv_safe(text))
                    out.write("\n")
            except Exception:
                continue


 if __name__ == "__main__":
    main()
	import argparse
	import os

	cache_root = os.path.abspath("./hf_cache")
	os.environ["HF_HOME"] = cache_root
	os.environ["HF_HUB_CACHE"] = os.path.join(cache_root, "hub")
	os.environ["VLLM_CACHE_ROOT"] = os.path.join(cache_root, "vllm")

	import torch
	import pandas as pd
	from tqdm import tqdm
	from transformers import AutoTokenizer
	from vllm import LLM, SamplingParams

	EXTREMISM_DEFINITIONS = {
	"uk": """Extremism is the promotion or advancement of an ideology based on violence, hatred or intolerance, that aims to:
	1. negate or destroy the fundamental rights and freedoms of others; or
	2. undermine, overturn or replace the system of liberal democracy and democratic rights; or
	3. intentionally create a permissive environment for others to achieve the results in (1) or (2).""",
	"berger": """Extremism refers to the belief that the success or survival of the speaker's or author's in-group can never be separated from the need for hostile action against an out-group. The hostile action must be part of the in-group's definition of success. Hostile acts can range from verbal attacks and diminishment to discriminatory behavior, violence, and even genocide.""",
	"schmid": """In the context of democratic societies, extremism is a form of political expression (usually on the far left or the far right of the political spectrum) that is not acceptable to the more moderate mainstream of political life. Extremist groups and parties tend to be anti-constitutional, antidemocratic, anti-pluralistic, fanatical, intolerant, non-compromising, single-minded, authoritarian and adhering to an ends-justify-means philosophy, wanting to realise their goals by any means, including the use of political violence against opponents. Extremists on the political left or right and those of a religious-fundamentalist orientation favour violence over persuasion, uniformity over diversity, unity over pluralism and orders over dialogue.""",
	}

	SYSTEM_PROMPT_TEMPLATE = """You are an expert in political language. Check if the document below satisfies the following definition of extremist speech:

	{extremism_defn}

	Use only the content of the document. Ignore source or author.
	Respond with one word: "Yes" or "No".
	"""


	def make_tsv_safe(text):
	return str(text).replace("\t", " ").replace("\n", " ").replace("\r", " ")


	def load_tokenizer_with_fallback(model_tag: str):
	try:
	return AutoTokenizer.from_pretrained(model_tag)
	except AttributeError as exc:
	# Some model/tokenizer combinations hit a fast-tokenizer special-token
	# parsing bug where extra_special_tokens is read as a list.
	if "'list' object has no attribute 'keys'" in str(exc):
	print(
	"Fast tokenizer initialization failed due to special token config. "
	"Retrying with use_fast=False."
	)
	return AutoTokenizer.from_pretrained(model_tag, use_fast=False)
	raise


	def get_system_prompt(extremism_defn):
	return SYSTEM_PROMPT_TEMPLATE.format(extremism_defn=extremism_defn)


	def parse_args():
	parser = argparse.ArgumentParser()
	parser.add_argument("defn_key", choices=EXTREMISM_DEFINITIONS.keys())
	parser.add_argument("model_tag")
	return parser.parse_args()


	def main():
	args = parse_args()

	defn_key = args.defn_key
	model_tag = args.model_tag
	extremism_defn = EXTREMISM_DEFINITIONS[defn_key]
	system_prompt = get_system_prompt(extremism_defn)
	tokenizer = load_tokenizer_with_fallback(model_tag)
	sampling_params = SamplingParams(temperature=0.0, max_tokens=3)
	num_gpus = torch.cuda.device_count()
	max_model_len = 8192
	if "bnb" in model_tag:
	llm = LLM(
	model=model_tag,
	pipeline_parallel_size=num_gpus,
	max_model_len=max_model_len,
	)
	else:
	llm = LLM(
	model=model_tag, tensor_parallel_size=num_gpus, max_model_len=max_model_len
	)

	def model_call(user_prompt):
	user_prompt = "Document:\n\n" + user_prompt
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	]
	chat_template_kwargs = {"tokenize": False, "add_generation_prompt": True}
	if "Qwen3.5" in model_tag:
	chat_template_kwargs["enable_thinking"] = False
	chat_input = tokenizer.apply_chat_template(messages, **chat_template_kwargs)
	outputs = llm.generate([chat_input], sampling_params, use_tqdm=False)
	return outputs[0].outputs[0].text.strip()

	data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/corpora/dolma/sample.jsonl.gz"
	# data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/corpora/dolma/dolmino_sample.jsonl.gz"
	# data_path = "/mnt/hum01-rds/Nikolaev_Dmitry/dominik-llama/wgmix.jsonl.gz"
	df = pd.read_json(data_path, lines=True, compression="gzip")
	model_name_safe = model_tag.replace("/", "_")
	extremist_sample_out_path = f"../extremism_clf_outputs/dolma_extremist_sample_{model_name_safe}_{defn_key}.txt"
	with open(extremist_sample_out_path, "w", encoding="utf-8") as out:
	for text in tqdm(df.text, total=len(df)):
	if pd.isnull(text) or not str(text).strip():
	continue
	try:
	annotation = model_call(str(text)).lower()
	if annotation.startswith("yes"):
	out.write(make_tsv_safe(text))
	out.write("\n")
	except Exception:
	continue


	if __name__ == "__main__":
	main()
No results found