Skip to content

Instantly share code, notes, and snippets.

View macleginn's full-sized avatar

Dmitry Nikolayev macleginn

View GitHub Profile
@macleginn
macleginn / sample_from_dolma.py
Created June 2, 2026 11:28
A script for applying reservoir sampling to the Dolma dataset
import random
import gzip
import json
import os
import requests
import pickle
from typing import List, Dict, Any, Tuple
from datetime import datetime
@macleginn
macleginn / llms_for_extremism_classification.py
Created June 2, 2026 11:21
Using LLMs for extremist speech detection (tested with Qwen and Gemma)
import argparse
import os
cache_root = os.path.abspath("./hf_cache")
os.environ["HF_HOME"] = cache_root
os.environ["HF_HUB_CACHE"] = os.path.join(cache_root, "hub")
os.environ["VLLM_CACHE_ROOT"] = os.path.join(cache_root, "vllm")
import torch
import pandas as pd
@macleginn
macleginn / llama_for_political_classification.py
Last active June 2, 2026 11:20
Llama for political classification
import pandas as pd
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from tqdm import tqdm
def llama3_call(user_prompt, temperature=0.0):
user_prompt = "### DOCUMENT:\n" + user_prompt + "\n### ANSWER"
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
messages = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt = True)
sampling_params = SamplingParams(temperature=temperature, max_tokens=10)
from math import pi
import numpy as np
x = np.array([1, 0, 0])
y = np.array([1, 0, 0.001]) # very similar to
z = np.array([0, 1, 0]) # orthogonal to x
q = np.array([-1, 0, 0]) # looks in the opposite direction to x
def angular_distance(v1, v2):
cos_of_angle = np.dot(v1, v2) / np.linalg.norm(v1) / np.linalg.norm(v2)
"""
Conceptual pseudocode explaining how sliding-window attention works
using loops. This is NOT meant to be efficient or runnable in a real
model. It simply illustrates what PyTorch operations like:
unfold
unsqueeze
squeeze
batched matrix multiplication
a
abandon
ability
able
abortion
about
above
abroad
absence
absolute
class Counter:
def __init__(self):
self.count = 0
def get_value(self):
return self.count
def increment(self):
self.count += 1
class Counter:
def __init__(self):
self.count = 0
def get_value(self):
return self.count
def increment(self):
self.count += 1
class Counter:
def __init__(self):
self.count = 0
def get_value(self):
return self.count
def increment(self):
self.count += 1
@macleginn
macleginn / dolma_sum_probability.py
Created July 9, 2025 11:31
Estimate the sum probability mass of a corpus using importance sampling
import json
import random
from math import ceil
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm.auto import tqdm