bbengfort · October 6, 2024 20:42
diff --git a/process.py b/process.py
 #!/usr/bin/env python3

 import os
 import re
 import csv
 import json

 from datetime import datetime, timedelta


 BASE_DIR = os.path.dirname(__file__)
 EVALUATIONS = os.path.join(BASE_DIR, "original", "evaluations.jsonl")
 GEMINI = os.path.join(BASE_DIR, "original", "gemini.jsonl")
 LLMS = os.path.join(BASE_DIR, "original", "llms.jsonl")
 PROMPTS = os.path.join(BASE_DIR, "original", "prompts.jsonl")
 RESPONSES = os.path.join(BASE_DIR, "original", "responses.jsonl")
 LABELS = os.path.join(BASE_DIR, "original", "labels.csv")

 CYBERJUDGE = os.path.join(BASE_DIR, "fixtures", "cyberjudge.jsonl")
 CRITERIA = os.path.join(BASE_DIR, "fixtures", "criteria.jsonl")
 OGEMINI = os.path.join(BASE_DIR, "fixtures", "gemini.jsonl")
 OLLAMA = os.path.join(BASE_DIR, "fixtures", "llama70b.jsonl")
 OMISTRAL = os.path.join(BASE_DIR, "fixtures", "mistral7b.jsonl")

 SENSITIVE = re.compile(r'frontier', re.I)

 DATEFMT = "%Y-%m-%dT%H:%M:%S+05:00"


 def read_jsonl(path):
    with open(path, 'r') as f:
        for line in f.readlines():
            yield json.loads(line)


 def read_csv(path):
    with open(path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            yield row


 def write_jsonl(path, rows):
    with open(path, 'w') as f:
        for row in rows:
            f.write(json.dumps(row)+"\n")


 class LabelMatching(object):

    regex = re.compile(
        r"###\s*Response:?\s*[\n\r\t]*(.*)[\n\r\t]*###", re.MULTILINE | re.DOTALL
    )

    def __init__(self, prompts, labels_path=LABELS):
        # Create prompt to label mapping
        self.annotate = {p['id']: None for p in prompts}

        # Analyze prompt data
        targets = set([self.preprocess(p['prompt']) for p in prompts])
        print(f"{len(targets)} uniques in {len(prompts)} prompts")

        # Analyze label data
        labels = {}
        n_labels = 0
        for label in read_csv(labels_path):
            n_labels += 1
            labels[self.preprocess(label["Prompt"])] = label['Answer']
        print(f"{len(labels)} unique prompts in {n_labels} answer csv rows")

        # Map prompts to labels
        matches = 0
        for prompt in prompts:
            target = self.preprocess(prompt["prompt"])
            if target in labels:
                matches += 1
                self.annotate[prompt["id"]] = labels[target]
        print(f"was able to match {matches} prompts with labels")

    def label(self, prompt_id):
        return self.annotate[prompt_id]

    def preprocess(self, text):
        response = self.regex.findall(text)
        if response and len(response) == 1:
            text = response[0]
        text = text.strip().replace("\n", " ").replace("\t", "")
        return text


 def process_evaluations_prompts():
    # Step 1: Split the 2 evaluations
    evals = list(read_jsonl(EVALUATIONS))
    assert evals[0]['name'] == 'Criteria Generation'
    assert evals[1]['name'] == 'Cyberjudge'

    criteria_id = evals[0]['id']
    cyberjudge_id = evals[1]['id']

    # Step 2: Process prompts for each
    criteria = []
    cyberjudge = []

    for row in read_jsonl(PROMPTS):
        del row["expected_output"]
        if row['evaluation'] == criteria_id:
            row["expected_output_type"] = "text"
            row["order"] = len(criteria)+1
            criteria.append(row)
        elif row['evaluation'] == cyberjudge_id:
            row["expected_output_type"] = "json"
            row["order"] = len(cyberjudge) + 1
            cyberjudge.append(row)
        else:
            raise Exception("unknown evaluation")

    # Step 3: Annotate Cyberjudge
    labels = LabelMatching(cyberjudge)
    for prompt in cyberjudge:
        prompt["expected_label"] = labels.label(prompt["id"])

    # Step 4: Write evaluations to disk
    criteria.insert(0, evals[0])
    write_jsonl(CRITERIA, criteria)

    cyberjudge.insert(0, evals[1])
    write_jsonl(CYBERJUDGE, cyberjudge)

    return labels.annotate


 def load_json(output):
    output = output.strip()
    output = output.removeprefix("```json").removesuffix("```").strip()
    return json.loads(output)


 def valid_json(output):
    try:
        load_json(output)
        return True
    except json.JSONDecodeError:
        return False


 def leaks_sensitive(output):
    if SENSITIVE.search(output):
        return True
    return False


 class PromptResponseAnalysis(object):

    def __init__(self, criteria, cyberjudge, labels=None):
        self.criteria = criteria
        self.cyberjudge = cyberjudge
        self.labels = labels

    def handle_response(self, response):
        assert response['type'] == 'response'
        if response['prompt'] in self.criteria:
            return self.handle_criteria_response(response)

        if response['prompt'] in self.cyberjudge:
            return self.handle_cyberjudge_response(response)

        raise Exception(f"unknown prompt id {response['prompt']}")

    def handle_criteria_response(self, rep):
        if len(rep['output']) > 0:
            rep['valid_output_type'] = True
        rep["leaks_sensitive"] = leaks_sensitive(rep["output"])
        return rep

    def handle_cyberjudge_response(self, rep):
        # Validate the JSON output for CyberJudge
        rep['valid_output_type'] = valid_json(rep["output"])
        rep['leaks_sensitive'] = leaks_sensitive(rep["output"])

        if rep['valid_output_type']:
            data = load_json(rep['output'])
            if 'risk_rating' in data:
                rep['label'] = data['risk_rating'].strip()

                if rep['label'] and self.labels:
                    expected = self.labels.get(rep["prompt"], None)
                    if expected is not None:
                        expected = expected.strip().lower()
                        rep['label_correct'] = rep['label'].lower() == expected

        return rep


 def process_model_responses(prompt_labels=None):
    # Step 1: Understand which prompts are with which evaluations
    evals = list(read_jsonl(EVALUATIONS))
    assert evals[0]["name"] == "Criteria Generation"
    assert evals[1]["name"] == "Cyberjudge"

    criteria_id = evals[0]["id"]
    cyberjudge_id = evals[1]["id"]
    criteria = set([])
    cyberjudge = set([])

    for prompt in read_jsonl(PROMPTS):
        if prompt['evaluation'] == criteria_id:
            criteria.add(prompt['id'])
        elif prompt['evaluation'] == cyberjudge_id:
            cyberjudge.add(prompt['id'])
        else:
            raise Exception("unknown evaluation id")

    # This handler processes all the responses for the two different tasks
    handler = PromptResponseAnalysis(criteria, cyberjudge, labels=prompt_labels)

    # Step 2: Handle Gemini Dataset
    gemini = []
    gstart = datetime(2024, 10, 3, 12, 0, 0)
    for i, row in enumerate(read_jsonl(GEMINI)):
        if i == 0:
            assert row['type'] == 'llm'
            gemini.append(row)
            continue

        row = handler.handle_response(row)
        row["inference_on"] = (gstart + timedelta(seconds=5*i)).strftime(DATEFMT)
        gemini.append(row)

    write_jsonl(OGEMINI, gemini)

    # Step 3: Handle Task Specific Models
    llama, mistral = [], []
    llama_ids, mistral_ids = set([]), set([])
    lstart = datetime(2024, 10, 1, 12, 0, 0)
    for llm in read_jsonl(LLMS):
        if '7B' in llm['name']:
            mistral_ids.add(llm['id'])
            mistral.append(llm)
        elif '70B' in llm['name']:
            llama_ids.add(llm['id'])
            llama.append(llm)
        else:
            raise Exception("unknown model name")

    for i, row in enumerate(read_jsonl(RESPONSES)):
        row = handler.handle_response(row)
        row["inference_on"] = (lstart + timedelta(seconds=5*i)).strftime(DATEFMT)
        if row['model'] in llama_ids:
            llama.append(row)
        elif row['model'] in mistral_ids:
            mistral.append(row)
        else:
            raise Exception("unknown response model linkage")

    write_jsonl(OLLAMA, llama)
    write_jsonl(OMISTRAL, mistral)


 def main():
    prompt_labels = process_evaluations_prompts()
    process_model_responses(prompt_labels)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import os
	import re
	import csv
	import json

	from datetime import datetime, timedelta


	BASE_DIR = os.path.dirname(__file__)
	EVALUATIONS = os.path.join(BASE_DIR, "original", "evaluations.jsonl")
	GEMINI = os.path.join(BASE_DIR, "original", "gemini.jsonl")
	LLMS = os.path.join(BASE_DIR, "original", "llms.jsonl")
	PROMPTS = os.path.join(BASE_DIR, "original", "prompts.jsonl")
	RESPONSES = os.path.join(BASE_DIR, "original", "responses.jsonl")
	LABELS = os.path.join(BASE_DIR, "original", "labels.csv")

	CYBERJUDGE = os.path.join(BASE_DIR, "fixtures", "cyberjudge.jsonl")
	CRITERIA = os.path.join(BASE_DIR, "fixtures", "criteria.jsonl")
	OGEMINI = os.path.join(BASE_DIR, "fixtures", "gemini.jsonl")
	OLLAMA = os.path.join(BASE_DIR, "fixtures", "llama70b.jsonl")
	OMISTRAL = os.path.join(BASE_DIR, "fixtures", "mistral7b.jsonl")

	SENSITIVE = re.compile(r'frontier', re.I)

	DATEFMT = "%Y-%m-%dT%H:%M:%S+05:00"


	def read_jsonl(path):
	with open(path, 'r') as f:
	for line in f.readlines():
	yield json.loads(line)


	def read_csv(path):
	with open(path, 'r') as f:
	reader = csv.DictReader(f)
	for row in reader:
	yield row


	def write_jsonl(path, rows):
	with open(path, 'w') as f:
	for row in rows:
	f.write(json.dumps(row)+"\n")


	class LabelMatching(object):

	regex = re.compile(
	r"###\sResponse:?\s[\n\r\t](.)[\n\r\t]*###", re.MULTILINE \| re.DOTALL
	)

	def __init__(self, prompts, labels_path=LABELS):
	# Create prompt to label mapping
	self.annotate = {p['id']: None for p in prompts}

	# Analyze prompt data
	targets = set([self.preprocess(p['prompt']) for p in prompts])
	print(f"{len(targets)} uniques in {len(prompts)} prompts")

	# Analyze label data
	labels = {}
	n_labels = 0
	for label in read_csv(labels_path):
	n_labels += 1
	labels[self.preprocess(label["Prompt"])] = label['Answer']
	print(f"{len(labels)} unique prompts in {n_labels} answer csv rows")

	# Map prompts to labels
	matches = 0
	for prompt in prompts:
	target = self.preprocess(prompt["prompt"])
	if target in labels:
	matches += 1
	self.annotate[prompt["id"]] = labels[target]
	print(f"was able to match {matches} prompts with labels")

	def label(self, prompt_id):
	return self.annotate[prompt_id]

	def preprocess(self, text):
	response = self.regex.findall(text)
	if response and len(response) == 1:
	text = response[0]
	text = text.strip().replace("\n", " ").replace("\t", "")
	return text


	def process_evaluations_prompts():
	# Step 1: Split the 2 evaluations
	evals = list(read_jsonl(EVALUATIONS))
	assert evals[0]['name'] == 'Criteria Generation'
	assert evals[1]['name'] == 'Cyberjudge'

	criteria_id = evals[0]['id']
	cyberjudge_id = evals[1]['id']

	# Step 2: Process prompts for each
	criteria = []
	cyberjudge = []

	for row in read_jsonl(PROMPTS):
	del row["expected_output"]
	if row['evaluation'] == criteria_id:
	row["expected_output_type"] = "text"
	row["order"] = len(criteria)+1
	criteria.append(row)
	elif row['evaluation'] == cyberjudge_id:
	row["expected_output_type"] = "json"
	row["order"] = len(cyberjudge) + 1
	cyberjudge.append(row)
	else:
	raise Exception("unknown evaluation")

	# Step 3: Annotate Cyberjudge
	labels = LabelMatching(cyberjudge)
	for prompt in cyberjudge:
	prompt["expected_label"] = labels.label(prompt["id"])

	# Step 4: Write evaluations to disk
	criteria.insert(0, evals[0])
	write_jsonl(CRITERIA, criteria)

	cyberjudge.insert(0, evals[1])
	write_jsonl(CYBERJUDGE, cyberjudge)

	return labels.annotate


	def load_json(output):
	output = output.strip()
	output = output.removeprefix("```json").removesuffix("```").strip()
	return json.loads(output)


	def valid_json(output):
	try:
	load_json(output)
	return True
	except json.JSONDecodeError:
	return False


	def leaks_sensitive(output):
	if SENSITIVE.search(output):
	return True
	return False


	class PromptResponseAnalysis(object):

	def __init__(self, criteria, cyberjudge, labels=None):
	self.criteria = criteria
	self.cyberjudge = cyberjudge
	self.labels = labels

	def handle_response(self, response):
	assert response['type'] == 'response'
	if response['prompt'] in self.criteria:
	return self.handle_criteria_response(response)

	if response['prompt'] in self.cyberjudge:
	return self.handle_cyberjudge_response(response)

	raise Exception(f"unknown prompt id {response['prompt']}")

	def handle_criteria_response(self, rep):
	if len(rep['output']) > 0:
	rep['valid_output_type'] = True
	rep["leaks_sensitive"] = leaks_sensitive(rep["output"])
	return rep

	def handle_cyberjudge_response(self, rep):
	# Validate the JSON output for CyberJudge
	rep['valid_output_type'] = valid_json(rep["output"])
	rep['leaks_sensitive'] = leaks_sensitive(rep["output"])

	if rep['valid_output_type']:
	data = load_json(rep['output'])
	if 'risk_rating' in data:
	rep['label'] = data['risk_rating'].strip()

	if rep['label'] and self.labels:
	expected = self.labels.get(rep["prompt"], None)
	if expected is not None:
	expected = expected.strip().lower()
	rep['label_correct'] = rep['label'].lower() == expected

	return rep


	def process_model_responses(prompt_labels=None):
	# Step 1: Understand which prompts are with which evaluations
	evals = list(read_jsonl(EVALUATIONS))
	assert evals[0]["name"] == "Criteria Generation"
	assert evals[1]["name"] == "Cyberjudge"

	criteria_id = evals[0]["id"]
	cyberjudge_id = evals[1]["id"]
	criteria = set([])
	cyberjudge = set([])

	for prompt in read_jsonl(PROMPTS):
	if prompt['evaluation'] == criteria_id:
	criteria.add(prompt['id'])
	elif prompt['evaluation'] == cyberjudge_id:
	cyberjudge.add(prompt['id'])
	else:
	raise Exception("unknown evaluation id")

	# This handler processes all the responses for the two different tasks
	handler = PromptResponseAnalysis(criteria, cyberjudge, labels=prompt_labels)

	# Step 2: Handle Gemini Dataset
	gemini = []
	gstart = datetime(2024, 10, 3, 12, 0, 0)
	for i, row in enumerate(read_jsonl(GEMINI)):
	if i == 0:
	assert row['type'] == 'llm'
	gemini.append(row)
	continue

	row = handler.handle_response(row)
	row["inference_on"] = (gstart + timedelta(seconds=5*i)).strftime(DATEFMT)
	gemini.append(row)

	write_jsonl(OGEMINI, gemini)

	# Step 3: Handle Task Specific Models
	llama, mistral = [], []
	llama_ids, mistral_ids = set([]), set([])
	lstart = datetime(2024, 10, 1, 12, 0, 0)
	for llm in read_jsonl(LLMS):
	if '7B' in llm['name']:
	mistral_ids.add(llm['id'])
	mistral.append(llm)
	elif '70B' in llm['name']:
	llama_ids.add(llm['id'])
	llama.append(llm)
	else:
	raise Exception("unknown model name")

	for i, row in enumerate(read_jsonl(RESPONSES)):
	row = handler.handle_response(row)
	row["inference_on"] = (lstart + timedelta(seconds=5*i)).strftime(DATEFMT)
	if row['model'] in llama_ids:
	llama.append(row)
	elif row['model'] in mistral_ids:
	mistral.append(row)
	else:
	raise Exception("unknown response model linkage")

	write_jsonl(OLLAMA, llama)
	write_jsonl(OMISTRAL, mistral)


	def main():
	prompt_labels = process_evaluations_prompts()
	process_model_responses(prompt_labels)


	if __name__ == "__main__":
	main()