Skip to content

Instantly share code, notes, and snippets.

@NisugaJ
Created March 6, 2025 15:59
Show Gist options
  • Save NisugaJ/dba4cb05d7638501da45d95f71c042a0 to your computer and use it in GitHub Desktop.
Save NisugaJ/dba4cb05d7638501da45d95f71c042a0 to your computer and use it in GitHub Desktop.
import pandas as pd
import random
# Load the Excel file
df = pd.read_excel("qa_pairs.xlsx")
# Group by category
grouped = df.groupby('Category')
train_data = []
test_data = []
for category, group in grouped:
category_pairs = group.to_dict('records')
random.shuffle(category_pairs)
# 80% for training, 20% for testing
split_point = int(len(category_pairs) * 0.8)
train_data.extend(category_pairs[:split_point])
test_data.extend(category_pairs[split_point:])
# Save the splits
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)
train_df.to_csv("training_data.csv", index=False)
test_df.to_csv("testing_data.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment