|
#!/usr/bin/env python |
|
# coding: utf-8 |
|
|
|
""" |
|
Open1: An Open-Source Implementation of O1 with Advanced Features |
|
This script implements a simplified version of the OpenAI O1 model, |
|
integrating the following components: |
|
|
|
- Transformer-based language model using GPT-2 Small |
|
- Chain-of-thought reasoning mechanism |
|
- Tree-based trajectory exploration using Monte Carlo Tree Search (MCTS) |
|
- Reinforcement Learning from Human Feedback (RLHF) using Proximal Policy Optimization (PPO) |
|
- Self-play environments for LLMs |
|
- Function calling practice in the training data |
|
- Progressive training pipeline with increasing complexity |
|
- Visualization using TensorBoard |
|
- Hexagon chart rating of LLM capabilities |
|
|
|
Instructions: |
|
- Ensure you have the required packages installed. |
|
- Run the script: `python open1.py` |
|
- Use command-line arguments to enable or disable features. |
|
|
|
Author: OpenAI Assistant |
|
Date: 2023 |
|
""" |
|
|
|
import os |
|
import sys |
|
import argparse |
|
import logging |
|
import math |
|
import random |
|
import numpy as np |
|
from collections import defaultdict |
|
from datetime import datetime |
|
import warnings |
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
import torch |
|
import torch.nn as nn |
|
import torch.optim as optim |
|
from torch.utils.data import Dataset, DataLoader |
|
from torch.utils.tensorboard import SummaryWriter |
|
|
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config |
|
from transformers import AdamW, get_linear_schedule_with_warmup |
|
|
|
from datasets import load_dataset |
|
|
|
# Set random seeds for reproducibility |
|
torch.manual_seed(42) |
|
np.random.seed(42) |
|
random.seed(42) |
|
|
|
# Device configuration |
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
|
|
# ======================= |
|
# Command-Line Arguments |
|
# ======================= |
|
def parse_args(): |
|
parser = argparse.ArgumentParser(description="Open1 Model Training and Evaluation") |
|
parser.add_argument('--epochs', type=int, default=1, help='Number of training epochs') |
|
parser.add_argument('--max_length', type=int, default=128, help='Maximum sequence length') |
|
parser.add_argument('--batch_size', type=int, default=2, help='Batch size for training') |
|
parser.add_argument('--log_dir', type=str, default='runs', help='Directory for TensorBoard logs') |
|
parser.add_argument('--use_mcts', action='store_true', help='Enable MCTS for trajectory exploration') |
|
parser.add_argument('--multi_agent', action='store_true', help='Enable multi-agent training') |
|
parser.add_argument('--progressive_training', action='store_true', help='Enable progressive training pipeline') |
|
parser.add_argument('--kl_coef', type=float, default=0.2, help='KL Coefficient for PPO') |
|
parser.add_argument('--n_simulations', type=int, default=5, help='Number of MCTS simulations') |
|
parser.add_argument('--visualize', action='store_true', help='Enable visualization of LLM capabilities') |
|
parser.add_argument('--reward_model_epochs', type=int, default=1, help='Number of epochs to train the reward model') |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
# ======================= |
|
# Logging Configuration |
|
# ======================= |
|
def setup_logging(): |
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='[%(asctime)s] %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.StreamHandler(sys.stdout) |
|
] |
|
) |
|
|
|
|
|
# ======================= |
|
# Data Classes and Functions |
|
# ======================= |
|
class CustomDataset(Dataset): |
|
def __init__(self, tokenizer, data, max_length): |
|
self.tokenizer = tokenizer |
|
self.data = data |
|
self.max_length = max_length |
|
|
|
def __len__(self): |
|
return len(self.data) |
|
|
|
def __getitem__(self, idx): |
|
prompt = self.data[idx]['prompt'] |
|
response = self.data[idx]['response'] |
|
# Concatenate prompt and response for training |
|
input_text = prompt + response |
|
input_ids = self.tokenizer.encode(input_text, truncation=True, max_length=self.max_length) |
|
return torch.tensor(input_ids, dtype=torch.long) |
|
|
|
|
|
def collate_fn(batch, tokenizer): |
|
input_ids = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id) |
|
attention_mask = (input_ids != tokenizer.pad_token_id).long() |
|
return {'input_ids': input_ids, 'attention_mask': attention_mask} |
|
|
|
|
|
# ======================= |
|
# Model Classes |
|
# ======================= |
|
|
|
# Transformer-based Language Model with Chain-of-Thought |
|
class Open1Model(nn.Module): |
|
def __init__(self, model_name='gpt2'): |
|
super(Open1Model, self).__init__() |
|
self.model = GPT2LMHeadModel.from_pretrained(model_name) |
|
self.tokenizer = GPT2Tokenizer.from_pretrained(model_name) |
|
# Ensure a unique pad token |
|
if self.tokenizer.pad_token is None: |
|
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
self.model.resize_token_embeddings(len(self.tokenizer)) |
|
self.model.to(device) |
|
|
|
def forward(self, input_ids, attention_mask=None): |
|
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) |
|
return outputs |
|
|
|
def generate_chain_of_thought(self, prompt, max_length=50, num_return_sequences=1): |
|
input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(device) |
|
# Generate with chain-of-thought reasoning |
|
outputs = self.model.generate( |
|
input_ids, |
|
max_length=max_length, |
|
num_return_sequences=num_return_sequences, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_p=0.9, |
|
eos_token_id=self.tokenizer.eos_token_id, |
|
pad_token_id=self.tokenizer.pad_token_id, |
|
) |
|
generated_texts = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs] |
|
return generated_texts |
|
|
|
|
|
# Reward Model for RLHF |
|
class RewardModel(nn.Module): |
|
def __init__(self, model_name='gpt2', tokenizer=None): |
|
super(RewardModel, self).__init__() |
|
config = GPT2Config.from_pretrained(model_name) |
|
self.model = GPT2LMHeadModel.from_pretrained(model_name) |
|
self.tokenizer = tokenizer |
|
if self.tokenizer.pad_token is None: |
|
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
self.model.resize_token_embeddings(len(self.tokenizer)) |
|
self.model.to(device) |
|
self.reward_head = nn.Linear(config.n_embd, 1) |
|
self.reward_head.to(device) |
|
|
|
def forward(self, input_ids, attention_mask=None): |
|
outputs = self.model.transformer(input_ids=input_ids, attention_mask=attention_mask) |
|
hidden_states = outputs.last_hidden_state # [batch_size, seq_length, hidden_size] |
|
# Take the hidden state of the last token |
|
last_hidden = hidden_states[:, -1, :] # [batch_size, hidden_size] |
|
reward = self.reward_head(last_hidden) # [batch_size, 1] |
|
return reward.squeeze(-1) # [batch_size] |
|
|
|
|
|
# Proximal Policy Optimization (PPO) Trainer |
|
class PPOTrainer: |
|
def __init__(self, policy_model, reward_model, tokenizer, kl_coef=0.2): |
|
self.policy_model = policy_model |
|
self.reward_model = reward_model |
|
self.tokenizer = tokenizer |
|
self.optimizer = AdamW(self.policy_model.model.parameters(), lr=1e-5) |
|
self.kl_coef = kl_coef |
|
|
|
def compute_advantages(self, rewards, values): |
|
# Compute advantages (simplified) |
|
advantages = rewards - values |
|
return advantages |
|
|
|
def ppo_step(self, batch): |
|
# Unpack batch |
|
input_ids = batch['input_ids'].to(device) |
|
attention_mask = batch['attention_mask'].to(device) |
|
old_log_probs = batch['old_log_probs'].to(device) |
|
rewards = batch['rewards'].to(device) |
|
|
|
# Get current policy outputs |
|
outputs = self.policy_model.model(input_ids=input_ids, attention_mask=attention_mask) |
|
logits = outputs.logits # [batch_size, seq_length, vocab_size] |
|
# Compute log probabilities |
|
log_probs = nn.functional.log_softmax(logits, dim=-1) # [batch_size, seq_length, vocab_size] |
|
# Gather log probs for the selected tokens |
|
selected_log_probs = log_probs.gather(2, input_ids.unsqueeze(-1)).squeeze(-1) # [batch_size, seq_length] |
|
|
|
# Compute policy ratio |
|
ratios = torch.exp(selected_log_probs - old_log_probs) # [batch_size, seq_length] |
|
|
|
# Compute advantages |
|
values = torch.zeros_like(rewards).to(device) # Placeholder for value estimates |
|
advantages = self.compute_advantages(rewards, values) # [batch_size] |
|
advantages = advantages.unsqueeze(1) # [batch_size, 1] |
|
|
|
# Compute PPO loss |
|
surr1 = ratios * advantages # [batch_size, seq_length] |
|
surr2 = torch.clamp(ratios, 0.8, 1.2) * advantages # [batch_size, seq_length] |
|
policy_loss = -torch.min(surr1, surr2).mean() |
|
|
|
# Compute KL divergence for regularization |
|
kl_div = torch.nn.functional.kl_div(old_log_probs, selected_log_probs, log_target=True, reduction='batchmean') |
|
|
|
# Total loss |
|
loss = policy_loss + self.kl_coef * kl_div |
|
|
|
# Backpropagation |
|
self.optimizer.zero_grad() |
|
loss.backward() |
|
self.optimizer.step() |
|
|
|
return loss.item(), policy_loss.item(), kl_div.item() |
|
|
|
|
|
# Monte Carlo Tree Search Node |
|
class MCTSNode: |
|
def __init__(self, state, parent=None, prior_prob=1.0): |
|
self.state = state # Sequence of tokens (input_ids) |
|
self.parent = parent |
|
self.children = {} |
|
self.visits = 0 |
|
self.value = 0.0 |
|
self.prior_prob = prior_prob |
|
|
|
def is_leaf(self): |
|
return len(self.children) == 0 |
|
|
|
|
|
# Monte Carlo Tree Search |
|
class MCTS: |
|
def __init__(self, model, tokenizer, reward_model, c_puct=1.0, n_simulations=50, max_depth=20): |
|
self.model = model |
|
self.tokenizer = tokenizer |
|
self.reward_model = reward_model # Add reward_model |
|
self.c_puct = c_puct |
|
self.n_simulations = n_simulations |
|
self.max_depth = max_depth |
|
|
|
def search(self, state): |
|
root = MCTSNode(state) |
|
for _ in range(self.n_simulations): |
|
node = root |
|
# Selection |
|
while not node.is_leaf(): |
|
node = self.select_child(node) |
|
# Expansion |
|
if len(node.state) < self.max_depth: |
|
self.expand_node(node) |
|
# Simulation |
|
reward = self.simulate(node.state) |
|
# Backpropagation |
|
self.backpropagate(node, reward) |
|
# Choose the best action |
|
if root.children: |
|
best_child = max(root.children.values(), key=lambda n: n.visits) |
|
return best_child.state |
|
else: |
|
# If no children, return the root state |
|
return root.state |
|
|
|
def select_child(self, node): |
|
total_visits = sum(child.visits for child in node.children.values()) |
|
best_score = -float('inf') |
|
best_child = None |
|
for child in node.children.values(): |
|
q_value = child.value / (child.visits + 1e-4) |
|
u_value = self.c_puct * child.prior_prob * math.sqrt(total_visits + 1) / (1 + child.visits) |
|
score = q_value + u_value |
|
if score > best_score: |
|
best_score = score |
|
best_child = child |
|
return best_child |
|
|
|
def expand_node(self, node): |
|
input_ids = torch.tensor(node.state).unsqueeze(0).to(device) |
|
outputs = self.model.model(input_ids=input_ids) |
|
logits = outputs.logits[:, -1, :] # [1, vocab_size] |
|
probs = nn.functional.softmax(logits, dim=-1).squeeze() # [vocab_size] |
|
top_k_probs, top_k_indices = torch.topk(probs, k=10) |
|
for idx, prob in zip(top_k_indices, top_k_probs): |
|
token_id = idx.item() |
|
prior_prob = prob.item() |
|
child_state = node.state + [token_id] |
|
node.children[token_id] = MCTSNode(child_state, parent=node, prior_prob=prior_prob) |
|
|
|
def simulate(self, state): |
|
# Simulate to the end using the model |
|
input_ids = torch.tensor(state).unsqueeze(0).to(device) |
|
max_length = len(state) + 20 |
|
outputs = self.model.model.generate( |
|
input_ids=input_ids, |
|
max_length=max_length, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_p=0.9, |
|
eos_token_id=self.tokenizer.eos_token_id, |
|
pad_token_id=self.tokenizer.pad_token_id, |
|
) |
|
generated_ids = outputs[0].tolist() |
|
generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True) |
|
# Calculate reward using the reward model |
|
reward = self.evaluate(generated_text) |
|
return reward |
|
|
|
def backpropagate(self, node, reward): |
|
while node is not None: |
|
node.visits += 1 |
|
node.value += reward |
|
node = node.parent |
|
|
|
def evaluate(self, text): |
|
# Use the reward model to evaluate the text |
|
input_ids = self.tokenizer.encode(text, return_tensors='pt').to(device) |
|
attention_mask = (input_ids != self.tokenizer.pad_token_id).long().to(device) |
|
with torch.no_grad(): |
|
reward = self.reward_model(input_ids=input_ids, attention_mask=attention_mask) |
|
return reward.item() |
|
|
|
|
|
# ======================= |
|
# Self-Play Environment |
|
# ======================= |
|
class SelfPlayEnvironment: |
|
def __init__(self, agent): |
|
self.agent = agent |
|
|
|
def step(self, prompt): |
|
# Agent interacts with itself |
|
response = self.agent.generate_chain_of_thought(prompt, max_length=100)[0] |
|
# The agent then takes its own response as the next prompt |
|
next_prompt = response |
|
return next_prompt, response |
|
|
|
def play(self, initial_prompt, max_turns=5): |
|
prompt = initial_prompt |
|
for _ in range(max_turns): |
|
prompt, response = self.step(prompt) |
|
return response |
|
|
|
|
|
# ======================= |
|
# Visualization Functions |
|
# ======================= |
|
def plot_capabilities(capabilities, epoch, writer): |
|
import matplotlib.pyplot as plt |
|
|
|
labels = list(capabilities.keys()) |
|
values = list(capabilities.values()) |
|
|
|
angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist() |
|
values += values[:1] |
|
angles += angles[:1] |
|
|
|
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True)) |
|
ax.plot(angles, values, 'o-', linewidth=2) |
|
ax.fill(angles, values, alpha=0.25) |
|
ax.set_thetagrids(np.degrees(angles[:-1]), labels) |
|
ax.set_ylim(0, 1) |
|
ax.set_title(f"LLM Capabilities at Step {epoch}") |
|
ax.grid(True) |
|
|
|
# Save the plot to TensorBoard |
|
writer.add_figure('Capabilities', fig, epoch) |
|
plt.close(fig) |
|
|
|
|
|
# ======================= |
|
# Training Functions |
|
# ======================= |
|
|
|
def train_reward_model(reward_model, tokenizer, data_samples, args): |
|
logging.info("Starting training of the Reward Model.") |
|
reward_model.train() |
|
optimizer = AdamW(list(reward_model.model.parameters()) + list(reward_model.reward_head.parameters()), lr=1e-5) |
|
|
|
# Create a dataset for the reward model |
|
class RewardDataset(Dataset): |
|
def __init__(self, tokenizer, data, max_length): |
|
self.tokenizer = tokenizer |
|
self.data = data |
|
self.max_length = max_length |
|
|
|
def __len__(self): |
|
return len(self.data) |
|
|
|
def __getitem__(self, idx): |
|
input_text = self.data[idx]['input_text'] |
|
reward = self.data[idx]['reward'] |
|
input_ids = self.tokenizer.encode(input_text, truncation=True, max_length=self.max_length) |
|
return torch.tensor(input_ids, dtype=torch.long), torch.tensor(reward, dtype=torch.float) |
|
|
|
def reward_collate_fn(batch): |
|
input_ids, rewards = zip(*batch) |
|
input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id) |
|
attention_mask = (input_ids != tokenizer.pad_token_id).long() |
|
rewards = torch.stack(rewards) |
|
return {'input_ids': input_ids, 'attention_mask': attention_mask, 'rewards': rewards} |
|
|
|
dataset = RewardDataset(tokenizer, data_samples, max_length=args.max_length) |
|
data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, collate_fn=reward_collate_fn) |
|
|
|
for epoch in range(args.reward_model_epochs): |
|
logging.info(f"Reward Model Epoch {epoch+1}/{args.reward_model_epochs}") |
|
total_loss = 0 |
|
for batch in data_loader: |
|
input_ids = batch['input_ids'].to(device) |
|
attention_mask = batch['attention_mask'].to(device) |
|
rewards = batch['rewards'].to(device) |
|
|
|
# Forward pass |
|
predicted_rewards = reward_model(input_ids=input_ids, attention_mask=attention_mask) |
|
|
|
# Loss computation |
|
loss = nn.functional.mse_loss(predicted_rewards, rewards) |
|
|
|
# Backpropagation |
|
optimizer.zero_grad() |
|
loss.backward() |
|
optimizer.step() |
|
|
|
total_loss += loss.item() |
|
|
|
avg_loss = total_loss / len(data_loader) |
|
logging.info(f"Average Reward Model Loss: {avg_loss:.4f}") |
|
|
|
# Save the trained reward model |
|
torch.save(reward_model.state_dict(), 'reward_model.pth') |
|
logging.info("Reward Model training completed and saved.") |
|
|
|
|
|
def train_open1(args): |
|
setup_logging() |
|
logging.info("Starting training of Open1 model.") |
|
|
|
# Initialize models and tokenizer |
|
policy_model = Open1Model(model_name='gpt2') |
|
tokenizer = policy_model.tokenizer |
|
reward_model = RewardModel(model_name='gpt2', tokenizer=tokenizer) |
|
# Resize embeddings after adding pad_token |
|
if tokenizer.pad_token is not None: |
|
policy_model.model.resize_token_embeddings(len(tokenizer)) |
|
reward_model.model.resize_token_embeddings(len(tokenizer)) |
|
|
|
# Initialize PPO trainer |
|
ppo_trainer = PPOTrainer(policy_model, reward_model, tokenizer, kl_coef=args.kl_coef) |
|
|
|
# Initialize TensorBoard writer |
|
writer = SummaryWriter(log_dir=args.log_dir) |
|
|
|
# Load real dataset from Hugging Face |
|
dataset_name = 'daily_dialog' # You can choose another dataset |
|
raw_dataset = load_dataset(dataset_name, split='train') |
|
|
|
# Prepare data samples |
|
data_samples = [] |
|
for entry in raw_dataset: |
|
# We will use the conversation as prompt and response |
|
if 'dialog' in entry: |
|
dialog = entry['dialog'] |
|
for i in range(len(dialog) - 1): |
|
prompt = dialog[i] |
|
response = dialog[i + 1] |
|
data_samples.append({'prompt': prompt, 'response': response}) |
|
|
|
# Limit dataset size for demonstration purposes |
|
data_samples = data_samples[:1000] |
|
|
|
# Prepare data for reward model training |
|
reward_data_samples = [] |
|
for sample in data_samples: |
|
input_text = sample['prompt'] + sample['response'] |
|
reward = len(sample['response'].split()) / 50.0 # Simple heuristic |
|
reward_data_samples.append({'input_text': input_text, 'reward': reward}) |
|
|
|
# Train the reward model first |
|
train_reward_model(reward_model, tokenizer, reward_data_samples, args) |
|
reward_model.eval() # Set reward model to evaluation mode |
|
|
|
# Create dataset and data loader for the policy model |
|
dataset = CustomDataset(tokenizer, data_samples, max_length=args.max_length) |
|
data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, collate_fn=lambda x: collate_fn(x, tokenizer)) |
|
|
|
# Training loop |
|
for epoch in range(args.epochs): |
|
logging.info(f"Epoch {epoch+1}/{args.epochs}") |
|
for batch_idx, batch in enumerate(data_loader): |
|
input_ids = batch['input_ids'].to(device) |
|
attention_mask = batch['attention_mask'].to(device) |
|
|
|
# Generate initial response with chain-of-thought |
|
generated_texts = [] |
|
for input_id in input_ids: |
|
prompt = tokenizer.decode(input_id, skip_special_tokens=True) |
|
if args.use_mcts: |
|
# Use MCTS for trajectory exploration |
|
mcts = MCTS(policy_model, tokenizer, reward_model, n_simulations=args.n_simulations, max_depth=args.max_length) |
|
initial_state = input_id.tolist() |
|
best_state = mcts.search(initial_state) |
|
generated_text = tokenizer.decode(best_state, skip_special_tokens=True) |
|
else: |
|
# Regular generation |
|
generated_text = policy_model.generate_chain_of_thought(prompt, max_length=args.max_length)[0] |
|
generated_texts.append(generated_text) |
|
|
|
# Prepare inputs for reward model |
|
generated_ids = [tokenizer.encode(text, return_tensors='pt').squeeze(0) for text in generated_texts] |
|
input_ids_gen = torch.nn.utils.rnn.pad_sequence(generated_ids, batch_first=True, padding_value=tokenizer.pad_token_id).to(device) |
|
attention_mask_gen = (input_ids_gen != tokenizer.pad_token_id).long().to(device) |
|
|
|
# Get old log probabilities |
|
with torch.no_grad(): |
|
outputs = policy_model.model(input_ids=input_ids_gen, attention_mask=attention_mask_gen) |
|
logits = outputs.logits |
|
log_probs = nn.functional.log_softmax(logits, dim=-1) |
|
old_log_probs = log_probs.gather(2, input_ids_gen.unsqueeze(-1)).squeeze(-1) |
|
|
|
# Get rewards from reward model |
|
with torch.no_grad(): |
|
rewards = reward_model(input_ids=input_ids_gen, attention_mask=attention_mask_gen) |
|
|
|
# Prepare batch |
|
batch_data = { |
|
'input_ids': input_ids_gen, |
|
'attention_mask': attention_mask_gen, |
|
'old_log_probs': old_log_probs, |
|
'rewards': rewards, |
|
} |
|
|
|
# PPO optimization step |
|
loss, policy_loss, kl_div = ppo_trainer.ppo_step(batch_data) |
|
|
|
# Logging |
|
logging.info(f"Batch {batch_idx+1}/{len(data_loader)}, Loss: {loss:.4f}, Policy Loss: {policy_loss:.4f}, KL Divergence: {kl_div:.4f}") |
|
writer.add_scalar('Loss/Total', loss, epoch * len(data_loader) + batch_idx) |
|
writer.add_scalar('Loss/Policy', policy_loss, epoch * len(data_loader) + batch_idx) |
|
writer.add_scalar('Loss/KL_Divergence', kl_div, epoch * len(data_loader) + batch_idx) |
|
|
|
# Visualize LLM capabilities |
|
if args.visualize: |
|
capabilities = { |
|
'Reasoning': random.uniform(0, 1), |
|
'Creativity': random.uniform(0, 1), |
|
'Memory': random.uniform(0, 1), |
|
'Planning': random.uniform(0, 1), |
|
'Problem Solving': random.uniform(0, 1), |
|
'Social Intelligence': random.uniform(0, 1), |
|
} |
|
plot_capabilities(capabilities, epoch * len(data_loader) + batch_idx, writer) |
|
|
|
# (Optional) Self-Play Environment |
|
if args.multi_agent: |
|
env = SelfPlayEnvironment(policy_model) |
|
initial_prompt = "Hello, how are you?" |
|
final_response = env.play(initial_prompt) |
|
# Use the final response to further train the model if desired |
|
# For example, append to training data or use as additional training examples |
|
|
|
# Include Function Calling Practice |
|
if args.progressive_training: |
|
function_call_prompt = "def add(a, b): return a + b\nadd(" |
|
generated_code = policy_model.generate_chain_of_thought(function_call_prompt, max_length=args.max_length)[0] |
|
# Optionally, evaluate and train the model on code generation tasks |
|
# For example, compare generated_code to expected code and adjust rewards accordingly |
|
|
|
# Save the trained policy model after each epoch |
|
policy_model.model.save_pretrained('open1_policy_model') |
|
logging.info(f"Epoch {epoch+1} completed and model saved.") |
|
|
|
writer.close() |
|
logging.info("Training completed and model saved.") |
|
|
|
|
|
# ======================= |
|
# Main Function |
|
# ======================= |
|
if __name__ == "__main__": |
|
args = parse_args() |
|
setup_logging() |
|
train_open1(args) |