shrimo · March 2, 2025 00:05
diff --git a/simple_transformer.py b/simple_transformer.py
 import math
 import random

 class SimpleTransformer:
    def __init__(self, d_model):
        """Initializes the transformer with a given embedding size (d_model)."""
        self.d_model = d_model
        self.W_q = [[random.uniform(-0.5, 0.5) for _ in range(d_model)] for _ in range(d_model)]
        self.W_k = [[random.uniform(-0.5, 0.5) for _ in range(d_model)] for _ in range(d_model)]
        self.W_v = [[random.uniform(-0.5, 0.5) for _ in range(d_model)] for _ in range(d_model)]

    def matmul(self, A, B):
        """Performs matrix multiplication of A and B."""
        return [[sum(A[i][k] * B[k][j] for k in range(len(B))) for j in range(len(B[0]))] for i in range(len(A))]

    def softmax(self, x):
        """Computes the softmax function for a list of values."""
        exp_x = [math.exp(i) for i in x]
        sum_exp = sum(exp_x)
        return [i / sum_exp for i in exp_x]

    def attention(self, Q, K, V):
        """Computes the scaled dot-product attention."""
        scores = self.matmul(Q, list(map(list, zip(*K))))
        scores = [[score / math.sqrt(self.d_model) for score in row] for row in scores]
        weights = [self.softmax(row) for row in scores]
        return self.matmul(weights, V)

    def forward(self, X):
        """Performs a forward pass through the transformer."""
        Q = self.matmul(X, self.W_q)
        K = self.matmul(X, self.W_k)
        V = self.matmul(X, self.W_v)
        return self.attention(Q, K, V)

 # Tokenization & Embedding
 def tokenize(sentence):
    """Splits a sentence into words."""
    return sentence.lower().split()

 def generate_embeddings(tokens, d_model):
    """Creates random embeddings for tokens."""
    embedding_dict = {token: [random.uniform(-1, 1) for _ in range(d_model)] for token in set(tokens)}
    return [embedding_dict[token] for token in tokens], embedding_dict

 # Training Data (Input → Expected Response)
 training_data = [
    ("hello", "hi"),
    ("how are you", "i am fine"),
    ("what is your name", "i am a chatbot"),
    ("bye", "goodbye"),
 ]

 # Mean Squared Error Loss
 def mse_loss(y_pred, y_true):
    """Computes Mean Squared Error (MSE) loss."""
    return sum((yp - yt) ** 2 for yp, yt in zip(y_pred, y_true)) / len(y_pred)

 # Training Function
 def train(transformer, embedding_dict, learning_rate=0.01, epochs=100):
    """Trains the transformer model using a simple gradient update on embeddings."""
    d_model = transformer.d_model
    for epoch in range(epochs):
        total_loss = 0
        for input_text, target_text in training_data:
            # Convert words to embeddings
            input_tokens = tokenize(input_text)
            target_tokens = tokenize(target_text)

            if input_tokens[0] not in embedding_dict or target_tokens[0] not in embedding_dict:
                continue  # Skip unknown words

            input_embedding = [embedding_dict[input_tokens[0]]]
            target_embedding = embedding_dict[target_tokens[0]]

            # Forward pass through transformer
            output_embedding = transformer.forward(input_embedding)[0]

            # Compute loss
            loss = mse_loss(output_embedding, target_embedding)
            total_loss += loss

            # Simple gradient update (gradient descent)
            for i in range(d_model):
                embedding_dict[input_tokens[0]][i] -= learning_rate * (output_embedding[i] - target_embedding[i])

        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

 # Find Closest Word
 def closest_word(vector, embedding_dict):
    """Finds the closest word in the dictionary to the given vector (cosine similarity)."""
    best_match = None
    best_score = -float('inf')

    def cosine_similarity(v1, v2):
        dot = sum(a * b for a, b in zip(v1, v2))
        norm1 = math.sqrt(sum(a * a for a in v1))
        norm2 = math.sqrt(sum(b * b for b in v2))
        return dot / (norm1 * norm2 + 1e-9)  # Avoid division by zero

    for word, embed in embedding_dict.items():
        score = cosine_similarity(vector, embed)
        if score > best_score:
            best_match = word
            best_score = score

    return best_match

 # Chatbot Interaction
 def chatbot():
    """Runs a simple chatbot loop."""
    d_model = 4
    transformer = SimpleTransformer(d_model)

    # Collect all words from training data
    all_words = set(word for pair in training_data for sentence in pair for word in tokenize(sentence))
    embeddings, embedding_dict = generate_embeddings(list(all_words), d_model)

    # Train the model
    train(transformer, embedding_dict)

    print("Chatbot: Hello! Type something to chat. Type 'exit' to stop.")

    while True:
        user_input = input("You: ").strip().lower()
        if user_input == "exit":
            print("Chatbot: Goodbye!")
            break

        tokens = tokenize(user_input)
        if not tokens or tokens[0] not in embedding_dict:
            print("Chatbot: I don't understand.")
            continue

        input_embedding = [embedding_dict[tokens[0]]]
        transformer_output = transformer.forward(input_embedding)

        # Find the best response based on trained embeddings
        response_word = closest_word(transformer_output[0], embedding_dict)
        print(f"Chatbot: {response_word}")

 # Start chatbot
 chatbot()
	import math
	import random

	class SimpleTransformer:
	def __init__(self, d_model):
	"""Initializes the transformer with a given embedding size (d_model)."""
	self.d_model = d_model
	self.W_q = [[random.uniform(-0.5, 0.5) for _ in range(d_model)] for _ in range(d_model)]
	self.W_k = [[random.uniform(-0.5, 0.5) for _ in range(d_model)] for _ in range(d_model)]
	self.W_v = [[random.uniform(-0.5, 0.5) for _ in range(d_model)] for _ in range(d_model)]

	def matmul(self, A, B):
	"""Performs matrix multiplication of A and B."""
	return [[sum(A[i][k] * B[k][j] for k in range(len(B))) for j in range(len(B[0]))] for i in range(len(A))]

	def softmax(self, x):
	"""Computes the softmax function for a list of values."""
	exp_x = [math.exp(i) for i in x]
	sum_exp = sum(exp_x)
	return [i / sum_exp for i in exp_x]

	def attention(self, Q, K, V):
	"""Computes the scaled dot-product attention."""
	scores = self.matmul(Q, list(map(list, zip(*K))))
	scores = [[score / math.sqrt(self.d_model) for score in row] for row in scores]
	weights = [self.softmax(row) for row in scores]
	return self.matmul(weights, V)

	def forward(self, X):
	"""Performs a forward pass through the transformer."""
	Q = self.matmul(X, self.W_q)
	K = self.matmul(X, self.W_k)
	V = self.matmul(X, self.W_v)
	return self.attention(Q, K, V)

	# Tokenization & Embedding
	def tokenize(sentence):
	"""Splits a sentence into words."""
	return sentence.lower().split()

	def generate_embeddings(tokens, d_model):
	"""Creates random embeddings for tokens."""
	embedding_dict = {token: [random.uniform(-1, 1) for _ in range(d_model)] for token in set(tokens)}
	return [embedding_dict[token] for token in tokens], embedding_dict

	# Training Data (Input → Expected Response)
	training_data = [
	("hello", "hi"),
	("how are you", "i am fine"),
	("what is your name", "i am a chatbot"),
	("bye", "goodbye"),
	]

	# Mean Squared Error Loss
	def mse_loss(y_pred, y_true):
	"""Computes Mean Squared Error (MSE) loss."""
	return sum((yp - yt) ** 2 for yp, yt in zip(y_pred, y_true)) / len(y_pred)

	# Training Function
	def train(transformer, embedding_dict, learning_rate=0.01, epochs=100):
	"""Trains the transformer model using a simple gradient update on embeddings."""
	d_model = transformer.d_model
	for epoch in range(epochs):
	total_loss = 0
	for input_text, target_text in training_data:
	# Convert words to embeddings
	input_tokens = tokenize(input_text)
	target_tokens = tokenize(target_text)

	if input_tokens[0] not in embedding_dict or target_tokens[0] not in embedding_dict:
	continue # Skip unknown words

	input_embedding = [embedding_dict[input_tokens[0]]]
	target_embedding = embedding_dict[target_tokens[0]]

	# Forward pass through transformer
	output_embedding = transformer.forward(input_embedding)[0]

	# Compute loss
	loss = mse_loss(output_embedding, target_embedding)
	total_loss += loss

	# Simple gradient update (gradient descent)
	for i in range(d_model):
	embedding_dict[input_tokens[0]][i] -= learning_rate * (output_embedding[i] - target_embedding[i])

	if epoch % 10 == 0:
	print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

	# Find Closest Word
	def closest_word(vector, embedding_dict):
	"""Finds the closest word in the dictionary to the given vector (cosine similarity)."""
	best_match = None
	best_score = -float('inf')

	def cosine_similarity(v1, v2):
	dot = sum(a * b for a, b in zip(v1, v2))
	norm1 = math.sqrt(sum(a * a for a in v1))
	norm2 = math.sqrt(sum(b * b for b in v2))
	return dot / (norm1 * norm2 + 1e-9) # Avoid division by zero

	for word, embed in embedding_dict.items():
	score = cosine_similarity(vector, embed)
	if score > best_score:
	best_match = word
	best_score = score

	return best_match

	# Chatbot Interaction
	def chatbot():
	"""Runs a simple chatbot loop."""
	d_model = 4
	transformer = SimpleTransformer(d_model)

	# Collect all words from training data
	all_words = set(word for pair in training_data for sentence in pair for word in tokenize(sentence))
	embeddings, embedding_dict = generate_embeddings(list(all_words), d_model)

	# Train the model
	train(transformer, embedding_dict)

	print("Chatbot: Hello! Type something to chat. Type 'exit' to stop.")

	while True:
	user_input = input("You: ").strip().lower()
	if user_input == "exit":
	print("Chatbot: Goodbye!")
	break

	tokens = tokenize(user_input)
	if not tokens or tokens[0] not in embedding_dict:
	print("Chatbot: I don't understand.")
	continue

	input_embedding = [embedding_dict[tokens[0]]]
	transformer_output = transformer.forward(input_embedding)

	# Find the best response based on trained embeddings
	response_word = closest_word(transformer_output[0], embedding_dict)
	print(f"Chatbot: {response_word}")

	# Start chatbot
	chatbot()