Skip to content

Instantly share code, notes, and snippets.

@tripolskypetr
Created April 6, 2025 12:31
Show Gist options
  • Save tripolskypetr/65e4fbfdd4ae6d14a3f7417b82a6337c to your computer and use it in GitHub Desktop.
Save tripolskypetr/65e4fbfdd4ae6d14a3f7417b82a6337c to your computer and use it in GitHub Desktop.
create-word2vec-embedding.ts
const createWord2VecEmbedding = (text: string, vectorSize = 128, windowSize = 2): number[] => {
// Предобработка текста
const words = text.toLowerCase()
.replace(/[^а-яa-z\s]/g, '') // Удаление спецсимволов (базовая очистка)
.trim()
.split(/\s+/)
.filter(word => word.length > 0);
// Инициализация вектора документа
const documentVector = new Float32Array(vectorSize);
const wordVectors = new Map<string, Float32Array>();
// Генерация векторных представлений для слов
words.forEach((word, index) => {
if (!wordVectors.has(word)) {
// Создание детерминированного вектора слова на основе хэша
const vector = new Float32Array(vectorSize);
for (let i = 0; i < word.length; i++) {
const charCode = word.charCodeAt(i);
const seed = charCode % vectorSize;
vector[seed] += (charCode / 1000) * (i % 2 === 0 ? 1 : -1);
}
wordVectors.set(word, vector);
}
});
// Обработка контекстных окон
words.forEach((currentWord, position) => {
const contextWords = words.slice(
Math.max(0, position - windowSize),
Math.min(words.length, position + windowSize + 1)
);
contextWords.forEach(contextWord => {
const contextVector = wordVectors.get(contextWord)!;
contextVector.forEach((value, index) => {
documentVector[index] += value / contextWords.length;
});
});
});
// Нормализация вектора
const magnitude = Math.sqrt(documentVector.reduce((sum, val) => sum + val * val, 0));
return magnitude !== 0
? Array.from(documentVector).map(v => v / magnitude)
: Array.from(documentVector);
};
const calculateSimilarity = async (a: number[], b: number[]) => {
const tensorA = tensor1d(a);
const tensorB = tensor1d(b);
const dotProduct = sum(mul(tensorA, tensorB));
const normA = norm(tensorA);
const normB = norm(tensorB);
const normProduct = mul(normA, normB);
const cosineTensor = div(dotProduct, normProduct);
const [similarity] = await cosineTensor.data();
{
tensorA.dispose();
tensorB.dispose();
dotProduct.dispose();
normA.dispose();
normB.dispose();
normProduct.dispose();
cosineTensor.dispose();
}
return similarity;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment