Created
June 2, 2025 16:35
-
-
Save jacobsapps/7bac61f11e91b3b9978b199e5f279041 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Encode text and extract embeddings | |
for text in categories: | |
try: | |
# Tokenize text using CLIP tokenizer | |
encoded = tokenizer( | |
text, | |
return_tensors="np", | |
padding="max_length", | |
truncation=True, | |
max_length=77 | |
) | |
input_ids = encoded["input_ids"].astype("int32") | |
# CoreML model expects the tokens under "text" | |
input_data = { | |
"text": input_ids | |
} | |
# Run prediction using CoreML model | |
output = model.predict(input_data) | |
print(f"✅ Output keys for '{text}': {output.keys()}") | |
# Extract the embedding output from the model | |
embedding_key = list(output.keys())[0] | |
embedding = output[embedding_key] | |
embedding = embedding.flatten().tolist() if hasattr(embedding, "flatten") else list(embedding) | |
norm = np.linalg.norm(embedding) | |
normalized = (np.array(embedding) / norm).tolist() | |
# Save normalized embedding with label | |
entries.append({ | |
"label": text, | |
"embedding": normalized | |
}) | |
except Exception as e: | |
print(f"⚠️ Failed to process '{text}': {e}") | |
continue | |
output_filename = "text_embeddings.json" | |
with open(output_filename, "w") as f: | |
json.dump(entries, f, indent=2) | |
print(f"✅ Saved embeddings to {output_filename}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment