Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jacobsapps/7bac61f11e91b3b9978b199e5f279041 to your computer and use it in GitHub Desktop.
Save jacobsapps/7bac61f11e91b3b9978b199e5f279041 to your computer and use it in GitHub Desktop.
# Encode text and extract embeddings
for text in categories:
try:
# Tokenize text using CLIP tokenizer
encoded = tokenizer(
text,
return_tensors="np",
padding="max_length",
truncation=True,
max_length=77
)
input_ids = encoded["input_ids"].astype("int32")
# CoreML model expects the tokens under "text"
input_data = {
"text": input_ids
}
# Run prediction using CoreML model
output = model.predict(input_data)
print(f"✅ Output keys for '{text}': {output.keys()}")
# Extract the embedding output from the model
embedding_key = list(output.keys())[0]
embedding = output[embedding_key]
embedding = embedding.flatten().tolist() if hasattr(embedding, "flatten") else list(embedding)
norm = np.linalg.norm(embedding)
normalized = (np.array(embedding) / norm).tolist()
# Save normalized embedding with label
entries.append({
"label": text,
"embedding": normalized
})
except Exception as e:
print(f"⚠️ Failed to process '{text}': {e}")
continue
output_filename = "text_embeddings.json"
with open(output_filename, "w") as f:
json.dump(entries, f, indent=2)
print(f"✅ Saved embeddings to {output_filename}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment