Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jacobsapps/d6c4b98bed861728e0ffedd2cf4b5361 to your computer and use it in GitHub Desktop.
Save jacobsapps/d6c4b98bed861728e0ffedd2cf4b5361 to your computer and use it in GitHub Desktop.
import os
import json
import numpy as np
import coremltools as ct
from transformers import CLIPTokenizer
# 1. Load labels
with open("stats.txt", "r") as f:
categories = [line.strip() for line in f if line.strip()]
# 2. Load CLIP-compatible tokenizer
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
# 3. Process embeddings
model_path = os.path.join(
"mobileclip_blt_text.mlpackage",
"Data",
"com.apple.CoreML",
"model.mlmodel"
)
# 4. Validate model path
if not os.path.isfile(model_path):
raise FileNotFoundError(f"❌ model.mlmodel not found at: {model_path}")
print(f"📦 Loading model: {model_path}")
model = ct.models.MLModel(model_path)
entries = []
# Encode text and extract embeddings
for text in categories:
try:
# Tokenize text using CLIP tokenizer
encoded = tokenizer(
text,
return_tensors="np",
padding="max_length",
truncation=True,
max_length=77
)
# Rank-2 shape (1, 77) and int32 type
input_ids = encoded["input_ids"].astype("int32")
# CoreML model expects the tokens under "text"
input_data = {
"text": input_ids
}
# Run prediction using CoreML model
output = model.predict(input_data)
print(f"✅ Output keys for '{text}': {output.keys()}")
# Extract the embedding output from the model
embedding_key = list(output.keys())[0]
embedding = output[embedding_key]
embedding = embedding.flatten().tolist() if hasattr(embedding, "flatten") else list(embedding)
norm = np.linalg.norm(embedding)
normalized = (np.array(embedding) / norm).tolist()
# Save normalized embedding with label
entries.append({
"label": text,
"embedding": normalized
})
except Exception as e:
print(f"⚠️ Failed to process '{text}': {e}")
continue
output_filename = "text_embeddings.json"
with open(output_filename, "w") as f:
json.dump(entries, f, indent=2)
print(f"✅ Saved embeddings to {output_filename}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment