Created
June 2, 2025 16:39
-
-
Save jacobsapps/d6c4b98bed861728e0ffedd2cf4b5361 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import numpy as np | |
import coremltools as ct | |
from transformers import CLIPTokenizer | |
# 1. Load labels | |
with open("stats.txt", "r") as f: | |
categories = [line.strip() for line in f if line.strip()] | |
# 2. Load CLIP-compatible tokenizer | |
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") | |
# 3. Process embeddings | |
model_path = os.path.join( | |
"mobileclip_blt_text.mlpackage", | |
"Data", | |
"com.apple.CoreML", | |
"model.mlmodel" | |
) | |
# 4. Validate model path | |
if not os.path.isfile(model_path): | |
raise FileNotFoundError(f"❌ model.mlmodel not found at: {model_path}") | |
print(f"📦 Loading model: {model_path}") | |
model = ct.models.MLModel(model_path) | |
entries = [] | |
# Encode text and extract embeddings | |
for text in categories: | |
try: | |
# Tokenize text using CLIP tokenizer | |
encoded = tokenizer( | |
text, | |
return_tensors="np", | |
padding="max_length", | |
truncation=True, | |
max_length=77 | |
) | |
# Rank-2 shape (1, 77) and int32 type | |
input_ids = encoded["input_ids"].astype("int32") | |
# CoreML model expects the tokens under "text" | |
input_data = { | |
"text": input_ids | |
} | |
# Run prediction using CoreML model | |
output = model.predict(input_data) | |
print(f"✅ Output keys for '{text}': {output.keys()}") | |
# Extract the embedding output from the model | |
embedding_key = list(output.keys())[0] | |
embedding = output[embedding_key] | |
embedding = embedding.flatten().tolist() if hasattr(embedding, "flatten") else list(embedding) | |
norm = np.linalg.norm(embedding) | |
normalized = (np.array(embedding) / norm).tolist() | |
# Save normalized embedding with label | |
entries.append({ | |
"label": text, | |
"embedding": normalized | |
}) | |
except Exception as e: | |
print(f"⚠️ Failed to process '{text}': {e}") | |
continue | |
output_filename = "text_embeddings.json" | |
with open(output_filename, "w") as f: | |
json.dump(entries, f, indent=2) | |
print(f"✅ Saved embeddings to {output_filename}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment