Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jacobsapps/78a32c6257db42af773bb75e432fb7b7 to your computer and use it in GitHub Desktop.
Save jacobsapps/78a32c6257db42af773bb75e432fb7b7 to your computer and use it in GitHub Desktop.
import os
import json
import numpy as np
import coremltools as ct
from transformers import CLIPTokenizer
# 1. Load titles list with descriptions
titles = []
with open("names.txt", "r", encoding="utf-8") as f:
for line in f:
if "–" in line:
fullNameCopy, keyword = map(str.strip, line.strip().split("–", 1))
titles.append({
"fullNameCopy": fullNameCopy,
"keyword": keyword
})
# 2. Load CLIP-compatible tokenizer
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
# 3. Load BLT model
model_path = os.path.join(
"mobileclip_blt_text.mlpackage",
"Data",
"com.apple.CoreML",
"model.mlmodel"
)
if not os.path.isfile(model_path):
raise FileNotFoundError(f"❌ model.mlmodel not found at: {model_path}")
print(f"πŸ“¦ Loading model: {model_path}")
model = ct.models.MLModel(model_path)
entries = []
# 4. Encode keywords and extract embeddings
for item in titles:
fullNameCopy = item["fullNameCopy"]
keyword = item["keyword"]
try:
# Tokenize keyword using CLIP tokenizer
encoded = tokenizer(
keyword,
return_tensors="np",
padding="max_length",
truncation=True,
max_length=77
)
input_ids = encoded["input_ids"].astype("int32")
input_data = {"text": input_ids}
# Run CoreML model
output = model.predict(input_data)
print(f"βœ… Output keys for '{fullNameCopy}': {output.keys()}")
# Extract and normalize embedding
embedding_key = list(output.keys())[0]
embedding = output[embedding_key]
embedding = embedding.flatten().tolist()
norm = np.linalg.norm(embedding)
normalized = (np.array(embedding) / norm).tolist()
entries.append({
"fullNameCopy": fullNameCopy,
"keyword": keyword,
"embedding": normalized
})
except Exception as e:
print(f"⚠️ Failed to process '{fullNameCopy}': {e}")
continue
# 5. Save results
output_filename = "text_embeddings_blt.json"
with open(output_filename, "w", encoding="utf-8") as f:
json.dump(entries, f, indent=2, ensure_ascii=False)
print(f"βœ… Saved embeddings to {output_filename}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment