Created
June 2, 2025 16:47
-
-
Save jacobsapps/78a32c6257db42af773bb75e432fb7b7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import numpy as np | |
import coremltools as ct | |
from transformers import CLIPTokenizer | |
# 1. Load titles list with descriptions | |
titles = [] | |
with open("names.txt", "r", encoding="utf-8") as f: | |
for line in f: | |
if "β" in line: | |
fullNameCopy, keyword = map(str.strip, line.strip().split("β", 1)) | |
titles.append({ | |
"fullNameCopy": fullNameCopy, | |
"keyword": keyword | |
}) | |
# 2. Load CLIP-compatible tokenizer | |
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") | |
# 3. Load BLT model | |
model_path = os.path.join( | |
"mobileclip_blt_text.mlpackage", | |
"Data", | |
"com.apple.CoreML", | |
"model.mlmodel" | |
) | |
if not os.path.isfile(model_path): | |
raise FileNotFoundError(f"β model.mlmodel not found at: {model_path}") | |
print(f"π¦ Loading model: {model_path}") | |
model = ct.models.MLModel(model_path) | |
entries = [] | |
# 4. Encode keywords and extract embeddings | |
for item in titles: | |
fullNameCopy = item["fullNameCopy"] | |
keyword = item["keyword"] | |
try: | |
# Tokenize keyword using CLIP tokenizer | |
encoded = tokenizer( | |
keyword, | |
return_tensors="np", | |
padding="max_length", | |
truncation=True, | |
max_length=77 | |
) | |
input_ids = encoded["input_ids"].astype("int32") | |
input_data = {"text": input_ids} | |
# Run CoreML model | |
output = model.predict(input_data) | |
print(f"β Output keys for '{fullNameCopy}': {output.keys()}") | |
# Extract and normalize embedding | |
embedding_key = list(output.keys())[0] | |
embedding = output[embedding_key] | |
embedding = embedding.flatten().tolist() | |
norm = np.linalg.norm(embedding) | |
normalized = (np.array(embedding) / norm).tolist() | |
entries.append({ | |
"fullNameCopy": fullNameCopy, | |
"keyword": keyword, | |
"embedding": normalized | |
}) | |
except Exception as e: | |
print(f"β οΈ Failed to process '{fullNameCopy}': {e}") | |
continue | |
# 5. Save results | |
output_filename = "text_embeddings_blt.json" | |
with open(output_filename, "w", encoding="utf-8") as f: | |
json.dump(entries, f, indent=2, ensure_ascii=False) | |
print(f"β Saved embeddings to {output_filename}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment