jacobsapps · June 2, 2025 16:47
diff --git a/gistfile1.txt b/gistfile1.txt
 import os
 import json
 import numpy as np
 import coremltools as ct
 from transformers import CLIPTokenizer

 # 1. Load titles list with descriptions
 titles = []
 with open("names.txt", "r", encoding="utf-8") as f:
    for line in f:
        if "–" in line:
            fullNameCopy, keyword = map(str.strip, line.strip().split("–", 1))
            titles.append({
                "fullNameCopy": fullNameCopy,
                "keyword": keyword
            })

 # 2. Load CLIP-compatible tokenizer
 tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

 # 3. Load BLT model
 model_path = os.path.join(
    "mobileclip_blt_text.mlpackage",
    "Data",
    "com.apple.CoreML",
    "model.mlmodel"
 )

 if not os.path.isfile(model_path):
    raise FileNotFoundError(f"❌ model.mlmodel not found at: {model_path}")

 print(f"📦 Loading model: {model_path}")
 model = ct.models.MLModel(model_path)

 entries = []

 # 4. Encode keywords and extract embeddings
 for item in titles:
    fullNameCopy = item["fullNameCopy"]
    keyword = item["keyword"]

    try:
        # Tokenize keyword using CLIP tokenizer
        encoded = tokenizer(
            keyword,
            return_tensors="np",
            padding="max_length",
            truncation=True,
            max_length=77
        )

        input_ids = encoded["input_ids"].astype("int32")
        input_data = {"text": input_ids}

        # Run CoreML model
        output = model.predict(input_data)
        print(f"✅ Output keys for '{fullNameCopy}': {output.keys()}")

        # Extract and normalize embedding
        embedding_key = list(output.keys())[0]
        embedding = output[embedding_key]
        embedding = embedding.flatten().tolist()

        norm = np.linalg.norm(embedding)
        normalized = (np.array(embedding) / norm).tolist()

        entries.append({
            "fullNameCopy": fullNameCopy,
            "keyword": keyword,
            "embedding": normalized
        })

    except Exception as e:
        print(f"⚠️ Failed to process '{fullNameCopy}': {e}")
        continue

 # 5. Save results
 output_filename = "text_embeddings_blt.json"
 with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(entries, f, indent=2, ensure_ascii=False)

 print(f"✅ Saved embeddings to {output_filename}")
	import os
	import json
	import numpy as np
	import coremltools as ct
	from transformers import CLIPTokenizer

	# 1. Load titles list with descriptions
	titles = []
	with open("names.txt", "r", encoding="utf-8") as f:
	for line in f:
	if "–" in line:
	fullNameCopy, keyword = map(str.strip, line.strip().split("–", 1))
	titles.append({
	"fullNameCopy": fullNameCopy,
	"keyword": keyword
	})

	# 2. Load CLIP-compatible tokenizer
	tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

	# 3. Load BLT model
	model_path = os.path.join(
	"mobileclip_blt_text.mlpackage",
	"Data",
	"com.apple.CoreML",
	"model.mlmodel"
	)

	if not os.path.isfile(model_path):
	raise FileNotFoundError(f"❌ model.mlmodel not found at: {model_path}")

	print(f"📦 Loading model: {model_path}")
	model = ct.models.MLModel(model_path)

	entries = []

	# 4. Encode keywords and extract embeddings
	for item in titles:
	fullNameCopy = item["fullNameCopy"]
	keyword = item["keyword"]

	try:
	# Tokenize keyword using CLIP tokenizer
	encoded = tokenizer(
	keyword,
	return_tensors="np",
	padding="max_length",
	truncation=True,
	max_length=77
	)

	input_ids = encoded["input_ids"].astype("int32")
	input_data = {"text": input_ids}

	# Run CoreML model
	output = model.predict(input_data)
	print(f"✅ Output keys for '{fullNameCopy}': {output.keys()}")

	# Extract and normalize embedding
	embedding_key = list(output.keys())[0]
	embedding = output[embedding_key]
	embedding = embedding.flatten().tolist()

	norm = np.linalg.norm(embedding)
	normalized = (np.array(embedding) / norm).tolist()

	entries.append({
	"fullNameCopy": fullNameCopy,
	"keyword": keyword,
	"embedding": normalized
	})

	except Exception as e:
	print(f"⚠️ Failed to process '{fullNameCopy}': {e}")
	continue

	# 5. Save results
	output_filename = "text_embeddings_blt.json"
	with open(output_filename, "w", encoding="utf-8") as f:
	json.dump(entries, f, indent=2, ensure_ascii=False)

	print(f"✅ Saved embeddings to {output_filename}")