jacobsapps · June 2, 2025 16:39
diff --git a/gistfile1.txt b/gistfile1.txt
 import os
 import json
 import numpy as np
 import coremltools as ct
 from transformers import CLIPTokenizer

 # 1. Load labels
 with open("stats.txt", "r") as f:
    categories = [line.strip() for line in f if line.strip()]

 # 2. Load CLIP-compatible tokenizer
 tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

 # 3. Process embeddings 
 model_path = os.path.join(
    "mobileclip_blt_text.mlpackage",
    "Data",
    "com.apple.CoreML",
    "model.mlmodel"
 )

 # 4. Validate model path
 if not os.path.isfile(model_path):
    raise FileNotFoundError(f"❌ model.mlmodel not found at: {model_path}")

 print(f"📦 Loading model: {model_path}")
 model = ct.models.MLModel(model_path)

 entries = []

 # Encode text and extract embeddings
 for text in categories:
    try:
        # Tokenize text using CLIP tokenizer
        encoded = tokenizer(
            text,
            return_tensors="np",
            padding="max_length",
            truncation=True,
            max_length=77
        )

        # Rank-2 shape (1, 77) and int32 type
        input_ids = encoded["input_ids"].astype("int32")

        # CoreML model expects the tokens under "text"
        input_data = {
            "text": input_ids
        }

        # Run prediction using CoreML model
        output = model.predict(input_data)
        print(f"✅ Output keys for '{text}': {output.keys()}")

        # Extract the embedding output from the model
        embedding_key = list(output.keys())[0]
        embedding = output[embedding_key]
        embedding = embedding.flatten().tolist() if hasattr(embedding, "flatten") else list(embedding)
        norm = np.linalg.norm(embedding)
        normalized = (np.array(embedding) / norm).tolist()
    
        # Save normalized embedding with label
        entries.append({
            "label": text,
            "embedding": normalized
        })

    except Exception as e:
        print(f"⚠️ Failed to process '{text}': {e}")
        continue

 output_filename = "text_embeddings.json"
 with open(output_filename, "w") as f:
    json.dump(entries, f, indent=2)

 print(f"✅ Saved embeddings to {output_filename}")
	import os
	import json
	import numpy as np
	import coremltools as ct
	from transformers import CLIPTokenizer

	# 1. Load labels
	with open("stats.txt", "r") as f:
	categories = [line.strip() for line in f if line.strip()]

	# 2. Load CLIP-compatible tokenizer
	tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

	# 3. Process embeddings
	model_path = os.path.join(
	"mobileclip_blt_text.mlpackage",
	"Data",
	"com.apple.CoreML",
	"model.mlmodel"
	)

	# 4. Validate model path
	if not os.path.isfile(model_path):
	raise FileNotFoundError(f"❌ model.mlmodel not found at: {model_path}")

	print(f"📦 Loading model: {model_path}")
	model = ct.models.MLModel(model_path)

	entries = []

	# Encode text and extract embeddings
	for text in categories:
	try:
	# Tokenize text using CLIP tokenizer
	encoded = tokenizer(
	text,
	return_tensors="np",
	padding="max_length",
	truncation=True,
	max_length=77
	)

	# Rank-2 shape (1, 77) and int32 type
	input_ids = encoded["input_ids"].astype("int32")

	# CoreML model expects the tokens under "text"
	input_data = {
	"text": input_ids
	}

	# Run prediction using CoreML model
	output = model.predict(input_data)
	print(f"✅ Output keys for '{text}': {output.keys()}")

	# Extract the embedding output from the model
	embedding_key = list(output.keys())[0]
	embedding = output[embedding_key]
	embedding = embedding.flatten().tolist() if hasattr(embedding, "flatten") else list(embedding)
	norm = np.linalg.norm(embedding)
	normalized = (np.array(embedding) / norm).tolist()

	# Save normalized embedding with label
	entries.append({
	"label": text,
	"embedding": normalized
	})

	except Exception as e:
	print(f"⚠️ Failed to process '{text}': {e}")
	continue

	output_filename = "text_embeddings.json"
	with open(output_filename, "w") as f:
	json.dump(entries, f, indent=2)

	print(f"✅ Saved embeddings to {output_filename}")