rahulunair · July 12, 2024 22:42 · rahulunair · Jul 12, 2024
diff --git a/embed_xpu.py b/embed_xpu.py
 import logging
 import warnings

 logging.basicConfig(level=logging.ERROR)
 st_logger = logging.getLogger("sentence_transformers")
 st_logger.setLevel(logging.ERROR)
 warnings.filterwarnings("ignore")

 import argparse
 import time

 import torch
 import intel_extension_for_pytorch as ipex

 from accelerate import Accelerator
 from transformers import AutoTokenizer, AutoModel
 from sentence_transformers import SentenceTransformer


 def display_xpu_memory(device):
    print(
        "You can use `xpu-smi dump -m18` on another shell to monitor the gpu memory utilizaton in realtime."
    )
    if str(device).startswith("xpu"):
        print(
            "XPU Memory Allocated:",
            float(torch.xpu.memory_allocated(device)) / 1024 / 1024,
            " GB.",
        )
        print(
            "XPU Memory Reserved:",
            float(torch.xpu.memory_reserved(device)) / 1024 / 1024,
            " GB.",
        )


 def parse_arguments():
    parser = argparse.ArgumentParser(
        description="Run embeddings compute  on Intel XPUs  with optional Accelerate multi-XPU support."
    )
    parser.add_argument(
        "--use_accelerate",
        action="store_true",
        help="Enable running on multiple XPUs using Accelerate. Use 'accelerate launch script.py --use_accelerate' to run.",
    )
    return parser.parse_args()


 def compute_transformer_embeddings(model, tokenizer, sentences, device):
    start_time = time.time()
    encoded_input = tokenizer(
        sentences, padding=True, truncation=True, return_tensors="pt"
    ).to(device)
    with torch.inference_mode():
        with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
            output = model(**encoded_input)[0][:, 0]
            embeddings = torch.nn.functional.normalize(output, p=2, dim=1)
    duration = time.time() - start_time
    print(f"Transformer embeddings computed in {duration:.2f} seconds on {device}")
    return embeddings


 def compute_st_embeddings(st_model, tokenizer, sentences, device):
    start_time = time.time()
    with torch.inference_mode():
        with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
            embeddings = st_model.encode(
                sentences,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True,
            )
    duration = time.time() - start_time
    print(
        f"\nSentence Transformer embeddings computed in {duration:.2f} seconds on {device}"
    )
    return embeddings


 def device_selection(args, use_only_cpu=False):
    """if using accelerate, then use accelerate.device for multi xpu, other wise use the first xpu."""
    if use_only_cpu:
        return "cpu"
    accelerator = Accelerator()
    if args.use_accelerate:
        device = (
            accelerator.device
        )  # this can automatically allocate work to the available 8 xpus
    else:
        device = (
            "xpu:0" if torch.xpu.is_available() else "cpu"
        )  # can use just xpu , or xpu:0 or xpu:1 etc upto xpu:7
    print(f"Using device: {device}")
    return device


 def main():
    args = parse_arguments()
    device = device_selection(args)
    embedding_model = "BAAI/bge-m3"

    tokenizer = AutoTokenizer.from_pretrained(embedding_model)
    model = AutoModel.from_pretrained(embedding_model).to(device)
    st_model = SentenceTransformer(embedding_model).to(device)

    sentences = [
        "In the year 2029, the residents of the distant planet Gliese were on the brink of a discovery.",
        "Known for their advanced computing, the Gliesians had developed a network of supercomputers.",
        "These machines were not just tools but companions, integrated with AI that possessed emotional intelligence.",
        "As Earth's signal finally reached Gliese, the computers detected the anomaly almost instantly.",
        "The computers simulated countless scenarios, optimizing the message for clarity and warmth.",
        "Finally, the response was ready, directing the powerful beams of data toward Earth.",
    ]
    transformer_embeddings = compute_transformer_embeddings(
        model, tokenizer, sentences, device
    )
    display_xpu_memory(device)
    st_embeddings = compute_st_embeddings(st_model, tokenizer, sentences, device)
    display_xpu_memory(device)


 if __name__ == "__main__":
    main()
	import logging
	import warnings

	logging.basicConfig(level=logging.ERROR)
	st_logger = logging.getLogger("sentence_transformers")
	st_logger.setLevel(logging.ERROR)
	warnings.filterwarnings("ignore")

	import argparse
	import time

	import torch
	import intel_extension_for_pytorch as ipex

	from accelerate import Accelerator
	from transformers import AutoTokenizer, AutoModel
	from sentence_transformers import SentenceTransformer


	def display_xpu_memory(device):
	print(
	"You can use `xpu-smi dump -m18` on another shell to monitor the gpu memory utilizaton in realtime."
	)
	if str(device).startswith("xpu"):
	print(
	"XPU Memory Allocated:",
	float(torch.xpu.memory_allocated(device)) / 1024 / 1024,
	" GB.",
	)
	print(
	"XPU Memory Reserved:",
	float(torch.xpu.memory_reserved(device)) / 1024 / 1024,
	" GB.",
	)


	def parse_arguments():
	parser = argparse.ArgumentParser(
	description="Run embeddings compute on Intel XPUs with optional Accelerate multi-XPU support."
	)
	parser.add_argument(
	"--use_accelerate",
	action="store_true",
	help="Enable running on multiple XPUs using Accelerate. Use 'accelerate launch script.py --use_accelerate' to run.",
	)
	return parser.parse_args()


	def compute_transformer_embeddings(model, tokenizer, sentences, device):
	start_time = time.time()
	encoded_input = tokenizer(
	sentences, padding=True, truncation=True, return_tensors="pt"
	).to(device)
	with torch.inference_mode():
	with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
	output = model(**encoded_input)[0][:, 0]
	embeddings = torch.nn.functional.normalize(output, p=2, dim=1)
	duration = time.time() - start_time
	print(f"Transformer embeddings computed in {duration:.2f} seconds on {device}")
	return embeddings


	def compute_st_embeddings(st_model, tokenizer, sentences, device):
	start_time = time.time()
	with torch.inference_mode():
	with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
	embeddings = st_model.encode(
	sentences,
	convert_to_tensor=True,
	device=device,
	normalize_embeddings=True,
	)
	duration = time.time() - start_time
	print(
	f"\nSentence Transformer embeddings computed in {duration:.2f} seconds on {device}"
	)
	return embeddings


	def device_selection(args, use_only_cpu=False):
	"""if using accelerate, then use accelerate.device for multi xpu, other wise use the first xpu."""
	if use_only_cpu:
	return "cpu"
	accelerator = Accelerator()
	if args.use_accelerate:
	device = (
	accelerator.device
	) # this can automatically allocate work to the available 8 xpus
	else:
	device = (
	"xpu:0" if torch.xpu.is_available() else "cpu"
	) # can use just xpu , or xpu:0 or xpu:1 etc upto xpu:7
	print(f"Using device: {device}")
	return device


	def main():
	args = parse_arguments()
	device = device_selection(args)
	embedding_model = "BAAI/bge-m3"

	tokenizer = AutoTokenizer.from_pretrained(embedding_model)
	model = AutoModel.from_pretrained(embedding_model).to(device)
	st_model = SentenceTransformer(embedding_model).to(device)

	sentences = [
	"In the year 2029, the residents of the distant planet Gliese were on the brink of a discovery.",
	"Known for their advanced computing, the Gliesians had developed a network of supercomputers.",
	"These machines were not just tools but companions, integrated with AI that possessed emotional intelligence.",
	"As Earth's signal finally reached Gliese, the computers detected the anomaly almost instantly.",
	"The computers simulated countless scenarios, optimizing the message for clarity and warmth.",
	"Finally, the response was ready, directing the powerful beams of data toward Earth.",
	]
	transformer_embeddings = compute_transformer_embeddings(
	model, tokenizer, sentences, device
	)
	display_xpu_memory(device)
	st_embeddings = compute_st_embeddings(st_model, tokenizer, sentences, device)
	display_xpu_memory(device)


	if __name__ == "__main__":
	main()