Created
July 12, 2024 22:42
-
-
Save rahulunair/5e048b76518962eddeeafaf9bcf8da1b to your computer and use it in GitHub Desktop.
A simple script for embedding text using gbe-m model on Intel XPUs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import warnings | |
logging.basicConfig(level=logging.ERROR) | |
st_logger = logging.getLogger("sentence_transformers") | |
st_logger.setLevel(logging.ERROR) | |
warnings.filterwarnings("ignore") | |
import argparse | |
import time | |
import torch | |
import intel_extension_for_pytorch as ipex | |
from accelerate import Accelerator | |
from transformers import AutoTokenizer, AutoModel | |
from sentence_transformers import SentenceTransformer | |
def display_xpu_memory(device): | |
print( | |
"You can use `xpu-smi dump -m18` on another shell to monitor the gpu memory utilizaton in realtime." | |
) | |
if str(device).startswith("xpu"): | |
print( | |
"XPU Memory Allocated:", | |
float(torch.xpu.memory_allocated(device)) / 1024 / 1024, | |
" GB.", | |
) | |
print( | |
"XPU Memory Reserved:", | |
float(torch.xpu.memory_reserved(device)) / 1024 / 1024, | |
" GB.", | |
) | |
def parse_arguments(): | |
parser = argparse.ArgumentParser( | |
description="Run embeddings compute on Intel XPUs with optional Accelerate multi-XPU support." | |
) | |
parser.add_argument( | |
"--use_accelerate", | |
action="store_true", | |
help="Enable running on multiple XPUs using Accelerate. Use 'accelerate launch script.py --use_accelerate' to run.", | |
) | |
return parser.parse_args() | |
def compute_transformer_embeddings(model, tokenizer, sentences, device): | |
start_time = time.time() | |
encoded_input = tokenizer( | |
sentences, padding=True, truncation=True, return_tensors="pt" | |
).to(device) | |
with torch.inference_mode(): | |
with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16): | |
output = model(**encoded_input)[0][:, 0] | |
embeddings = torch.nn.functional.normalize(output, p=2, dim=1) | |
duration = time.time() - start_time | |
print(f"Transformer embeddings computed in {duration:.2f} seconds on {device}") | |
return embeddings | |
def compute_st_embeddings(st_model, tokenizer, sentences, device): | |
start_time = time.time() | |
with torch.inference_mode(): | |
with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16): | |
embeddings = st_model.encode( | |
sentences, | |
convert_to_tensor=True, | |
device=device, | |
normalize_embeddings=True, | |
) | |
duration = time.time() - start_time | |
print( | |
f"\nSentence Transformer embeddings computed in {duration:.2f} seconds on {device}" | |
) | |
return embeddings | |
def device_selection(args, use_only_cpu=False): | |
"""if using accelerate, then use accelerate.device for multi xpu, other wise use the first xpu.""" | |
if use_only_cpu: | |
return "cpu" | |
accelerator = Accelerator() | |
if args.use_accelerate: | |
device = ( | |
accelerator.device | |
) # this can automatically allocate work to the available 8 xpus | |
else: | |
device = ( | |
"xpu:0" if torch.xpu.is_available() else "cpu" | |
) # can use just xpu , or xpu:0 or xpu:1 etc upto xpu:7 | |
print(f"Using device: {device}") | |
return device | |
def main(): | |
args = parse_arguments() | |
device = device_selection(args) | |
embedding_model = "BAAI/bge-m3" | |
tokenizer = AutoTokenizer.from_pretrained(embedding_model) | |
model = AutoModel.from_pretrained(embedding_model).to(device) | |
st_model = SentenceTransformer(embedding_model).to(device) | |
sentences = [ | |
"In the year 2029, the residents of the distant planet Gliese were on the brink of a discovery.", | |
"Known for their advanced computing, the Gliesians had developed a network of supercomputers.", | |
"These machines were not just tools but companions, integrated with AI that possessed emotional intelligence.", | |
"As Earth's signal finally reached Gliese, the computers detected the anomaly almost instantly.", | |
"The computers simulated countless scenarios, optimizing the message for clarity and warmth.", | |
"Finally, the response was ready, directing the powerful beams of data toward Earth.", | |
] | |
transformer_embeddings = compute_transformer_embeddings( | |
model, tokenizer, sentences, device | |
) | |
display_xpu_memory(device) | |
st_embeddings = compute_st_embeddings(st_model, tokenizer, sentences, device) | |
display_xpu_memory(device) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script is designed to run on Intel GPUs (XPUs) and demonstrates embedding computations using PyTorch and the Intel Extension for PyTorch (IPEX). There are two primary ways to launch the script, depending on whether you're using a single device or multiple devices.
Prerequisites
Ensure your environment is set up correctly before running the script. Activate the
pytorch_xpu
environment which should have PyTorch and the Intel Extension for PyTorch installed. You can do this by running:Running the Script
Single Device Execution:
To run the script on a single device, use the standard Python command:
This command runs the script on the default device, which will be the first Intel GPU(here we are specifying as xpu:0 available unless specified otherwise.
Multiple XPUs Execution:
To leverage multiple XPUs use the accelerate toolkit:
Other details
Memory Utilization Functions : The script includes functions to monitor XPU memory usage. This is crucial for managing resources effectively and ensuring your application runs efficiently on the XPU.
Device Allocation : The script demonstrates how to explicitly allocate models and tokenizers to an XPU device. This ensures that all computations are performed on the XPU.