Skip to content

Instantly share code, notes, and snippets.

@Naman-ntc
Created January 23, 2024 16:42
Show Gist options
  • Save Naman-ntc/1a93a04a22db7dc44e6383f337e3b25d to your computer and use it in GitHub Desktop.
Save Naman-ntc/1a93a04a22db7dc44e6383f337e3b25d to your computer and use it in GitHub Desktop.
def start_model_server(model_name : str, client : OpenAI):
def get_model():
models = client.models.list()
model = models.data[0].id
return model
import torch
num_gpus = torch.cuda.device_count()
command_args = [
"python",
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
model_name,
"--tensor-parallel-size",
f"{num_gpus}",
"--max-model-len",
"4096",
"--host",
"localhost",
"--port",
"8000",
"--enforce-eager",
]
stdout_tempfile = tempfile.NamedTemporaryFile("w", delete=False)
stderr_tempfile = tempfile.NamedTemporaryFile("w", delete=False)
print(f"Logging model outputs at {stdout_tempfile.name} and {stderr_tempfile.name}")
process = subprocess.Popen(command_args, stdout=stdout_tempfile, stderr=stderr_tempfile)
def wait_for_server():
try:
model = get_model()
assert model
print("Model server started successfully!")
return True
except Exception as e:
sleep(10)
return wait_for_server()
wait_for_server()
sleep(10)
return process
## once the moder server starts
client = OpenAI(api_key="EMPTY", base_url="http://127.0.0.1:8000/v1")
## use it as an openai server (code reuse!)
response = client.completions.create(
model=args.model,
prompt=prompt,
echo=False,
n=args.n,
max_tokens=args.max_tokens,
temperature=args.temperature,
top_p=args.top_p,
frequency_penalty=0,
presence_penalty=0,
stream=False,
)
outputs = [c.text for c in response.choices]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment