sefatanam · January 28, 2025 16:27 · sefatanam · Jan 28, 2025
diff --git a/prompt.py b/prompt.py
 #!/bin/env python3
 import argparse
 import io
 import json
 import os
 import sys
 import urllib.parse
 import urllib.request
 from typing import Optional, NamedTuple

 import rich.console
 import rich.live
 import rich.markdown
 import rich.panel
 import rich.spinner
 import rich.text
 import rich.table
 import rich.pretty

 query_url = urllib.parse.urljoin(
    os.environ.get("OLLAMA_HOST", "http://localhost:11434"), "/api/generate"
 )


 class ExecutionParams(NamedTuple):
    raw: bool
    stats: bool
    seed: Optional[int] = None
    temperature: Optional[float] = None


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--models",
        "--model",
        default="llama3",
        help="The name of the models to run. Multiple models can be specified separated by commas",
    )
    parser.add_argument(
        "--raw",
        action="store_true",
        help="Do not format the results with markdown. Output the results to stdout",
    )
    parser.add_argument(
        "--stats",
        action="store_true",
        help="Output some stats about the generation at the end",
    )
    parser.add_argument(
        "--seed", type=int, help="The seed to use in the model evaluation. "
                                 "for deterministic output you must also set temperature=0."
    )
    parser.add_argument(
        "--temperature",
        type=float,
        help="The temperature to use in the model evaluation. A value of 0 means the model"
             " will always pick the highest probability token. Values above 0 add some"
             " randomness to token selection and generally make the model more creative."
             " For deterministic output, set this to 0 and set a seed to a non-zero constant."
             " Default in ollama is 0.8.",
    )
    parser.add_argument("prompt", nargs="*")
    args = vars(parser.parse_args())

    raw = args["raw"]

    if args["prompt"]:
        prompt = " ".join(args["prompt"])
    else:
        prompt = sys.stdin.read()

    console = rich.console.Console(stderr=raw)

    if not raw:
        console.print(rich.panel.Panel.fit("[bold]Prompt"))
        console.print(rich.text.Text(prompt))

    models = args["models"].split(",")
    for model in models:
        run_ollama(
            model,
            prompt,
            console,
            ExecutionParams(
                raw=raw,
                stats=args["stats"],
                seed=args["seed"],
                temperature=args["temperature"],
            ),
        )


 def run_ollama(model, prompt, console: rich.console.Console, params: ExecutionParams):
    raw = params.raw
    if not raw:
        console.print()
        console.print(rich.panel.Panel.fit("[bold]" + model))
    req_data = {
        "model": model,
        "prompt": prompt,
        "options": {},
    }
    if params.temperature is not None:
        req_data["options"]["temperature"] = params.temperature
    if params.seed is not None:
        req_data["options"]["seed"] = params.seed
    request = urllib.request.Request(
        query_url,
        data=json.dumps(
            req_data
        ).encode("utf-8"),
        headers={
            "Content-Type": "application/json; charset=utf-8",
        },
    )
    output = io.StringIO()

    spinner = rich.spinner.Spinner(
        "dots", text="Loading model...", style="status.spinner", speed=1.0
    )
    stats_data: Optional[dict] = None
    with rich.live.Live(
        spinner,
        console=console,
        vertical_overflow="ellipsis",
        refresh_per_second=12.5,
        transient=raw,
    ) as live:
        response = urllib.request.urlopen(request)
        with response:
            response_buf = io.TextIOWrapper(response, encoding="utf-8")
            while True:
                buf = response_buf.readline()
                if not buf:
                    break
                data = json.loads(buf)
                if data["done"]:
                    stats_data = data
                    if raw:
                        sys.stdout.write("\n")
                    break
                if raw:
                    live.stop()
                    sys.stdout.write(data["response"])
                else:
                    output.write(data["response"])
                    live.update(rich.markdown.Markdown(output.getvalue()))
    if params.stats and stats_data:
        console.print()
        table = rich.table.Table(f"Generation Stats", box=None)
        table.add_column()
        table.add_column()

        def format_int(val):
            return format(val, "n")

        def format_float(val):
            return format(val, ".2f")

        table.add_row("[bold]Model", stats_data.get("model"))
        table.add_row("[bold]Created At", stats_data.get("created_at"))
        table.add_row(
            "[bold]Total Duration (s)",
            format_float(int(stats_data.get("total_duration")) / 10**9),
        )
        table.add_row(
            "[bold]Load Duration (s)",
            format_float(int(stats_data.get("load_duration")) / 10**9),
        )
        if "prompt_eval_count" in stats_data:
            table.add_row(
                "[bold]Input Tokens", format_int(int(stats_data.get("prompt_eval_count")))
            )
        table.add_row(
            "[bold]Prompt Evaluation Duration (s)",
            format_float(int(stats_data.get("prompt_eval_duration")) / 10**9),
        )
        table.add_row(
            "[bold]Response Tokens", format_int(int(stats_data.get("eval_count")))
        )
        table.add_row(
            "[bold]Response Evaluation Duration (s)",
            format_float(int(stats_data.get("eval_duration")) / 10**9),
        )
        table.add_row(
            "[bold]Tokens per second",
            format_float(
                int(stats_data.get("eval_count"))
                / int(stats_data.get("eval_duration"))
                * 10**9
            ),
        )
        console.print(table)


 if __name__ == "__main__":
    main()
	#!/bin/env python3
	import argparse
	import io
	import json
	import os
	import sys
	import urllib.parse
	import urllib.request
	from typing import Optional, NamedTuple

	import rich.console
	import rich.live
	import rich.markdown
	import rich.panel
	import rich.spinner
	import rich.text
	import rich.table
	import rich.pretty

	query_url = urllib.parse.urljoin(
	os.environ.get("OLLAMA_HOST", "http://localhost:11434"), "/api/generate"
	)


	class ExecutionParams(NamedTuple):
	raw: bool
	stats: bool
	seed: Optional[int] = None
	temperature: Optional[float] = None


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--models",
	"--model",
	default="llama3",
	help="The name of the models to run. Multiple models can be specified separated by commas",
	)
	parser.add_argument(
	"--raw",
	action="store_true",
	help="Do not format the results with markdown. Output the results to stdout",
	)
	parser.add_argument(
	"--stats",
	action="store_true",
	help="Output some stats about the generation at the end",
	)
	parser.add_argument(
	"--seed", type=int, help="The seed to use in the model evaluation. "
	"for deterministic output you must also set temperature=0."
	)
	parser.add_argument(
	"--temperature",
	type=float,
	help="The temperature to use in the model evaluation. A value of 0 means the model"
	" will always pick the highest probability token. Values above 0 add some"
	" randomness to token selection and generally make the model more creative."
	" For deterministic output, set this to 0 and set a seed to a non-zero constant."
	" Default in ollama is 0.8.",
	)
	parser.add_argument("prompt", nargs="*")
	args = vars(parser.parse_args())

	raw = args["raw"]

	if args["prompt"]:
	prompt = " ".join(args["prompt"])
	else:
	prompt = sys.stdin.read()

	console = rich.console.Console(stderr=raw)

	if not raw:
	console.print(rich.panel.Panel.fit("[bold]Prompt"))
	console.print(rich.text.Text(prompt))

	models = args["models"].split(",")
	for model in models:
	run_ollama(
	model,
	prompt,
	console,
	ExecutionParams(
	raw=raw,
	stats=args["stats"],
	seed=args["seed"],
	temperature=args["temperature"],
	),
	)


	def run_ollama(model, prompt, console: rich.console.Console, params: ExecutionParams):
	raw = params.raw
	if not raw:
	console.print()
	console.print(rich.panel.Panel.fit("[bold]" + model))
	req_data = {
	"model": model,
	"prompt": prompt,
	"options": {},
	}
	if params.temperature is not None:
	req_data["options"]["temperature"] = params.temperature
	if params.seed is not None:
	req_data["options"]["seed"] = params.seed
	request = urllib.request.Request(
	query_url,
	data=json.dumps(
	req_data
	).encode("utf-8"),
	headers={
	"Content-Type": "application/json; charset=utf-8",
	},
	)
	output = io.StringIO()

	spinner = rich.spinner.Spinner(
	"dots", text="Loading model...", style="status.spinner", speed=1.0
	)
	stats_data: Optional[dict] = None
	with rich.live.Live(
	spinner,
	console=console,
	vertical_overflow="ellipsis",
	refresh_per_second=12.5,
	transient=raw,
	) as live:
	response = urllib.request.urlopen(request)
	with response:
	response_buf = io.TextIOWrapper(response, encoding="utf-8")
	while True:
	buf = response_buf.readline()
	if not buf:
	break
	data = json.loads(buf)
	if data["done"]:
	stats_data = data
	if raw:
	sys.stdout.write("\n")
	break
	if raw:
	live.stop()
	sys.stdout.write(data["response"])
	else:
	output.write(data["response"])
	live.update(rich.markdown.Markdown(output.getvalue()))
	if params.stats and stats_data:
	console.print()
	table = rich.table.Table(f"Generation Stats", box=None)
	table.add_column()
	table.add_column()

	def format_int(val):
	return format(val, "n")

	def format_float(val):
	return format(val, ".2f")

	table.add_row("[bold]Model", stats_data.get("model"))
	table.add_row("[bold]Created At", stats_data.get("created_at"))
	table.add_row(
	"[bold]Total Duration (s)",
	format_float(int(stats_data.get("total_duration")) / 10**9),
	)
	table.add_row(
	"[bold]Load Duration (s)",
	format_float(int(stats_data.get("load_duration")) / 10**9),
	)
	if "prompt_eval_count" in stats_data:
	table.add_row(
	"[bold]Input Tokens", format_int(int(stats_data.get("prompt_eval_count")))
	)
	table.add_row(
	"[bold]Prompt Evaluation Duration (s)",
	format_float(int(stats_data.get("prompt_eval_duration")) / 10**9),
	)
	table.add_row(
	"[bold]Response Tokens", format_int(int(stats_data.get("eval_count")))
	)
	table.add_row(
	"[bold]Response Evaluation Duration (s)",
	format_float(int(stats_data.get("eval_duration")) / 10**9),
	)
	table.add_row(
	"[bold]Tokens per second",
	format_float(
	int(stats_data.get("eval_count"))
	/ int(stats_data.get("eval_duration"))
	* 10**9
	),
	)
	console.print(table)


	if __name__ == "__main__":
	main()