Skip to content

Instantly share code, notes, and snippets.

@rhee-elten
Forked from rhee/nv_gpu_stat.py
Last active September 6, 2024 05:42
Show Gist options
  • Save rhee-elten/1a1070e3a812ca863c3b937b5180b2f8 to your computer and use it in GitHub Desktop.
Save rhee-elten/1a1070e3a812ca863c3b937b5180b2f8 to your computer and use it in GitHub Desktop.
## nv_gpu_stat.py - monitor GPU usage in jupyter notebook -- [email protected]
## https://gist.github.com/rhee-elten/1a1070e3a812ca863c3b937b5180b2f8
# pylint: disable=invalid-name
# pylint: disable=using-constant-test
# pylint: disable=wrong-import-position
# pylint: disable=missing-class-docstring
# pylint: disable=missing-function-docstring
if True:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # make sure
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
# disable numpy warnings
import warnings
warnings.filterwarnings("ignore")
import sys
import io
import os
from os.path import expanduser, dirname
from time import sleep, perf_counter, time
from socket import gethostname
from datetime import datetime
import threading
import subprocess
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
def nv_gpu_stat():
"""
example output:
print(nv_gpu_stat().to_string(index=False))
index name temperature.gpu utilization.gpu [%] power.draw [W] memory.used [MiB] memory.total [MiB]
0 Tesla M40 24GB 17 0 17.46 0 24478
1 Tesla M40 24GB 20 0 18.23 0 24478
2 Tesla M40 24GB 19 0 18.53 0 24478
3 Tesla M40 24GB 20 0 18.83 0 24478
"""
query_gpu = "--query-gpu=index,name,temperature.gpu,utilization.gpu,power.draw,memory.used,memory.total"
query_format = "--format=csv,nounits"
for nvidia_smi in ["nvidia-smi", "C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe", "/usr/lib/wsl/lib/nvidia-smi"]:
try:
proc = subprocess.run(
[nvidia_smi, query_gpu, query_format], capture_output=True
)
break
except FileNotFoundError:
continue # 다른 이름으로 계속 시도
else:
raise RuntimeError(f"nvidia-smi not found").with_traceback(None)
assert proc.returncode == 0, f"proc failure exitcode: {proc.returncode}"
return pd.read_csv(io.StringIO(proc.stdout.decode("ascii")))
NV_GPU_STAT_COLLECT_FILE = expanduser("~/.nv_gpu_stat/collect.h5")
def nv_gpu_stat_collect(
collect_file=None, collect_interval=0.5, collect_length=2048, verbose=0
):
collect_file = collect_file or NV_GPU_STAT_COLLECT_FILE
os.makedirs(dirname(collect_file), exist_ok=True)
with h5py.File(collect_file, "a") as f:
# call nv_gpu_stat to get num_gpus
gpu_stat = nv_gpu_stat()
num_gpus = gpu_stat.values.shape[0]
if "timestamp" not in f:
f.create_dataset("timestamp", (collect_length,), np.float64)
f["timestamp"][:] = np.nan
if "temperature_gpu" not in f:
f.create_dataset("temperature_gpu", (collect_length, num_gpus), np.float64)
f["temperature_gpu"][:] = np.nan
if "utilization_gpu" not in f:
f.create_dataset("utilization_gpu", (collect_length, num_gpus), np.float64)
f["utilization_gpu"][:] = np.nan
if "power_draw" not in f:
f.create_dataset("power_draw", (collect_length, num_gpus), np.float64)
f["power_draw"][:] = np.nan
if "memory_used" not in f:
f.create_dataset("memory_used", (collect_length, num_gpus), np.float64)
f["memory_used"][:] = np.nan
timestamp = f["timestamp"][:]
temperature_gpu = f["temperature_gpu"][:]
utilization_gpu = f["utilization_gpu"][:]
power_draw = f["power_draw"][:]
memory_used = f["memory_used"][:]
with h5py.File(collect_file, "a") as f:
while True:
t_now = time()
if verbose:
print(">>> nv_gpu_stat: time:", t_now)
gpu_stat = nv_gpu_stat()
# roll
timestamp = np.roll(timestamp, -1, axis=0)
temperature_gpu = np.roll(temperature_gpu, -1, axis=0)
utilization_gpu = np.roll(utilization_gpu, -1, axis=0)
power_draw = np.roll(power_draw, -1, axis=0)
memory_used = np.roll(memory_used, -1, axis=0)
# put new measure
timestamp[-1] = t_now
temperature_gpu[-1, :] = gpu_stat[" temperature.gpu"].values
utilization_gpu[-1, :] = gpu_stat[" utilization.gpu [%]"].values
power_draw[-1, :] = gpu_stat[" power.draw [W]"].values
memory_used[-1, :] = gpu_stat[" memory.used [MiB]"].values
# write back to h5 file
f["timestamp"][:] = timestamp
f["temperature_gpu"][:] = temperature_gpu
f["utilization_gpu"][:] = utilization_gpu
f["power_draw"][:] = power_draw
f["memory_used"][:] = memory_used
t_next = t_now + collect_interval
t_sleep = t_next - time()
if t_sleep > 0.0:
sleep(t_sleep)
def nv_gpu_stat_query(time_up_to=None, collect_file=None):
time_up_to = time_up_to or time()
collect_file = collect_file or NV_GPU_STAT_COLLECT_FILE
try:
with h5py.File(collect_file, "r") as f:
timestamp = f["timestamp"][:]
temperature_gpu = f["temperature_gpu"][:]
utilization_gpu = f["utilization_gpu"][:]
power_draw = f["power_draw"][:]
memory_used = f["memory_used"][:]
except:
_, exc_value, _ = sys.exc_info()
print("*** ", exc_value)
return None
if time_up_to:
in_range = timestamp <= time_up_to
timestamp = timestamp[in_range]
temperature_gpu = temperature_gpu[in_range]
utilization_gpu = utilization_gpu[in_range]
power_draw = power_draw[in_range]
memory_used = memory_used[in_range]
return dict(
timestamp=timestamp,
temperature_gpu=temperature_gpu,
utilization_gpu=utilization_gpu,
power_draw=power_draw,
memory_used=memory_used,
)
def nv_gpu_plot_values(
time_series,
values,
t_now=None,
ax=None,
plot_seconds=300.0,
stat_seconds=60.0,
plot_config=None,
min_ymax=10.0,
plot_kws=None,
):
plot_kws = plot_kws or dict()
ax = ax or plt.gca()
time_series = np.asarray(time_series)
t_now = t_now or time_series[-1]
values = np.asarray(values)
plot_config = plot_config or dict(title="values", value_fmt="{:>5.1f}", ylim=None)
title = plot_config["title"]
value_fmt = plot_config["value_fmt"]
if not callable(value_fmt):
value_fmt = lambda x, _fmt=value_fmt: _fmt.format(x)
ylim = plot_config["ylim"]
if time_series[0] + stat_seconds > time_series[-1]:
stat_seconds = time_series[-1] - time_series[0]
stat_select = (t_now - stat_seconds <= time_series) & (time_series <= t_now)
for i, ser in enumerate(np.transpose(values)):
y_stat = ser[stat_select]
# 현재값 (마지막 값)
val_last = ser[-1]
if len(y_stat) > 0:
val_mean = np.nanmean(y_stat)
val_max = np.nanmax(y_stat)
# do plot
label = "G{:d} {:s}, avg={:s}, max={:s}".format(
i, value_fmt(val_last), value_fmt(val_mean), value_fmt(val_max)
)
else:
# do plot
label = "G{:d} {:s}".format(i, value_fmt(val_last))
ax.plot(time_series - t_now, ser, label=label, **plot_kws)
ax.set_title(title)
ax.legend(loc="upper left", prop={"size": 8}, bbox_to_anchor=(1, 1))
# ylim 상한 자동 계산, 15% 마진
max_win = np.amax(values)
max_win = max(min_ymax, max_win)
ylim = ylim or [-max_win * 0.1, max_win * 1.1]
ax.set_ylim(ylim)
# ylim 내에 vertical line
dyn_y_lim = ax.get_ylim()
ax.vlines(-stat_seconds, *dyn_y_lim, ls="dashed", lw=1.0, color="k", alpha=0.5)
ax.set_xlim([-plot_seconds, 0])
def nv_gpu_stat_draw(
t_now=None,
collect_file=None,
values_dict=None,
plot_configs=None,
plot_context=None,
hostname=None,
figsize=(5.4, 7.4),
dpi=100,
plot_seconds=300.0,
clear_output=None,
**kwargs
):
t_now = t_now or time()
hostname = hostname or gethostname()
if values_dict is None:
values_dict = nv_gpu_stat_query(collect_file=collect_file)
if plot_context is None:
plot_context = dict()
if plot_configs is None:
plot_configs = dict(
temperature_gpu=dict(
title="temperature.gpu", value_fmt="{:>3.0f}", ylim=None
),
utilization_gpu=dict(
title="utilization.gpu [%]", value_fmt="{:>3.0f}%", ylim=None
),
power_draw=dict(title="power.draw [W]", value_fmt="{:>3.0f}", ylim=None),
memory_used=dict(title="memory.used [GB]", value_fmt="{:>4.1f}", ylim=None),
)
timestamp = np.asarray(values_dict["timestamp"])
temperature_gpu = np.asarray(values_dict["temperature_gpu"])
utilization_gpu = np.asarray(values_dict["utilization_gpu"])
power_draw = np.asarray(values_dict["power_draw"])
memory_used = np.asarray(values_dict["memory_used"])
memory_used = memory_used / 1024 # MiB ==> GB
plot_select = (t_now - plot_seconds <= timestamp) & (timestamp <= t_now)
if not np.any(plot_select):
return
timestamp = timestamp[plot_select]
temperature_gpu = temperature_gpu[plot_select, :]
utilization_gpu = utilization_gpu[plot_select, :]
power_draw = power_draw[plot_select, :]
memory_used = memory_used[plot_select, :]
new_fig_created = False
if "fig" in plot_context and "axs" in plot_context:
fig = plot_context["fig"]
axs = plot_context["axs"]
for ax in axs:
ax.clear() # ax.cla()
else:
if callable(clear_output):
clear_output(wait=True)
fig, axs = plt.subplots(4, 1, figsize=figsize, dpi=dpi)
axs = axs.flatten()
plot_context["fig"] = fig
plot_context["axs"] = axs
new_fig_created = True
plot_kws = dict(lw=1, alpha=0.55)
nv_gpu_plot_values(
timestamp,
utilization_gpu,
ax=axs[0],
t_now=t_now,
plot_seconds=plot_seconds,
plot_config=plot_configs["utilization_gpu"],
plot_kws=plot_kws,
**kwargs
)
nv_gpu_plot_values(
timestamp,
memory_used,
ax=axs[1],
t_now=t_now,
plot_seconds=plot_seconds,
plot_config=plot_configs["memory_used"],
plot_kws=plot_kws,
**kwargs
)
nv_gpu_plot_values(
timestamp,
power_draw,
ax=axs[2],
t_now=t_now,
plot_seconds=plot_seconds,
plot_config=plot_configs["power_draw"],
plot_kws=plot_kws,
**kwargs
)
nv_gpu_plot_values(
timestamp,
temperature_gpu,
ax=axs[3],
t_now=t_now,
plot_seconds=plot_seconds,
plot_config=plot_configs["temperature_gpu"],
plot_kws=plot_kws,
**kwargs
)
if new_fig_created:
dtstr = datetime.fromtimestamp(t_now).strftime("%H:%M:%S")
fig_title = "{:s}\n{:s}".format(hostname, dtstr)
fig.suptitle(fig_title, fontsize=12)
fig.tight_layout(rect=[0, 0.03, 1, 0.92]) # fig.tight_layout()
plt.show(fig)
else:
fig.canvas.draw()
fig.canvas.flush_events()
print(t_now, end="\r")
def nv_gpu_stat_monitor(collect_file=None, interval=2.0, out=None, **kwargs):
"""
usage:
%matplotlib inline
from nv_gpu_stat import nv_gpu_stat_monitor
nv_gpu_stat_monitor()
"""
from IPython.display import display, clear_output
thr = threading.Thread(target=nv_gpu_stat_collect)
thr.start()
try:
# plot_context = dict() # 안됨. 왜? ##
plot_context = None
while True:
t_now = time()
collect = nv_gpu_stat_query(time_up_to=t_now, collect_file=collect_file)
if collect:
# 아직 collect_file 이 없으면, 대기
nv_gpu_stat_draw(
t_now=t_now,
values_dict=collect,
clear_output=clear_output,
plot_context=plot_context,
**kwargs
)
t_next = t_now + interval
t_sleep = t_next - time()
if t_sleep > 0.0:
sleep(t_sleep)
finally:
thr.join()
if __name__ == "__main__":
get_ipython().run_line_magic("matplotlib", "inline")
try:
nv_gpu_stat_monitor()
finally:
get_ipython().system("rm -fvr .??*.ipynb .ipynb_checkpoints __pycache__")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment