-
-
Save rhee-elten/1a1070e3a812ca863c3b937b5180b2f8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## nv_gpu_stat.py - monitor GPU usage in jupyter notebook -- [email protected] | |
## https://gist.github.com/rhee-elten/1a1070e3a812ca863c3b937b5180b2f8 | |
# pylint: disable=invalid-name | |
# pylint: disable=using-constant-test | |
# pylint: disable=wrong-import-position | |
# pylint: disable=missing-class-docstring | |
# pylint: disable=missing-function-docstring | |
if True: | |
import os | |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # make sure | |
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" | |
# disable numpy warnings | |
import warnings | |
warnings.filterwarnings("ignore") | |
import sys | |
import io | |
import os | |
from os.path import expanduser, dirname | |
from time import sleep, perf_counter, time | |
from socket import gethostname | |
from datetime import datetime | |
import threading | |
import subprocess | |
import numpy as np | |
import pandas as pd | |
import h5py | |
import matplotlib.pyplot as plt | |
def nv_gpu_stat(): | |
""" | |
example output: | |
print(nv_gpu_stat().to_string(index=False)) | |
index name temperature.gpu utilization.gpu [%] power.draw [W] memory.used [MiB] memory.total [MiB] | |
0 Tesla M40 24GB 17 0 17.46 0 24478 | |
1 Tesla M40 24GB 20 0 18.23 0 24478 | |
2 Tesla M40 24GB 19 0 18.53 0 24478 | |
3 Tesla M40 24GB 20 0 18.83 0 24478 | |
""" | |
query_gpu = "--query-gpu=index,name,temperature.gpu,utilization.gpu,power.draw,memory.used,memory.total" | |
query_format = "--format=csv,nounits" | |
for nvidia_smi in ["nvidia-smi", "C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe", "/usr/lib/wsl/lib/nvidia-smi"]: | |
try: | |
proc = subprocess.run( | |
[nvidia_smi, query_gpu, query_format], capture_output=True | |
) | |
break | |
except FileNotFoundError: | |
continue # 다른 이름으로 계속 시도 | |
else: | |
raise RuntimeError(f"nvidia-smi not found").with_traceback(None) | |
assert proc.returncode == 0, f"proc failure exitcode: {proc.returncode}" | |
return pd.read_csv(io.StringIO(proc.stdout.decode("ascii"))) | |
NV_GPU_STAT_COLLECT_FILE = expanduser("~/.nv_gpu_stat/collect.h5") | |
def nv_gpu_stat_collect( | |
collect_file=None, collect_interval=0.5, collect_length=2048, verbose=0 | |
): | |
collect_file = collect_file or NV_GPU_STAT_COLLECT_FILE | |
os.makedirs(dirname(collect_file), exist_ok=True) | |
with h5py.File(collect_file, "a") as f: | |
# call nv_gpu_stat to get num_gpus | |
gpu_stat = nv_gpu_stat() | |
num_gpus = gpu_stat.values.shape[0] | |
if "timestamp" not in f: | |
f.create_dataset("timestamp", (collect_length,), np.float64) | |
f["timestamp"][:] = np.nan | |
if "temperature_gpu" not in f: | |
f.create_dataset("temperature_gpu", (collect_length, num_gpus), np.float64) | |
f["temperature_gpu"][:] = np.nan | |
if "utilization_gpu" not in f: | |
f.create_dataset("utilization_gpu", (collect_length, num_gpus), np.float64) | |
f["utilization_gpu"][:] = np.nan | |
if "power_draw" not in f: | |
f.create_dataset("power_draw", (collect_length, num_gpus), np.float64) | |
f["power_draw"][:] = np.nan | |
if "memory_used" not in f: | |
f.create_dataset("memory_used", (collect_length, num_gpus), np.float64) | |
f["memory_used"][:] = np.nan | |
timestamp = f["timestamp"][:] | |
temperature_gpu = f["temperature_gpu"][:] | |
utilization_gpu = f["utilization_gpu"][:] | |
power_draw = f["power_draw"][:] | |
memory_used = f["memory_used"][:] | |
with h5py.File(collect_file, "a") as f: | |
while True: | |
t_now = time() | |
if verbose: | |
print(">>> nv_gpu_stat: time:", t_now) | |
gpu_stat = nv_gpu_stat() | |
# roll | |
timestamp = np.roll(timestamp, -1, axis=0) | |
temperature_gpu = np.roll(temperature_gpu, -1, axis=0) | |
utilization_gpu = np.roll(utilization_gpu, -1, axis=0) | |
power_draw = np.roll(power_draw, -1, axis=0) | |
memory_used = np.roll(memory_used, -1, axis=0) | |
# put new measure | |
timestamp[-1] = t_now | |
temperature_gpu[-1, :] = gpu_stat[" temperature.gpu"].values | |
utilization_gpu[-1, :] = gpu_stat[" utilization.gpu [%]"].values | |
power_draw[-1, :] = gpu_stat[" power.draw [W]"].values | |
memory_used[-1, :] = gpu_stat[" memory.used [MiB]"].values | |
# write back to h5 file | |
f["timestamp"][:] = timestamp | |
f["temperature_gpu"][:] = temperature_gpu | |
f["utilization_gpu"][:] = utilization_gpu | |
f["power_draw"][:] = power_draw | |
f["memory_used"][:] = memory_used | |
t_next = t_now + collect_interval | |
t_sleep = t_next - time() | |
if t_sleep > 0.0: | |
sleep(t_sleep) | |
def nv_gpu_stat_query(time_up_to=None, collect_file=None): | |
time_up_to = time_up_to or time() | |
collect_file = collect_file or NV_GPU_STAT_COLLECT_FILE | |
try: | |
with h5py.File(collect_file, "r") as f: | |
timestamp = f["timestamp"][:] | |
temperature_gpu = f["temperature_gpu"][:] | |
utilization_gpu = f["utilization_gpu"][:] | |
power_draw = f["power_draw"][:] | |
memory_used = f["memory_used"][:] | |
except: | |
_, exc_value, _ = sys.exc_info() | |
print("*** ", exc_value) | |
return None | |
if time_up_to: | |
in_range = timestamp <= time_up_to | |
timestamp = timestamp[in_range] | |
temperature_gpu = temperature_gpu[in_range] | |
utilization_gpu = utilization_gpu[in_range] | |
power_draw = power_draw[in_range] | |
memory_used = memory_used[in_range] | |
return dict( | |
timestamp=timestamp, | |
temperature_gpu=temperature_gpu, | |
utilization_gpu=utilization_gpu, | |
power_draw=power_draw, | |
memory_used=memory_used, | |
) | |
def nv_gpu_plot_values( | |
time_series, | |
values, | |
t_now=None, | |
ax=None, | |
plot_seconds=300.0, | |
stat_seconds=60.0, | |
plot_config=None, | |
min_ymax=10.0, | |
plot_kws=None, | |
): | |
plot_kws = plot_kws or dict() | |
ax = ax or plt.gca() | |
time_series = np.asarray(time_series) | |
t_now = t_now or time_series[-1] | |
values = np.asarray(values) | |
plot_config = plot_config or dict(title="values", value_fmt="{:>5.1f}", ylim=None) | |
title = plot_config["title"] | |
value_fmt = plot_config["value_fmt"] | |
if not callable(value_fmt): | |
value_fmt = lambda x, _fmt=value_fmt: _fmt.format(x) | |
ylim = plot_config["ylim"] | |
if time_series[0] + stat_seconds > time_series[-1]: | |
stat_seconds = time_series[-1] - time_series[0] | |
stat_select = (t_now - stat_seconds <= time_series) & (time_series <= t_now) | |
for i, ser in enumerate(np.transpose(values)): | |
y_stat = ser[stat_select] | |
# 현재값 (마지막 값) | |
val_last = ser[-1] | |
if len(y_stat) > 0: | |
val_mean = np.nanmean(y_stat) | |
val_max = np.nanmax(y_stat) | |
# do plot | |
label = "G{:d} {:s}, avg={:s}, max={:s}".format( | |
i, value_fmt(val_last), value_fmt(val_mean), value_fmt(val_max) | |
) | |
else: | |
# do plot | |
label = "G{:d} {:s}".format(i, value_fmt(val_last)) | |
ax.plot(time_series - t_now, ser, label=label, **plot_kws) | |
ax.set_title(title) | |
ax.legend(loc="upper left", prop={"size": 8}, bbox_to_anchor=(1, 1)) | |
# ylim 상한 자동 계산, 15% 마진 | |
max_win = np.amax(values) | |
max_win = max(min_ymax, max_win) | |
ylim = ylim or [-max_win * 0.1, max_win * 1.1] | |
ax.set_ylim(ylim) | |
# ylim 내에 vertical line | |
dyn_y_lim = ax.get_ylim() | |
ax.vlines(-stat_seconds, *dyn_y_lim, ls="dashed", lw=1.0, color="k", alpha=0.5) | |
ax.set_xlim([-plot_seconds, 0]) | |
def nv_gpu_stat_draw( | |
t_now=None, | |
collect_file=None, | |
values_dict=None, | |
plot_configs=None, | |
plot_context=None, | |
hostname=None, | |
figsize=(5.4, 7.4), | |
dpi=100, | |
plot_seconds=300.0, | |
clear_output=None, | |
**kwargs | |
): | |
t_now = t_now or time() | |
hostname = hostname or gethostname() | |
if values_dict is None: | |
values_dict = nv_gpu_stat_query(collect_file=collect_file) | |
if plot_context is None: | |
plot_context = dict() | |
if plot_configs is None: | |
plot_configs = dict( | |
temperature_gpu=dict( | |
title="temperature.gpu", value_fmt="{:>3.0f}", ylim=None | |
), | |
utilization_gpu=dict( | |
title="utilization.gpu [%]", value_fmt="{:>3.0f}%", ylim=None | |
), | |
power_draw=dict(title="power.draw [W]", value_fmt="{:>3.0f}", ylim=None), | |
memory_used=dict(title="memory.used [GB]", value_fmt="{:>4.1f}", ylim=None), | |
) | |
timestamp = np.asarray(values_dict["timestamp"]) | |
temperature_gpu = np.asarray(values_dict["temperature_gpu"]) | |
utilization_gpu = np.asarray(values_dict["utilization_gpu"]) | |
power_draw = np.asarray(values_dict["power_draw"]) | |
memory_used = np.asarray(values_dict["memory_used"]) | |
memory_used = memory_used / 1024 # MiB ==> GB | |
plot_select = (t_now - plot_seconds <= timestamp) & (timestamp <= t_now) | |
if not np.any(plot_select): | |
return | |
timestamp = timestamp[plot_select] | |
temperature_gpu = temperature_gpu[plot_select, :] | |
utilization_gpu = utilization_gpu[plot_select, :] | |
power_draw = power_draw[plot_select, :] | |
memory_used = memory_used[plot_select, :] | |
new_fig_created = False | |
if "fig" in plot_context and "axs" in plot_context: | |
fig = plot_context["fig"] | |
axs = plot_context["axs"] | |
for ax in axs: | |
ax.clear() # ax.cla() | |
else: | |
if callable(clear_output): | |
clear_output(wait=True) | |
fig, axs = plt.subplots(4, 1, figsize=figsize, dpi=dpi) | |
axs = axs.flatten() | |
plot_context["fig"] = fig | |
plot_context["axs"] = axs | |
new_fig_created = True | |
plot_kws = dict(lw=1, alpha=0.55) | |
nv_gpu_plot_values( | |
timestamp, | |
utilization_gpu, | |
ax=axs[0], | |
t_now=t_now, | |
plot_seconds=plot_seconds, | |
plot_config=plot_configs["utilization_gpu"], | |
plot_kws=plot_kws, | |
**kwargs | |
) | |
nv_gpu_plot_values( | |
timestamp, | |
memory_used, | |
ax=axs[1], | |
t_now=t_now, | |
plot_seconds=plot_seconds, | |
plot_config=plot_configs["memory_used"], | |
plot_kws=plot_kws, | |
**kwargs | |
) | |
nv_gpu_plot_values( | |
timestamp, | |
power_draw, | |
ax=axs[2], | |
t_now=t_now, | |
plot_seconds=plot_seconds, | |
plot_config=plot_configs["power_draw"], | |
plot_kws=plot_kws, | |
**kwargs | |
) | |
nv_gpu_plot_values( | |
timestamp, | |
temperature_gpu, | |
ax=axs[3], | |
t_now=t_now, | |
plot_seconds=plot_seconds, | |
plot_config=plot_configs["temperature_gpu"], | |
plot_kws=plot_kws, | |
**kwargs | |
) | |
if new_fig_created: | |
dtstr = datetime.fromtimestamp(t_now).strftime("%H:%M:%S") | |
fig_title = "{:s}\n{:s}".format(hostname, dtstr) | |
fig.suptitle(fig_title, fontsize=12) | |
fig.tight_layout(rect=[0, 0.03, 1, 0.92]) # fig.tight_layout() | |
plt.show(fig) | |
else: | |
fig.canvas.draw() | |
fig.canvas.flush_events() | |
print(t_now, end="\r") | |
def nv_gpu_stat_monitor(collect_file=None, interval=2.0, out=None, **kwargs): | |
""" | |
usage: | |
%matplotlib inline | |
from nv_gpu_stat import nv_gpu_stat_monitor | |
nv_gpu_stat_monitor() | |
""" | |
from IPython.display import display, clear_output | |
thr = threading.Thread(target=nv_gpu_stat_collect) | |
thr.start() | |
try: | |
# plot_context = dict() # 안됨. 왜? ## | |
plot_context = None | |
while True: | |
t_now = time() | |
collect = nv_gpu_stat_query(time_up_to=t_now, collect_file=collect_file) | |
if collect: | |
# 아직 collect_file 이 없으면, 대기 | |
nv_gpu_stat_draw( | |
t_now=t_now, | |
values_dict=collect, | |
clear_output=clear_output, | |
plot_context=plot_context, | |
**kwargs | |
) | |
t_next = t_now + interval | |
t_sleep = t_next - time() | |
if t_sleep > 0.0: | |
sleep(t_sleep) | |
finally: | |
thr.join() | |
if __name__ == "__main__": | |
get_ipython().run_line_magic("matplotlib", "inline") | |
try: | |
nv_gpu_stat_monitor() | |
finally: | |
get_ipython().system("rm -fvr .??*.ipynb .ipynb_checkpoints __pycache__") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment