Created
January 25, 2022 10:00
-
-
Save dniku/c901b9a459696d0969a2c78b51878a63 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
####### | |
# USAGE | |
# | |
# nvidia-smi | nvidia-htop.py [-l [length]] | |
# print GPU utilization with usernames and CPU stats for each GPU-utilizing process | |
# | |
# -l|--command-length [length] Print longer part of the commandline. If `length' | |
# is provided, use it as the commandline length, | |
# otherwise print first 100 characters. | |
# -c|--color Colorize the output (green - free GPU, yellow - | |
# moderately used GPU, red - fully used GPU) | |
# | |
# It is also useful to run this under `watch`, i.e.: | |
# | |
# watch -d -n 1 python3 nvidia-htop.py | |
###### | |
import argparse | |
import os | |
import re | |
import subprocess | |
import sys | |
try: | |
from termcolor import colored | |
except ImportError: | |
def colored(x, _): | |
return x | |
MEMORY_FREE_RATIO = 0.05 | |
MEMORY_MODERATE_RATIO = 0.9 | |
GPU_FREE_RATIO = 0.05 | |
GPU_MODERATE_RATIO = 0.75 | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-l', '--command-length', default=20, const=100, type=int, nargs='?') | |
parser.add_argument('-c', '--color', action='store_true') | |
parser.add_argument('--stdin', action='store_true') | |
args = parser.parse_args() | |
# parse the command length argument | |
command_length = args.command_length | |
color = args.color | |
# for testing, the stdin can be provided in a file | |
fake_stdin_path = os.getenv("FAKE_STDIN_PATH", None) | |
if fake_stdin_path is not None: | |
with open(fake_stdin_path, 'rt') as f: | |
nvidia_smi_output = f.readlines() | |
elif args.stdin: | |
nvidia_smi_output = sys.stdin.readlines() | |
else: | |
from io import StringIO | |
nvidia_smi_output = StringIO(subprocess.check_output(['nvidia-smi']).decode('utf-8')).readlines() | |
def colorize(_lines): | |
for i in range(len(_lines)): | |
line = _lines[i] | |
m = re.match(r"\| ..%\s+[0-9]{2,3}C.*\s([0-9]+)MiB\s+\/\s+([0-9]+)MiB.*\s([0-9]+)%", line) | |
if m is not None: | |
used_mem = int(m.group(1)) | |
total_mem = int(m.group(2)) | |
gpu_util = int(m.group(3)) / 100.0 | |
mem_util = used_mem / float(total_mem) | |
is_low = is_moderate = is_high = False | |
is_high = gpu_util >= GPU_MODERATE_RATIO or mem_util >= MEMORY_MODERATE_RATIO | |
if not is_high: | |
is_moderate = gpu_util >= GPU_FREE_RATIO or mem_util >= MEMORY_FREE_RATIO | |
if not is_high and not is_moderate: | |
is_free = True | |
c = 'red' if is_high else ('yellow' if is_moderate else 'green') | |
_lines[i] = colored(_lines[i], c) | |
_lines[i - 1] = colored(_lines[i - 1], c) | |
return _lines | |
lines_to_print = [] | |
# Copy the utilization upper part verbatim | |
for i in range(len(nvidia_smi_output)): | |
if not nvidia_smi_output[i].startswith("| Processes:"): | |
lines_to_print.append(nvidia_smi_output[i].rstrip()) | |
else: | |
i += 4 | |
break | |
# We set the width of the block with processes manually | |
assert lines_to_print[-1].startswith('+--') | |
lines_to_print.pop() | |
processes_delimeter_line = '+' + '-' * (args.command_length + 59) + '+' | |
lines_to_print.append(processes_delimeter_line) | |
if color: | |
lines_to_print = colorize(lines_to_print) | |
for line in lines_to_print: | |
print(line) | |
pids = {} | |
# Parse the PIDs from the lower part | |
# gpu_num = [] | |
# pids = [] | |
# gpu_mem = [] | |
# user = [] | |
# cpu = [] | |
# mem = [] | |
# time = [] | |
# command = [] | |
while not nvidia_smi_output[i].startswith("+--"): | |
if "Not Supported" in nvidia_smi_output[i]: | |
i += 1 | |
continue | |
line = nvidia_smi_output[i] | |
line = re.split(r'\s+', line) | |
pid = line[4] | |
gpu_num = line[1] | |
gpu_mem = line[-3] | |
if pid in pids: | |
pids[pid]['gpu_num'].append(gpu_num) | |
pids[pid]['gpu_mem'].append(gpu_mem) | |
else: | |
pids[pid] = { | |
'gpu_num': [gpu_num], | |
'gpu_mem': [gpu_mem], | |
} | |
i += 1 | |
# Query the PIDs using ps | |
ps_format = "pid,user,%cpu,%mem,etime,command" | |
ps_command = ["ps", "-ww", "-o", ps_format, "-p", ",".join(pids.keys())] | |
ps_output = subprocess.check_output(ps_command) | |
# Parse ps output | |
for line in ps_output.decode('ascii').split("\n"): | |
if line.strip().startswith("PID") or len(line) == 0: | |
continue | |
parts = re.split(r'\s+', line.strip(), 5) | |
pid = parts[0] | |
pids[pid]['user'] = parts[1] | |
pids[pid]['cpu'] = parts[2] | |
pids[pid]['mem'] = parts[3] | |
pids[pid]['time'] = parts[4] if not "-" in parts[4] else parts[4].split("-")[0] + " days" | |
pids[pid]['command'] = parts[5] | |
for process_info in pids.values(): | |
process_info['gpu_num'], process_info['gpu_mem'] = zip( | |
*sorted(zip(process_info['gpu_num'], process_info['gpu_mem']))) | |
def parse_mem(mem): | |
suffix = 'MiB' | |
assert mem.endswith(suffix) | |
return int(mem[:-len(suffix)]) | |
def sort_key(pid): | |
return ( | |
-sum(parse_mem(mem) for mem in pids[pid]['gpu_mem']), | |
int(pid) | |
) | |
format = ("| %3s %7s %8s %8s %5s %5s %9s %-" + str(command_length) + "." + str(command_length) + "s |") | |
print(format % ( | |
"GPU", "PID", "USER", "GPU MEM", "%CPU", "%MEM", "TIME", "COMMAND" | |
)) | |
for pid in sorted(pids.keys(), key=sort_key): | |
command = pids[pid].get('command', '?') | |
print(format % ( | |
pids[pid]['gpu_num'][0], | |
pid, | |
pids[pid].get('user', '?'), | |
pids[pid]['gpu_mem'][0], | |
pids[pid].get('cpu', '?'), | |
pids[pid].get('mem', '?'), | |
pids[pid].get('time', '?'), | |
command[:command_length], | |
)) | |
indent = 4 | |
for i in range(command_length, len(command), command_length - indent): | |
command_part = command[i:i + command_length - indent] | |
print(format % ('', '', '', '', '', '', '', ' ' * indent + command_part)) | |
for gpu_num, gpu_mem in zip(pids[pid]['gpu_num'][1:], pids[pid]['gpu_mem'][1:]): | |
print(format % (gpu_num, '', '', gpu_mem, '', '', '', '')) | |
assert nvidia_smi_output[-1].startswith('+--') | |
print(processes_delimeter_line) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment