Skip to content

Instantly share code, notes, and snippets.

@greyhoundforty
Created May 7, 2025 20:21
Show Gist options
  • Save greyhoundforty/0cead3155cc441fb726f83355528a656 to your computer and use it in GitHub Desktop.
Save greyhoundforty/0cead3155cc441fb726f83355528a656 to your computer and use it in GitHub Desktop.
RVtools Python Parser
import os
import pandas as pd
import glob
from typing import Dict, List, Optional, Union, Tuple
import statistics
import click
class RVToolsAnalyzer:
def __init__(self, directory_path: str = None):
"""
Initialize the RVTools analyzer with a directory containing RVTools CSV exports.
Args:
directory_path: Path to directory containing RVTools CSV files
"""
self.directory_path = directory_path
self.dataframes = {}
self.file_mapping = {
'vinfo': None,
'vhost': None,
'vcluster': None,
'vcpu': None,
'vmemory': None,
'vnetwork': None,
'vdisk': None
}
if directory_path:
self.load_csv_files()
def load_csv_files(self) -> None:
"""Load all RVTools CSV files from the specified directory."""
if not self.directory_path or not os.path.exists(self.directory_path):
raise ValueError(f"Directory path {self.directory_path} does not exist")
# Map with exact RVTools filenames to look for
exact_file_mapping = {
'vinfo': 'RVTools_tabvInfo.csv',
'vhost': 'RVTools_tabvHost.csv',
'vcluster': 'RVTools_tabvCluster.csv',
'vcpu': 'RVTools_tabvCPU.csv',
'vmemory': 'RVTools_tabvMemory.csv',
'vnetwork': 'RVTools_tabvNetwork.csv',
'vdisk': 'RVTools_tabvDisk.csv'
}
# Find the exact filenames in the directory
for key, filename in exact_file_mapping.items():
filepath = os.path.join(self.directory_path, filename)
if os.path.exists(filepath):
self.file_mapping[key] = filepath
print(f"Found {key} file: {filename}")
else:
print(f"Warning: {filename} not found in {self.directory_path}")
# Load each file into a pandas DataFrame
for key, file_path in self.file_mapping.items():
if file_path and os.path.exists(file_path):
try:
df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
df.columns = [c.strip().replace(' ', '_').replace('(', '').replace(')', '').replace('%', 'pct').replace('#', 'num') for c in df.columns]
self.dataframes[key] = df
print(f"Loaded {key} data from {os.path.basename(file_path)} with {len(df)} rows and {len(df.columns)} columns")
print(f"Sample columns: {', '.join(df.columns[:5])}")
except Exception as e:
print(f"Error loading {file_path}: {e}")
else:
print(f"File for {key} not found or not valid")
def get_cluster_utilization(self) -> pd.DataFrame:
"""
Calculate and return cluster utilization metrics.
Returns:
DataFrame with cluster utilization statistics
"""
if 'vcluster' not in self.dataframes:
raise ValueError("vCluster data not found in loaded files")
df = self.dataframes['vcluster']
# Select relevant columns - adjust these based on actual column names in your CSVs
# Common columns might include: Cluster, NumCPUs, CPUMhz, MemoryGB, etc.
cluster_cols = [col for col in df.columns if 'cluster' in col.lower() or 'name' in col.lower()]
cpu_cols = [col for col in df.columns if 'cpu' in col.lower()]
mem_cols = [col for col in df.columns if 'mem' in col.lower() or 'ram' in col.lower()]
# Create a summary DataFrame
if cluster_cols:
cluster_name_col = cluster_cols[0]
result = df[[cluster_name_col]].copy()
# Add CPU metrics
for col in cpu_cols:
if 'pct' in col.lower(): # Utilization percentage columns
result[col] = df[col]
# Add Memory metrics
for col in mem_cols:
if 'pct' in col.lower(): # Utilization percentage columns
result[col] = df[col]
return result
return pd.DataFrame() # Return empty DataFrame if no suitable columns found
def get_host_utilization(self) -> pd.DataFrame:
"""
Calculate and return host utilization metrics.
Returns:
DataFrame with host-level utilization statistics
"""
if 'vhost' not in self.dataframes:
raise ValueError("vHost data not found in loaded files")
df = self.dataframes['vhost']
# Select relevant columns
host_cols = [col for col in df.columns if 'host' in col.lower() or 'name' in col.lower() or 'esx' in col.lower()]
cluster_cols = [col for col in df.columns if 'cluster' in col.lower()]
cpu_cols = [col for col in df.columns if 'cpu' in col.lower() and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
mem_cols = [col for col in df.columns if ('mem' in col.lower() or 'ram' in col.lower()) and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
if host_cols:
host_name_col = host_cols[0]
result_cols = [host_name_col]
if cluster_cols:
result_cols.append(cluster_cols[0])
result_cols.extend(cpu_cols)
result_cols.extend(mem_cols)
# Filter to only include columns that exist in the dataframe
result_cols = [col for col in result_cols if col in df.columns]
if result_cols:
return df[result_cols].copy()
return pd.DataFrame()
def get_vm_cpu_utilization(self) -> pd.DataFrame:
"""
Calculate VM CPU utilization metrics based on the column structure in RVTools_tabvInfo.csv.
Returns:
DataFrame with VM CPU utilization statistics
"""
if 'vinfo' not in self.dataframes:
print("vInfo data not found in loaded files")
return pd.DataFrame()
df = self.dataframes['vinfo']
# Based on the provided column headers from RVTools_tabvInfo.csv
# VM name should be in column 0 called 'VM'
vm_cols = ['VM'] if 'VM' in df.columns else [col for col in df.columns if 'vm' in col.lower() and 'name' not in col.lower()]
if not vm_cols:
vm_cols = [col for col in df.columns if 'name' in col.lower() and 'dns' not in col.lower()]
# CPU information in column 16 (CPUs) and possibly Overall_Cpu_Readiness (column 17)
cpu_cols = ['CPUs', 'Overall_Cpu_Readiness'] if 'CPUs' in df.columns else [
col for col in df.columns if 'cpu' in col.lower() and 'num' not in col.lower()
]
# Cluster and host information (columns 76-78)
cluster_cols = ['Cluster'] if 'Cluster' in df.columns else [col for col in df.columns if 'cluster' in col.lower()]
host_cols = ['Host'] if 'Host' in df.columns else [col for col in df.columns if 'host' in col.lower()]
if vm_cols:
vm_col = vm_cols[0]
result_cols = [vm_col]
# Add cluster and host columns if available
if cluster_cols:
result_cols.extend(cluster_cols)
if host_cols:
result_cols.extend(host_cols)
# Add CPU columns if available
result_cols.extend([col for col in cpu_cols if col in df.columns])
# Filter to only include columns that exist in the dataframe
result_cols = [col for col in result_cols if col in df.columns]
if result_cols:
print(f"Using columns for VM CPU utilization: {', '.join(result_cols)}")
return df[result_cols].copy()
return pd.DataFrame()
def get_vm_memory_utilization(self) -> pd.DataFrame:
"""
Calculate VM memory utilization metrics based on the column structure in RVTools_tabvInfo.csv.
Returns:
DataFrame with VM memory utilization statistics
"""
if 'vinfo' not in self.dataframes:
print("vInfo data not found in loaded files")
return pd.DataFrame()
df = self.dataframes['vinfo']
# Based on the provided column headers from RVTools_tabvInfo.csv
# VM name should be in column 0 called 'VM'
vm_cols = ['VM'] if 'VM' in df.columns else [col for col in df.columns if 'vm' in col.lower() and 'name' not in col.lower()]
if not vm_cols:
vm_cols = [col for col in df.columns if 'name' in col.lower() and 'dns' not in col.lower()]
# Memory information in column 18 (Memory) and possibly Active_Memory (column 19)
mem_cols = ['Memory', 'Active_Memory'] if 'Memory' in df.columns else [
col for col in df.columns if ('mem' in col.lower() or 'ram' in col.lower())
]
# Cluster and host information (columns 76-78)
cluster_cols = ['Cluster'] if 'Cluster' in df.columns else [col for col in df.columns if 'cluster' in col.lower()]
host_cols = ['Host'] if 'Host' in df.columns else [col for col in df.columns if 'host' in col.lower()]
if vm_cols:
vm_col = vm_cols[0]
result_cols = [vm_col]
# Add cluster and host columns if available
if cluster_cols:
result_cols.extend(cluster_cols)
if host_cols:
result_cols.extend(host_cols)
# Add memory columns if available
result_cols.extend([col for col in mem_cols if col in df.columns])
# Filter to only include columns that exist in the dataframe
result_cols = [col for col in result_cols if col in df.columns]
if result_cols:
print(f"Using columns for VM memory utilization: {', '.join(result_cols)}")
return df[result_cols].copy()
return pd.DataFrame()
def print_aggregated_vm_report(self, top_n: int = 10, by: str = 'cpu') -> None:
"""
Print a formatted VM utilization report, showing the top N VMs by utilization.
Args:
top_n: Number of top VMs to show
by: Metric to sort by ('cpu' or 'memory')
"""
# If we have vinfo but not vcpu or vmemory, use vinfo for everything
if 'vinfo' in self.dataframes and ('vcpu' not in self.dataframes or 'vmemory' not in self.dataframes):
print("Using vInfo data for VM utilization metrics")
vm_cpu_df = self.get_vm_cpu_utilization()
vm_memory_df = self.get_vm_memory_utilization()
else:
# Otherwise use the standard methods
vm_cpu_df = self.get_vm_cpu_utilization() if 'vcpu' in self.dataframes else None
vm_memory_df = self.get_vm_memory_utilization() if 'vmemory' in self.dataframes else None
# Rest of the method remains the same...
# (The existing implementation to display top VMs)
report = {'vms_by_cluster': {}, 'vms_by_host': {}}
# Process VM CPU data
if vm_cpu_df is not None and not vm_cpu_df.empty:
# Try to find cluster and host columns
cluster_cols = [col for col in vm_cpu_df.columns if 'cluster' in col.lower()]
host_cols = [col for col in vm_cpu_df.columns if 'host' in col.lower()]
vm_col = vm_cpu_df.columns[0] # Assume first column is VM name
# Identify CPU and memory utilization metrics
cpu_cols = [col for col in vm_cpu_df.columns if 'cpu' in col.lower() and col != vm_col]
# Group by cluster if available
if cluster_cols and vm_col and cpu_cols:
cluster_col = cluster_cols[0]
print(f"Grouping VMs by cluster using column: {cluster_col}")
for cluster in vm_cpu_df[cluster_col].dropna().unique():
cluster_vms = vm_cpu_df[vm_cpu_df[cluster_col] == cluster]
# Create a list of (vm_name, cpu_value) tuples for sorting
vm_cpu_values = []
for _, row in cluster_vms.iterrows():
vm_name = row[vm_col]
# Use first CPU column as the metric for sorting
cpu_value = row[cpu_cols[0]] if pd.notna(row[cpu_cols[0]]) else 0
vm_cpu_values.append((vm_name, cpu_value))
# Sort by CPU value (descending) and take top N
vm_cpu_values.sort(key=lambda x: x[1], reverse=True)
top_vms = vm_cpu_values[:top_n]
if by.lower() == 'cpu' and top_vms:
print(f"\nCluster: {cluster}")
print(f" Top {len(top_vms)} VMs by CPU:")
for vm_name, cpu_value in top_vms:
print(f" {vm_name}: {cpu_value} CPUs")
# Group by host if available
if host_cols and vm_col and cpu_cols:
host_col = host_cols[0]
print(f"Grouping VMs by host using column: {host_col}")
for host in vm_cpu_df[host_col].dropna().unique():
host_vms = vm_cpu_df[vm_cpu_df[host_col] == host]
# Create a list of (vm_name, cpu_value) tuples for sorting
vm_cpu_values = []
for _, row in host_vms.iterrows():
vm_name = row[vm_col]
# Use first CPU column as the metric for sorting
cpu_value = row[cpu_cols[0]] if pd.notna(row[cpu_cols[0]]) else 0
vm_cpu_values.append((vm_name, cpu_value))
# Sort by CPU value (descending) and take top N
vm_cpu_values.sort(key=lambda x: x[1], reverse=True)
top_vms = vm_cpu_values[:top_n]
if by.lower() == 'cpu' and top_vms:
print(f"\nHost: {host}")
print(f" Top {len(top_vms)} VMs by CPU:")
for vm_name, cpu_value in top_vms:
print(f" {vm_name}: {cpu_value} CPUs")
# Process VM Memory data
if vm_memory_df is not None and not vm_memory_df.empty and by.lower() == 'memory':
# Try to find cluster and host columns
cluster_cols = [col for col in vm_memory_df.columns if 'cluster' in col.lower()]
host_cols = [col for col in vm_memory_df.columns if 'host' in col.lower()]
vm_col = vm_memory_df.columns[0] # Assume first column is VM name
# Identify memory utilization metrics
mem_cols = [col for col in vm_memory_df.columns if ('mem' in col.lower() or 'ram' in col.lower()) and col != vm_col]
# Group by cluster if available
if cluster_cols and vm_col and mem_cols:
cluster_col = cluster_cols[0]
print(f"Grouping VMs by cluster using column: {cluster_col}")
for cluster in vm_memory_df[cluster_col].dropna().unique():
cluster_vms = vm_memory_df[vm_memory_df[cluster_col] == cluster]
# Create a list of (vm_name, mem_value) tuples for sorting
vm_mem_values = []
for _, row in cluster_vms.iterrows():
vm_name = row[vm_col]
# Use first memory column as the metric for sorting
mem_value = row[mem_cols[0]] if pd.notna(row[mem_cols[0]]) else 0
vm_mem_values.append((vm_name, mem_value))
# Sort by memory value (descending) and take top N
vm_mem_values.sort(key=lambda x: x[1], reverse=True)
top_vms = vm_mem_values[:top_n]
if top_vms:
print(f"\nCluster: {cluster}")
print(f" Top {len(top_vms)} VMs by Memory:")
for vm_name, mem_value in top_vms:
print(f" {vm_name}: {mem_value} MB")
# Group by host if available
if host_cols and vm_col and mem_cols:
host_col = host_cols[0]
print(f"Grouping VMs by host using column: {host_col}")
for host in vm_memory_df[host_col].dropna().unique():
host_vms = vm_memory_df[vm_memory_df[host_col] == host]
# Create a list of (vm_name, mem_value) tuples for sorting
vm_mem_values = []
for _, row in host_vms.iterrows():
vm_name = row[vm_col]
# Use first memory column as the metric for sorting
mem_value = row[mem_cols[0]] if pd.notna(row[mem_cols[0]]) else 0
vm_mem_values.append((vm_name, mem_value))
# Sort by memory value (descending) and take top N
vm_mem_values.sort(key=lambda x: x[1], reverse=True)
top_vms = vm_mem_values[:top_n]
if top_vms:
print(f"\nHost: {host}")
print(f" Top {len(top_vms)} VMs by Memory:")
for vm_name, mem_value in top_vms:
print(f" {vm_name}: {mem_value} MB")
if (vm_cpu_df is None or vm_cpu_df.empty) and (vm_memory_df is None or vm_memory_df.empty):
print("No VM data available.")
def get_vm_memory_utilization(self) -> pd.DataFrame:
"""
Calculate VM memory utilization metrics.
Returns:
DataFrame with VM memory utilization statistics
"""
if 'vmemory' not in self.dataframes:
raise ValueError("vMemory data not found in loaded files")
df = self.dataframes['vmemory']
# Identify key columns
vm_name_cols = [col for col in df.columns if 'vm' in col.lower() and 'name' in col.lower()]
if not vm_name_cols:
vm_name_cols = [col for col in df.columns if 'name' in col.lower()]
mem_cols = [col for col in df.columns if ('mem' in col.lower() or 'ram' in col.lower())]
mem_util_cols = [col for col in mem_cols if 'usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower()]
mem_size_cols = [col for col in mem_cols if 'size' in col.lower() or 'total' in col.lower() or 'capacity' in col.lower()]
if vm_name_cols:
vm_col = vm_name_cols[0]
result = df[[vm_col]].copy()
# Add memory metrics
for col in mem_util_cols + mem_size_cols:
if col in df.columns:
result[col] = df[col]
# Add host/cluster info if available
cluster_cols = [col for col in df.columns if 'cluster' in col.lower()]
host_cols = [col for col in df.columns if 'host' in col.lower() or 'esx' in col.lower()]
for col in cluster_cols + host_cols:
if col in df.columns:
result[col] = df[col]
return result
return pd.DataFrame()
def get_network_utilization(self) -> pd.DataFrame:
"""
Calculate network utilization metrics.
Returns:
DataFrame with network utilization statistics
"""
if 'vnetwork' not in self.dataframes:
raise ValueError("vNetwork data not found in loaded files")
df = self.dataframes['vnetwork']
# Identify key columns
vm_name_cols = [col for col in df.columns if 'vm' in col.lower() and 'name' in col.lower()]
if not vm_name_cols:
vm_name_cols = [col for col in df.columns if 'name' in col.lower()]
network_cols = [col for col in df.columns if 'network' in col.lower() or 'nic' in col.lower() or 'net' in col.lower()
or 'adapter' in col.lower() or 'connected' in col.lower() or 'mac' in col.lower()]
if vm_name_cols:
vm_col = vm_name_cols[0]
result = df[[vm_col]].copy()
# Add network metrics
for col in network_cols:
if col in df.columns:
result[col] = df[col]
# Add host/cluster info if available
cluster_cols = [col for col in df.columns if 'cluster' in col.lower()]
host_cols = [col for col in df.columns if 'host' in col.lower() or 'esx' in col.lower()]
for col in cluster_cols + host_cols:
if col in df.columns:
result[col] = df[col]
return result
return pd.DataFrame()
def get_aggregated_cluster_report(self) -> Dict:
"""
Generate an aggregated report of cluster-level utilization.
Returns:
Dictionary with cluster utilization statistics
"""
report = {}
# Get relevant dataframes
cluster_df = self.get_cluster_utilization() if 'vcluster' in self.dataframes else None
host_df = self.get_host_utilization() if 'vhost' in self.dataframes else None
if cluster_df is not None and not cluster_df.empty:
# Get cluster names
cluster_col = cluster_df.columns[0] # Assuming first column is cluster name
clusters = cluster_df[cluster_col].unique()
report['clusters'] = {}
# Process each cluster
for cluster in clusters:
cluster_data = cluster_df[cluster_df[cluster_col] == cluster]
# CPU metrics
cpu_cols = [col for col in cluster_data.columns if 'cpu' in col.lower() and 'pct' in col.lower()]
cpu_metrics = {}
for col in cpu_cols:
if not cluster_data[col].empty:
cpu_metrics[col] = cluster_data[col].iloc[0]
# Memory metrics
mem_cols = [col for col in cluster_data.columns if ('mem' in col.lower() or 'ram' in col.lower()) and 'pct' in col.lower()]
mem_metrics = {}
for col in mem_cols:
if not cluster_data[col].empty:
mem_metrics[col] = cluster_data[col].iloc[0]
report['clusters'][cluster] = {
'cpu': cpu_metrics,
'memory': mem_metrics,
'hosts': {} # Initialize the hosts dictionary for each cluster
}
# Add host-level aggregated data if available
if host_df is not None and not host_df.empty:
# Identify cluster column if it exists
cluster_cols = [col for col in host_df.columns if 'cluster' in col.lower()]
if cluster_cols:
cluster_col = cluster_cols[0]
# Group hosts by cluster
clusters = host_df[cluster_col].unique()
if 'clusters' not in report:
report['clusters'] = {}
for cluster in clusters:
if pd.notna(cluster): # Skip NaN values
cluster_hosts = host_df[host_df[cluster_col] == cluster]
# CPU metrics
cpu_cols = [col for col in cluster_hosts.columns if 'cpu' in col.lower() and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
# Memory metrics
mem_cols = [col for col in cluster_hosts.columns if ('mem' in col.lower() or 'ram' in col.lower()) and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
if cluster not in report['clusters']:
report['clusters'][cluster] = {
'cpu': {},
'memory': {},
'hosts': {} # Initialize the hosts dictionary if this is a new cluster
}
elif 'hosts' not in report['clusters'][cluster]:
# Add the hosts key if it doesn't exist
report['clusters'][cluster]['hosts'] = {}
# Add host-level details
host_col = [col for col in cluster_hosts.columns if 'host' in col.lower() or 'name' in col.lower() or 'esx' in col.lower()][0]
for _, host_row in cluster_hosts.iterrows():
host_name = host_row[host_col]
host_cpu = {}
for col in cpu_cols:
if pd.notna(host_row[col]):
host_cpu[col] = host_row[col]
host_mem = {}
for col in mem_cols:
if pd.notna(host_row[col]):
host_mem[col] = host_row[col]
report['clusters'][cluster]['hosts'][host_name] = {
'cpu': host_cpu,
'memory': host_mem
}
return report
def get_aggregated_vm_report(self) -> Dict:
"""
Generate an aggregated report of VM-level utilization.
Returns:
Dictionary with VM utilization statistics grouped by clusters and hosts
"""
report = {'vms_by_cluster': {}, 'vms_by_host': {}}
# Get VM data
vm_cpu = self.get_vm_cpu_utilization() if 'vcpu' in self.dataframes else None
vm_memory = self.get_vm_memory_utilization() if 'vmemory' in self.dataframes else None
vm_network = self.get_network_utilization() if 'vnetwork' in self.dataframes else None
# Process VM CPU data
if vm_cpu is not None and not vm_cpu.empty:
# Try to find cluster and host columns
cluster_cols = [col for col in vm_cpu.columns if 'cluster' in col.lower()]
host_cols = [col for col in vm_cpu.columns if 'host' in col.lower() or 'esx' in col.lower()]
vm_col = [col for col in vm_cpu.columns if 'vm' in col.lower() and 'name' in col.lower()]
if not vm_col:
vm_col = [col for col in vm_cpu.columns if 'name' in col.lower()]
vm_col = vm_col[0] if vm_col else None
if vm_col:
# CPU metrics columns
cpu_cols = [col for col in vm_cpu.columns if 'cpu' in col.lower() and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
# Group by cluster if available
if cluster_cols and cluster_cols[0] in vm_cpu.columns:
cluster_col = cluster_cols[0]
clusters = vm_cpu[cluster_col].dropna().unique()
for cluster in clusters:
if pd.notna(cluster):
cluster_vms = vm_cpu[vm_cpu[cluster_col] == cluster]
if cluster not in report['vms_by_cluster']:
report['vms_by_cluster'][cluster] = {}
for _, vm_row in cluster_vms.iterrows():
vm_name = vm_row[vm_col]
if vm_name not in report['vms_by_cluster'][cluster]:
report['vms_by_cluster'][cluster][vm_name] = {
'cpu': {},
'memory': {},
'network': {}
}
# Add CPU metrics
for col in cpu_cols:
if col in vm_row and pd.notna(vm_row[col]):
report['vms_by_cluster'][cluster][vm_name]['cpu'][col] = vm_row[col]
# Group by host if available
if host_cols and host_cols[0] in vm_cpu.columns:
host_col = host_cols[0]
hosts = vm_cpu[host_col].dropna().unique()
for host in hosts:
if pd.notna(host):
host_vms = vm_cpu[vm_cpu[host_col] == host]
if host not in report['vms_by_host']:
report['vms_by_host'][host] = {}
for _, vm_row in host_vms.iterrows():
vm_name = vm_row[vm_col]
if vm_name not in report['vms_by_host'][host]:
report['vms_by_host'][host][vm_name] = {
'cpu': {},
'memory': {},
'network': {}
}
# Add CPU metrics
for col in cpu_cols:
if col in vm_row and pd.notna(vm_row[col]):
report['vms_by_host'][host][vm_name]['cpu'][col] = vm_row[col]
# Process VM memory data
if vm_memory is not None and not vm_memory.empty:
# Try to find cluster and host columns
cluster_cols = [col for col in vm_memory.columns if 'cluster' in col.lower()]
host_cols = [col for col in vm_memory.columns if 'host' in col.lower() or 'esx' in col.lower()]
vm_col = [col for col in vm_memory.columns if 'vm' in col.lower() and 'name' in col.lower()]
if not vm_col:
vm_col = [col for col in vm_memory.columns if 'name' in col.lower()]
vm_col = vm_col[0] if vm_col else None
if vm_col:
# Memory metrics columns
mem_cols = [col for col in vm_memory.columns if ('mem' in col.lower() or 'ram' in col.lower()) and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower() or 'size' in col.lower())]
# Update existing report structure with memory data
# By cluster
if cluster_cols and cluster_cols[0] in vm_memory.columns:
cluster_col = cluster_cols[0]
for _, vm_row in vm_memory.iterrows():
cluster = vm_row[cluster_col] if pd.notna(vm_row[cluster_col]) else None
vm_name = vm_row[vm_col]
if cluster and vm_name:
if cluster not in report['vms_by_cluster']:
report['vms_by_cluster'][cluster] = {}
if vm_name not in report['vms_by_cluster'][cluster]:
report['vms_by_cluster'][cluster][vm_name] = {
'cpu': {},
'memory': {},
'network': {}
}
# Add memory metrics
for col in mem_cols:
if col in vm_row and pd.notna(vm_row[col]):
report['vms_by_cluster'][cluster][vm_name]['memory'][col] = vm_row[col]
# By host
if host_cols and host_cols[0] in vm_memory.columns:
host_col = host_cols[0]
for _, vm_row in vm_memory.iterrows():
host = vm_row[host_col] if pd.notna(vm_row[host_col]) else None
vm_name = vm_row[vm_col]
if host and vm_name:
if host not in report['vms_by_host']:
report['vms_by_host'][host] = {}
if vm_name not in report['vms_by_host'][host]:
report['vms_by_host'][host][vm_name] = {
'cpu': {},
'memory': {},
'network': {}
}
# Add memory metrics
for col in mem_cols:
if col in vm_row and pd.notna(vm_row[col]):
report['vms_by_host'][host][vm_name]['memory'][col] = vm_row[col]
# Process VM network data
if vm_network is not None and not vm_network.empty:
# Try to find cluster and host columns
cluster_cols = [col for col in vm_network.columns if 'cluster' in col.lower()]
host_cols = [col for col in vm_network.columns if 'host' in col.lower() or 'esx' in col.lower()]
vm_col = [col for col in vm_network.columns if 'vm' in col.lower() and 'name' in col.lower()]
if not vm_col:
vm_col = [col for col in vm_network.columns if 'name' in col.lower()]
vm_col = vm_col[0] if vm_col else None
if vm_col:
# Network metrics columns
net_cols = [col for col in vm_network.columns if 'network' in col.lower() or 'nic' in col.lower() or 'adapters' in col.lower()]
# Update existing report structure with network data
# By cluster
if cluster_cols and cluster_cols[0] in vm_network.columns:
cluster_col = cluster_cols[0]
for _, vm_row in vm_network.iterrows():
cluster = vm_row[cluster_col] if pd.notna(vm_row[cluster_col]) else None
vm_name = vm_row[vm_col]
if cluster and vm_name:
if cluster not in report['vms_by_cluster']:
report['vms_by_cluster'][cluster] = {}
if vm_name not in report['vms_by_cluster'][cluster]:
report['vms_by_cluster'][cluster][vm_name] = {
'cpu': {},
'memory': {},
'network': {}
}
# Add network metrics
for col in net_cols:
if col in vm_row and pd.notna(vm_row[col]):
report['vms_by_cluster'][cluster][vm_name]['network'][col] = vm_row[col]
# By host
if host_cols and host_cols[0] in vm_network.columns:
host_col = host_cols[0]
for _, vm_row in vm_network.iterrows():
host = vm_row[host_col] if pd.notna(vm_row[host_col]) else None
vm_name = vm_row[vm_col]
if host and vm_name:
if host not in report['vms_by_host']:
report['vms_by_host'][host] = {}
if vm_name not in report['vms_by_host'][host]:
report['vms_by_host'][host][vm_name] = {
'cpu': {},
'memory': {},
'network': {}
}
# Add network metrics
for col in net_cols:
if col in vm_row and pd.notna(vm_row[col]):
report['vms_by_host'][host][vm_name]['network'][col] = vm_row[col]
return report
def print_aggregated_cluster_report(self) -> None:
"""Print a formatted cluster utilization report to the console."""
report = self.get_aggregated_cluster_report()
if not report or not report.get('clusters'):
print("No cluster data available.")
return
print("\n===== CLUSTER UTILIZATION REPORT =====\n")
for cluster_name, cluster_data in report['clusters'].items():
print(f"Cluster: {cluster_name}")
# Print CPU metrics
if cluster_data.get('cpu'):
print(" CPU Utilization:")
for metric, value in cluster_data['cpu'].items():
print(f" {metric}: {value}")
# Print Memory metrics
if cluster_data.get('memory'):
print(" Memory Utilization:")
for metric, value in cluster_data['memory'].items():
print(f" {metric}: {value}")
# Print Host metrics if available
if cluster_data.get('hosts'):
print(" Hosts:")
for host_name, host_data in cluster_data['hosts'].items():
print(f" Host: {host_name}")
if host_data.get('cpu'):
print(" CPU Utilization:")
for metric, value in host_data['cpu'].items():
print(f" {metric}: {value}")
if host_data.get('memory'):
print(" Memory Utilization:")
for metric, value in host_data['memory'].items():
print(f" {metric}: {value}")
print() # Add a blank line between clusters
def print_aggregated_vm_report(self, top_n: int = 10, by: str = 'cpu') -> None:
"""
Print a formatted VM utilization report, showing the top N VMs by utilization directly from vInfo.
Args:
top_n: Number of top VMs to show
by: Metric to sort by ('cpu' or 'memory')
"""
if 'vinfo' not in self.dataframes:
print("No VM data available - vInfo data missing.")
return
df = self.dataframes['vinfo']
# Debug output
print(f"Found {len(df)} VMs in vInfo data")
# Ensure we have the necessary columns
vm_col = 'VM'
cluster_col = 'Cluster'
host_col = 'Host'
# Check if these columns exist
required_cols = [vm_col, cluster_col, host_col]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
print(f"Missing required columns: {', '.join(missing_cols)}")
return
# Define the metric columns based on the 'by' parameter
if by.lower() == 'cpu':
metric_col = 'CPUs'
metric_label = 'CPU Count'
else: # memory
metric_col = 'Memory'
metric_label = 'Memory (MB)'
if metric_col not in df.columns:
print(f"Metric column '{metric_col}' not found in vInfo data")
return
# Ensure the metric column is numeric
try:
df[metric_col] = pd.to_numeric(df[metric_col], errors='coerce')
except Exception as e:
print(f"Error converting {metric_col} to numeric: {e}")
return
print(f"\n===== TOP {top_n} VMs BY {by.upper()} =====\n")
# Group by cluster
print("BY CLUSTER:")
clusters = df[cluster_col].dropna().unique()
for cluster in clusters:
cluster_vms = df[df[cluster_col] == cluster].copy()
# Create a list of (vm_name, metric_value) tuples for sorting
vm_values = []
for _, row in cluster_vms.iterrows():
vm_name = row[vm_col]
metric_value = row[metric_col] if pd.notna(row[metric_col]) else 0
vm_values.append((vm_name, metric_value))
if not vm_values:
print(f"\nCluster: {cluster} - No VM data available")
continue
# Sort by metric value (descending) and take top N
vm_values.sort(key=lambda x: x[1], reverse=True)
top_vms = vm_values[:top_n]
print(f"\nCluster: {cluster}")
print(f" Top {len(top_vms)} VMs by {by.upper()}:")
for vm_name, value in top_vms:
print(f" {vm_name}: {value} {metric_label}")
# Group by host
print("\nBY HOST:")
hosts = df[host_col].dropna().unique()
for host in hosts:
host_vms = df[df[host_col] == host].copy()
# Create a list of (vm_name, metric_value) tuples for sorting
vm_values = []
for _, row in host_vms.iterrows():
vm_name = row[vm_col]
metric_value = row[metric_col] if pd.notna(row[metric_col]) else 0
vm_values.append((vm_name, metric_value))
if not vm_values:
print(f"\nHost: {host} - No VM data available")
continue
# Sort by metric value (descending) and take top N
vm_values.sort(key=lambda x: x[1], reverse=True)
top_vms = vm_values[:top_n]
print(f"\nHost: {host}")
print(f" Top {len(top_vms)} VMs by {by.upper()}:")
for vm_name, value in top_vms:
print(f" {vm_name}: {value} {metric_label}")
def get_overall_utilization_summary(self) -> Dict:
"""
Create a summary of overall utilization across the environment.
Returns:
Dictionary with summarized utilization metrics
"""
summary = {
'cpu': {
'overall_avg': None,
'by_cluster': {},
'by_host': {}
},
'memory': {
'overall_avg': None,
'by_cluster': {},
'by_host': {}
},
'network': {
'overall_stats': {},
'by_cluster': {},
'by_host': {}
}
}
# Get relevant dataframes
host_df = self.get_host_utilization() if 'vhost' in self.dataframes else None
vm_cpu_df = self.get_vm_cpu_utilization() if 'vcpu' in self.dataframes else None
vm_memory_df = self.get_vm_memory_utilization() if 'vmemory' in self.dataframes else None
vm_network_df = self.get_network_utilization() if 'vnetwork' in self.dataframes else None
# Calculate CPU utilization summary
if host_df is not None and not host_df.empty:
# Find CPU utilization metrics in host data
cpu_util_cols = [col for col in host_df.columns if 'cpu' in col.lower() and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
if cpu_util_cols:
# Use the first utilization column found
cpu_col = cpu_util_cols[0]
# Calculate overall average
overall_cpu_avg = host_df[cpu_col].mean()
if pd.notna(overall_cpu_avg):
summary['cpu']['overall_avg'] = overall_cpu_avg
# Calculate by cluster if available
cluster_cols = [col for col in host_df.columns if 'cluster' in col.lower()]
if cluster_cols:
cluster_col = cluster_cols[0]
for cluster in host_df[cluster_col].dropna().unique():
cluster_hosts = host_df[host_df[cluster_col] == cluster]
cluster_avg = cluster_hosts[cpu_col].mean()
if pd.notna(cluster_avg):
summary['cpu']['by_cluster'][cluster] = cluster_avg
# Calculate by host
host_cols = [col for col in host_df.columns if 'host' in col.lower() or 'name' in col.lower() or 'esx' in col.lower()]
if host_cols:
host_col = host_cols[0]
for host in host_df[host_col].dropna().unique():
host_cpu = host_df[host_df[host_col] == host][cpu_col].iloc[0] if not host_df[host_df[host_col] == host].empty else None
if pd.notna(host_cpu):
summary['cpu']['by_host'][host] = host_cpu
# If host data is not available, try using VM data
elif vm_cpu_df is not None and not vm_cpu_df.empty:
# Find CPU utilization metrics in VM data
cpu_util_cols = [col for col in vm_cpu_df.columns if 'cpu' in col.lower() and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
if cpu_util_cols:
# Use the first utilization column found
cpu_col = cpu_util_cols[0]
# Calculate overall average from VMs
overall_cpu_avg = vm_cpu_df[cpu_col].mean()
if pd.notna(overall_cpu_avg):
summary['cpu']['overall_avg'] = overall_cpu_avg
# Calculate by cluster if available
cluster_cols = [col for col in vm_cpu_df.columns if 'cluster' in col.lower()]
if cluster_cols:
cluster_col = cluster_cols[0]
for cluster in vm_cpu_df[cluster_col].dropna().unique():
cluster_vms = vm_cpu_df[vm_cpu_df[cluster_col] == cluster]
cluster_avg = cluster_vms[cpu_col].mean()
if pd.notna(cluster_avg):
summary['cpu']['by_cluster'][cluster] = cluster_avg
# Calculate by host if available
host_cols = [col for col in vm_cpu_df.columns if 'host' in col.lower() or 'esx' in col.lower()]
if host_cols:
host_col = host_cols[0]
for host in vm_cpu_df[host_col].dropna().unique():
host_vms = vm_cpu_df[vm_cpu_df[host_col] == host]
host_avg = host_vms[cpu_col].mean()
if pd.notna(host_avg):
summary['cpu']['by_host'][host] = host_avg
# Calculate Memory utilization summary
if host_df is not None and not host_df.empty:
# Find Memory utilization metrics in host data
mem_util_cols = [col for col in host_df.columns if ('mem' in col.lower() or 'ram' in col.lower()) and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
if mem_util_cols:
# Use the first utilization column found
mem_col = mem_util_cols[0]
# Calculate overall average
overall_mem_avg = host_df[mem_col].mean()
if pd.notna(overall_mem_avg):
summary['memory']['overall_avg'] = overall_mem_avg
# Calculate by cluster if available
cluster_cols = [col for col in host_df.columns if 'cluster' in col.lower()]
if cluster_cols:
cluster_col = cluster_cols[0]
for cluster in host_df[cluster_col].dropna().unique():
cluster_hosts = host_df[host_df[cluster_col] == cluster]
cluster_avg = cluster_hosts[mem_col].mean()
if pd.notna(cluster_avg):
summary['memory']['by_cluster'][cluster] = cluster_avg
# Calculate by host
host_cols = [col for col in host_df.columns if 'host' in col.lower() or 'name' in col.lower() or 'esx' in col.lower()]
if host_cols:
host_col = host_cols[0]
for host in host_df[host_col].dropna().unique():
host_mem = host_df[host_df[host_col] == host][mem_col].iloc[0] if not host_df[host_df[host_col] == host].empty else None
if pd.notna(host_mem):
summary['memory']['by_host'][host] = host_mem
# If host data is not available, try using VM data
elif vm_memory_df is not None and not vm_memory_df.empty:
# Find Memory utilization metrics in VM data
mem_util_cols = [col for col in vm_memory_df.columns if ('mem' in col.lower() or 'ram' in col.lower()) and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
if mem_util_cols:
# Use the first utilization column found
mem_col = mem_util_cols[0]
# Calculate overall average from VMs
overall_mem_avg = vm_memory_df[mem_col].mean()
if pd.notna(overall_mem_avg):
summary['memory']['overall_avg'] = overall_mem_avg
# Calculate by cluster if available
cluster_cols = [col for col in vm_memory_df.columns if 'cluster' in col.lower()]
if cluster_cols:
cluster_col = cluster_cols[0]
for cluster in vm_memory_df[cluster_col].dropna().unique():
cluster_vms = vm_memory_df[vm_memory_df[cluster_col] == cluster]
cluster_avg = cluster_vms[mem_col].mean()
if pd.notna(cluster_avg):
summary['memory']['by_cluster'][cluster] = cluster_avg
# Calculate by host if available
host_cols = [col for col in vm_memory_df.columns if 'host' in col.lower() or 'esx' in col.lower()]
if host_cols:
host_col = host_cols[0]
for host in vm_memory_df[host_col].dropna().unique():
host_vms = vm_memory_df[vm_memory_df[host_col] == host]
host_avg = host_vms[mem_col].mean()
if pd.notna(host_avg):
summary['memory']['by_host'][host] = host_avg
# Calculate Network statistics
if vm_network_df is not None and not vm_network_df.empty:
# Count total network adapters and connections
nic_cols = [col for col in vm_network_df.columns if 'nic' in col.lower() or 'adapter' in col.lower() or 'connected' in col.lower()]
if nic_cols:
# Try to find connection status columns
connected_cols = [col for col in nic_cols if 'connected' in col.lower() or 'status' in col.lower()]
if connected_cols:
conn_col = connected_cols[0]
# Count total NICs and connected NICs
total_nics = vm_network_df[conn_col].count()
connected_nics = vm_network_df[vm_network_df[conn_col].astype(str).str.lower().isin(['true', 'connected', 'yes', '1'])].shape[0]
summary['network']['overall_stats'] = {
'total_nics': total_nics,
'connected_nics': connected_nics,
'connection_rate': connected_nics / total_nics if total_nics > 0 else None
}
# Calculate by cluster if available
cluster_cols = [col for col in vm_network_df.columns if 'cluster' in col.lower()]
if cluster_cols and connected_cols:
cluster_col = cluster_cols[0]
conn_col = connected_cols[0]
for cluster in vm_network_df[cluster_col].dropna().unique():
cluster_vms = vm_network_df[vm_network_df[cluster_col] == cluster]
total_nics = cluster_vms[conn_col].count()
connected_nics = cluster_vms[cluster_vms[conn_col].astype(str).str.lower().isin(['true', 'connected', 'yes', '1'])].shape[0]
summary['network']['by_cluster'][cluster] = {
'total_nics': total_nics,
'connected_nics': connected_nics,
'connection_rate': connected_nics / total_nics if total_nics > 0 else None
}
# Calculate by host if available
host_cols = [col for col in vm_network_df.columns if 'host' in col.lower() or 'esx' in col.lower()]
if host_cols and connected_cols:
host_col = host_cols[0]
conn_col = connected_cols[0]
for host in vm_network_df[host_col].dropna().unique():
host_vms = vm_network_df[vm_network_df[host_col] == host]
total_nics = host_vms[conn_col].count()
connected_nics = host_vms[host_vms[conn_col].astype(str).str.lower().isin(['true', 'connected', 'yes', '1'])].shape[0]
summary['network']['by_host'][host] = {
'total_nics': total_nics,
'connected_nics': connected_nics,
'connection_rate': connected_nics / total_nics if total_nics > 0 else None
}
return summary
def print_overall_utilization_summary(self) -> None:
"""Print a formatted summary of overall utilization to the console."""
summary = self.get_overall_utilization_summary()
print("\n===== OVERALL UTILIZATION SUMMARY =====\n")
# Print CPU Summary
print("CPU UTILIZATION:")
if summary['cpu'].get('overall_avg') is not None:
print(f" Overall Average: {summary['cpu']['overall_avg']:.2f}%")
else:
print(" Overall Average: N/A")
if summary['cpu'].get('by_cluster'):
print(" By Cluster:")
for cluster, value in summary['cpu']['by_cluster'].items():
print(f" {cluster}: {value:.2f}%")
if summary['cpu'].get('by_host'):
print(" By Host:")
for host, value in summary['cpu']['by_host'].items():
print(f" {host}: {value:.2f}%")
print() # Add a blank line
# Print Memory Summary
print("MEMORY UTILIZATION:")
if summary['memory'].get('overall_avg') is not None:
print(f" Overall Average: {summary['memory']['overall_avg']:.2f}%")
else:
print(" Overall Average: N/A")
if summary['memory'].get('by_cluster'):
print(" By Cluster:")
for cluster, value in summary['memory']['by_cluster'].items():
print(f" {cluster}: {value:.2f}%")
if summary['memory'].get('by_host'):
print(" By Host:")
for host, value in summary['memory']['by_host'].items():
print(f" {host}: {value:.2f}%")
print() # Add a blank line
# Print Network Summary
print("NETWORK CONNECTIVITY:")
if summary.get('network') and summary['network'].get('overall_stats'):
stats = summary['network']['overall_stats']
print(f" Total NICs: {stats.get('total_nics', 'N/A')}")
print(f" Connected NICs: {stats.get('connected_nics', 'N/A')}")
if stats.get('connection_rate') is not None:
print(f" Connection Rate: {stats['connection_rate'] * 100:.2f}%")
else:
print(" Connection Rate: N/A")
else:
print(" Network information not available")
if summary.get('network') and summary['network'].get('by_cluster'):
print(" By Cluster:")
for cluster, stats in summary['network']['by_cluster'].items():
rate = stats.get('connection_rate')
rate_str = f"{rate * 100:.2f}%" if rate is not None else "N/A"
print(f" {cluster}: {stats.get('connected_nics', 'N/A')}/{stats.get('total_nics', 'N/A')} NICs connected ({rate_str})")
if summary.get('network') and summary['network'].get('by_host'):
print(" By Host:")
for host, stats in summary['network']['by_host'].items():
rate = stats.get('connection_rate')
rate_str = f"{rate * 100:.2f}%" if rate is not None else "N/A"
print(f" {host}: {stats.get('connected_nics', 'N/A')}/{stats.get('total_nics', 'N/A')} NICs connected ({rate_str})")
@click.command()
@click.option('--directory', '-d', type=click.Path(exists=True, file_okay=False, dir_okay=True),
help='Directory containing RVTools CSV files')
@click.option('--top-n', '-n', type=int, default=10,
help='Number of top VMs to display in the VM utilization report')
def main(directory, top_n):
"""
Parse and analyze RVTools CSV files for infrastructure utilization metrics.
If no directory is provided, the script will prompt for one.
"""
# If directory wasn't provided as a command-line argument, prompt for it
if not directory:
directory = click.prompt(
"Please enter the path to the directory containing RVTools CSV files",
type=click.Path(exists=True, file_okay=False, dir_okay=True)
)
click.echo(f"Analyzing RVTools CSV files in: {directory}")
# Initialize the analyzer with the specified directory
analyzer = RVToolsAnalyzer(directory)
# Print diagnostic information about what files were loaded
click.echo("\nLoaded RVTools files:")
for key, file_path in analyzer.file_mapping.items():
if file_path:
click.echo(f" {key}: {os.path.basename(file_path)}")
else:
click.echo(f" {key}: Not found")
# Print overall utilization summary
click.echo("\nGenerating overall utilization summary...")
analyzer.print_overall_utilization_summary()
# Print cluster utilization report
click.echo("\nGenerating cluster utilization report...")
analyzer.print_aggregated_cluster_report()
# Print top N VMs by CPU utilization
click.echo(f"\nGenerating top {top_n} VMs by CPU utilization...")
analyzer.print_aggregated_vm_report(top_n=top_n, by='cpu')
# Print top N VMs by memory utilization
click.echo(f"\nGenerating top {top_n} VMs by memory utilization...")
analyzer.print_aggregated_vm_report(top_n=top_n, by='memory')
click.echo("\nAnalysis complete!")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment