greyhoundforty · May 7, 2025 20:21
diff --git a/parser.py b/parser.py
 import os
 import pandas as pd
 import glob
 from typing import Dict, List, Optional, Union, Tuple
 import statistics
 import click

 class RVToolsAnalyzer:
    def __init__(self, directory_path: str = None):
        """
        Initialize the RVTools analyzer with a directory containing RVTools CSV exports.
        
        Args:
            directory_path: Path to directory containing RVTools CSV files
        """
        self.directory_path = directory_path
        self.dataframes = {}
        self.file_mapping = {
            'vinfo': None, 
            'vhost': None,
            'vcluster': None,
            'vcpu': None,
            'vmemory': None,
            'vnetwork': None,
            'vdisk': None
        }
        if directory_path:
            self.load_csv_files()

    def load_csv_files(self) -> None:
        """Load all RVTools CSV files from the specified directory."""
        if not self.directory_path or not os.path.exists(self.directory_path):
            raise ValueError(f"Directory path {self.directory_path} does not exist")
        
        # Map with exact RVTools filenames to look for
        exact_file_mapping = {
            'vinfo': 'RVTools_tabvInfo.csv',
            'vhost': 'RVTools_tabvHost.csv',
            'vcluster': 'RVTools_tabvCluster.csv',
            'vcpu': 'RVTools_tabvCPU.csv',
            'vmemory': 'RVTools_tabvMemory.csv',
            'vnetwork': 'RVTools_tabvNetwork.csv',
            'vdisk': 'RVTools_tabvDisk.csv'
        }
        
        # Find the exact filenames in the directory
        for key, filename in exact_file_mapping.items():
            filepath = os.path.join(self.directory_path, filename)
            if os.path.exists(filepath):
                self.file_mapping[key] = filepath
                print(f"Found {key} file: {filename}")
            else:
                print(f"Warning: {filename} not found in {self.directory_path}")
        
        # Load each file into a pandas DataFrame
        for key, file_path in self.file_mapping.items():
            if file_path and os.path.exists(file_path):
                try:
                    df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
                    df.columns = [c.strip().replace(' ', '_').replace('(', '').replace(')', '').replace('%', 'pct').replace('#', 'num') for c in df.columns]
                    self.dataframes[key] = df
                    print(f"Loaded {key} data from {os.path.basename(file_path)} with {len(df)} rows and {len(df.columns)} columns")
                    
                    print(f"Sample columns: {', '.join(df.columns[:5])}")
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
            else:
                print(f"File for {key} not found or not valid")

    def get_cluster_utilization(self) -> pd.DataFrame:
        """
        Calculate and return cluster utilization metrics.
        
        Returns:
            DataFrame with cluster utilization statistics
        """
        if 'vcluster' not in self.dataframes:
            raise ValueError("vCluster data not found in loaded files")
        
        df = self.dataframes['vcluster']
        
        # Select relevant columns - adjust these based on actual column names in your CSVs
        # Common columns might include: Cluster, NumCPUs, CPUMhz, MemoryGB, etc.
        cluster_cols = [col for col in df.columns if 'cluster' in col.lower() or 'name' in col.lower()]
        cpu_cols = [col for col in df.columns if 'cpu' in col.lower()]
        mem_cols = [col for col in df.columns if 'mem' in col.lower() or 'ram' in col.lower()]
        
        # Create a summary DataFrame
        if cluster_cols:
            cluster_name_col = cluster_cols[0]
            result = df[[cluster_name_col]].copy()
            
            # Add CPU metrics
            for col in cpu_cols:
                if 'pct' in col.lower():  # Utilization percentage columns
                    result[col] = df[col]
            
            # Add Memory metrics
            for col in mem_cols:
                if 'pct' in col.lower():  # Utilization percentage columns
                    result[col] = df[col]
            
            return result
        
        return pd.DataFrame()  # Return empty DataFrame if no suitable columns found

    def get_host_utilization(self) -> pd.DataFrame:
        """
        Calculate and return host utilization metrics.
        
        Returns:
            DataFrame with host-level utilization statistics
        """
        if 'vhost' not in self.dataframes:
            raise ValueError("vHost data not found in loaded files")
        
        df = self.dataframes['vhost']
        
        # Select relevant columns
        host_cols = [col for col in df.columns if 'host' in col.lower() or 'name' in col.lower() or 'esx' in col.lower()]
        cluster_cols = [col for col in df.columns if 'cluster' in col.lower()]
        cpu_cols = [col for col in df.columns if 'cpu' in col.lower() and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
        mem_cols = [col for col in df.columns if ('mem' in col.lower() or 'ram' in col.lower()) and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
        
        if host_cols:
            host_name_col = host_cols[0]
            result_cols = [host_name_col]
            
            if cluster_cols:
                result_cols.append(cluster_cols[0])
            
            result_cols.extend(cpu_cols)
            result_cols.extend(mem_cols)
            
            # Filter to only include columns that exist in the dataframe
            result_cols = [col for col in result_cols if col in df.columns]
            
            if result_cols:
                return df[result_cols].copy()
        
        return pd.DataFrame()

    def get_vm_cpu_utilization(self) -> pd.DataFrame:
        """
        Calculate VM CPU utilization metrics based on the column structure in RVTools_tabvInfo.csv.
        
        Returns:
            DataFrame with VM CPU utilization statistics
        """
        if 'vinfo' not in self.dataframes:
            print("vInfo data not found in loaded files")
            return pd.DataFrame()
        
        df = self.dataframes['vinfo']
        
        # Based on the provided column headers from RVTools_tabvInfo.csv
        # VM name should be in column 0 called 'VM'
        vm_cols = ['VM'] if 'VM' in df.columns else [col for col in df.columns if 'vm' in col.lower() and 'name' not in col.lower()]
        if not vm_cols:
            vm_cols = [col for col in df.columns if 'name' in col.lower() and 'dns' not in col.lower()]
        
        # CPU information in column 16 (CPUs) and possibly Overall_Cpu_Readiness (column 17)
        cpu_cols = ['CPUs', 'Overall_Cpu_Readiness'] if 'CPUs' in df.columns else [
            col for col in df.columns if 'cpu' in col.lower() and 'num' not in col.lower()
        ]
        
        # Cluster and host information (columns 76-78)
        cluster_cols = ['Cluster'] if 'Cluster' in df.columns else [col for col in df.columns if 'cluster' in col.lower()]
        host_cols = ['Host'] if 'Host' in df.columns else [col for col in df.columns if 'host' in col.lower()]
        
        if vm_cols:
            vm_col = vm_cols[0]
            result_cols = [vm_col]
            
            # Add cluster and host columns if available
            if cluster_cols:
                result_cols.extend(cluster_cols)
            if host_cols:
                result_cols.extend(host_cols)
            
            # Add CPU columns if available
            result_cols.extend([col for col in cpu_cols if col in df.columns])
            
            # Filter to only include columns that exist in the dataframe
            result_cols = [col for col in result_cols if col in df.columns]
            
            if result_cols:
                print(f"Using columns for VM CPU utilization: {', '.join(result_cols)}")
                return df[result_cols].copy()
        
        return pd.DataFrame()

    def get_vm_memory_utilization(self) -> pd.DataFrame:
        """
        Calculate VM memory utilization metrics based on the column structure in RVTools_tabvInfo.csv.
        
        Returns:
            DataFrame with VM memory utilization statistics
        """
        if 'vinfo' not in self.dataframes:
            print("vInfo data not found in loaded files")
            return pd.DataFrame()
        
        df = self.dataframes['vinfo']
        
        # Based on the provided column headers from RVTools_tabvInfo.csv
        # VM name should be in column 0 called 'VM'
        vm_cols = ['VM'] if 'VM' in df.columns else [col for col in df.columns if 'vm' in col.lower() and 'name' not in col.lower()]
        if not vm_cols:
            vm_cols = [col for col in df.columns if 'name' in col.lower() and 'dns' not in col.lower()]
        
        # Memory information in column 18 (Memory) and possibly Active_Memory (column 19)
        mem_cols = ['Memory', 'Active_Memory'] if 'Memory' in df.columns else [
            col for col in df.columns if ('mem' in col.lower() or 'ram' in col.lower())
        ]
        
        # Cluster and host information (columns 76-78)
        cluster_cols = ['Cluster'] if 'Cluster' in df.columns else [col for col in df.columns if 'cluster' in col.lower()]
        host_cols = ['Host'] if 'Host' in df.columns else [col for col in df.columns if 'host' in col.lower()]
        
        if vm_cols:
            vm_col = vm_cols[0]
            result_cols = [vm_col]
            
            # Add cluster and host columns if available
            if cluster_cols:
                result_cols.extend(cluster_cols)
            if host_cols:
                result_cols.extend(host_cols)
            
            # Add memory columns if available
            result_cols.extend([col for col in mem_cols if col in df.columns])
            
            # Filter to only include columns that exist in the dataframe
            result_cols = [col for col in result_cols if col in df.columns]
            
            if result_cols:
                print(f"Using columns for VM memory utilization: {', '.join(result_cols)}")
                return df[result_cols].copy()
        
        return pd.DataFrame()

    def print_aggregated_vm_report(self, top_n: int = 10, by: str = 'cpu') -> None:
        """
        Print a formatted VM utilization report, showing the top N VMs by utilization.
        
        Args:
            top_n: Number of top VMs to show
            by: Metric to sort by ('cpu' or 'memory')
        """
        # If we have vinfo but not vcpu or vmemory, use vinfo for everything
        if 'vinfo' in self.dataframes and ('vcpu' not in self.dataframes or 'vmemory' not in self.dataframes):
            print("Using vInfo data for VM utilization metrics")
            vm_cpu_df = self.get_vm_cpu_utilization()
            vm_memory_df = self.get_vm_memory_utilization()
        else:
            # Otherwise use the standard methods
            vm_cpu_df = self.get_vm_cpu_utilization() if 'vcpu' in self.dataframes else None
            vm_memory_df = self.get_vm_memory_utilization() if 'vmemory' in self.dataframes else None
        
        # Rest of the method remains the same...
        # (The existing implementation to display top VMs)
        
        report = {'vms_by_cluster': {}, 'vms_by_host': {}}
        
        # Process VM CPU data
        if vm_cpu_df is not None and not vm_cpu_df.empty:
            # Try to find cluster and host columns
            cluster_cols = [col for col in vm_cpu_df.columns if 'cluster' in col.lower()]
            host_cols = [col for col in vm_cpu_df.columns if 'host' in col.lower()]
            vm_col = vm_cpu_df.columns[0]  # Assume first column is VM name
            
            # Identify CPU and memory utilization metrics
            cpu_cols = [col for col in vm_cpu_df.columns if 'cpu' in col.lower() and col != vm_col]
            
            # Group by cluster if available
            if cluster_cols and vm_col and cpu_cols:
                cluster_col = cluster_cols[0]
                print(f"Grouping VMs by cluster using column: {cluster_col}")
                
                for cluster in vm_cpu_df[cluster_col].dropna().unique():
                    cluster_vms = vm_cpu_df[vm_cpu_df[cluster_col] == cluster]
                    
                    # Create a list of (vm_name, cpu_value) tuples for sorting
                    vm_cpu_values = []
                    
                    for _, row in cluster_vms.iterrows():
                        vm_name = row[vm_col]
                        # Use first CPU column as the metric for sorting
                        cpu_value = row[cpu_cols[0]] if pd.notna(row[cpu_cols[0]]) else 0
                        vm_cpu_values.append((vm_name, cpu_value))
                    
                    # Sort by CPU value (descending) and take top N
                    vm_cpu_values.sort(key=lambda x: x[1], reverse=True)
                    top_vms = vm_cpu_values[:top_n]
                    
                    if by.lower() == 'cpu' and top_vms:
                        print(f"\nCluster: {cluster}")
                        print(f"  Top {len(top_vms)} VMs by CPU:")
                        for vm_name, cpu_value in top_vms:
                            print(f"    {vm_name}: {cpu_value} CPUs")
            
            # Group by host if available
            if host_cols and vm_col and cpu_cols:
                host_col = host_cols[0]
                print(f"Grouping VMs by host using column: {host_col}")
                
                for host in vm_cpu_df[host_col].dropna().unique():
                    host_vms = vm_cpu_df[vm_cpu_df[host_col] == host]
                    
                    # Create a list of (vm_name, cpu_value) tuples for sorting
                    vm_cpu_values = []
                    
                    for _, row in host_vms.iterrows():
                        vm_name = row[vm_col]
                        # Use first CPU column as the metric for sorting
                        cpu_value = row[cpu_cols[0]] if pd.notna(row[cpu_cols[0]]) else 0
                        vm_cpu_values.append((vm_name, cpu_value))
                    
                    # Sort by CPU value (descending) and take top N
                    vm_cpu_values.sort(key=lambda x: x[1], reverse=True)
                    top_vms = vm_cpu_values[:top_n]
                    
                    if by.lower() == 'cpu' and top_vms:
                        print(f"\nHost: {host}")
                        print(f"  Top {len(top_vms)} VMs by CPU:")
                        for vm_name, cpu_value in top_vms:
                            print(f"    {vm_name}: {cpu_value} CPUs")
        
        # Process VM Memory data
        if vm_memory_df is not None and not vm_memory_df.empty and by.lower() == 'memory':
            # Try to find cluster and host columns
            cluster_cols = [col for col in vm_memory_df.columns if 'cluster' in col.lower()]
            host_cols = [col for col in vm_memory_df.columns if 'host' in col.lower()]
            vm_col = vm_memory_df.columns[0]  # Assume first column is VM name
            
            # Identify memory utilization metrics
            mem_cols = [col for col in vm_memory_df.columns if ('mem' in col.lower() or 'ram' in col.lower()) and col != vm_col]
            
            # Group by cluster if available
            if cluster_cols and vm_col and mem_cols:
                cluster_col = cluster_cols[0]
                print(f"Grouping VMs by cluster using column: {cluster_col}")
                
                for cluster in vm_memory_df[cluster_col].dropna().unique():
                    cluster_vms = vm_memory_df[vm_memory_df[cluster_col] == cluster]
                    
                    # Create a list of (vm_name, mem_value) tuples for sorting
                    vm_mem_values = []
                    
                    for _, row in cluster_vms.iterrows():
                        vm_name = row[vm_col]
                        # Use first memory column as the metric for sorting
                        mem_value = row[mem_cols[0]] if pd.notna(row[mem_cols[0]]) else 0
                        vm_mem_values.append((vm_name, mem_value))
                    
                    # Sort by memory value (descending) and take top N
                    vm_mem_values.sort(key=lambda x: x[1], reverse=True)
                    top_vms = vm_mem_values[:top_n]
                    
                    if top_vms:
                        print(f"\nCluster: {cluster}")
                        print(f"  Top {len(top_vms)} VMs by Memory:")
                        for vm_name, mem_value in top_vms:
                            print(f"    {vm_name}: {mem_value} MB")
            
            # Group by host if available
            if host_cols and vm_col and mem_cols:
                host_col = host_cols[0]
                print(f"Grouping VMs by host using column: {host_col}")
                
                for host in vm_memory_df[host_col].dropna().unique():
                    host_vms = vm_memory_df[vm_memory_df[host_col] == host]
                    
                    # Create a list of (vm_name, mem_value) tuples for sorting
                    vm_mem_values = []
                    
                    for _, row in host_vms.iterrows():
                        vm_name = row[vm_col]
                        # Use first memory column as the metric for sorting
                        mem_value = row[mem_cols[0]] if pd.notna(row[mem_cols[0]]) else 0
                        vm_mem_values.append((vm_name, mem_value))
                    
                    # Sort by memory value (descending) and take top N
                    vm_mem_values.sort(key=lambda x: x[1], reverse=True)
                    top_vms = vm_mem_values[:top_n]
                    
                    if top_vms:
                        print(f"\nHost: {host}")
                        print(f"  Top {len(top_vms)} VMs by Memory:")
                        for vm_name, mem_value in top_vms:
                            print(f"    {vm_name}: {mem_value} MB")
        
        if (vm_cpu_df is None or vm_cpu_df.empty) and (vm_memory_df is None or vm_memory_df.empty):
            print("No VM data available.")
            
    def get_vm_memory_utilization(self) -> pd.DataFrame:
        """
        Calculate VM memory utilization metrics.
        
        Returns:
            DataFrame with VM memory utilization statistics
        """
        if 'vmemory' not in self.dataframes:
            raise ValueError("vMemory data not found in loaded files")
        
        df = self.dataframes['vmemory']
        
        # Identify key columns
        vm_name_cols = [col for col in df.columns if 'vm' in col.lower() and 'name' in col.lower()]
        if not vm_name_cols:
            vm_name_cols = [col for col in df.columns if 'name' in col.lower()]
            
        mem_cols = [col for col in df.columns if ('mem' in col.lower() or 'ram' in col.lower())]
        mem_util_cols = [col for col in mem_cols if 'usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower()]
        mem_size_cols = [col for col in mem_cols if 'size' in col.lower() or 'total' in col.lower() or 'capacity' in col.lower()]
        
        if vm_name_cols:
            vm_col = vm_name_cols[0]
            result = df[[vm_col]].copy()
            
            # Add memory metrics
            for col in mem_util_cols + mem_size_cols:
                if col in df.columns:
                    result[col] = df[col]
            
            # Add host/cluster info if available
            cluster_cols = [col for col in df.columns if 'cluster' in col.lower()]
            host_cols = [col for col in df.columns if 'host' in col.lower() or 'esx' in col.lower()]
            
            for col in cluster_cols + host_cols:
                if col in df.columns:
                    result[col] = df[col]
            
            return result
        
        return pd.DataFrame()

    def get_network_utilization(self) -> pd.DataFrame:
        """
        Calculate network utilization metrics.
        
        Returns:
            DataFrame with network utilization statistics
        """
        if 'vnetwork' not in self.dataframes:
            raise ValueError("vNetwork data not found in loaded files")
        
        df = self.dataframes['vnetwork']
        
        # Identify key columns
        vm_name_cols = [col for col in df.columns if 'vm' in col.lower() and 'name' in col.lower()]
        if not vm_name_cols:
            vm_name_cols = [col for col in df.columns if 'name' in col.lower()]
            
        network_cols = [col for col in df.columns if 'network' in col.lower() or 'nic' in col.lower() or 'net' in col.lower() 
                        or 'adapter' in col.lower() or 'connected' in col.lower() or 'mac' in col.lower()]
        
        if vm_name_cols:
            vm_col = vm_name_cols[0]
            result = df[[vm_col]].copy()
            
            # Add network metrics
            for col in network_cols:
                if col in df.columns:
                    result[col] = df[col]
            
            # Add host/cluster info if available
            cluster_cols = [col for col in df.columns if 'cluster' in col.lower()]
            host_cols = [col for col in df.columns if 'host' in col.lower() or 'esx' in col.lower()]
            
            for col in cluster_cols + host_cols:
                if col in df.columns:
                    result[col] = df[col]
            
            return result
        
        return pd.DataFrame()

    def get_aggregated_cluster_report(self) -> Dict:
        """
        Generate an aggregated report of cluster-level utilization.
        
        Returns:
            Dictionary with cluster utilization statistics
        """
        report = {}
        
        # Get relevant dataframes
        cluster_df = self.get_cluster_utilization() if 'vcluster' in self.dataframes else None
        host_df = self.get_host_utilization() if 'vhost' in self.dataframes else None
        
        if cluster_df is not None and not cluster_df.empty:
            # Get cluster names
            cluster_col = cluster_df.columns[0]  # Assuming first column is cluster name
            clusters = cluster_df[cluster_col].unique()
            
            report['clusters'] = {}
            
            # Process each cluster
            for cluster in clusters:
                cluster_data = cluster_df[cluster_df[cluster_col] == cluster]
                
                # CPU metrics
                cpu_cols = [col for col in cluster_data.columns if 'cpu' in col.lower() and 'pct' in col.lower()]
                cpu_metrics = {}
                for col in cpu_cols:
                    if not cluster_data[col].empty:
                        cpu_metrics[col] = cluster_data[col].iloc[0]
                
                # Memory metrics
                mem_cols = [col for col in cluster_data.columns if ('mem' in col.lower() or 'ram' in col.lower()) and 'pct' in col.lower()]
                mem_metrics = {}
                for col in mem_cols:
                    if not cluster_data[col].empty:
                        mem_metrics[col] = cluster_data[col].iloc[0]
                
                report['clusters'][cluster] = {
                    'cpu': cpu_metrics,
                    'memory': mem_metrics,
                    'hosts': {}  # Initialize the hosts dictionary for each cluster
                }
        
        # Add host-level aggregated data if available
        if host_df is not None and not host_df.empty:
            # Identify cluster column if it exists
            cluster_cols = [col for col in host_df.columns if 'cluster' in col.lower()]
            if cluster_cols:
                cluster_col = cluster_cols[0]
                
                # Group hosts by cluster
                clusters = host_df[cluster_col].unique()
                
                if 'clusters' not in report:
                    report['clusters'] = {}
                
                for cluster in clusters:
                    if pd.notna(cluster):  # Skip NaN values
                        cluster_hosts = host_df[host_df[cluster_col] == cluster]
                        
                        # CPU metrics
                        cpu_cols = [col for col in cluster_hosts.columns if 'cpu' in col.lower() and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
                        
                        # Memory metrics
                        mem_cols = [col for col in cluster_hosts.columns if ('mem' in col.lower() or 'ram' in col.lower()) and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
                        
                        if cluster not in report['clusters']:
                            report['clusters'][cluster] = {
                                'cpu': {},
                                'memory': {},
                                'hosts': {}  # Initialize the hosts dictionary if this is a new cluster
                            }
                        elif 'hosts' not in report['clusters'][cluster]:
                            # Add the hosts key if it doesn't exist
                            report['clusters'][cluster]['hosts'] = {}
                        
                        # Add host-level details
                        host_col = [col for col in cluster_hosts.columns if 'host' in col.lower() or 'name' in col.lower() or 'esx' in col.lower()][0]
                        
                        for _, host_row in cluster_hosts.iterrows():
                            host_name = host_row[host_col]
                            
                            host_cpu = {}
                            for col in cpu_cols:
                                if pd.notna(host_row[col]):
                                    host_cpu[col] = host_row[col]
                            
                            host_mem = {}
                            for col in mem_cols:
                                if pd.notna(host_row[col]):
                                    host_mem[col] = host_row[col]
                            
                            report['clusters'][cluster]['hosts'][host_name] = {
                                'cpu': host_cpu,
                                'memory': host_mem
                            }
        
        return report

    def get_aggregated_vm_report(self) -> Dict:
        """
        Generate an aggregated report of VM-level utilization.
        
        Returns:
            Dictionary with VM utilization statistics grouped by clusters and hosts
        """
        report = {'vms_by_cluster': {}, 'vms_by_host': {}}
        
        # Get VM data
        vm_cpu = self.get_vm_cpu_utilization() if 'vcpu' in self.dataframes else None
        vm_memory = self.get_vm_memory_utilization() if 'vmemory' in self.dataframes else None
        vm_network = self.get_network_utilization() if 'vnetwork' in self.dataframes else None
        
        # Process VM CPU data
        if vm_cpu is not None and not vm_cpu.empty:
            # Try to find cluster and host columns
            cluster_cols = [col for col in vm_cpu.columns if 'cluster' in col.lower()]
            host_cols = [col for col in vm_cpu.columns if 'host' in col.lower() or 'esx' in col.lower()]
            vm_col = [col for col in vm_cpu.columns if 'vm' in col.lower() and 'name' in col.lower()]
            if not vm_col:
                vm_col = [col for col in vm_cpu.columns if 'name' in col.lower()]
            
            vm_col = vm_col[0] if vm_col else None
            
            if vm_col:
                # CPU metrics columns
                cpu_cols = [col for col in vm_cpu.columns if 'cpu' in col.lower() and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
                
                # Group by cluster if available
                if cluster_cols and cluster_cols[0] in vm_cpu.columns:
                    cluster_col = cluster_cols[0]
                    clusters = vm_cpu[cluster_col].dropna().unique()
                    
                    for cluster in clusters:
                        if pd.notna(cluster):
                            cluster_vms = vm_cpu[vm_cpu[cluster_col] == cluster]
                            
                            if cluster not in report['vms_by_cluster']:
                                report['vms_by_cluster'][cluster] = {}
                            
                            for _, vm_row in cluster_vms.iterrows():
                                vm_name = vm_row[vm_col]
                                
                                if vm_name not in report['vms_by_cluster'][cluster]:
                                    report['vms_by_cluster'][cluster][vm_name] = {
                                        'cpu': {},
                                        'memory': {},
                                        'network': {}
                                    }
                                
                                # Add CPU metrics
                                for col in cpu_cols:
                                    if col in vm_row and pd.notna(vm_row[col]):
                                        report['vms_by_cluster'][cluster][vm_name]['cpu'][col] = vm_row[col]
                
                # Group by host if available
                if host_cols and host_cols[0] in vm_cpu.columns:
                    host_col = host_cols[0]
                    hosts = vm_cpu[host_col].dropna().unique()
                    
                    for host in hosts:
                        if pd.notna(host):
                            host_vms = vm_cpu[vm_cpu[host_col] == host]
                            
                            if host not in report['vms_by_host']:
                                report['vms_by_host'][host] = {}
                            
                            for _, vm_row in host_vms.iterrows():
                                vm_name = vm_row[vm_col]
                                
                                if vm_name not in report['vms_by_host'][host]:
                                    report['vms_by_host'][host][vm_name] = {
                                        'cpu': {},
                                        'memory': {},
                                        'network': {}
                                    }
                                
                                # Add CPU metrics
                                for col in cpu_cols:
                                    if col in vm_row and pd.notna(vm_row[col]):
                                        report['vms_by_host'][host][vm_name]['cpu'][col] = vm_row[col]
        
        # Process VM memory data
        if vm_memory is not None and not vm_memory.empty:
            # Try to find cluster and host columns
            cluster_cols = [col for col in vm_memory.columns if 'cluster' in col.lower()]
            host_cols = [col for col in vm_memory.columns if 'host' in col.lower() or 'esx' in col.lower()]
            vm_col = [col for col in vm_memory.columns if 'vm' in col.lower() and 'name' in col.lower()]
            if not vm_col:
                vm_col = [col for col in vm_memory.columns if 'name' in col.lower()]
            
            vm_col = vm_col[0] if vm_col else None
            
            if vm_col:
                # Memory metrics columns
                mem_cols = [col for col in vm_memory.columns if ('mem' in col.lower() or 'ram' in col.lower()) and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower() or 'size' in col.lower())]
                
                # Update existing report structure with memory data
                # By cluster
                if cluster_cols and cluster_cols[0] in vm_memory.columns:
                    cluster_col = cluster_cols[0]
                    
                    for _, vm_row in vm_memory.iterrows():
                        cluster = vm_row[cluster_col] if pd.notna(vm_row[cluster_col]) else None
                        vm_name = vm_row[vm_col]
                        
                        if cluster and vm_name:
                            if cluster not in report['vms_by_cluster']:
                                report['vms_by_cluster'][cluster] = {}
                            
                            if vm_name not in report['vms_by_cluster'][cluster]:
                                report['vms_by_cluster'][cluster][vm_name] = {
                                    'cpu': {},
                                    'memory': {},
                                    'network': {}
                                }
                            
                            # Add memory metrics
                            for col in mem_cols:
                                if col in vm_row and pd.notna(vm_row[col]):
                                    report['vms_by_cluster'][cluster][vm_name]['memory'][col] = vm_row[col]
                
                # By host
                if host_cols and host_cols[0] in vm_memory.columns:
                    host_col = host_cols[0]
                    
                    for _, vm_row in vm_memory.iterrows():
                        host = vm_row[host_col] if pd.notna(vm_row[host_col]) else None
                        vm_name = vm_row[vm_col]
                        
                        if host and vm_name:
                            if host not in report['vms_by_host']:
                                report['vms_by_host'][host] = {}
                            
                            if vm_name not in report['vms_by_host'][host]:
                                report['vms_by_host'][host][vm_name] = {
                                    'cpu': {},
                                    'memory': {},
                                    'network': {}
                                }
                            
                            # Add memory metrics
                            for col in mem_cols:
                                if col in vm_row and pd.notna(vm_row[col]):
                                    report['vms_by_host'][host][vm_name]['memory'][col] = vm_row[col]
        
        # Process VM network data
        if vm_network is not None and not vm_network.empty:
            # Try to find cluster and host columns
            cluster_cols = [col for col in vm_network.columns if 'cluster' in col.lower()]
            host_cols = [col for col in vm_network.columns if 'host' in col.lower() or 'esx' in col.lower()]
            vm_col = [col for col in vm_network.columns if 'vm' in col.lower() and 'name' in col.lower()]
            if not vm_col:
                vm_col = [col for col in vm_network.columns if 'name' in col.lower()]
            
            vm_col = vm_col[0] if vm_col else None
            
            if vm_col:
                # Network metrics columns
                net_cols = [col for col in vm_network.columns if 'network' in col.lower() or 'nic' in col.lower() or 'adapters' in col.lower()]
                
                # Update existing report structure with network data
                # By cluster
                if cluster_cols and cluster_cols[0] in vm_network.columns:
                    cluster_col = cluster_cols[0]
                    
                    for _, vm_row in vm_network.iterrows():
                        cluster = vm_row[cluster_col] if pd.notna(vm_row[cluster_col]) else None
                        vm_name = vm_row[vm_col]
                        
                        if cluster and vm_name:
                            if cluster not in report['vms_by_cluster']:
                                report['vms_by_cluster'][cluster] = {}
                            
                            if vm_name not in report['vms_by_cluster'][cluster]:
                                report['vms_by_cluster'][cluster][vm_name] = {
                                    'cpu': {},
                                    'memory': {},
                                    'network': {}
                                }
                            
                            # Add network metrics
                            for col in net_cols:
                                if col in vm_row and pd.notna(vm_row[col]):
                                    report['vms_by_cluster'][cluster][vm_name]['network'][col] = vm_row[col]
                
                # By host
                if host_cols and host_cols[0] in vm_network.columns:
                    host_col = host_cols[0]
                    
                    for _, vm_row in vm_network.iterrows():
                        host = vm_row[host_col] if pd.notna(vm_row[host_col]) else None
                        vm_name = vm_row[vm_col]
                        
                        if host and vm_name:
                            if host not in report['vms_by_host']:
                                report['vms_by_host'][host] = {}
                            
                            if vm_name not in report['vms_by_host'][host]:
                                report['vms_by_host'][host][vm_name] = {
                                    'cpu': {},
                                    'memory': {},
                                    'network': {}
                                }
                            
                            # Add network metrics
                            for col in net_cols:
                                if col in vm_row and pd.notna(vm_row[col]):
                                    report['vms_by_host'][host][vm_name]['network'][col] = vm_row[col]
        
        return report

    def print_aggregated_cluster_report(self) -> None:
        """Print a formatted cluster utilization report to the console."""
        report = self.get_aggregated_cluster_report()
        
        if not report or not report.get('clusters'):
            print("No cluster data available.")
            return
        
        print("\n===== CLUSTER UTILIZATION REPORT =====\n")
        
        for cluster_name, cluster_data in report['clusters'].items():
            print(f"Cluster: {cluster_name}")
            
            # Print CPU metrics
            if cluster_data.get('cpu'):
                print("  CPU Utilization:")
                for metric, value in cluster_data['cpu'].items():
                    print(f"    {metric}: {value}")
            
            # Print Memory metrics
            if cluster_data.get('memory'):
                print("  Memory Utilization:")
                for metric, value in cluster_data['memory'].items():
                    print(f"    {metric}: {value}")
            
            # Print Host metrics if available
            if cluster_data.get('hosts'):
                print("  Hosts:")
                for host_name, host_data in cluster_data['hosts'].items():
                    print(f"    Host: {host_name}")
                    
                    if host_data.get('cpu'):
                        print("      CPU Utilization:")
                        for metric, value in host_data['cpu'].items():
                            print(f"        {metric}: {value}")
                    
                    if host_data.get('memory'):
                        print("      Memory Utilization:")
                        for metric, value in host_data['memory'].items():
                            print(f"        {metric}: {value}")
            
            print()  # Add a blank line between clusters

    def print_aggregated_vm_report(self, top_n: int = 10, by: str = 'cpu') -> None:
        """
        Print a formatted VM utilization report, showing the top N VMs by utilization directly from vInfo.
        
        Args:
            top_n: Number of top VMs to show
            by: Metric to sort by ('cpu' or 'memory')
        """
        if 'vinfo' not in self.dataframes:
            print("No VM data available - vInfo data missing.")
            return
        
        df = self.dataframes['vinfo']
        
        # Debug output
        print(f"Found {len(df)} VMs in vInfo data")
        
        # Ensure we have the necessary columns
        vm_col = 'VM'
        cluster_col = 'Cluster'
        host_col = 'Host'
        
        # Check if these columns exist
        required_cols = [vm_col, cluster_col, host_col]
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"Missing required columns: {', '.join(missing_cols)}")
            return
        
        # Define the metric columns based on the 'by' parameter
        if by.lower() == 'cpu':
            metric_col = 'CPUs'
            metric_label = 'CPU Count'
        else:  # memory
            metric_col = 'Memory'
            metric_label = 'Memory (MB)'
        
        if metric_col not in df.columns:
            print(f"Metric column '{metric_col}' not found in vInfo data")
            return
        
        # Ensure the metric column is numeric
        try:
            df[metric_col] = pd.to_numeric(df[metric_col], errors='coerce')
        except Exception as e:
            print(f"Error converting {metric_col} to numeric: {e}")
            return
        
        print(f"\n===== TOP {top_n} VMs BY {by.upper()} =====\n")
        
        # Group by cluster
        print("BY CLUSTER:")
        clusters = df[cluster_col].dropna().unique()
        
        for cluster in clusters:
            cluster_vms = df[df[cluster_col] == cluster].copy()
            
            # Create a list of (vm_name, metric_value) tuples for sorting
            vm_values = []
            
            for _, row in cluster_vms.iterrows():
                vm_name = row[vm_col]
                metric_value = row[metric_col] if pd.notna(row[metric_col]) else 0
                vm_values.append((vm_name, metric_value))
            
            if not vm_values:
                print(f"\nCluster: {cluster} - No VM data available")
                continue
                
            # Sort by metric value (descending) and take top N
            vm_values.sort(key=lambda x: x[1], reverse=True)
            top_vms = vm_values[:top_n]
            
            print(f"\nCluster: {cluster}")
            print(f"  Top {len(top_vms)} VMs by {by.upper()}:")
            for vm_name, value in top_vms:
                print(f"    {vm_name}: {value} {metric_label}")
        
        # Group by host
        print("\nBY HOST:")
        hosts = df[host_col].dropna().unique()
        
        for host in hosts:
            host_vms = df[df[host_col] == host].copy()
            
            # Create a list of (vm_name, metric_value) tuples for sorting
            vm_values = []
            
            for _, row in host_vms.iterrows():
                vm_name = row[vm_col]
                metric_value = row[metric_col] if pd.notna(row[metric_col]) else 0
                vm_values.append((vm_name, metric_value))
            
            if not vm_values:
                print(f"\nHost: {host} - No VM data available")
                continue
                
            # Sort by metric value (descending) and take top N
            vm_values.sort(key=lambda x: x[1], reverse=True)
            top_vms = vm_values[:top_n]
            
            print(f"\nHost: {host}")
            print(f"  Top {len(top_vms)} VMs by {by.upper()}:")
            for vm_name, value in top_vms:
                print(f"    {vm_name}: {value} {metric_label}")

    def get_overall_utilization_summary(self) -> Dict:
        """
        Create a summary of overall utilization across the environment.
        
        Returns:
            Dictionary with summarized utilization metrics
        """
        summary = {
            'cpu': {
                'overall_avg': None,
                'by_cluster': {},
                'by_host': {}
            },
            'memory': {
                'overall_avg': None,
                'by_cluster': {},
                'by_host': {}
            },
            'network': {
                'overall_stats': {},
                'by_cluster': {},
                'by_host': {}
            }
        }
        
        # Get relevant dataframes
        host_df = self.get_host_utilization() if 'vhost' in self.dataframes else None
        vm_cpu_df = self.get_vm_cpu_utilization() if 'vcpu' in self.dataframes else None
        vm_memory_df = self.get_vm_memory_utilization() if 'vmemory' in self.dataframes else None
        vm_network_df = self.get_network_utilization() if 'vnetwork' in self.dataframes else None
        
        # Calculate CPU utilization summary
        if host_df is not None and not host_df.empty:
            # Find CPU utilization metrics in host data
            cpu_util_cols = [col for col in host_df.columns if 'cpu' in col.lower() and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
            
            if cpu_util_cols:
                # Use the first utilization column found
                cpu_col = cpu_util_cols[0]
                
                # Calculate overall average
                overall_cpu_avg = host_df[cpu_col].mean()
                if pd.notna(overall_cpu_avg):
                    summary['cpu']['overall_avg'] = overall_cpu_avg
                
                # Calculate by cluster if available
                cluster_cols = [col for col in host_df.columns if 'cluster' in col.lower()]
                if cluster_cols:
                    cluster_col = cluster_cols[0]
                    
                    for cluster in host_df[cluster_col].dropna().unique():
                        cluster_hosts = host_df[host_df[cluster_col] == cluster]
                        cluster_avg = cluster_hosts[cpu_col].mean()
                        
                        if pd.notna(cluster_avg):
                            summary['cpu']['by_cluster'][cluster] = cluster_avg
                
                # Calculate by host
                host_cols = [col for col in host_df.columns if 'host' in col.lower() or 'name' in col.lower() or 'esx' in col.lower()]
                if host_cols:
                    host_col = host_cols[0]
                    
                    for host in host_df[host_col].dropna().unique():
                        host_cpu = host_df[host_df[host_col] == host][cpu_col].iloc[0] if not host_df[host_df[host_col] == host].empty else None
                        
                        if pd.notna(host_cpu):
                            summary['cpu']['by_host'][host] = host_cpu
        
        # If host data is not available, try using VM data
        elif vm_cpu_df is not None and not vm_cpu_df.empty:
            # Find CPU utilization metrics in VM data
            cpu_util_cols = [col for col in vm_cpu_df.columns if 'cpu' in col.lower() and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
            
            if cpu_util_cols:
                # Use the first utilization column found
                cpu_col = cpu_util_cols[0]
                
                # Calculate overall average from VMs
                overall_cpu_avg = vm_cpu_df[cpu_col].mean()
                if pd.notna(overall_cpu_avg):
                    summary['cpu']['overall_avg'] = overall_cpu_avg
                
                # Calculate by cluster if available
                cluster_cols = [col for col in vm_cpu_df.columns if 'cluster' in col.lower()]
                if cluster_cols:
                    cluster_col = cluster_cols[0]
                    
                    for cluster in vm_cpu_df[cluster_col].dropna().unique():
                        cluster_vms = vm_cpu_df[vm_cpu_df[cluster_col] == cluster]
                        cluster_avg = cluster_vms[cpu_col].mean()
                        
                        if pd.notna(cluster_avg):
                            summary['cpu']['by_cluster'][cluster] = cluster_avg
                
                # Calculate by host if available
                host_cols = [col for col in vm_cpu_df.columns if 'host' in col.lower() or 'esx' in col.lower()]
                if host_cols:
                    host_col = host_cols[0]
                    
                    for host in vm_cpu_df[host_col].dropna().unique():
                        host_vms = vm_cpu_df[vm_cpu_df[host_col] == host]
                        host_avg = host_vms[cpu_col].mean()
                        
                        if pd.notna(host_avg):
                            summary['cpu']['by_host'][host] = host_avg
        
        # Calculate Memory utilization summary
        if host_df is not None and not host_df.empty:
            # Find Memory utilization metrics in host data
            mem_util_cols = [col for col in host_df.columns if ('mem' in col.lower() or 'ram' in col.lower()) and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
            
            if mem_util_cols:
                # Use the first utilization column found
                mem_col = mem_util_cols[0]
                
                # Calculate overall average
                overall_mem_avg = host_df[mem_col].mean()
                if pd.notna(overall_mem_avg):
                    summary['memory']['overall_avg'] = overall_mem_avg
                
                # Calculate by cluster if available
                cluster_cols = [col for col in host_df.columns if 'cluster' in col.lower()]
                if cluster_cols:
                    cluster_col = cluster_cols[0]
                    
                    for cluster in host_df[cluster_col].dropna().unique():
                        cluster_hosts = host_df[host_df[cluster_col] == cluster]
                        cluster_avg = cluster_hosts[mem_col].mean()
                        
                        if pd.notna(cluster_avg):
                            summary['memory']['by_cluster'][cluster] = cluster_avg
                
                # Calculate by host
                host_cols = [col for col in host_df.columns if 'host' in col.lower() or 'name' in col.lower() or 'esx' in col.lower()]
                if host_cols:
                    host_col = host_cols[0]
                    
                    for host in host_df[host_col].dropna().unique():
                        host_mem = host_df[host_df[host_col] == host][mem_col].iloc[0] if not host_df[host_df[host_col] == host].empty else None
                        
                        if pd.notna(host_mem):
                            summary['memory']['by_host'][host] = host_mem
        
        # If host data is not available, try using VM data
        elif vm_memory_df is not None and not vm_memory_df.empty:
            # Find Memory utilization metrics in VM data
            mem_util_cols = [col for col in vm_memory_df.columns if ('mem' in col.lower() or 'ram' in col.lower()) and ('usage' in col.lower() or 'util' in col.lower() or 'pct' in col.lower())]
            
            if mem_util_cols:
                # Use the first utilization column found
                mem_col = mem_util_cols[0]
                
                # Calculate overall average from VMs
                overall_mem_avg = vm_memory_df[mem_col].mean()
                if pd.notna(overall_mem_avg):
                    summary['memory']['overall_avg'] = overall_mem_avg
                
                # Calculate by cluster if available
                cluster_cols = [col for col in vm_memory_df.columns if 'cluster' in col.lower()]
                if cluster_cols:
                    cluster_col = cluster_cols[0]
                    
                    for cluster in vm_memory_df[cluster_col].dropna().unique():
                        cluster_vms = vm_memory_df[vm_memory_df[cluster_col] == cluster]
                        cluster_avg = cluster_vms[mem_col].mean()
                        
                        if pd.notna(cluster_avg):
                            summary['memory']['by_cluster'][cluster] = cluster_avg
                
                # Calculate by host if available
                host_cols = [col for col in vm_memory_df.columns if 'host' in col.lower() or 'esx' in col.lower()]
                if host_cols:
                    host_col = host_cols[0]
                    
                    for host in vm_memory_df[host_col].dropna().unique():
                        host_vms = vm_memory_df[vm_memory_df[host_col] == host]
                        host_avg = host_vms[mem_col].mean()
                        
                        if pd.notna(host_avg):
                            summary['memory']['by_host'][host] = host_avg
        
        # Calculate Network statistics
        if vm_network_df is not None and not vm_network_df.empty:
            # Count total network adapters and connections
            nic_cols = [col for col in vm_network_df.columns if 'nic' in col.lower() or 'adapter' in col.lower() or 'connected' in col.lower()]
            
            if nic_cols:
                # Try to find connection status columns
                connected_cols = [col for col in nic_cols if 'connected' in col.lower() or 'status' in col.lower()]
                
                if connected_cols:
                    conn_col = connected_cols[0]
                    
                    # Count total NICs and connected NICs
                    total_nics = vm_network_df[conn_col].count()
                    connected_nics = vm_network_df[vm_network_df[conn_col].astype(str).str.lower().isin(['true', 'connected', 'yes', '1'])].shape[0]
                    
                    summary['network']['overall_stats'] = {
                        'total_nics': total_nics,
                        'connected_nics': connected_nics,
                        'connection_rate': connected_nics / total_nics if total_nics > 0 else None
                    }
                
                # Calculate by cluster if available
                cluster_cols = [col for col in vm_network_df.columns if 'cluster' in col.lower()]
                if cluster_cols and connected_cols:
                    cluster_col = cluster_cols[0]
                    conn_col = connected_cols[0]
                    
                    for cluster in vm_network_df[cluster_col].dropna().unique():
                        cluster_vms = vm_network_df[vm_network_df[cluster_col] == cluster]
                        
                        total_nics = cluster_vms[conn_col].count()
                        connected_nics = cluster_vms[cluster_vms[conn_col].astype(str).str.lower().isin(['true', 'connected', 'yes', '1'])].shape[0]
                        
                        summary['network']['by_cluster'][cluster] = {
                            'total_nics': total_nics,
                            'connected_nics': connected_nics,
                            'connection_rate': connected_nics / total_nics if total_nics > 0 else None
                        }
                
                # Calculate by host if available
                host_cols = [col for col in vm_network_df.columns if 'host' in col.lower() or 'esx' in col.lower()]
                if host_cols and connected_cols:
                    host_col = host_cols[0]
                    conn_col = connected_cols[0]
                    
                    for host in vm_network_df[host_col].dropna().unique():
                        host_vms = vm_network_df[vm_network_df[host_col] == host]
                        
                        total_nics = host_vms[conn_col].count()
                        connected_nics = host_vms[host_vms[conn_col].astype(str).str.lower().isin(['true', 'connected', 'yes', '1'])].shape[0]
                        
                        summary['network']['by_host'][host] = {
                            'total_nics': total_nics,
                            'connected_nics': connected_nics,
                            'connection_rate': connected_nics / total_nics if total_nics > 0 else None
                        }
        
        return summary

    def print_overall_utilization_summary(self) -> None:
        """Print a formatted summary of overall utilization to the console."""
        summary = self.get_overall_utilization_summary()
        
        print("\n===== OVERALL UTILIZATION SUMMARY =====\n")
        
        # Print CPU Summary
        print("CPU UTILIZATION:")
        if summary['cpu'].get('overall_avg') is not None:
            print(f"  Overall Average: {summary['cpu']['overall_avg']:.2f}%")
        else:
            print("  Overall Average: N/A")
        
        if summary['cpu'].get('by_cluster'):
            print("  By Cluster:")
            for cluster, value in summary['cpu']['by_cluster'].items():
                print(f"    {cluster}: {value:.2f}%")
        
        if summary['cpu'].get('by_host'):
            print("  By Host:")
            for host, value in summary['cpu']['by_host'].items():
                print(f"    {host}: {value:.2f}%")
        
        print()  # Add a blank line
        
        # Print Memory Summary
        print("MEMORY UTILIZATION:")
        if summary['memory'].get('overall_avg') is not None:
            print(f"  Overall Average: {summary['memory']['overall_avg']:.2f}%")
        else:
            print("  Overall Average: N/A")
        
        if summary['memory'].get('by_cluster'):
            print("  By Cluster:")
            for cluster, value in summary['memory']['by_cluster'].items():
                print(f"    {cluster}: {value:.2f}%")
        
        if summary['memory'].get('by_host'):
            print("  By Host:")
            for host, value in summary['memory']['by_host'].items():
                print(f"    {host}: {value:.2f}%")
        
        print()  # Add a blank line
        
        # Print Network Summary
        print("NETWORK CONNECTIVITY:")
        if summary.get('network') and summary['network'].get('overall_stats'):
            stats = summary['network']['overall_stats']
            print(f"  Total NICs: {stats.get('total_nics', 'N/A')}")
            print(f"  Connected NICs: {stats.get('connected_nics', 'N/A')}")
            
            if stats.get('connection_rate') is not None:
                print(f"  Connection Rate: {stats['connection_rate'] * 100:.2f}%")
            else:
                print("  Connection Rate: N/A")
        else:
            print("  Network information not available")
        
        if summary.get('network') and summary['network'].get('by_cluster'):
            print("  By Cluster:")
            for cluster, stats in summary['network']['by_cluster'].items():
                rate = stats.get('connection_rate')
                rate_str = f"{rate * 100:.2f}%" if rate is not None else "N/A"
                print(f"    {cluster}: {stats.get('connected_nics', 'N/A')}/{stats.get('total_nics', 'N/A')} NICs connected ({rate_str})")
        
        if summary.get('network') and summary['network'].get('by_host'):
            print("  By Host:")
            for host, stats in summary['network']['by_host'].items():
                rate = stats.get('connection_rate')
                rate_str = f"{rate * 100:.2f}%" if rate is not None else "N/A"
                print(f"    {host}: {stats.get('connected_nics', 'N/A')}/{stats.get('total_nics', 'N/A')} NICs connected ({rate_str})")


 @click.command()
 @click.option('--directory', '-d', type=click.Path(exists=True, file_okay=False, dir_okay=True), 
              help='Directory containing RVTools CSV files')
 @click.option('--top-n', '-n', type=int, default=10, 
              help='Number of top VMs to display in the VM utilization report')
 def main(directory, top_n):
    """
    Parse and analyze RVTools CSV files for infrastructure utilization metrics.
    
    If no directory is provided, the script will prompt for one.
    """
    # If directory wasn't provided as a command-line argument, prompt for it
    if not directory:
        directory = click.prompt(
            "Please enter the path to the directory containing RVTools CSV files",
            type=click.Path(exists=True, file_okay=False, dir_okay=True)
        )
    
    click.echo(f"Analyzing RVTools CSV files in: {directory}")
    
    # Initialize the analyzer with the specified directory
    analyzer = RVToolsAnalyzer(directory)
    
    # Print diagnostic information about what files were loaded
    click.echo("\nLoaded RVTools files:")
    for key, file_path in analyzer.file_mapping.items():
        if file_path:
            click.echo(f"  {key}: {os.path.basename(file_path)}")
        else:
            click.echo(f"  {key}: Not found")
    
    # Print overall utilization summary
    click.echo("\nGenerating overall utilization summary...")
    analyzer.print_overall_utilization_summary()
    
    # Print cluster utilization report
    click.echo("\nGenerating cluster utilization report...")
    analyzer.print_aggregated_cluster_report()
    
    # Print top N VMs by CPU utilization
    click.echo(f"\nGenerating top {top_n} VMs by CPU utilization...")
    analyzer.print_aggregated_vm_report(top_n=top_n, by='cpu')
    
    # Print top N VMs by memory utilization
    click.echo(f"\nGenerating top {top_n} VMs by memory utilization...")
    analyzer.print_aggregated_vm_report(top_n=top_n, by='memory')
    
    click.echo("\nAnalysis complete!")

 if __name__ == "__main__":
    main()