llm_lab_perso/utils/system_monitor.py
2025-04-21 17:36:30 +02:00

590 lines
27 KiB
Python

"""
System resource monitor for Ollama and NVIDIA GPU resources
"""
import tkinter as tk
from tkinter import ttk
import psutil
import threading
import time
import subprocess
import json
import os
import platform
import requests
from datetime import datetime
class SystemMonitor:
def __init__(self, root):
self.root = root
self.root.title("LLM Lab Monitor")
self.root.geometry("800x600")
self.root.minsize(700, 500)
# Style
self.style = ttk.Style()
self.style.theme_use('alt') # 'clam', 'alt', 'default', 'classic'
# Variables
self.update_interval = 2 # seconds
self.running = True
self.ollama_models = []
self.active_model = None
self.gpu_available = self._check_gpu_available()
# Create UI
self._create_widgets()
# Start update thread
self.update_thread = threading.Thread(target=self._update_loop)
self.update_thread.daemon = True
self.update_thread.start()
# Intercept window close
self.root.protocol("WM_DELETE_WINDOW", self._on_close)
def _create_widgets(self):
# Create notebook (tabs)
self.notebook = ttk.Notebook(self.root)
self.notebook.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# Tab 1: System monitoring
self.system_frame = ttk.Frame(self.notebook)
self.notebook.add(self.system_frame, text="System")
# Tab 2: Ollama
self.ollama_frame = ttk.Frame(self.notebook)
self.notebook.add(self.ollama_frame, text="Ollama")
# Tab 3: GPU
self.gpu_frame = ttk.Frame(self.notebook)
self.notebook.add(self.gpu_frame, text="GPU")
# Tab 4: Logs
self.logs_frame = ttk.Frame(self.notebook)
self.notebook.add(self.logs_frame, text="Logs")
# === System tab configuration ===
system_label = ttk.Label(self.system_frame, text="System Resources", font=("Arial", 14, "bold"))
system_label.pack(pady=10)
# System info
system_info_frame = ttk.LabelFrame(self.system_frame, text="System Information")
system_info_frame.pack(fill=tk.X, padx=10, pady=5)
# OS
os_frame = ttk.Frame(system_info_frame)
os_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(os_frame, text="Operating System:").pack(side=tk.LEFT, padx=5)
self.os_label = ttk.Label(os_frame, text="")
self.os_label.pack(side=tk.LEFT, padx=5)
# CPU
cpu_frame = ttk.Frame(system_info_frame)
cpu_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(cpu_frame, text="Processor:").pack(side=tk.LEFT, padx=5)
self.cpu_label = ttk.Label(cpu_frame, text="")
self.cpu_label.pack(side=tk.LEFT, padx=5)
# RAM
ram_frame = ttk.Frame(system_info_frame)
ram_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(ram_frame, text="RAM Memory:").pack(side=tk.LEFT, padx=5)
self.ram_label = ttk.Label(ram_frame, text="")
self.ram_label.pack(side=tk.LEFT, padx=5)
# Progress bars
progress_frame = ttk.LabelFrame(self.system_frame, text="Resource Usage")
progress_frame.pack(fill=tk.X, padx=10, pady=5)
# CPU Usage
cpu_usage_frame = ttk.Frame(progress_frame)
cpu_usage_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(cpu_usage_frame, text="CPU:").pack(side=tk.LEFT, padx=5)
self.cpu_progress = ttk.Progressbar(cpu_usage_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.cpu_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.cpu_percent = ttk.Label(cpu_usage_frame, text="0%")
self.cpu_percent.pack(side=tk.LEFT, padx=5)
# RAM Usage
ram_usage_frame = ttk.Frame(progress_frame)
ram_usage_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(ram_usage_frame, text="RAM:").pack(side=tk.LEFT, padx=5)
self.ram_progress = ttk.Progressbar(ram_usage_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.ram_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.ram_percent = ttk.Label(ram_usage_frame, text="0%")
self.ram_percent.pack(side=tk.LEFT, padx=5)
# Disk Usage
disk_usage_frame = ttk.Frame(progress_frame)
disk_usage_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(disk_usage_frame, text="Disk:").pack(side=tk.LEFT, padx=5)
self.disk_progress = ttk.Progressbar(disk_usage_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.disk_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.disk_percent = ttk.Label(disk_usage_frame, text="0%")
self.disk_percent.pack(side=tk.LEFT, padx=5)
# === Ollama tab configuration ===
ollama_label = ttk.Label(self.ollama_frame, text="Ollama Server", font=("Arial", 14, "bold"))
ollama_label.pack(pady=10)
# Server status
server_frame = ttk.LabelFrame(self.ollama_frame, text="Server Status")
server_frame.pack(fill=tk.X, padx=10, pady=5)
# Status
status_frame = ttk.Frame(server_frame)
status_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(status_frame, text="Status:").pack(side=tk.LEFT, padx=5)
self.status_label = ttk.Label(status_frame, text="Checking...")
self.status_label.pack(side=tk.LEFT, padx=5)
# URL
url_frame = ttk.Frame(server_frame)
url_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(url_frame, text="URL:").pack(side=tk.LEFT, padx=5)
self.url_label = ttk.Label(url_frame, text="http://localhost:11434")
self.url_label.pack(side=tk.LEFT, padx=5)
# Available models
models_frame = ttk.LabelFrame(self.ollama_frame, text="Available Models")
models_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
# Models list
self.tree = ttk.Treeview(models_frame, columns=("Nom", "Taille", "Modifié"), show='headings')
self.tree.heading("Nom", text="LLM Model")
self.tree.heading("Taille", text="Size")
self.tree.heading("Modifié", text="Modified")
self.tree.column("Nom", width=150)
self.tree.column("Taille", width=100)
self.tree.column("Modifié", width=150)
self.tree.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
# Buttons
button_frame = ttk.Frame(self.ollama_frame)
button_frame.pack(fill=tk.X, padx=10, pady=5)
refresh_button = ttk.Button(button_frame, text="Refresh", command=self._refresh_ollama)
refresh_button.pack(side=tk.LEFT, padx=5)
# === GPU tab configuration ===
gpu_label = ttk.Label(self.gpu_frame, text="GPU Resources", font=("Arial", 14, "bold"))
gpu_label.pack(pady=10)
if self.gpu_available:
# GPU Info
gpu_info_frame = ttk.LabelFrame(self.gpu_frame, text="GPU Information")
gpu_info_frame.pack(fill=tk.X, padx=10, pady=5)
# GPU Model
gpu_model_frame = ttk.Frame(gpu_info_frame)
gpu_model_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(gpu_model_frame, text="Model:").pack(side=tk.LEFT, padx=5)
self.gpu_model_label = ttk.Label(gpu_model_frame, text="")
self.gpu_model_label.pack(side=tk.LEFT, padx=5)
# GPU Memory
gpu_memory_frame = ttk.Frame(gpu_info_frame)
gpu_memory_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(gpu_memory_frame, text="Memory:").pack(side=tk.LEFT, padx=5)
self.gpu_memory_label = ttk.Label(gpu_memory_frame, text="")
self.gpu_memory_label.pack(side=tk.LEFT, padx=5)
# GPU Usage
gpu_usage_frame = ttk.LabelFrame(self.gpu_frame, text="Utilization")
gpu_usage_frame.pack(fill=tk.X, padx=10, pady=5)
# GPU Compute
gpu_compute_frame = ttk.Frame(gpu_usage_frame)
gpu_compute_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(gpu_compute_frame, text="Compute:").pack(side=tk.LEFT, padx=5)
self.gpu_compute_progress = ttk.Progressbar(gpu_compute_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.gpu_compute_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.gpu_compute_percent = ttk.Label(gpu_compute_frame, text="0%")
self.gpu_compute_percent.pack(side=tk.LEFT, padx=5)
# GPU Memory
gpu_mem_usage_frame = ttk.Frame(gpu_usage_frame)
gpu_mem_usage_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(gpu_mem_usage_frame, text="Memory:").pack(side=tk.LEFT, padx=5)
self.gpu_mem_progress = ttk.Progressbar(gpu_mem_usage_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.gpu_mem_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.gpu_mem_percent = ttk.Label(gpu_mem_usage_frame, text="0%")
self.gpu_mem_percent.pack(side=tk.LEFT, padx=5)
# Temperature
gpu_temp_frame = ttk.Frame(gpu_usage_frame)
gpu_temp_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(gpu_temp_frame, text="Temperature:").pack(side=tk.LEFT, padx=5)
self.gpu_temp_progress = ttk.Progressbar(gpu_temp_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.gpu_temp_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.gpu_temp_label = ttk.Label(gpu_temp_frame, text="0°C")
self.gpu_temp_label.pack(side=tk.LEFT, padx=5)
# Active processes graphs
gpu_processes_frame = ttk.LabelFrame(self.gpu_frame, text="GPU Processes")
gpu_processes_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
# Process list
self.gpu_process_tree = ttk.Treeview(gpu_processes_frame,
columns=("PID", "Nom", "Mémoire"),
show='headings')
self.gpu_process_tree.heading("PID", text="PID")
self.gpu_process_tree.heading("Nom", text="Process")
self.gpu_process_tree.heading("Mémoire", text="Memory")
self.gpu_process_tree.column("PID", width=50)
self.gpu_process_tree.column("Nom", width=200)
self.gpu_process_tree.column("Mémoire", width=100)
self.gpu_process_tree.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
else:
no_gpu_label = ttk.Label(self.gpu_frame,
text="No NVIDIA GPU detected.",
font=("Arial", 12))
no_gpu_label.pack(pady=50)
install_label = ttk.Label(self.gpu_frame,
text="To monitor an NVIDIA GPU, install nvidia-smi and nvitop.",
font=("Arial", 10))
install_label.pack(pady=10)
# === Logs tab configuration ===
logs_label = ttk.Label(self.logs_frame, text="Activity Logs", font=("Arial", 14, "bold"))
logs_label.pack(pady=10)
# Log area
log_area_frame = ttk.Frame(self.logs_frame)
log_area_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
# Scrollbar
scrollbar = ttk.Scrollbar(log_area_frame)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
# Text area
self.log_text = tk.Text(log_area_frame, yscrollcommand=scrollbar.set)
self.log_text.pack(fill=tk.BOTH, expand=True)
scrollbar.config(command=self.log_text.yview)
# Buttons
log_button_frame = ttk.Frame(self.logs_frame)
log_button_frame.pack(fill=tk.X, padx=10, pady=5)
clear_log_button = ttk.Button(log_button_frame, text="Clear Logs",
command=lambda: self.log_text.delete(1.0, tk.END))
clear_log_button.pack(side=tk.LEFT, padx=5)
# Status bar at bottom
self.status_bar = ttk.Label(self.root, text="LLM Lab Monitor - Last update: Never",
relief=tk.SUNKEN, anchor=tk.W)
self.status_bar.pack(side=tk.BOTTOM, fill=tk.X)
def _update_loop(self):
"""Main update thread"""
while self.running:
try:
# System update
self._update_system_info()
# Ollama update
if self.notebook.index(self.notebook.select()) == 1: # Ollama tab
self._update_ollama_info()
# GPU update
if self.gpu_available and self.notebook.index(self.notebook.select()) == 2: # GPU tab
self._update_gpu_info()
# Status bar update
now = datetime.now().strftime("%H:%M:%S")
self.status_bar.config(text=f"LLM Lab Monitor - Last update: {now}")
except Exception as e:
self._log(f"Update error: {str(e)}")
time.sleep(self.update_interval)
def _update_system_info(self):
"""Updates system information"""
# System information
self.os_label.config(text=f"{platform.system()} {platform.release()}")
self.cpu_label.config(text=f"{psutil.cpu_count(logical=False)} cores ({psutil.cpu_count()} threads)")
# Advanced RAM detection
try:
ram = psutil.virtual_memory()
total_ram = ram.total / (1024 * 1024 * 1024) # GB
# Additional check for Linux
if platform.system() == "Linux":
try:
# Use /proc/meminfo for more accurate detection
with open('/proc/meminfo', 'r') as f:
for line in f:
if 'MemTotal' in line:
# MemTotal is in kB
mem_kb = int(line.split()[1])
linux_ram = mem_kb / (1024 * 1024) # GB
# Use the higher value
total_ram = max(total_ram, linux_ram)
break
except Exception as e:
self._log(f"Error reading /proc/meminfo: {str(e)}")
self.ram_label.config(text=f"{total_ram:.1f} GB")
except Exception as e:
self._log(f"Error detecting RAM: {str(e)}")
self.ram_label.config(text="Detection failed")
# CPU Usage
cpu_percent = psutil.cpu_percent()
self.cpu_progress["value"] = cpu_percent
self.cpu_percent.config(text=f"{cpu_percent:.1f}%")
# RAM Usage
ram_percent = ram.percent
self.ram_progress["value"] = ram_percent
self.ram_percent.config(text=f"{ram_percent:.1f}%")
# Disk usage
disk = psutil.disk_usage('/')
disk_percent = disk.percent
self.disk_progress["value"] = disk_percent
self.disk_percent.config(text=f"{disk_percent:.1f}%")
def _update_ollama_info(self):
"""Updates Ollama information"""
try:
# Check if server is running
response = requests.get("http://localhost:11434/api/tags", timeout=2)
if response.status_code == 200:
self.status_label.config(text="Online", foreground="green")
# Update model list
data = response.json()
models = data.get("models", [])
# Clear current list
for item in self.tree.get_children():
self.tree.delete(item)
# Add models
for model in models:
model_name = model.get("name", "")
model_size = self._format_size(model.get("size", 0))
modified = model.get("modified_at", "")
# Convert date format
if modified:
try:
modified_dt = datetime.fromisoformat(modified.replace('Z', '+00:00'))
modified = modified_dt.strftime("%d/%m/%Y %H:%M")
except:
pass
self.tree.insert("", tk.END, text=model_name, values=(model_name, model_size, modified), iid=model_name)
# Update global list and count active models
self.ollama_models = [model.get("name", "") for model in models]
model_count = len(models)
# Check if there's an active model via nvidia-smi if GPU available
active_models = []
if self.gpu_available:
try:
# Check processes using GPU
result_processes = subprocess.run(
['nvidia-smi', '--query-compute-apps=pid,name,used_memory', '--format=csv,noheader,nounits'],
capture_output=True,
text=True,
check=False
)
if result_processes.returncode == 0:
processes = result_processes.stdout.strip().split('\n')
for process in processes:
if process.strip():
process_data = process.split(',')
if len(process_data) >= 3:
pid = process_data[0].strip()
name = process_data[1].strip()
memory = process_data[2].strip()
# If it's Ollama, search which model is active
if "ollama" in name.lower():
try:
process_info = psutil.Process(int(pid))
cmd_line = " ".join(process_info.cmdline())
for model in self.ollama_models:
if model in cmd_line:
active_models.append(model)
self.active_model = model
self._log(f"Active model detected: {model} (PID {pid}, using {memory} MiB)")
# Highlight in list
self.tree.selection_set(model)
self.tree.see(model)
# Add "ACTIVE" to the list
values = self.tree.item(model, "values")
self.tree.item(model, values=values, tags=("active",))
self.style.configure("Treeview", background="#FFFFFF")
self.style.map("Treeview",
foreground=[("selected", "#000000")],
background=[("selected", "#e1e1e1")])
self.tree.tag_configure("active", background="#e0f7fa")
except Exception as e:
self._log(f"Error analyzing process: {str(e)}")
except Exception as e:
self._log(f"Error checking GPU processes: {str(e)}")
# Update models section title with count
active_count = len(active_models)
models_text = f"Available Models ({model_count})"
if active_count > 0:
models_text += f" - Active: {', '.join(active_models)}"
# Find the models LabelFrame and update its text
for child in self.ollama_frame.winfo_children():
if isinstance(child, ttk.LabelFrame) and "Available Models" in child["text"]:
child["text"] = models_text
break
else:
self.status_label.config(text="Error", foreground="red")
self._log(f"Error connecting to Ollama server: {response.status_code}")
except requests.exceptions.RequestException:
self.status_label.config(text="Offline", foreground="red")
self._log("Ollama server not available")
def _update_gpu_info(self):
"""Updates GPU information"""
if not self.gpu_available:
return
try:
# Run nvidia-smi to get GPU information
result = subprocess.run(
['nvidia-smi', '--query-gpu=name,memory.total,memory.used,utilization.gpu,temperature.gpu',
'--format=csv,noheader,nounits'],
capture_output=True,
text=True,
check=True
)
if result.returncode == 0:
# Parse results
gpu_data = result.stdout.strip().split(',')
if len(gpu_data) >= 5:
# Model name
model_name = gpu_data[0].strip()
self.gpu_model_label.config(text=model_name)
# Total and used memory
total_memory = float(gpu_data[1].strip())
used_memory = float(gpu_data[2].strip())
memory_percent = (used_memory / total_memory) * 100 if total_memory > 0 else 0
self.gpu_memory_label.config(text=f"{used_memory:.0f} MiB / {total_memory:.0f} MiB")
self.gpu_mem_progress["value"] = memory_percent
self.gpu_mem_percent.config(text=f"{memory_percent:.1f}%")
# GPU utilization
gpu_util = float(gpu_data[3].strip())
self.gpu_compute_progress["value"] = gpu_util
self.gpu_compute_percent.config(text=f"{gpu_util:.1f}%")
# Temperature
temp = float(gpu_data[4].strip())
# Temperature scale: 0-100°C
self.gpu_temp_progress["value"] = temp
self.gpu_temp_label.config(text=f"{temp:.1f}°C")
# Get GPU processes
result_processes = subprocess.run(
['nvidia-smi', '--query-compute-apps=pid,name,used_memory', '--format=csv,noheader,nounits'],
capture_output=True,
text=True
)
if result_processes.returncode == 0:
# Clear current list
for item in self.gpu_process_tree.get_children():
self.gpu_process_tree.delete(item)
# Add processes
processes = result_processes.stdout.strip().split('\n')
for process in processes:
if process.strip():
process_data = process.split(',')
if len(process_data) >= 3:
pid = process_data[0].strip()
name = process_data[1].strip()
memory = f"{process_data[2].strip()} MiB"
# Add to list
self.gpu_process_tree.insert("", tk.END, text=pid, values=(pid, name, memory))
# If it's Ollama, mark as active model
if "ollama" in name.lower():
self._log(f"Ollama detected on GPU: PID {pid}, using {memory}")
# Find which model is active
try:
process_info = psutil.Process(int(pid))
cmd_line = " ".join(process_info.cmdline())
for model in self.ollama_models:
if model in cmd_line:
self.active_model = model
self._log(f"Active model detected: {model}")
break
except:
pass
except subprocess.SubprocessError as e:
self._log(f"Error executing nvidia-smi: {str(e)}")
except Exception as e:
self._log(f"GPU update error: {str(e)}")
def _refresh_ollama(self):
"""Force refresh of Ollama information"""
self._update_ollama_info()
self._log("Ollama information refreshed")
def _check_gpu_available(self):
"""Check if an NVIDIA GPU is available"""
try:
result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
return result.returncode == 0
except:
return False
def _format_size(self, size_bytes):
"""Format size in readable units"""
if size_bytes < 1024:
return f"{size_bytes} B"
elif size_bytes < 1024 * 1024:
return f"{size_bytes/1024:.1f} KB"
elif size_bytes < 1024 * 1024 * 1024:
return f"{size_bytes/(1024*1024):.1f} MB"
else:
return f"{size_bytes/(1024*1024*1024):.1f} GB"
def _log(self, message):
"""Add a message to logs"""
timestamp = datetime.now().strftime("%H:%M:%S")
log_message = f"[{timestamp}] {message}\n"
# Add to text
self.log_text.insert(tk.END, log_message)
self.log_text.see(tk.END) # Scroll down
def _on_close(self):
"""Handle application closure"""
self.running = False
time.sleep(0.5) # Wait for thread to terminate
self.root.destroy()
def main():
"""Main entry point"""
root = tk.Tk()
app = SystemMonitor(root)
root.mainloop()
if __name__ == "__main__":
main()