llm_lab_perso/utils/system_monitor.py
2025-03-27 18:40:52 +01:00

534 lines
24 KiB
Python

"""
Moniteur de ressources système pour Ollama et les ressources NVIDIA GPU
"""
import tkinter as tk
from tkinter import ttk
import psutil
import threading
import time
import subprocess
import json
import os
import platform
import requests
from datetime import datetime
class SystemMonitor:
def __init__(self, root):
self.root = root
self.root.title("Moniteur LLM Lab")
self.root.geometry("800x600")
self.root.minsize(700, 500)
# Style
self.style = ttk.Style()
self.style.theme_use('alt') # 'clam', 'alt', 'default', 'classic'
# Variables
self.update_interval = 2 # secondes
self.running = True
self.ollama_models = []
self.active_model = None
self.gpu_available = self._check_gpu_available()
# Création de l'UI
self._create_widgets()
# Démarrer le thread de mise à jour
self.update_thread = threading.Thread(target=self._update_loop)
self.update_thread.daemon = True
self.update_thread.start()
# Intercepter la fermeture de la fenêtre
self.root.protocol("WM_DELETE_WINDOW", self._on_close)
def _create_widgets(self):
# Créer le notebook (onglets)
self.notebook = ttk.Notebook(self.root)
self.notebook.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# Onglet 1: Surveillance système
self.system_frame = ttk.Frame(self.notebook)
self.notebook.add(self.system_frame, text="Système")
# Onglet 2: Ollama
self.ollama_frame = ttk.Frame(self.notebook)
self.notebook.add(self.ollama_frame, text="Ollama")
# Onglet 3: GPU
self.gpu_frame = ttk.Frame(self.notebook)
self.notebook.add(self.gpu_frame, text="GPU")
# Onglet 4: Logs
self.logs_frame = ttk.Frame(self.notebook)
self.notebook.add(self.logs_frame, text="Logs")
# === Configuration de l'onglet Système ===
system_label = ttk.Label(self.system_frame, text="Ressources Système", font=("Arial", 14, "bold"))
system_label.pack(pady=10)
# Infos système
system_info_frame = ttk.LabelFrame(self.system_frame, text="Informations Système")
system_info_frame.pack(fill=tk.X, padx=10, pady=5)
# OS
os_frame = ttk.Frame(system_info_frame)
os_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(os_frame, text="Système d'exploitation:").pack(side=tk.LEFT, padx=5)
self.os_label = ttk.Label(os_frame, text="")
self.os_label.pack(side=tk.LEFT, padx=5)
# CPU
cpu_frame = ttk.Frame(system_info_frame)
cpu_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(cpu_frame, text="Processeur:").pack(side=tk.LEFT, padx=5)
self.cpu_label = ttk.Label(cpu_frame, text="")
self.cpu_label.pack(side=tk.LEFT, padx=5)
# RAM
ram_frame = ttk.Frame(system_info_frame)
ram_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(ram_frame, text="Mémoire RAM:").pack(side=tk.LEFT, padx=5)
self.ram_label = ttk.Label(ram_frame, text="")
self.ram_label.pack(side=tk.LEFT, padx=5)
# Barres de progression
progress_frame = ttk.LabelFrame(self.system_frame, text="Utilisation des ressources")
progress_frame.pack(fill=tk.X, padx=10, pady=5)
# CPU Usage
cpu_usage_frame = ttk.Frame(progress_frame)
cpu_usage_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(cpu_usage_frame, text="CPU:").pack(side=tk.LEFT, padx=5)
self.cpu_progress = ttk.Progressbar(cpu_usage_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.cpu_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.cpu_percent = ttk.Label(cpu_usage_frame, text="0%")
self.cpu_percent.pack(side=tk.LEFT, padx=5)
# RAM Usage
ram_usage_frame = ttk.Frame(progress_frame)
ram_usage_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(ram_usage_frame, text="RAM:").pack(side=tk.LEFT, padx=5)
self.ram_progress = ttk.Progressbar(ram_usage_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.ram_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.ram_percent = ttk.Label(ram_usage_frame, text="0%")
self.ram_percent.pack(side=tk.LEFT, padx=5)
# Disk Usage
disk_usage_frame = ttk.Frame(progress_frame)
disk_usage_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(disk_usage_frame, text="Disque:").pack(side=tk.LEFT, padx=5)
self.disk_progress = ttk.Progressbar(disk_usage_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.disk_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.disk_percent = ttk.Label(disk_usage_frame, text="0%")
self.disk_percent.pack(side=tk.LEFT, padx=5)
# === Configuration de l'onglet Ollama ===
ollama_label = ttk.Label(self.ollama_frame, text="Serveur Ollama", font=("Arial", 14, "bold"))
ollama_label.pack(pady=10)
# État du serveur
server_frame = ttk.LabelFrame(self.ollama_frame, text="État du serveur")
server_frame.pack(fill=tk.X, padx=10, pady=5)
# Status
status_frame = ttk.Frame(server_frame)
status_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(status_frame, text="Statut:").pack(side=tk.LEFT, padx=5)
self.status_label = ttk.Label(status_frame, text="Vérification...")
self.status_label.pack(side=tk.LEFT, padx=5)
# URL
url_frame = ttk.Frame(server_frame)
url_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(url_frame, text="URL:").pack(side=tk.LEFT, padx=5)
self.url_label = ttk.Label(url_frame, text="http://localhost:11434")
self.url_label.pack(side=tk.LEFT, padx=5)
# Modèles disponibles
models_frame = ttk.LabelFrame(self.ollama_frame, text="Modèles disponibles")
models_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
# Liste des modèles
self.tree = ttk.Treeview(models_frame, columns=("Taille", "Modifié"), show='headings')
self.tree.heading("Taille", text="Taille")
self.tree.heading("Modifié", text="Modifié")
self.tree.column("Taille", width=100)
self.tree.column("Modifié", width=150)
self.tree.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
# Boutons
button_frame = ttk.Frame(self.ollama_frame)
button_frame.pack(fill=tk.X, padx=10, pady=5)
refresh_button = ttk.Button(button_frame, text="Rafraîchir", command=self._refresh_ollama)
refresh_button.pack(side=tk.LEFT, padx=5)
# === Configuration de l'onglet GPU ===
gpu_label = ttk.Label(self.gpu_frame, text="Ressources GPU", font=("Arial", 14, "bold"))
gpu_label.pack(pady=10)
if self.gpu_available:
# Infos GPU
gpu_info_frame = ttk.LabelFrame(self.gpu_frame, text="Informations GPU")
gpu_info_frame.pack(fill=tk.X, padx=10, pady=5)
# Modèle GPU
gpu_model_frame = ttk.Frame(gpu_info_frame)
gpu_model_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(gpu_model_frame, text="Modèle:").pack(side=tk.LEFT, padx=5)
self.gpu_model_label = ttk.Label(gpu_model_frame, text="")
self.gpu_model_label.pack(side=tk.LEFT, padx=5)
# Mémoire GPU
gpu_memory_frame = ttk.Frame(gpu_info_frame)
gpu_memory_frame.pack(fill=tk.X, padx=5, pady=2)
ttk.Label(gpu_memory_frame, text="Mémoire:").pack(side=tk.LEFT, padx=5)
self.gpu_memory_label = ttk.Label(gpu_memory_frame, text="")
self.gpu_memory_label.pack(side=tk.LEFT, padx=5)
# Utilisation GPU
gpu_usage_frame = ttk.LabelFrame(self.gpu_frame, text="Utilisation")
gpu_usage_frame.pack(fill=tk.X, padx=10, pady=5)
# GPU Compute
gpu_compute_frame = ttk.Frame(gpu_usage_frame)
gpu_compute_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(gpu_compute_frame, text="Calcul:").pack(side=tk.LEFT, padx=5)
self.gpu_compute_progress = ttk.Progressbar(gpu_compute_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.gpu_compute_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.gpu_compute_percent = ttk.Label(gpu_compute_frame, text="0%")
self.gpu_compute_percent.pack(side=tk.LEFT, padx=5)
# GPU Memory
gpu_mem_usage_frame = ttk.Frame(gpu_usage_frame)
gpu_mem_usage_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(gpu_mem_usage_frame, text="Mémoire:").pack(side=tk.LEFT, padx=5)
self.gpu_mem_progress = ttk.Progressbar(gpu_mem_usage_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.gpu_mem_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.gpu_mem_percent = ttk.Label(gpu_mem_usage_frame, text="0%")
self.gpu_mem_percent.pack(side=tk.LEFT, padx=5)
# Température
gpu_temp_frame = ttk.Frame(gpu_usage_frame)
gpu_temp_frame.pack(fill=tk.X, padx=5, pady=5)
ttk.Label(gpu_temp_frame, text="Température:").pack(side=tk.LEFT, padx=5)
self.gpu_temp_progress = ttk.Progressbar(gpu_temp_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
self.gpu_temp_progress.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True)
self.gpu_temp_label = ttk.Label(gpu_temp_frame, text="0°C")
self.gpu_temp_label.pack(side=tk.LEFT, padx=5)
# Graphiques processe actifs
gpu_processes_frame = ttk.LabelFrame(self.gpu_frame, text="Processus GPU")
gpu_processes_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
# Liste des processus
self.gpu_process_tree = ttk.Treeview(gpu_processes_frame,
columns=("PID", "Nom", "Mémoire"),
show='headings')
self.gpu_process_tree.heading("PID", text="PID")
self.gpu_process_tree.heading("Nom", text="Processus")
self.gpu_process_tree.heading("Mémoire", text="Mémoire")
self.gpu_process_tree.column("PID", width=50)
self.gpu_process_tree.column("Nom", width=200)
self.gpu_process_tree.column("Mémoire", width=100)
self.gpu_process_tree.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
else:
no_gpu_label = ttk.Label(self.gpu_frame,
text="Aucun GPU NVIDIA détecté.",
font=("Arial", 12))
no_gpu_label.pack(pady=50)
install_label = ttk.Label(self.gpu_frame,
text="Pour surveiller un GPU NVIDIA, installez nvidia-smi et nvitop.",
font=("Arial", 10))
install_label.pack(pady=10)
# === Configuration de l'onglet Logs ===
logs_label = ttk.Label(self.logs_frame, text="Journaux d'activité", font=("Arial", 14, "bold"))
logs_label.pack(pady=10)
# Zone de logs
log_area_frame = ttk.Frame(self.logs_frame)
log_area_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
# Scrollbar
scrollbar = ttk.Scrollbar(log_area_frame)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
# Text area
self.log_text = tk.Text(log_area_frame, yscrollcommand=scrollbar.set)
self.log_text.pack(fill=tk.BOTH, expand=True)
scrollbar.config(command=self.log_text.yview)
# Boutons
log_button_frame = ttk.Frame(self.logs_frame)
log_button_frame.pack(fill=tk.X, padx=10, pady=5)
clear_log_button = ttk.Button(log_button_frame, text="Effacer les logs",
command=lambda: self.log_text.delete(1.0, tk.END))
clear_log_button.pack(side=tk.LEFT, padx=5)
# Barre d'état en bas
self.status_bar = ttk.Label(self.root, text="Moniteur LLM Lab - Dernière mise à jour: Jamais",
relief=tk.SUNKEN, anchor=tk.W)
self.status_bar.pack(side=tk.BOTTOM, fill=tk.X)
def _update_loop(self):
"""Thread principal de mise à jour"""
while self.running:
try:
# Mise à jour système
self._update_system_info()
# Mise à jour Ollama
if self.notebook.index(self.notebook.select()) == 1: # Onglet Ollama
self._update_ollama_info()
# Mise à jour GPU
if self.gpu_available and self.notebook.index(self.notebook.select()) == 2: # Onglet GPU
self._update_gpu_info()
# Mise à jour de la barre d'état
now = datetime.now().strftime("%H:%M:%S")
self.status_bar.config(text=f"Moniteur LLM Lab - Dernière mise à jour: {now}")
except Exception as e:
self._log(f"Erreur de mise à jour: {str(e)}")
time.sleep(self.update_interval)
def _update_system_info(self):
"""Met à jour les informations système"""
# Informations système
self.os_label.config(text=f"{platform.system()} {platform.release()}")
self.cpu_label.config(text=f"{psutil.cpu_count(logical=False)} cœurs ({psutil.cpu_count()} threads)")
# Détection avancée de la RAM
try:
ram = psutil.virtual_memory()
total_ram = ram.total / (1024 * 1024 * 1024) # GB
# Vérification supplémentaire pour Linux
if platform.system() == "Linux":
try:
# Utiliser /proc/meminfo pour une détection plus précise
with open('/proc/meminfo', 'r') as f:
for line in f:
if 'MemTotal' in line:
# MemTotal est en kB
mem_kb = int(line.split()[1])
linux_ram = mem_kb / (1024 * 1024) # GB
# Utiliser la valeur la plus élevée
total_ram = max(total_ram, linux_ram)
break
except Exception as e:
self._log(f"Erreur lors de la lecture de /proc/meminfo: {str(e)}")
self.ram_label.config(text=f"{total_ram:.1f} GB")
except Exception as e:
self._log(f"Erreur lors de la détection de la RAM: {str(e)}")
self.ram_label.config(text="Détection impossible")
# Utilisation CPU
cpu_percent = psutil.cpu_percent()
self.cpu_progress["value"] = cpu_percent
self.cpu_percent.config(text=f"{cpu_percent:.1f}%")
# Utilisation RAM
ram_percent = ram.percent
self.ram_progress["value"] = ram_percent
self.ram_percent.config(text=f"{ram_percent:.1f}%")
# Utilisation disque
disk = psutil.disk_usage('/')
disk_percent = disk.percent
self.disk_progress["value"] = disk_percent
self.disk_percent.config(text=f"{disk_percent:.1f}%")
def _update_ollama_info(self):
"""Met à jour les informations Ollama"""
try:
# Vérifier si le serveur est en cours d'exécution
response = requests.get("http://localhost:11434/api/tags", timeout=2)
if response.status_code == 200:
self.status_label.config(text="En ligne", foreground="green")
# Mettre à jour la liste des modèles
data = response.json()
models = data.get("models", [])
# Effacer la liste actuelle
for item in self.tree.get_children():
self.tree.delete(item)
# Ajouter les modèles
for model in models:
model_name = model.get("name", "")
model_size = self._format_size(model.get("size", 0))
modified = model.get("modified_at", "")
# Convertir le format de date
if modified:
try:
modified_dt = datetime.fromisoformat(modified.replace('Z', '+00:00'))
modified = modified_dt.strftime("%d/%m/%Y %H:%M")
except:
pass
self.tree.insert("", tk.END, text=model_name, values=(model_size, modified), iid=model_name)
# Mettre à jour la liste globale
self.ollama_models = [model.get("name", "") for model in models]
# Vérifier s'il y a un modèle actif
if self.active_model:
self._log(f"Modèle actif: {self.active_model}")
# Mettre en surbrillance le modèle actif
if self.active_model in self.ollama_models:
self.tree.selection_set(self.active_model)
self.tree.see(self.active_model)
else:
self.status_label.config(text="Erreur", foreground="red")
self._log(f"Erreur de connexion au serveur Ollama: {response.status_code}")
except requests.exceptions.RequestException:
self.status_label.config(text="Hors ligne", foreground="red")
self._log("Serveur Ollama non disponible")
def _update_gpu_info(self):
"""Met à jour les informations GPU"""
if not self.gpu_available:
return
try:
# Exécuter nvidia-smi pour obtenir les informations GPU
result = subprocess.run(
['nvidia-smi', '--query-gpu=name,memory.total,memory.used,utilization.gpu,temperature.gpu',
'--format=csv,noheader,nounits'],
capture_output=True,
text=True,
check=True
)
if result.returncode == 0:
# Analyser les résultats
gpu_data = result.stdout.strip().split(',')
if len(gpu_data) >= 5:
# Nom du modèle
model_name = gpu_data[0].strip()
self.gpu_model_label.config(text=model_name)
# Mémoire totale et utilisée
total_memory = float(gpu_data[1].strip())
used_memory = float(gpu_data[2].strip())
memory_percent = (used_memory / total_memory) * 100 if total_memory > 0 else 0
self.gpu_memory_label.config(text=f"{used_memory:.0f} MiB / {total_memory:.0f} MiB")
self.gpu_mem_progress["value"] = memory_percent
self.gpu_mem_percent.config(text=f"{memory_percent:.1f}%")
# Utilisation GPU
gpu_util = float(gpu_data[3].strip())
self.gpu_compute_progress["value"] = gpu_util
self.gpu_compute_percent.config(text=f"{gpu_util:.1f}%")
# Température
temp = float(gpu_data[4].strip())
# Échelle de température: 0-100°C
self.gpu_temp_progress["value"] = temp
self.gpu_temp_label.config(text=f"{temp:.1f}°C")
# Récupérer les processus GPU
result_processes = subprocess.run(
['nvidia-smi', '--query-compute-apps=pid,name,used_memory', '--format=csv,noheader,nounits'],
capture_output=True,
text=True
)
if result_processes.returncode == 0:
# Effacer la liste actuelle
for item in self.gpu_process_tree.get_children():
self.gpu_process_tree.delete(item)
# Ajouter les processus
processes = result_processes.stdout.strip().split('\n')
for process in processes:
if process.strip():
process_data = process.split(',')
if len(process_data) >= 3:
pid = process_data[0].strip()
name = process_data[1].strip()
memory = f"{process_data[2].strip()} MiB"
# Ajouter à la liste
self.gpu_process_tree.insert("", tk.END, text=pid, values=(pid, name, memory))
# Si c'est Ollama, marquer comme modèle actif
if "ollama" in name.lower():
self._log(f"Ollama détecté sur GPU: PID {pid}, utilisant {memory}")
# Chercher quel modèle est actif
try:
process_info = psutil.Process(int(pid))
cmd_line = " ".join(process_info.cmdline())
for model in self.ollama_models:
if model in cmd_line:
self.active_model = model
self._log(f"Modèle actif détecté: {model}")
break
except:
pass
except subprocess.SubprocessError as e:
self._log(f"Erreur lors de l'exécution de nvidia-smi: {str(e)}")
except Exception as e:
self._log(f"Erreur de mise à jour GPU: {str(e)}")
def _refresh_ollama(self):
"""Force le rafraîchissement des informations Ollama"""
self._update_ollama_info()
self._log("Informations Ollama rafraîchies")
def _check_gpu_available(self):
"""Vérifie si un GPU NVIDIA est disponible"""
try:
result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
return result.returncode == 0
except:
return False
def _format_size(self, size_bytes):
"""Formate la taille en unités lisibles"""
if size_bytes < 1024:
return f"{size_bytes} B"
elif size_bytes < 1024 * 1024:
return f"{size_bytes/1024:.1f} KB"
elif size_bytes < 1024 * 1024 * 1024:
return f"{size_bytes/(1024*1024):.1f} MB"
else:
return f"{size_bytes/(1024*1024*1024):.1f} GB"
def _log(self, message):
"""Ajoute un message aux logs"""
timestamp = datetime.now().strftime("%H:%M:%S")
log_message = f"[{timestamp}] {message}\n"
# Ajouter au texte
self.log_text.insert(tk.END, log_message)
self.log_text.see(tk.END) # Défiler vers le bas
def _on_close(self):
"""Gère la fermeture de l'application"""
self.running = False
time.sleep(0.5) # Attendre que le thread se termine
self.root.destroy()
def main():
"""Point d'entrée principal"""
root = tk.Tk()
app = SystemMonitor(root)
root.mainloop()
if __name__ == "__main__":
main()