llm_ticket3/tests/compare_models.py
2025-04-18 17:34:21 +02:00

253 lines
10 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script pour comparer plusieurs modèles LLM sur le même ticket.
Permet d'évaluer rapidement les performances de différents modèles.
"""
import os
import sys
import argparse
import json
import time
import logging
from typing import Dict, Any, List, Optional
from tests.common.llm_factory import TEXT_MODELS, VISION_MODELS
from tests.workflows import execute_ticket_analysis, execute_image_analysis
def setup_logging(verbose: bool = False, log_file: Optional[str] = None):
"""
Configure le système de logging.
Args:
verbose: Si True, active le mode verbeux (DEBUG)
log_file: Nom du fichier de log (si None, pas de log dans un fichier)
"""
log_level = logging.DEBUG if verbose else logging.INFO
handlers: List[logging.Handler] = [logging.StreamHandler()]
if log_file:
handlers.append(logging.FileHandler(log_file))
logging.basicConfig(
level=log_level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=handlers
)
def compare_text_models(ticket_id: str, models: List[str], output_dir: str = "output") -> Dict[str, Any]:
"""
Compare plusieurs modèles de texte sur le même ticket.
Args:
ticket_id: ID du ticket à analyser
models: Liste des modèles à comparer
output_dir: Répertoire contenant les tickets
Returns:
Dictionnaire avec les résultats pour chaque modèle
"""
results = {
"ticket_id": ticket_id,
"type": "text",
"models": {},
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
print(f"\n=== Comparaison de modèles texte pour le ticket {ticket_id} ===\n")
for model in models:
print(f"Test du modèle: {model}")
start_time = time.time()
try:
workflow_results = execute_ticket_analysis(ticket_id, output_dir, model)
execution_time = time.time() - start_time
if "error" in workflow_results:
print(f" ❌ Échec pour {model}: {workflow_results['error']}")
results["models"][model] = {
"success": False,
"error": workflow_results["error"],
"stage": workflow_results.get("stage", "unknown"),
"execution_time": execution_time
}
else:
analysis = workflow_results["analysis"]
print(f" ✅ Succès pour {model} en {execution_time:.2f} sec: {len(analysis) if analysis else 0} caractères")
results["models"][model] = {
"success": True,
"execution_time": execution_time,
"analysis_length": len(analysis) if analysis else 0,
"analysis": analysis
}
except Exception as e:
execution_time = time.time() - start_time
print(f" ❌ Erreur pour {model}: {e}")
results["models"][model] = {
"success": False,
"error": str(e),
"execution_time": execution_time
}
return results
def compare_vision_models(ticket_id: str, models: List[str], text_model: str = "mistral_large", output_dir: str = "output") -> Dict[str, Any]:
"""
Compare plusieurs modèles de vision sur le même ticket.
Args:
ticket_id: ID du ticket à analyser
models: Liste des modèles de vision à comparer
text_model: Modèle de texte à utiliser pour l'analyse du contexte
output_dir: Répertoire contenant les tickets
Returns:
Dictionnaire avec les résultats pour chaque modèle
"""
results = {
"ticket_id": ticket_id,
"type": "vision",
"text_model": text_model,
"models": {},
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
print(f"\n=== Comparaison de modèles vision pour le ticket {ticket_id} (texte: {text_model}) ===\n")
for model in models:
print(f"Test du modèle: {model}")
start_time = time.time()
try:
workflow_results = execute_image_analysis(ticket_id, output_dir, text_model, model)
execution_time = time.time() - start_time
if "error" in workflow_results:
print(f" ❌ Échec pour {model}: {workflow_results['error']}")
results["models"][model] = {
"success": False,
"error": workflow_results["error"],
"stage": workflow_results.get("stage", "unknown"),
"execution_time": execution_time
}
else:
images_analysed = len(workflow_results["analysis_results"])
print(f" ✅ Succès pour {model} en {execution_time:.2f} sec: {images_analysed} images analysées")
results["models"][model] = {
"success": True,
"execution_time": execution_time,
"images_total": workflow_results["images_count"],
"images_relevant": len(workflow_results["relevant_images"]),
"images_analysed": images_analysed,
"analysis_results": workflow_results["analysis_results"]
}
except Exception as e:
execution_time = time.time() - start_time
print(f" ❌ Erreur pour {model}: {e}")
results["models"][model] = {
"success": False,
"error": str(e),
"execution_time": execution_time
}
return results
def print_comparison_table(results: Dict[str, Any]):
"""
Affiche un tableau de comparaison des modèles.
Args:
results: Résultats de la comparaison
"""
if results["type"] == "text":
print("\n=== Résultats de la comparaison des modèles texte ===")
print(f"Ticket: {results['ticket_id']}")
print(f"\n{'Modèle':<15} | {'Statut':<10} | {'Temps (s)':<10} | {'Taille analyse':<15}")
print("-" * 60)
for model, data in results["models"].items():
status = "✅ Succès" if data.get("success", False) else "❌ Échec"
time_str = f"{data.get('execution_time', 0):.2f}"
length = data.get("analysis_length", "N/A")
print(f"{model:<15} | {status:<10} | {time_str:<10} | {length:<15}")
elif results["type"] == "vision":
print("\n=== Résultats de la comparaison des modèles vision ===")
print(f"Ticket: {results['ticket_id']}")
print(f"Modèle texte: {results['text_model']}")
print(f"\n{'Modèle':<15} | {'Statut':<10} | {'Temps (s)':<10} | {'Images analysées':<20}")
print("-" * 65)
for model, data in results["models"].items():
status = "✅ Succès" if data.get("success", False) else "❌ Échec"
time_str = f"{data.get('execution_time', 0):.2f}"
if data.get("success", False):
images = f"{data.get('images_analysed', 0)}/{data.get('images_relevant', 0)}/{data.get('images_total', 0)}"
else:
images = "N/A"
print(f"{model:<15} | {status:<10} | {time_str:<10} | {images:<20}")
def main():
parser = argparse.ArgumentParser(description="Comparer plusieurs modèles LLM sur le même ticket")
parser.add_argument("ticket_id", help="ID du ticket à analyser")
parser.add_argument("--type", choices=["text", "vision", "both"], default="text",
help="Type de modèles à comparer")
parser.add_argument("--text-models", nargs="+", choices=TEXT_MODELS,
help="Modèles texte à comparer (par défaut: tous)")
parser.add_argument("--vision-models", nargs="+", choices=VISION_MODELS,
help="Modèles vision à comparer (par défaut: tous)")
parser.add_argument("--text-model", choices=TEXT_MODELS, default="mistral_large",
help="Modèle texte à utiliser pour les tests vision")
parser.add_argument("--output-dir", default="output", help="Répertoire des tickets")
parser.add_argument("--save", action="store_true", help="Sauvegarder les résultats")
parser.add_argument("--verbose", "-v", action="store_true", help="Mode verbeux")
args = parser.parse_args()
# Configuration du logging
setup_logging(args.verbose, f"compare_models_{args.ticket_id}.log")
# Sélection des modèles à comparer
text_models = args.text_models if args.text_models else TEXT_MODELS
vision_models = args.vision_models if args.vision_models else VISION_MODELS
# Comparaison des modèles
results = {}
if args.type in ["text", "both"]:
text_results = compare_text_models(args.ticket_id, text_models, args.output_dir)
results["text"] = text_results
print_comparison_table(text_results)
if args.save:
save_dir = os.path.join("results", "comparisons", args.ticket_id)
os.makedirs(save_dir, exist_ok=True)
save_file = os.path.join(save_dir, f"text_comparison_{int(time.time())}.json")
with open(save_file, "w", encoding="utf-8") as f:
json.dump(text_results, f, ensure_ascii=False, indent=2)
print(f"Résultats sauvegardés dans: {save_file}")
if args.type in ["vision", "both"]:
vision_results = compare_vision_models(args.ticket_id, vision_models, args.text_model, args.output_dir)
results["vision"] = vision_results
print_comparison_table(vision_results)
if args.save:
save_dir = os.path.join("results", "comparisons", args.ticket_id)
os.makedirs(save_dir, exist_ok=True)
save_file = os.path.join(save_dir, f"vision_comparison_{int(time.time())}.json")
with open(save_file, "w", encoding="utf-8") as f:
json.dump(vision_results, f, ensure_ascii=False, indent=2)
print(f"Résultats sauvegardés dans: {save_file}")
return 0
if __name__ == "__main__":
sys.exit(main())