#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Script pour comparer plusieurs modèles LLM sur le même ticket. Permet d'évaluer rapidement les performances de différents modèles. """ import os import sys import argparse import json import time import logging from typing import Dict, Any, List, Optional from tests.common.llm_factory import TEXT_MODELS, VISION_MODELS from tests.workflows import execute_ticket_analysis, execute_image_analysis def setup_logging(verbose: bool = False, log_file: Optional[str] = None): """ Configure le système de logging. Args: verbose: Si True, active le mode verbeux (DEBUG) log_file: Nom du fichier de log (si None, pas de log dans un fichier) """ log_level = logging.DEBUG if verbose else logging.INFO handlers: List[logging.Handler] = [logging.StreamHandler()] if log_file: handlers.append(logging.FileHandler(log_file)) logging.basicConfig( level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=handlers ) def compare_text_models(ticket_id: str, models: List[str], output_dir: str = "output") -> Dict[str, Any]: """ Compare plusieurs modèles de texte sur le même ticket. Args: ticket_id: ID du ticket à analyser models: Liste des modèles à comparer output_dir: Répertoire contenant les tickets Returns: Dictionnaire avec les résultats pour chaque modèle """ results = { "ticket_id": ticket_id, "type": "text", "models": {}, "timestamp": time.strftime("%Y-%m-%d %H:%M:%S") } print(f"\n=== Comparaison de modèles texte pour le ticket {ticket_id} ===\n") for model in models: print(f"Test du modèle: {model}") start_time = time.time() try: workflow_results = execute_ticket_analysis(ticket_id, output_dir, model) execution_time = time.time() - start_time if "error" in workflow_results: print(f" ❌ Échec pour {model}: {workflow_results['error']}") results["models"][model] = { "success": False, "error": workflow_results["error"], "stage": workflow_results.get("stage", "unknown"), "execution_time": execution_time } else: analysis = workflow_results["analysis"] print(f" ✅ Succès pour {model} en {execution_time:.2f} sec: {len(analysis) if analysis else 0} caractères") results["models"][model] = { "success": True, "execution_time": execution_time, "analysis_length": len(analysis) if analysis else 0, "analysis": analysis } except Exception as e: execution_time = time.time() - start_time print(f" ❌ Erreur pour {model}: {e}") results["models"][model] = { "success": False, "error": str(e), "execution_time": execution_time } return results def compare_vision_models(ticket_id: str, models: List[str], text_model: str = "mistral_large", output_dir: str = "output") -> Dict[str, Any]: """ Compare plusieurs modèles de vision sur le même ticket. Args: ticket_id: ID du ticket à analyser models: Liste des modèles de vision à comparer text_model: Modèle de texte à utiliser pour l'analyse du contexte output_dir: Répertoire contenant les tickets Returns: Dictionnaire avec les résultats pour chaque modèle """ results = { "ticket_id": ticket_id, "type": "vision", "text_model": text_model, "models": {}, "timestamp": time.strftime("%Y-%m-%d %H:%M:%S") } print(f"\n=== Comparaison de modèles vision pour le ticket {ticket_id} (texte: {text_model}) ===\n") for model in models: print(f"Test du modèle: {model}") start_time = time.time() try: workflow_results = execute_image_analysis(ticket_id, output_dir, text_model, model) execution_time = time.time() - start_time if "error" in workflow_results: print(f" ❌ Échec pour {model}: {workflow_results['error']}") results["models"][model] = { "success": False, "error": workflow_results["error"], "stage": workflow_results.get("stage", "unknown"), "execution_time": execution_time } else: images_analysed = len(workflow_results["analysis_results"]) print(f" ✅ Succès pour {model} en {execution_time:.2f} sec: {images_analysed} images analysées") results["models"][model] = { "success": True, "execution_time": execution_time, "images_total": workflow_results["images_count"], "images_relevant": len(workflow_results["relevant_images"]), "images_analysed": images_analysed, "analysis_results": workflow_results["analysis_results"] } except Exception as e: execution_time = time.time() - start_time print(f" ❌ Erreur pour {model}: {e}") results["models"][model] = { "success": False, "error": str(e), "execution_time": execution_time } return results def print_comparison_table(results: Dict[str, Any]): """ Affiche un tableau de comparaison des modèles. Args: results: Résultats de la comparaison """ if results["type"] == "text": print("\n=== Résultats de la comparaison des modèles texte ===") print(f"Ticket: {results['ticket_id']}") print(f"\n{'Modèle':<15} | {'Statut':<10} | {'Temps (s)':<10} | {'Taille analyse':<15}") print("-" * 60) for model, data in results["models"].items(): status = "✅ Succès" if data.get("success", False) else "❌ Échec" time_str = f"{data.get('execution_time', 0):.2f}" length = data.get("analysis_length", "N/A") print(f"{model:<15} | {status:<10} | {time_str:<10} | {length:<15}") elif results["type"] == "vision": print("\n=== Résultats de la comparaison des modèles vision ===") print(f"Ticket: {results['ticket_id']}") print(f"Modèle texte: {results['text_model']}") print(f"\n{'Modèle':<15} | {'Statut':<10} | {'Temps (s)':<10} | {'Images analysées':<20}") print("-" * 65) for model, data in results["models"].items(): status = "✅ Succès" if data.get("success", False) else "❌ Échec" time_str = f"{data.get('execution_time', 0):.2f}" if data.get("success", False): images = f"{data.get('images_analysed', 0)}/{data.get('images_relevant', 0)}/{data.get('images_total', 0)}" else: images = "N/A" print(f"{model:<15} | {status:<10} | {time_str:<10} | {images:<20}") def main(): parser = argparse.ArgumentParser(description="Comparer plusieurs modèles LLM sur le même ticket") parser.add_argument("ticket_id", help="ID du ticket à analyser") parser.add_argument("--type", choices=["text", "vision", "both"], default="text", help="Type de modèles à comparer") parser.add_argument("--text-models", nargs="+", choices=TEXT_MODELS, help="Modèles texte à comparer (par défaut: tous)") parser.add_argument("--vision-models", nargs="+", choices=VISION_MODELS, help="Modèles vision à comparer (par défaut: tous)") parser.add_argument("--text-model", choices=TEXT_MODELS, default="mistral_large", help="Modèle texte à utiliser pour les tests vision") parser.add_argument("--output-dir", default="output", help="Répertoire des tickets") parser.add_argument("--save", action="store_true", help="Sauvegarder les résultats") parser.add_argument("--verbose", "-v", action="store_true", help="Mode verbeux") args = parser.parse_args() # Configuration du logging setup_logging(args.verbose, f"compare_models_{args.ticket_id}.log") # Sélection des modèles à comparer text_models = args.text_models if args.text_models else TEXT_MODELS vision_models = args.vision_models if args.vision_models else VISION_MODELS # Comparaison des modèles results = {} if args.type in ["text", "both"]: text_results = compare_text_models(args.ticket_id, text_models, args.output_dir) results["text"] = text_results print_comparison_table(text_results) if args.save: save_dir = os.path.join("results", "comparisons", args.ticket_id) os.makedirs(save_dir, exist_ok=True) save_file = os.path.join(save_dir, f"text_comparison_{int(time.time())}.json") with open(save_file, "w", encoding="utf-8") as f: json.dump(text_results, f, ensure_ascii=False, indent=2) print(f"Résultats sauvegardés dans: {save_file}") if args.type in ["vision", "both"]: vision_results = compare_vision_models(args.ticket_id, vision_models, args.text_model, args.output_dir) results["vision"] = vision_results print_comparison_table(vision_results) if args.save: save_dir = os.path.join("results", "comparisons", args.ticket_id) os.makedirs(save_dir, exist_ok=True) save_file = os.path.join(save_dir, f"vision_comparison_{int(time.time())}.json") with open(save_file, "w", encoding="utf-8") as f: json.dump(vision_results, f, ensure_ascii=False, indent=2) print(f"Résultats sauvegardés dans: {save_file}") return 0 if __name__ == "__main__": sys.exit(main())