llm_ticket3/agents/llama_vision/agent_ticket_analyser.py

from ..base_agent import BaseAgent
from typing import Dict, Any, List, Union
import logging
import json
import os
from datetime import datetime
from loaders.ticket_data_loader import TicketDataLoader
from ..utils.pipeline_logger import sauvegarder_donnees
from utils.translate_utils import fr_to_en, en_to_fr
import re

logger = logging.getLogger("AgentTicketAnalyser")

class AgentTicketAnalyser(BaseAgent):
    def __init__(self, llm):
        super().__init__("AgentTicketAnalyser", llm)

        self.params = {
            "temperature": 0.1,
            "top_p": 0.5,
            "max_tokens": 4000
        }

        # Instructions principales
        self.instructions = """
You will analyze a technical support ticket from the CBAO company.

The ticket consists of an initial message from the client and subsequent exchanges between the client and technical support.

Follow these guidelines:

1. Overview:
   - Summarize the main issue reported in the ticket
   - Identify the product or service concerned

2. Detailed Analysis:
   - Analyze each message chronologically
   - Extract key information, error messages, and details about the problem
   - Note any attached images or files mentioned

3. Links and Technical Details:
   - Identify ALL links (URLs) mentioned in the ticket
   - Extract technical terminology, specific error codes, or reference numbers
   - If a URL appears in the ticket, always include it in your summary
   - Note system configurations or version information

4. Conversation Flow:
   - Identify questions asked by support and client responses
   - Highlight information requests that remain unanswered
   - Note any action items or next steps mentioned

5. Resolution:
   - Determine if the issue was resolved
   - Summarize the solution if provided
   - Identify if the ticket was escalated or needed additional input

IMPORTANT: Your analysis should:
- Be factual and objective
- Extract ALL URLs and links
- Avoid speculation or technical diagnosis
- Be structured chronologically
- Focus on the exchange of information
- Distinguish clearly between client and support statements
"""

        # Prompt système
        self.system_prompt = f"""
You are an expert in support ticket analysis at CBAO, tasked with extracting and organizing information from BRG-Lab technical support tickets.

Your goal is to produce a clear, factual summary of the support interaction that will help the technical team understand:
- The issue reported
- Information exchanged
- Current status
- Any URLs or technical details that need attention

Specifically regarding URLs and technical details:
- You MUST preserve ALL links (URLs) that appear in the ticket
- Format each URL on its own line, prefixed with [URL] for visibility
- Flag any technical parameters, configuration details, or version information

{self.instructions}

Your output will be used by the technical team to understand the ticket context, so factual accuracy is essential.

IMPORTANT: All responses should be in English. Translation to French will be handled separately.
"""

        self.ticket_loader = TicketDataLoader()
        self._appliquer_config_locale()
        logger.info("AgentTicketAnalyser initialisé")

    def _appliquer_config_locale(self) -> None:
        """
        Configure le LLM avec les paramètres spécifiques à cet agent
        """
        if hasattr(self.llm, "prompt_system"):
            self.llm.prompt_system = self.system_prompt

        if hasattr(self.llm, "configurer"):
            self.llm.configurer(**self.params)

    def _extraire_urls(self, texte: str) -> List[str]:
        """
        Extrait les URLs d'un texte

        Args:
            texte: Le texte à analyser

        Returns:
            Liste des URLs extraites
        """
        # Motif pour détecter les URLs (plus complet qu'une simple recherche http://)
        url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'

        # Chercher dans le texte avec un motif plus large pour capturer le contexte
        url_mentions = re.finditer(r'(?:URL|link|adresse|href|http)[^\n]*?(https?://[^\s\)\]\"\']+)', texte, re.IGNORECASE)

        # Liste pour stocker les URLs avec leur contexte
        urls = []

        # Ajouter les URLs extraites avec le motif générique
        for url in re.findall(url_pattern, texte):
            if url not in urls:
                urls.append(url)

        # Ajouter les URLs extraites du contexte plus large
        for match in url_mentions:
            url = match.group(1)
            if url not in urls:
                urls.append(url)

        return urls

    def executer(self, ticket_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Analyse un ticket de support et extrait les informations importantes

        Args:
            ticket_data: Données du ticket à analyser

        Returns:
            Dictionnaire contenant les résultats d'analyse
        """
        # Récupérer et normaliser le nom du modèle dès le début
        model_name = getattr(self.llm, "modele", "llama3-vision-90b-instruct")
        if model_name:
            model_name = model_name.replace(".", "-").replace(":", "-").replace("_", "-")

        # Récupérer le ticket_id correctement, avec vérification
        ticket_id = ticket_data.get("ticket_id", "")
        if not ticket_id or ticket_id == "UNKNOWN":
            # Tentative d'extraction depuis le chemin du fichier JSON si disponible
            if "file_path" in ticket_data:
                file_path = ticket_data["file_path"]
                parts = file_path.split(os.path.sep)
                for part in parts:
                    if part.startswith("T") and len(part) >= 2 and part[1:].isdigit():
                        ticket_id = part
                        break
                    if part.startswith("ticket_T"):
                        ticket_id = part.replace("ticket_", "")
                        break

            # Si toujours pas de ticket_id valide, utiliser UNKNOWN
            if not ticket_id:
                ticket_id = "UNKNOWN"

        ticket_content = ticket_data.get("content", "")

        print(f"  AgentTicketAnalyser: analyse du ticket {ticket_id}")

        if not ticket_content or len(ticket_content) < 10:
            logger.warning(f"Contenu du ticket {ticket_id} vide ou trop court")
            return {
                "response": "Contenu du ticket insuffisant pour analyse",
                "response_en": "Ticket content insufficient for analysis",
                "error": True,
                "metadata": {
                    "timestamp": self._get_timestamp(),
                    "source_agent": self.nom,
                    "ticket_id": ticket_id,
                    "model_info": {
                        "model": model_name,
                        **self.params
                    }
                }
            }

        try:
            # S'assurer que le contenu est en anglais pour l'analyser avec LlamaVision
            ticket_content_original = ticket_content
            is_translated = False

            # Vérifier d'abord si content_en existe dans les données du ticket
            if "content_en" in ticket_data:
                logger.info(f"[LANGUE] Utilisation du contenu déjà traduit en anglais pour le ticket {ticket_id}")
                ticket_content_en = ticket_data["content_en"]
                is_translated = True
            # Si le contenu est déjà marqué comme étant en anglais
            elif ticket_data.get("is_english", False):
                logger.info(f"[LANGUE] Le contenu du ticket {ticket_id} est déjà en anglais")
                ticket_content_en = ticket_content
            else:
                # Traduire le contenu en anglais pour l'analyse
                logger.info(f"[TRADUCTION] Nécessité de traduire le contenu du ticket {ticket_id} en anglais")
                logger.info(f"[TRADUCTION] Taille du contenu original à traduire: {len(ticket_content)} caractères")
                ticket_content_en = fr_to_en(ticket_content)
                is_translated = True
                logger.info(f"[TRADUCTION] Traduction terminée, résultat: {len(ticket_content_en)} caractères")

            # Générer le prompt d'analyse avec le contenu en anglais
            logger.info(f"[LANGUE] Génération du prompt d'analyse en anglais")
            prompt = self._generer_prompt({"ticket_id": ticket_id, "content": ticket_content_en})
            logger.info(f"[LANGUE] Taille du prompt en anglais: {len(prompt)} caractères")

            # Analyser avec le LLM
            logger.info(f"[LANGUE] Envoi du prompt en anglais au modèle pour analyse du ticket {ticket_id}")
            response_en = self.llm.interroger(prompt)
            logger.info(f"[LANGUE] Réponse reçue du modèle en anglais: {len(response_en)} caractères")

            # Extraire les URLs de la réponse
            urls = self._extraire_urls(response_en)
            if urls:
                logger.info(f"[ANALYSE] {len(urls)} URLs extraites de l'analyse: {urls}")

            # Ajouter un marqueur pour indiquer le début et la fin de l'analyse en anglais
            response_en_marked = "<!-- ENGLISH ANALYSIS START -->\n\n" + response_en + "\n\n<!-- ENGLISH ANALYSIS END -->"

            # Traduire la réponse en français pour la cohérence du pipeline
            logger.info(f"[TRADUCTION] Traduction de la réponse d'analyse du ticket de EN vers FR")
            logger.info(f"[TRADUCTION] Taille de la réponse en anglais: {len(response_en)} caractères")
            response_fr = en_to_fr(response_en)
            logger.info(f"[TRADUCTION] Taille de la réponse traduite en français: {len(response_fr)} caractères")

            # Ajouter un marqueur pour indiquer le début et la fin de la traduction
            response_fr_marked = "<!-- FRENCH TRANSLATION START -->\n\n" + response_fr + "\n\n<!-- FRENCH TRANSLATION END -->"

            # Formater la réponse
            result = {
                "prompt": ticket_content_original,
                "prompt_en": ticket_content_en,
                "response": response_fr,
                "response_en": response_en,
                "urls_extracted": urls,
                "metadata": {
                    "timestamp": self._get_timestamp(),
                    "source_agent": self.nom,
                    "ticket_id": ticket_id,
                    "model_info": {
                        "model": model_name,
                        **self.params
                    },
                    "language": "en-fr",
                    "translation_markers": True,
                    "is_translated": is_translated
                }
            }

            logger.info(f"[LANGUES] Résultat d'analyse disponible en deux langues: EN et FR")
            logger.info(f"[LANGUES] Traduction du contenu original: {is_translated}")

            # Sauvegarder les données avec un chemin explicite
            try:
                # Déterminer le répertoire ticket
                extraction_dir = None
                base_dir = "output"
                ticket_dir = os.path.join(base_dir, f"ticket_{ticket_id}")

                if os.path.exists(ticket_dir):
                    # Trouver l'extraction la plus récente
                    extractions = []
                    for extraction in os.listdir(ticket_dir):
                        extraction_path = os.path.join(ticket_dir, extraction)
                        if os.path.isdir(extraction_path) and extraction.startswith(ticket_id):
                            extractions.append(extraction_path)

                    if extractions:
                        # Trier par date de modification (plus récente en premier)
                        extractions.sort(key=lambda x: os.path.getmtime(x), reverse=True)
                        extraction_dir = extractions[0]

                if extraction_dir:
                    rapport_dir = os.path.join(extraction_dir, f"{ticket_id}_rapports")
                    os.makedirs(rapport_dir, exist_ok=True)

                    # Maintenant utiliser sauvegarder_donnees avec base_dir spécifié
                    sauvegarder_donnees(
                        ticket_id=ticket_id,
                        step_name="analyse_ticket",
                        data=result,
                        base_dir=rapport_dir,
                        is_resultat=True
                    )
                    print(f"  Analyse terminée: {len(response_fr)} caractères")
                else:
                    logger.error(f"Impossible de déterminer le répertoire d'extraction pour {ticket_id}")
                    # Essayer de sauvegarder quand même en laissant pipeline_logger déterminer le répertoire
                    sauvegarder_donnees(ticket_id, "analyse_ticket", result, is_resultat=True)
            except Exception as save_err:
                logger.error(f"Erreur lors de la sauvegarde des données: {save_err}")
                print(f"  Erreur de sauvegarde: {save_err}")

            # Ajouter à l'historique
            self.ajouter_historique(
                "analyse_ticket",
                {"ticket_id": ticket_id, "prompt": prompt},
                result
            )

            return result

        except Exception as e:
            logger.error(f"Erreur lors de l'analyse du ticket {ticket_id}: {str(e)}")
            return {
                "response": f"Erreur lors de l'analyse du ticket: {str(e)}",
                "response_en": f"Error analyzing ticket: {str(e)}",
                "error": True,
                "metadata": {
                    "timestamp": self._get_timestamp(),
                    "source_agent": self.nom,
                    "ticket_id": ticket_id,
                    "model_info": {
                        "model": model_name,
                        **self.params
                    },
                    "error": True
                }
            }

    def _generer_prompt(self, ticket_data: Dict[str, Any]) -> str:
        """
        Génère un prompt pour l'analyse du ticket

        Args:
            ticket_data: Données du ticket

        Returns:
            Prompt formaté pour le LLM
        """
        ticket_id = ticket_data.get("ticket_id", "UNKNOWN")
        content = ticket_data.get("content", "")

        # Ajout d'instructions spécifiques pour la capture des URLs
        prompt = f"""[ENGLISH RESPONSE REQUESTED]

### TICKET {ticket_id}

{content}

Analyze this support ticket and provide:
1. A chronological summary of the exchanges
2. Extraction of all important technical details
3. Clear identification of ALL URLs mentioned (prefix each with "[URL]")
4. Analysis of whether the issue was resolved

Present your analysis in a clear, concise format that would be helpful for a technical support team.
Focus on FACTS only, avoid interpretation or diagnosis.
"""
        return prompt

    def _formater_date(self, date_str: str) -> str:
        """
        Reformate une date pour l'uniformisation
        (Cette méthode peut être adaptée selon le format des dates dans les tickets)

        Args:
            date_str: Chaîne de date à formater

        Returns:
            Date formatée
        """
        # Formats possibles:
        # - DD/MM/YYYY HH:MM
        # - YYYY-MM-DD HH:MM:SS
        # On uniformise en YYYY-MM-DD HH:MM

        try:
            # Adapter cette partie selon les formats de date rencontrés
            if "/" in date_str:
                # Format DD/MM/YYYY
                parts = date_str.split(" ")
                date_parts = parts[0].split("/")
                time_part = parts[1] if len(parts) > 1 else "00:00"
                return f"{date_parts[2]}-{date_parts[1]}-{date_parts[0]} {time_part}"
            else:
                # Format YYYY-MM-DD
                if " " in date_str:
                    date_part, time_part = date_str.split(" ", 1)
                    time_part = time_part.split(".", 1)[0]  # Enlever les millisecondes
                    return f"{date_part} {time_part}"
                return date_str
        except Exception:
            # En cas d'erreur, retourner la date d'origine
            return date_str

    def _get_timestamp(self) -> str:
        """
        Génère un timestamp au format YYYYMMDD_HHMMSS

        Returns:
            Timestamp formaté
        """
        return datetime.now().strftime("%Y%m%d_%H%M%S")