import os import logging from datetime import datetime from PIL import Image from ..base_agent import BaseAgent from ..utils.pipeline_logger import sauvegarder_donnees logger = logging.getLogger("AgentVisionOCR") class AgentVisionOCR(BaseAgent): """ Agent LlamaVision qui extrait du texte (OCR avancé) depuis une image. Permet une lecture plus fine pour les images conservées après tri. """ def __init__(self, llm): super().__init__("AgentVisionOCR", llm) self.params = { "temperature": 0.1, "top_p": 0.85, "max_tokens": 1500 } self.system_prompt = """You are a multilingual OCR visual assistant. Your task is to extract all visible text from image, even if it is in French, English, or both. Guidelines: 1. Include partial, blurry, or stylized characters 2. Group the result by type: labels, titles, buttons, errors, URLs, etc. 3. Do NOT translate any text - just extract what is visible 4. Mention if the image contains unreadable or missing parts Respond in English.""" self._configurer_llm() logger.info("AgentVisionOCR initialisé") def _configurer_llm(self): if hasattr(self.llm, "prompt_system"): self.llm.prompt_system = self.system_prompt if hasattr(self.llm, "configurer"): self.llm.configurer(**self.params) def _extraire_ticket_id(self, image_path): parts = image_path.split(os.sep) for part in parts: if part.startswitch("T") and part[1:].isdigit(): return part return "UNKNOWN" def executer(self, image_path: str, ocr_baseline: str = "") -> dict: """" Effectue un OCR visuel via LlamaVision sur l'imga spécifiée. Args: image_path: Chemin vers l'image ç analyser ocr_baseline: Texte OCRé précédemment (pour comparaison) Returns: Dictionnaire contenant le texte extrait et les métadonnées """ image_name = os.path.basename(image_path) print(f" AgentVisionOCR: Extraction sur {image_name}") try: if not os.path.exists(image_path): raise FileNotFoundError(f"Image introuvable: {image_path}") if not hasattr(self, "interroge_avec_image"): raise RuntimeError("Le modèle ne supporte pas l'analyse d'images") response = self.llm.interroger_avec_image(image_path, self.system_prompt) if not response or "i cannot" in response.lower(): raise ValueError("Réponse vide invalide du modèle") result = { "extracted_text": response.strip(), "image_name": image_name, "image_path": image_path, "ocr_script_text": ocr_baseline.strip(), "ticket_id": self._extraire_ticket_id(image_path), "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "source_agent": self.nom } # Sauvegarder pour traçabilité sauvegarder_donnees( ticket_id=result["ticket_id"], step_name="ocr_llm", data=result, base_dir=None, is_resultat=True ) logger.info(f"OCR LLM réussi pour {image_name}") return result except Exception as e: logger.error(f"Erreur lors de l'extraction OCR pour {image_name}: {e}") return { "extracted_text": "", "image_name": image_name, "image_path": image_path, "ticket_id": self._extraire_ticket_id(image_path), "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "source_agent": self.nom, "error": str(e) }