import os import json import logging from datetime import datetime from typing import Optional from ..base_agent import BaseAgent from ..utils.pipeline_logger import sauvegarder_donnees logger = logging.getLogger("AgentVisionOCR") class AgentVisionOCR(BaseAgent): """ Agent LlamaVision qui extrait du texte (OCR avancé) depuis une image. """ def __init__(self, llm): super().__init__("AgentVisionOCR", llm) # Configuration des paramètres du modèle self.params = { "stream": False, "seed": 0, #"stop_sequence": [], "temperature": 1.3, #"reasoning_effort": 0.5, #"logit_bias": {}, "mirostat": 0, "mirostat_eta": 0.1, "mirostat_tau": 5.0, "top_k": 35, "top_p": 0.85, "min_p": 0.06, "frequency_penalty": 0.15, "presence_penalty": 0.1, "repeat_penalty": 1.15, "repeat_last_n": 128, "tfs_z": 1.0, "num_keep": 0, "num_predict": 2048, "num_ctx": 16384, #"repeat_penalty": 1.1, "num_batch": 2048, #"mmap": True, #"mlock": False, #"num_thread": 4, #"num_gpu": 1 } # Prompt OCR optimisé self.system_prompt = ("""You are tasked with performing a high-precision OCR extraction on a partially cropped screenshot of a technical web interface. GOAL: Extract **all visible and partially visible text** from the image, no matter how small, faint, or cropped. Structure the output clearly to maximize usability. FORMAT THE OUTPUT AS FOLLOWS: --- 1. PAGE STRUCTURE * Page title or headers (e.g., test names, document references) * Section labels or grouping titles * URL or navigation path (if visible) 2. LABORATORY DATA * Sample references or IDs (e.g., 25-00075) * Material descriptions (e.g., Sable 0/2 C) * Dates, operators (e.g., 02/04/2025 – BOLLÉE Victor) 3. UI ELEMENTS * Sidebar text, tabs, and form labels * Buttons (e.g., RAZ, IMPRIMER, ENREGISTRER) * Navigation or functional labels (e.g., ESSAI, SMQ) 4. SYSTEM MESSAGES & ERRORS * Any status or error message (e.g., "Impossible de trouver l'adresse IP du serveur de zkt1.brg-lab.com") * Domain names, IPs, server paths 5. METADATA * Document standards (e.g., NF EN 933-9) * Version/date (e.g., 02-2022) * Any hidden identifiers, codes, or repeated numbers 6. PERIPHERAL / UNCERTAIN TEXT * Logos (e.g., BRG-LAB), watermarks, faded/blurred text * Partially cropped lines: use “[...]” to indicate incomplete text RULES: - Do NOT interpret or translate. - Preserve all punctuation, accents, and casing. - Include duplicated text if present multiple times. - Mark unclear/partial lines explicitly. - Report even faint or background text. This is a high-precision extraction. Output clean bullet points grouped by section. Stay fully factual. """) self._configurer_llm() self.resultats = [] self.images_traitees = set() logger.info("AgentVisionOCR initialisé avec prompt amélioré.") def _configurer_llm(self): if hasattr(self.llm, "prompt_system"): self.llm.prompt_system = self.system_prompt if hasattr(self.llm, "configurer"): self.llm.configurer(**self.params) def _extraire_ticket_id(self, image_path): if not image_path: return "UNKNOWN" segments = image_path.replace('\\', '/').split('/') for segment in segments: if segment.startswith('T') and segment[1:].isdigit(): return segment if segment.startswith('ticket_T') and segment[8:].isdigit(): return 'T' + segment[8:] return "UNKNOWN" def executer(self, image_path: str, ocr_baseline: str = "", ticket_id: Optional[str] = None) -> dict: image_path_abs = os.path.abspath(image_path) image_name = os.path.basename(image_path) if image_path_abs in self.images_traitees: logger.warning(f"[OCR-LLM] Image déjà traitée, ignorée: {image_name}") print(f" AgentVisionOCR: Image déjà traitée, ignorée: {image_name}") return { "extracted_text": "DUPLICATE - Already processed", "image_name": image_name, "image_path": image_path_abs, "ticket_id": ticket_id or self._extraire_ticket_id(image_path), "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "source_agent": self.nom, "is_duplicate": True } self.images_traitees.add(image_path_abs) logger.info(f"[OCR-LLM] Extraction OCR sur {image_name}") print(f" AgentVisionOCR: Extraction OCR sur {image_name}") ticket_id = ticket_id or self._extraire_ticket_id(image_path) try: if not os.path.exists(image_path): raise FileNotFoundError(f"Image introuvable: {image_path}") if not hasattr(self.llm, "interroger_avec_image"): raise RuntimeError("Le modèle ne supporte pas l'analyse d'images.") # Interroger le modèle response = self.llm.interroger_avec_image(image_path, self.system_prompt) if not response or "i cannot" in response.lower(): raise ValueError("Réponse vide ou invalide du modèle") cleaned_text = response.strip() model_name = getattr(self.llm, "pipeline_normalized_name", getattr(self.llm, "modele", "llama3-vision-90b-instruct")) model_name = model_name.replace(".", "-").replace(":", "-").replace("_", "-") result = { "extracted_text": cleaned_text, "image_name": image_name, "image_path": image_path_abs, "ticket_id": ticket_id, "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "source_agent": self.nom, "model_info": { "model": model_name, **self.params } } self.resultats.append(result) logger.info(f"[OCR-LLM] OCR réussi ({len(cleaned_text)} caractères) pour {image_name}") return result except Exception as e: error_result = { "extracted_text": "", "image_name": image_name, "image_path": image_path_abs, "ticket_id": ticket_id or "UNKNOWN", "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "source_agent": self.nom, "error": str(e), "model_info": { "model": getattr(self.llm, "pipeline_normalized_name", "llama3-vision-90b-instruct"), **self.params } } self.resultats.append(error_result) logger.error(f"[OCR-LLM] Erreur OCR pour {image_name}: {e}") return error_result def sauvegarder_resultats(self, ticket_id: str = "T11143") -> None: if not self.resultats: logger.warning("[OCR-LLM] Aucun résultat à sauvegarder") return resultats_dedupliques = {} for resultat in self.resultats: image_path = resultat.get("image_path") if not image_path: continue if image_path not in resultats_dedupliques or \ resultat.get("timestamp", "") > resultats_dedupliques[image_path].get("timestamp", ""): resultats_dedupliques[image_path] = resultat resultats_finaux = list(resultats_dedupliques.values()) try: logger.info(f"[OCR-LLM] Sauvegarde de {len(resultats_finaux)} résultats") sauvegarder_donnees( ticket_id=ticket_id, step_name="ocr_llm", data=resultats_finaux, base_dir=None, is_resultat=True ) self.resultats = [] self.images_traitees = set() except Exception as e: logger.error(f"[OCR-LLM] Erreur sauvegarde résultats: {e}")