import os import logging from datetime import datetime from PIL import Image from typing import Optional # Gestion de la compatibilité Pillow pour le resampling try: from PIL.Image import Resampling RESAMPLING_METHOD = Resampling.BICUBIC except (ImportError, AttributeError): # Fallback pour les anciennes versions de PIL/Pillow RESAMPLING_METHOD = getattr(Image, 'BICUBIC', 3) # 3 est la valeur numérique pour BICUBIC from ..base_agent import BaseAgent from ..utils.pipeline_logger import sauvegarder_donnees logger = logging.getLogger("AgentVisionOCR") class AgentVisionOCR(BaseAgent): """ Agent LlamaVision qui extrait du texte (OCR avancé) depuis une image. Version corrigée avec agrandissement automatique, meilleur prompt et nettoyage. """ def __init__(self, llm): super().__init__("AgentVisionOCR", llm) # Configuration des paramètres du modèle self.params = { "stream": False, "seed": 0, #"stop_sequence": [], "temperature": 1.5, #"reasoning_effort": 0.5, #"logit_bias": {}, "mirostat": 0, "mirostat_eta": 0.1, "mirostat_tau": 5.0, "top_k": 40, "top_p": 0.85, "min_p": 0.05, "frequency_penalty": 0.0, "presence_penalty": 0.0, "repeat_penalty": 1.1, "repeat_last_n": 128, "tfs_z": 1.0, "num_keep": 0, "num_predict": 4096, "num_ctx": 16384, #"repeat_penalty": 1.1, "num_batch": 2048, #"mmap": True, #"mlock": False, #"num_thread": 4, #"num_gpu": 1 } # Prompt OCR optimisé self.system_prompt = (""" Your task is to extract all visible text from the provided image with absolute completeness. - Do not interpret, summarize, rephrase or explain the content. - Do not add anything that is not explicitly written in the image. - Do not describe the layout, style, or appearance. - Do not translate. Return text in its original language. Return every visible string, including those that are: - Small, low contrast, partially hidden or truncated - Located in headers, sidebars, buttons, labels, fields, tooltips, or footers - System warnings or unreachable server messages (e.g. bottom page alerts) Segment your extraction into the following visual zones: - Top (header, navigation bar, or browser URL bar) - Left (sidebar, vertical menu, or vertical labels) - Center (main content panel or form title) - Bottom-left (lower-left footers, labels or messages) - Bottom-right (lower-right warnings or error dialogs) Under each section, return only the raw text seen in that area, preserving the line order (top to bottom). Strict instructions: - Include interface elements like buttons, labels, collapsed or disabled items. - Include all visible text, even if duplicated elsewhere. - Do NOT skip faint or cropped strings if even partially visible. - Do NOT say "No visible text". Just skip zones silently if nothing is found. Format: Top: * text line Left: * text line ... Only include a zone if at least one string is found in that area. Skip empty sections silently. """) self._configurer_llm() self.resultats = [] self.images_traitees = set() logger.info("AgentVisionOCR initialisé avec upscale automatique + prompt amélioré.") def _configurer_llm(self): if hasattr(self.llm, "prompt_system"): self.llm.prompt_system = self.system_prompt if hasattr(self.llm, "configurer"): self.llm.configurer(**self.params) def _extraire_ticket_id(self, image_path): if not image_path: return "UNKNOWN" segments = image_path.replace('\\', '/').split('/') for segment in segments: if segment.startswith('T') and segment[1:].isdigit(): return segment if segment.startswith('ticket_T') and segment[8:].isdigit(): return 'T' + segment[8:] return "UNKNOWN" def _agrandir_image(self, image_path: str, facteur: float = 1.5) -> str: try: image = Image.open(image_path) largeur, hauteur = image.size nouvelle_taille = (int(largeur * facteur), int(hauteur * facteur)) image_upscaled = image.resize(nouvelle_taille, RESAMPLING_METHOD) temp_path = image_path.replace(".png", "_upscaled.png") image_upscaled.save(temp_path) logger.info(f"[OCR-LLM] Image agrandie sauvegardée: {temp_path}") return temp_path except Exception as e: logger.error(f"[OCR-LLM] Erreur lors de l'agrandissement: {e}") return image_path # fallback si erreur upscale def executer(self, image_path: str, ocr_baseline: str = "", ticket_id: Optional[str] = None) -> dict: image_path_abs = os.path.abspath(image_path) image_name = os.path.basename(image_path) if image_path_abs in self.images_traitees: logger.warning(f"[OCR-LLM] Image déjà traitée, ignorée: {image_name}") print(f" AgentVisionOCR: Image déjà traitée, ignorée: {image_name}") return { "extracted_text": "DUPLICATE - Already processed", "image_name": image_name, "image_path": image_path_abs, "ticket_id": ticket_id or self._extraire_ticket_id(image_path), "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "source_agent": self.nom, "is_duplicate": True } self.images_traitees.add(image_path_abs) logger.info(f"[OCR-LLM] Extraction OCR sur {image_name}") print(f" AgentVisionOCR: Extraction OCR sur {image_name}") ticket_id = ticket_id or self._extraire_ticket_id(image_path) try: if not os.path.exists(image_path): raise FileNotFoundError(f"Image introuvable: {image_path}") if not hasattr(self.llm, "interroger_avec_image"): raise RuntimeError("Le modèle ne supporte pas l'analyse d'images.") # Agrandir image avant OCR upscale_path = image_path # Interroger le modèle response = self.llm.interroger_avec_image(upscale_path, self.system_prompt) if not response or "i cannot" in response.lower(): raise ValueError("Réponse vide ou invalide du modèle") cleaned_text = response.strip() model_name = getattr(self.llm, "pipeline_normalized_name", getattr(self.llm, "modele", "llama3-vision-90b-instruct")) model_name = model_name.replace(".", "-").replace(":", "-").replace("_", "-") result = { "extracted_text": cleaned_text, "image_name": image_name, "image_path": image_path_abs, "ticket_id": ticket_id, "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "source_agent": self.nom, "model_info": { "model": model_name, **self.params } } self.resultats.append(result) logger.info(f"[OCR-LLM] OCR réussi ({len(cleaned_text)} caractères) pour {image_name}") return result except Exception as e: error_result = { "extracted_text": "", "image_name": image_name, "image_path": image_path_abs, "ticket_id": ticket_id or "UNKNOWN", "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "source_agent": self.nom, "error": str(e), "model_info": { "model": getattr(self.llm, "pipeline_normalized_name", "llama3-vision-90b-instruct"), **self.params } } self.resultats.append(error_result) logger.error(f"[OCR-LLM] Erreur OCR pour {image_name}: {e}") return error_result def sauvegarder_resultats(self, ticket_id: str = "T11143") -> None: if not self.resultats: logger.warning("[OCR-LLM] Aucun résultat à sauvegarder") return resultats_dedupliques = {} for resultat in self.resultats: image_path = resultat.get("image_path") if not image_path: continue if image_path not in resultats_dedupliques or \ resultat.get("timestamp", "") > resultats_dedupliques[image_path].get("timestamp", ""): resultats_dedupliques[image_path] = resultat resultats_finaux = list(resultats_dedupliques.values()) try: logger.info(f"[OCR-LLM] Sauvegarde de {len(resultats_finaux)} résultats") sauvegarder_donnees( ticket_id=ticket_id, step_name="ocr_llm", data=resultats_finaux, base_dir=None, is_resultat=True ) self.resultats = [] self.images_traitees = set() except Exception as e: logger.error(f"[OCR-LLM] Erreur sauvegarde résultats: {e}")