mirror of
https://github.com/Ladebeze66/llm_ticket3.git
synced 2025-12-13 10:46:51 +01:00
225 lines
8.1 KiB
Python
225 lines
8.1 KiB
Python
import os
|
||
import json
|
||
import logging
|
||
from datetime import datetime
|
||
from typing import Optional
|
||
|
||
from ..base_agent import BaseAgent
|
||
from ..utils.pipeline_logger import sauvegarder_donnees
|
||
|
||
logger = logging.getLogger("AgentVisionOCR")
|
||
|
||
class AgentVisionOCR(BaseAgent):
|
||
"""
|
||
Agent LlamaVision qui extrait du texte (OCR avancé) depuis une image.
|
||
"""
|
||
|
||
def __init__(self, llm):
|
||
super().__init__("AgentVisionOCR", llm)
|
||
|
||
# Configuration des paramètres du modèle
|
||
self.params = {
|
||
"stream": False,
|
||
"seed": 0,
|
||
#"stop_sequence": [],
|
||
"temperature": 1.3,
|
||
#"reasoning_effort": 0.5,
|
||
#"logit_bias": {},
|
||
"mirostat": 0,
|
||
"mirostat_eta": 0.1,
|
||
"mirostat_tau": 5.0,
|
||
"top_k": 35,
|
||
"top_p": 0.85,
|
||
"min_p": 0.06,
|
||
"frequency_penalty": 0.15,
|
||
"presence_penalty": 0.1,
|
||
"repeat_penalty": 1.15,
|
||
"repeat_last_n": 128,
|
||
"tfs_z": 1.0,
|
||
"num_keep": 0,
|
||
"num_predict": 2048,
|
||
"num_ctx": 16384,
|
||
#"repeat_penalty": 1.1,
|
||
"num_batch": 2048,
|
||
#"mmap": True,
|
||
#"mlock": False,
|
||
#"num_thread": 4,
|
||
#"num_gpu": 1
|
||
}
|
||
|
||
# Prompt OCR optimisé
|
||
self.system_prompt = ("""You are tasked with performing a high-precision OCR extraction on a partially cropped screenshot of a technical web interface.
|
||
|
||
GOAL: Extract **all visible and partially visible text** from the image, no matter how small, faint, or cropped. Structure the output clearly to maximize usability.
|
||
|
||
FORMAT THE OUTPUT AS FOLLOWS:
|
||
|
||
---
|
||
|
||
1. PAGE STRUCTURE
|
||
* Page title or headers (e.g., test names, document references)
|
||
* Section labels or grouping titles
|
||
* URL or navigation path (if visible)
|
||
|
||
2. LABORATORY DATA
|
||
* Sample references or IDs (e.g., 25-00075)
|
||
* Material descriptions (e.g., Sable 0/2 C)
|
||
* Dates, operators (e.g., 02/04/2025 – BOLLÉE Victor)
|
||
|
||
3. UI ELEMENTS
|
||
* Sidebar text, tabs, and form labels
|
||
* Buttons (e.g., RAZ, IMPRIMER, ENREGISTRER)
|
||
* Navigation or functional labels (e.g., ESSAI, SMQ)
|
||
|
||
4. SYSTEM MESSAGES & ERRORS
|
||
* Any status or error message (e.g., "Impossible de trouver l'adresse IP du serveur de zkt1.brg-lab.com")
|
||
* Domain names, IPs, server paths
|
||
|
||
5. METADATA
|
||
* Document standards (e.g., NF EN 933-9)
|
||
* Version/date (e.g., 02-2022)
|
||
* Any hidden identifiers, codes, or repeated numbers
|
||
|
||
6. PERIPHERAL / UNCERTAIN TEXT
|
||
* Logos (e.g., BRG-LAB), watermarks, faded/blurred text
|
||
* Partially cropped lines: use “[...]” to indicate incomplete text
|
||
|
||
RULES:
|
||
- Do NOT interpret or translate.
|
||
- Preserve all punctuation, accents, and casing.
|
||
- Include duplicated text if present multiple times.
|
||
- Mark unclear/partial lines explicitly.
|
||
- Report even faint or background text.
|
||
|
||
This is a high-precision extraction. Output clean bullet points grouped by section. Stay fully factual.
|
||
""")
|
||
|
||
self._configurer_llm()
|
||
self.resultats = []
|
||
self.images_traitees = set()
|
||
logger.info("AgentVisionOCR initialisé avec prompt amélioré.")
|
||
|
||
def _configurer_llm(self):
|
||
if hasattr(self.llm, "prompt_system"):
|
||
self.llm.prompt_system = self.system_prompt
|
||
if hasattr(self.llm, "configurer"):
|
||
self.llm.configurer(**self.params)
|
||
|
||
def _extraire_ticket_id(self, image_path):
|
||
if not image_path:
|
||
return "UNKNOWN"
|
||
segments = image_path.replace('\\', '/').split('/')
|
||
for segment in segments:
|
||
if segment.startswith('T') and segment[1:].isdigit():
|
||
return segment
|
||
if segment.startswith('ticket_T') and segment[8:].isdigit():
|
||
return 'T' + segment[8:]
|
||
return "UNKNOWN"
|
||
|
||
def executer(self, image_path: str, ocr_baseline: str = "", ticket_id: Optional[str] = None) -> dict:
|
||
image_path_abs = os.path.abspath(image_path)
|
||
image_name = os.path.basename(image_path)
|
||
|
||
if image_path_abs in self.images_traitees:
|
||
logger.warning(f"[OCR-LLM] Image déjà traitée, ignorée: {image_name}")
|
||
print(f" AgentVisionOCR: Image déjà traitée, ignorée: {image_name}")
|
||
return {
|
||
"extracted_text": "DUPLICATE - Already processed",
|
||
"image_name": image_name,
|
||
"image_path": image_path_abs,
|
||
"ticket_id": ticket_id or self._extraire_ticket_id(image_path),
|
||
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
"source_agent": self.nom,
|
||
"is_duplicate": True
|
||
}
|
||
|
||
self.images_traitees.add(image_path_abs)
|
||
logger.info(f"[OCR-LLM] Extraction OCR sur {image_name}")
|
||
print(f" AgentVisionOCR: Extraction OCR sur {image_name}")
|
||
|
||
ticket_id = ticket_id or self._extraire_ticket_id(image_path)
|
||
|
||
try:
|
||
if not os.path.exists(image_path):
|
||
raise FileNotFoundError(f"Image introuvable: {image_path}")
|
||
|
||
if not hasattr(self.llm, "interroger_avec_image"):
|
||
raise RuntimeError("Le modèle ne supporte pas l'analyse d'images.")
|
||
|
||
# Interroger le modèle
|
||
response = self.llm.interroger_avec_image(image_path, self.system_prompt)
|
||
|
||
if not response or "i cannot" in response.lower():
|
||
raise ValueError("Réponse vide ou invalide du modèle")
|
||
|
||
cleaned_text = response.strip()
|
||
|
||
model_name = getattr(self.llm, "pipeline_normalized_name",
|
||
getattr(self.llm, "modele", "llama3-vision-90b-instruct"))
|
||
model_name = model_name.replace(".", "-").replace(":", "-").replace("_", "-")
|
||
|
||
result = {
|
||
"extracted_text": cleaned_text,
|
||
"image_name": image_name,
|
||
"image_path": image_path_abs,
|
||
"ticket_id": ticket_id,
|
||
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
"source_agent": self.nom,
|
||
"model_info": {
|
||
"model": model_name,
|
||
**self.params
|
||
}
|
||
}
|
||
|
||
self.resultats.append(result)
|
||
logger.info(f"[OCR-LLM] OCR réussi ({len(cleaned_text)} caractères) pour {image_name}")
|
||
return result
|
||
|
||
except Exception as e:
|
||
error_result = {
|
||
"extracted_text": "",
|
||
"image_name": image_name,
|
||
"image_path": image_path_abs,
|
||
"ticket_id": ticket_id or "UNKNOWN",
|
||
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||
"source_agent": self.nom,
|
||
"error": str(e),
|
||
"model_info": {
|
||
"model": getattr(self.llm, "pipeline_normalized_name", "llama3-vision-90b-instruct"),
|
||
**self.params
|
||
}
|
||
}
|
||
self.resultats.append(error_result)
|
||
logger.error(f"[OCR-LLM] Erreur OCR pour {image_name}: {e}")
|
||
return error_result
|
||
|
||
def sauvegarder_resultats(self, ticket_id: str = "T11143") -> None:
|
||
if not self.resultats:
|
||
logger.warning("[OCR-LLM] Aucun résultat à sauvegarder")
|
||
return
|
||
|
||
resultats_dedupliques = {}
|
||
for resultat in self.resultats:
|
||
image_path = resultat.get("image_path")
|
||
if not image_path:
|
||
continue
|
||
if image_path not in resultats_dedupliques or \
|
||
resultat.get("timestamp", "") > resultats_dedupliques[image_path].get("timestamp", ""):
|
||
resultats_dedupliques[image_path] = resultat
|
||
|
||
resultats_finaux = list(resultats_dedupliques.values())
|
||
|
||
try:
|
||
logger.info(f"[OCR-LLM] Sauvegarde de {len(resultats_finaux)} résultats")
|
||
sauvegarder_donnees(
|
||
ticket_id=ticket_id,
|
||
step_name="ocr_llm",
|
||
data=resultats_finaux,
|
||
base_dir=None,
|
||
is_resultat=True
|
||
)
|
||
self.resultats = []
|
||
self.images_traitees = set()
|
||
except Exception as e:
|
||
logger.error(f"[OCR-LLM] Erreur sauvegarde résultats: {e}")
|