#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Agent for optical character recognition (OCR) in images """ import os import time import uuid import re from typing import Dict, Optional, List, Any, Union import pytesseract from PIL import Image import io import platform from .base import LLMBaseAgent class OCRAgent(LLMBaseAgent): """ Agent for optical character recognition (OCR) """ def __init__(self, model_name: str = "ocr", endpoint: str = "", **config): """ Initialize the OCR agent Args: model_name (str): Model name (default "ocr" as OCR doesn't use LLM models) endpoint (str): API endpoint (not used for OCR) **config: Additional configuration like language, etc. """ # Appeler le constructeur parent avec les paramètres requis super().__init__(model_name, endpoint, **config) # Default configuration for OCR default_config = { "language": "fra", # Default language: French "tesseract_config": "--psm 1 --oem 3", # Default Tesseract config } # Merge with provided configuration self.config.update(default_config) for key, value in default_config.items(): if key not in self.config: self.config[key] = value # Windows-specific configuration if platform.system() == "Windows": # Possible paths for Tesseract on Windows possible_paths = [ r"C:\Program Files\Tesseract-OCR\tesseract.exe", r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe", r"C:\Tesseract-OCR\tesseract.exe", r"C:\Users\PCDEV\AppData\Local\Programs\Tesseract-OCR\tesseract.exe", r"C:\Users\PCDEV\Tesseract-OCR\tesseract.exe" ] # Look for Tesseract in possible paths tesseract_path = None for path in possible_paths: if os.path.exists(path): tesseract_path = path break # Configure pytesseract with the found path if tesseract_path: self.config["tesseract_path"] = tesseract_path pytesseract.pytesseract.tesseract_cmd = tesseract_path print(f"Tesseract found at: {tesseract_path}") else: print("WARNING: Tesseract was not found in standard paths.") print("Please install Tesseract OCR from: https://github.com/UB-Mannheim/tesseract/wiki") print("Or manually specify the path with the tesseract_path parameter") # If a path is provided in the configuration, use it anyway if "tesseract_path" in self.config: pytesseract.pytesseract.tesseract_cmd = self.config["tesseract_path"] # Create directory for OCR logs self.log_dir = os.path.join("data", "ocr_logs") os.makedirs(self.log_dir, exist_ok=True) def generate(self, prompt: str = "", images: Optional[List[bytes]] = None) -> str: """ Perform optical character recognition on provided images Args: prompt (str, optional): Not used for OCR images (List[bytes], optional): List of images to process in bytes Returns: str: Text extracted from images """ if not images: return "Error: No images provided for OCR" results = [] image_count = len(images) # Generate unique ID for this OCR session ocr_id = str(uuid.uuid4())[:8] timestamp = time.strftime("%Y%m%d-%H%M%S") for i, img_bytes in enumerate(images): try: # Open image from bytes img = Image.open(io.BytesIO(img_bytes)) # Perform OCR with Tesseract lang = self.config.get("language", "fra") config = self.config.get("tesseract_config", "--psm 1 --oem 3") text = pytesseract.image_to_string(img, lang=lang, config=config) # Basic text cleaning text = self._clean_text(text) if text: results.append(text) # Save image and OCR result image_path = os.path.join(self.log_dir, f"{timestamp}_{ocr_id}_img{i+1}.png") img.save(image_path, "PNG") # Save extracted text text_path = os.path.join(self.log_dir, f"{timestamp}_{ocr_id}_img{i+1}_ocr.txt") with open(text_path, "w", encoding="utf-8") as f: f.write(f"OCR Language: {lang}\n") f.write(f"Tesseract config: {config}\n\n") f.write(text) print(f"OCR performed on image {i+1}/{image_count}, saved to: {text_path}") except Exception as e: error_msg = f"Error processing image {i+1}: {str(e)}" print(error_msg) # Log the error error_path = os.path.join(self.log_dir, f"{timestamp}_{ocr_id}_img{i+1}_error.txt") with open(error_path, "w", encoding="utf-8") as f: f.write(f"Error processing image {i+1}:\n{str(e)}") # Add error message to results results.append(f"[OCR Error on image {i+1}: {str(e)}]") # Combine all extracted texts if not results: return "No text could be extracted from the provided images." combined_result = "\n\n".join(results) # Save combined result combined_path = os.path.join(self.log_dir, f"{timestamp}_{ocr_id}_combined.txt") with open(combined_path, "w", encoding="utf-8") as f: f.write(f"OCR Language: {self.config.get('language', 'fra')}\n") f.write(f"Number of images: {image_count}\n\n") f.write(combined_result) return combined_result def _clean_text(self, text: str) -> str: """ Clean the text extracted by OCR Args: text (str): Raw text to clean Returns: str: Cleaned text """ if not text: return "" # Remove spaces at beginning and end text = text.strip() # Remove multiple empty lines text = re.sub(r'\n{3,}', '\n\n', text) # Remove non-printable characters text = ''.join(c for c in text if c.isprintable() or c == '\n') return text