llm_ticket3/utils/ocr_brut_test/ocr_utils.py

import pytesseract
import cv2
import numpy as np
import os
from pathlib import Path
from PIL import Image
from langdetect import detect
import re
from ocr_preprocessor import preprocess_image

# ⬇️ PARAMÈTRES CENTRAUX D'ACTIVATION ⬇️
USE_PREPROCESSING = True         # Active le prétraitement de l'image
USE_TEXT_CORRECTION = True       # Corrige les mots tronqués après OCR
USE_IMAGE_RESIZE = False          # Redimensionne l'image si trop petite
SAVE_DEBUG_OUTPUT = False         # Sauvegarde image + texte dans debug_ocr/
AUTO_DETECT_LANGUAGE = True      # Détecte automatiquement la langue

# Complétion de mots tronqués (rudimentaire mais utile)
def completer_mots_tronques(texte):
    lignes = texte.splitlines()
    lignes_corrigees = []
    for ligne in lignes:
        if ligne.strip().endswith("-"):
            ligne = ligne.strip()[:-1]
        lignes_corrigees.append(ligne)
    return "\n".join(lignes_corrigees)

# Prétraitement de l'image
def pretraiter_image(image_path, optimize_for_text=True):
    image = cv2.imread(image_path)
    if image is None:
        return None

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    if optimize_for_text:
        gray = cv2.GaussianBlur(gray, (3, 3), 0)
        gray = cv2.adaptiveThreshold(gray, 255,
                                     cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                     cv2.THRESH_BINARY, 31, 15)
    else:
        gray = cv2.equalizeHist(gray)

    if USE_IMAGE_RESIZE:
        height, width = gray.shape
        if width < 1000 or height < 1000:
            ratio = max(1000 / width, 1000 / height)
            gray = cv2.resize(gray, (0, 0), fx=ratio, fy=ratio)

    return gray

# Détection de langue automatique (si activée)
def detect_language_tesseract(image_cv):
    try:
        text_sample = pytesseract.image_to_string(image_cv, config="--psm 6")
        lang = detect(text_sample)
        return {
            "fr": "fra",
            "en": "eng"
        }.get(lang, "fra+eng")
    except:
        return "fra+eng"

# OCR principal
def extraire_texte(image_path, lang="auto"):
    image = cv2.imread(image_path)
    if image is None:
        print(f"[OCR] Image non trouvée: {image_path}")
        return "", None

    img_standard = preprocess_image(image_path) if USE_PREPROCESSING else Image.open(image_path)
    img_optimized = img_standard  # on utilise le même traitement pour les deux dans ce cas


    ocr_lang = lang
    if lang == "auto" and AUTO_DETECT_LANGUAGE:
        ocr_lang = detect_language_tesseract(img_standard)
    if ocr_lang == "auto":
        ocr_lang = "fra+eng"

    config = f"--psm 6 -l {ocr_lang}"
    texte = pytesseract.image_to_string(img_optimized, config=config)

    if USE_TEXT_CORRECTION:
        texte_corrige = completer_mots_tronques(texte)
        if len(texte_corrige) >= len(texte) * 0.9:
            texte = texte_corrige

    if SAVE_DEBUG_OUTPUT and texte:
        try:
            debug_dir = "debug_ocr"
            os.makedirs(debug_dir, exist_ok=True)
            image_name = Path(image_path).stem

            # Conversion si image PIL
            if isinstance(img_optimized, Image.Image):
                img_optimized = np.array(img_optimized)
                if img_optimized.ndim == 3 and img_optimized.shape[2] == 3:
                    img_optimized = cv2.cvtColor(img_optimized, cv2.COLOR_RGB2BGR)
                elif img_optimized.ndim == 3 and img_optimized.shape[2] == 4:
                    img_optimized = cv2.cvtColor(img_optimized, cv2.COLOR_RGBA2BGR)

            if isinstance(img_optimized, np.ndarray):
                cv2.imwrite(f"{debug_dir}/optimized_{image_name}.png", img_optimized)

            with open(f"{debug_dir}/ocr_{image_name}.png.txt", "w", encoding="utf-8") as f:
                f.write(texte)

        except Exception as e:
            print(f"[OCR DEBUG] Erreur de sauvegarde debug: {e}")

    return texte, img_optimized

# Raccourci rapide pour juste récupérer le texte en français
def extraire_texte_fr(image_path):
    texte, _ = extraire_texte(image_path, lang="fra")
    return texte