llm_ticket3/utils/ocr_brut_test/test_ocr_utils.py

import os
from pathlib import Path
from shutil import copyfile
from PIL import Image
from ocr_utils import pytesseract, AUTO_DETECT_LANGUAGE, completer_mots_tronques
from ocr_preprocessor import preprocess_image, PREPROCESSING_PROFILES

# === CONFIGURATION ===
DOSSIER_IMAGES = "images_test"
DOSSIER_RESULTATS = "ocr_test_results"
EXTENSIONS_AUTORISEES = [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]

# Profils de prétraitement à tester
PROFILS_A_TESTER = ["default", "document", "aggressive", "clahe_high", "invert_light"]

# Configurations OCR à tester
TEST_CONFIGS = [
    {"psm": 3, "oem": 3},
    {"psm": 6, "oem": 3},
    {"psm": 11, "oem": 3},
    {"psm": 4, "oem": 3, "extra": "-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"},
    {"psm": 12, "oem": 3},
    {"psm": 6, "oem": 3}, # neutre
    {"psm": 6, "oem": 1}, # neutre
]

def est_image_valide(fichier):
    return any(fichier.lower().endswith(ext) for ext in EXTENSIONS_AUTORISEES)

def detect_lang(img: Image.Image):
    return "fra+eng" if AUTO_DETECT_LANGUAGE else "eng"

def config_to_filename(cfg):
    suffix = f"psm{cfg['psm']}_oem{cfg['oem']}"
    if "extra" in cfg:
        suffix += "_extra"
    return suffix + ".txt"

def lancer_tests(dossier_images):
    images = [f for f in Path(dossier_images).glob("*") if est_image_valide(str(f))]

    if not images:
        print("❌ Aucune image valide trouvée.")
        return

    for image_path in images:
        base_name = Path(image_path).stem
        print(f"\n🖼️ Traitement de l'image : {image_path.name}")

        for profil in PROFILS_A_TESTER:
            print(f"  🔧 Profil de prétraitement : {profil}")
            settings = PREPROCESSING_PROFILES[profil].copy()
            settings["profile_name"] = profil

            dossier_profil = Path(DOSSIER_RESULTATS) / base_name / profil
            dossier_profil.mkdir(parents=True, exist_ok=True)

            # Copie de l’image originale (1 fois par profil)
            copyfile(image_path, dossier_profil / "image.png")

            try:
                img_prep = preprocess_image(str(image_path), **settings)
                img_prep.save(dossier_profil / f"optimized_{profil}.png")

                ocr_lang = detect_lang(img_prep)

                for cfg in TEST_CONFIGS:
                    print(f"    ⚙️ Test PSM={cfg['psm']} / OEM={cfg['oem']}...", end="")
                    config = f"--psm {cfg['psm']} --oem {cfg['oem']} -l {ocr_lang} -c preserve_interword_spaces=1"
                    if "extra" in cfg:
                        config += " " + cfg["extra"]

                    texte = pytesseract.image_to_string(img_prep, config=config)
                    texte_corrige = completer_mots_tronques(texte)

                    if len(texte_corrige) >= len(texte) * 0.9:
                        texte = texte_corrige

                    nom_fichier = config_to_filename(cfg)
                    with open(dossier_profil / nom_fichier, "w", encoding="utf-8") as f:
                        f.write(texte.strip())

                    print(f" ✅ {len(texte.strip())} caractères.")
            except Exception as e:
                print(f"    ❌ Erreur avec profil {profil}: {e}")

if __name__ == "__main__":
    os.makedirs(DOSSIER_RESULTATS, exist_ok=True)
    lancer_tests(DOSSIER_IMAGES)