llm_ticket3/utils/test_ocr_utils.py

import os
from pathlib import Path
from shutil import copyfile
from PIL import Image
from ocr_utils import pytesseract, AUTO_DETECT_LANGUAGE, completer_mots_tronques
from ocr_preprocessor import preprocess_image, PREPROCESSING_PROFILES

# === CONFIGURATION ===
DOSSIER_IMAGES = "images_test"
DOSSIER_RESULTATS = "ocr_test_results"
EXTENSIONS_AUTORISEES = [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
PROFILS_A_TESTER = ["default", "document", "aggressive"]
PSM_MODES = [3, 4, 6, 11, 12]
OEM = 3

def est_image_valide(fichier):
    return any(fichier.lower().endswith(ext) for ext in EXTENSIONS_AUTORISEES)

def detect_lang(img: Image.Image):
    return "fra+eng" if AUTO_DETECT_LANGUAGE else "eng"

def lancer_tests(dossier_images):
    images = [f for f in Path(dossier_images).glob("*") if est_image_valide(str(f))]

    if not images:
        print("❌ Aucune image valide trouvée.")
        return

    for image_path in images:
        base_name = Path(image_path).stem
        print(f"\n🖼️ Traitement de l'image : {image_path.name}")

        for profil in PROFILS_A_TESTER:
            print(f"  🔧 Profil de prétraitement : {profil}")
            settings = PREPROCESSING_PROFILES[profil].copy()

            dossier_profil = Path(DOSSIER_RESULTATS) / base_name / profil
            dossier_profil.mkdir(parents=True, exist_ok=True)

            # Copie de l’image originale (1 fois par profil)
            copyfile(image_path, dossier_profil / "image.png")

            try:
                img_prep = preprocess_image(str(image_path), **settings)
                img_prep.save(dossier_profil / "optimized.png")

                ocr_lang = detect_lang(img_prep)

                for psm in PSM_MODES:
                    print(f"    ⚙️ PSM={psm}...", end="")
                    config = f"--psm {psm} --oem {OEM} -l {ocr_lang} -c preserve_interword_spaces=1"
                    texte = pytesseract.image_to_string(img_prep, config=config)

                    texte_corrige = completer_mots_tronques(texte)
                    if len(texte_corrige) >= len(texte) * 0.9:
                        texte = texte_corrige

                    # Écriture du fichier texte
                    with open(dossier_profil / f"psm{psm}.txt", "w", encoding="utf-8") as f:
                        f.write(texte.strip())

                    print(f" ✅ {len(texte.strip())} caractères.")
            except Exception as e:
                print(f"    ❌ Erreur avec profil {profil}: {e}")

if __name__ == "__main__":
    os.makedirs(DOSSIER_RESULTATS, exist_ok=True)
    lancer_tests(DOSSIER_IMAGES)