llm_ticket3/utils/test_ocr_utils.py
2025-05-06 16:48:43 +02:00

69 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from pathlib import Path
from shutil import copyfile
from PIL import Image
from ocr_utils import pytesseract, AUTO_DETECT_LANGUAGE, completer_mots_tronques
from ocr_preprocessor import preprocess_image, PREPROCESSING_PROFILES
# === CONFIGURATION ===
DOSSIER_IMAGES = "images_test"
DOSSIER_RESULTATS = "ocr_test_results"
EXTENSIONS_AUTORISEES = [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
PROFILS_A_TESTER = ["default", "document", "aggressive"]
PSM_MODES = [3, 4, 6, 11, 12]
OEM = 3
def est_image_valide(fichier):
return any(fichier.lower().endswith(ext) for ext in EXTENSIONS_AUTORISEES)
def detect_lang(img: Image.Image):
return "fra+eng" if AUTO_DETECT_LANGUAGE else "eng"
def lancer_tests(dossier_images):
images = [f for f in Path(dossier_images).glob("*") if est_image_valide(str(f))]
if not images:
print("❌ Aucune image valide trouvée.")
return
for image_path in images:
base_name = Path(image_path).stem
print(f"\n🖼️ Traitement de l'image : {image_path.name}")
for profil in PROFILS_A_TESTER:
print(f" 🔧 Profil de prétraitement : {profil}")
settings = PREPROCESSING_PROFILES[profil].copy()
dossier_profil = Path(DOSSIER_RESULTATS) / base_name / profil
dossier_profil.mkdir(parents=True, exist_ok=True)
# Copie de limage originale (1 fois par profil)
copyfile(image_path, dossier_profil / "image.png")
try:
img_prep = preprocess_image(str(image_path), **settings)
img_prep.save(dossier_profil / "optimized.png")
ocr_lang = detect_lang(img_prep)
for psm in PSM_MODES:
print(f" ⚙️ PSM={psm}...", end="")
config = f"--psm {psm} --oem {OEM} -l {ocr_lang} -c preserve_interword_spaces=1"
texte = pytesseract.image_to_string(img_prep, config=config)
texte_corrige = completer_mots_tronques(texte)
if len(texte_corrige) >= len(texte) * 0.9:
texte = texte_corrige
# Écriture du fichier texte
with open(dossier_profil / f"psm{psm}.txt", "w", encoding="utf-8") as f:
f.write(texte.strip())
print(f"{len(texte.strip())} caractères.")
except Exception as e:
print(f" ❌ Erreur avec profil {profil}: {e}")
if __name__ == "__main__":
os.makedirs(DOSSIER_RESULTATS, exist_ok=True)
lancer_tests(DOSSIER_IMAGES)