llm_ticket3/utils/ocr_brut_test/test_ocr_utils.py
2025-05-07 16:02:39 +02:00

90 lines
3.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from pathlib import Path
from shutil import copyfile
from PIL import Image
from ocr_utils import pytesseract, AUTO_DETECT_LANGUAGE, completer_mots_tronques
from ocr_preprocessor import preprocess_image, PREPROCESSING_PROFILES
# === CONFIGURATION ===
DOSSIER_IMAGES = "images_test"
DOSSIER_RESULTATS = "ocr_test_results"
EXTENSIONS_AUTORISEES = [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
# Profils de prétraitement à tester
PROFILS_A_TESTER = ["default", "document", "aggressive", "clahe_high", "invert_light"]
# Configurations OCR à tester
TEST_CONFIGS = [
{"psm": 3, "oem": 3},
{"psm": 6, "oem": 3},
{"psm": 11, "oem": 3},
{"psm": 4, "oem": 3, "extra": "-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"},
{"psm": 12, "oem": 3},
{"psm": 6, "oem": 3}, # neutre
{"psm": 6, "oem": 1}, # neutre
]
def est_image_valide(fichier):
return any(fichier.lower().endswith(ext) for ext in EXTENSIONS_AUTORISEES)
def detect_lang(img: Image.Image):
return "fra+eng" if AUTO_DETECT_LANGUAGE else "eng"
def config_to_filename(cfg):
suffix = f"psm{cfg['psm']}_oem{cfg['oem']}"
if "extra" in cfg:
suffix += "_extra"
return suffix + ".txt"
def lancer_tests(dossier_images):
images = [f for f in Path(dossier_images).glob("*") if est_image_valide(str(f))]
if not images:
print("❌ Aucune image valide trouvée.")
return
for image_path in images:
base_name = Path(image_path).stem
print(f"\n🖼️ Traitement de l'image : {image_path.name}")
for profil in PROFILS_A_TESTER:
print(f" 🔧 Profil de prétraitement : {profil}")
settings = PREPROCESSING_PROFILES[profil].copy()
settings["profile_name"] = profil
dossier_profil = Path(DOSSIER_RESULTATS) / base_name / profil
dossier_profil.mkdir(parents=True, exist_ok=True)
# Copie de limage originale (1 fois par profil)
copyfile(image_path, dossier_profil / "image.png")
try:
img_prep = preprocess_image(str(image_path), **settings)
img_prep.save(dossier_profil / f"optimized_{profil}.png")
ocr_lang = detect_lang(img_prep)
for cfg in TEST_CONFIGS:
print(f" ⚙️ Test PSM={cfg['psm']} / OEM={cfg['oem']}...", end="")
config = f"--psm {cfg['psm']} --oem {cfg['oem']} -l {ocr_lang} -c preserve_interword_spaces=1"
if "extra" in cfg:
config += " " + cfg["extra"]
texte = pytesseract.image_to_string(img_prep, config=config)
texte_corrige = completer_mots_tronques(texte)
if len(texte_corrige) >= len(texte) * 0.9:
texte = texte_corrige
nom_fichier = config_to_filename(cfg)
with open(dossier_profil / nom_fichier, "w", encoding="utf-8") as f:
f.write(texte.strip())
print(f"{len(texte.strip())} caractères.")
except Exception as e:
print(f" ❌ Erreur avec profil {profil}: {e}")
if __name__ == "__main__":
os.makedirs(DOSSIER_RESULTATS, exist_ok=True)
lancer_tests(DOSSIER_IMAGES)