mirror of
https://github.com/Ladebeze66/llm_ticket3.git
synced 2025-12-15 19:46:53 +01:00
90 lines
3.3 KiB
Python
90 lines
3.3 KiB
Python
import os
|
||
from pathlib import Path
|
||
from shutil import copyfile
|
||
from PIL import Image
|
||
from ocr_utils import pytesseract, AUTO_DETECT_LANGUAGE, completer_mots_tronques
|
||
from ocr_preprocessor import preprocess_image, PREPROCESSING_PROFILES
|
||
|
||
# === CONFIGURATION ===
|
||
DOSSIER_IMAGES = "images_test"
|
||
DOSSIER_RESULTATS = "ocr_test_results"
|
||
EXTENSIONS_AUTORISEES = [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
|
||
|
||
# Profils de prétraitement à tester
|
||
PROFILS_A_TESTER = ["default", "document", "aggressive", "clahe_high", "invert_light"]
|
||
|
||
# Configurations OCR à tester
|
||
TEST_CONFIGS = [
|
||
{"psm": 3, "oem": 3},
|
||
{"psm": 6, "oem": 3},
|
||
{"psm": 11, "oem": 3},
|
||
{"psm": 4, "oem": 3, "extra": "-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"},
|
||
{"psm": 12, "oem": 3},
|
||
{"psm": 6, "oem": 3}, # neutre
|
||
{"psm": 6, "oem": 1}, # neutre
|
||
]
|
||
|
||
def est_image_valide(fichier):
|
||
return any(fichier.lower().endswith(ext) for ext in EXTENSIONS_AUTORISEES)
|
||
|
||
def detect_lang(img: Image.Image):
|
||
return "fra+eng" if AUTO_DETECT_LANGUAGE else "eng"
|
||
|
||
def config_to_filename(cfg):
|
||
suffix = f"psm{cfg['psm']}_oem{cfg['oem']}"
|
||
if "extra" in cfg:
|
||
suffix += "_extra"
|
||
return suffix + ".txt"
|
||
|
||
def lancer_tests(dossier_images):
|
||
images = [f for f in Path(dossier_images).glob("*") if est_image_valide(str(f))]
|
||
|
||
if not images:
|
||
print("❌ Aucune image valide trouvée.")
|
||
return
|
||
|
||
for image_path in images:
|
||
base_name = Path(image_path).stem
|
||
print(f"\n🖼️ Traitement de l'image : {image_path.name}")
|
||
|
||
for profil in PROFILS_A_TESTER:
|
||
print(f" 🔧 Profil de prétraitement : {profil}")
|
||
settings = PREPROCESSING_PROFILES[profil].copy()
|
||
settings["profile_name"] = profil
|
||
|
||
dossier_profil = Path(DOSSIER_RESULTATS) / base_name / profil
|
||
dossier_profil.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Copie de l’image originale (1 fois par profil)
|
||
copyfile(image_path, dossier_profil / "image.png")
|
||
|
||
try:
|
||
img_prep = preprocess_image(str(image_path), **settings)
|
||
img_prep.save(dossier_profil / f"optimized_{profil}.png")
|
||
|
||
ocr_lang = detect_lang(img_prep)
|
||
|
||
for cfg in TEST_CONFIGS:
|
||
print(f" ⚙️ Test PSM={cfg['psm']} / OEM={cfg['oem']}...", end="")
|
||
config = f"--psm {cfg['psm']} --oem {cfg['oem']} -l {ocr_lang} -c preserve_interword_spaces=1"
|
||
if "extra" in cfg:
|
||
config += " " + cfg["extra"]
|
||
|
||
texte = pytesseract.image_to_string(img_prep, config=config)
|
||
texte_corrige = completer_mots_tronques(texte)
|
||
|
||
if len(texte_corrige) >= len(texte) * 0.9:
|
||
texte = texte_corrige
|
||
|
||
nom_fichier = config_to_filename(cfg)
|
||
with open(dossier_profil / nom_fichier, "w", encoding="utf-8") as f:
|
||
f.write(texte.strip())
|
||
|
||
print(f" ✅ {len(texte.strip())} caractères.")
|
||
except Exception as e:
|
||
print(f" ❌ Erreur avec profil {profil}: {e}")
|
||
|
||
if __name__ == "__main__":
|
||
os.makedirs(DOSSIER_RESULTATS, exist_ok=True)
|
||
lancer_tests(DOSSIER_IMAGES)
|