import os from pathlib import Path from shutil import copyfile from PIL import Image from ocr_utils import pytesseract, AUTO_DETECT_LANGUAGE, completer_mots_tronques from ocr_preprocessor import preprocess_image, PREPROCESSING_PROFILES # === CONFIGURATION === DOSSIER_IMAGES = "images_test" DOSSIER_RESULTATS = "ocr_test_results" EXTENSIONS_AUTORISEES = [".jpg", ".jpeg", ".png", ".tiff", ".bmp"] # Profils de prétraitement à tester PROFILS_A_TESTER = ["default", "document", "aggressive", "clahe_high", "invert_light"] # Configurations OCR à tester TEST_CONFIGS = [ {"psm": 3, "oem": 3}, {"psm": 6, "oem": 3}, {"psm": 11, "oem": 3}, {"psm": 4, "oem": 3, "extra": "-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"}, {"psm": 12, "oem": 3}, {"psm": 6, "oem": 3}, # neutre {"psm": 6, "oem": 1}, # neutre ] def est_image_valide(fichier): return any(fichier.lower().endswith(ext) for ext in EXTENSIONS_AUTORISEES) def detect_lang(img: Image.Image): return "fra+eng" if AUTO_DETECT_LANGUAGE else "eng" def config_to_filename(cfg): suffix = f"psm{cfg['psm']}_oem{cfg['oem']}" if "extra" in cfg: suffix += "_extra" return suffix + ".txt" def lancer_tests(dossier_images): images = [f for f in Path(dossier_images).glob("*") if est_image_valide(str(f))] if not images: print("❌ Aucune image valide trouvée.") return for image_path in images: base_name = Path(image_path).stem print(f"\n🖼️ Traitement de l'image : {image_path.name}") for profil in PROFILS_A_TESTER: print(f" 🔧 Profil de prétraitement : {profil}") settings = PREPROCESSING_PROFILES[profil].copy() settings["profile_name"] = profil dossier_profil = Path(DOSSIER_RESULTATS) / base_name / profil dossier_profil.mkdir(parents=True, exist_ok=True) # Copie de l’image originale (1 fois par profil) copyfile(image_path, dossier_profil / "image.png") try: img_prep = preprocess_image(str(image_path), **settings) img_prep.save(dossier_profil / f"optimized_{profil}.png") ocr_lang = detect_lang(img_prep) for cfg in TEST_CONFIGS: print(f" ⚙️ Test PSM={cfg['psm']} / OEM={cfg['oem']}...", end="") config = f"--psm {cfg['psm']} --oem {cfg['oem']} -l {ocr_lang} -c preserve_interword_spaces=1" if "extra" in cfg: config += " " + cfg["extra"] texte = pytesseract.image_to_string(img_prep, config=config) texte_corrige = completer_mots_tronques(texte) if len(texte_corrige) >= len(texte) * 0.9: texte = texte_corrige nom_fichier = config_to_filename(cfg) with open(dossier_profil / nom_fichier, "w", encoding="utf-8") as f: f.write(texte.strip()) print(f" ✅ {len(texte.strip())} caractères.") except Exception as e: print(f" ❌ Erreur avec profil {profil}: {e}") if __name__ == "__main__": os.makedirs(DOSSIER_RESULTATS, exist_ok=True) lancer_tests(DOSSIER_IMAGES)