import os from pathlib import Path from shutil import copyfile from PIL import Image from ocr_utils import pytesseract, AUTO_DETECT_LANGUAGE, completer_mots_tronques from ocr_preprocessor import preprocess_image, PREPROCESSING_PROFILES # === CONFIGURATION === DOSSIER_IMAGES = "images_test" DOSSIER_RESULTATS = "ocr_test_results" EXTENSIONS_AUTORISEES = [".jpg", ".jpeg", ".png", ".tiff", ".bmp"] PROFILS_A_TESTER = ["default", "document", "aggressive"] PSM_MODES = [3, 4, 6, 11, 12] OEM = 3 def est_image_valide(fichier): return any(fichier.lower().endswith(ext) for ext in EXTENSIONS_AUTORISEES) def detect_lang(img: Image.Image): return "fra+eng" if AUTO_DETECT_LANGUAGE else "eng" def lancer_tests(dossier_images): images = [f for f in Path(dossier_images).glob("*") if est_image_valide(str(f))] if not images: print("❌ Aucune image valide trouvée.") return for image_path in images: base_name = Path(image_path).stem print(f"\n🖼️ Traitement de l'image : {image_path.name}") for profil in PROFILS_A_TESTER: print(f" 🔧 Profil de prétraitement : {profil}") settings = PREPROCESSING_PROFILES[profil].copy() dossier_profil = Path(DOSSIER_RESULTATS) / base_name / profil dossier_profil.mkdir(parents=True, exist_ok=True) # Copie de l’image originale (1 fois par profil) copyfile(image_path, dossier_profil / "image.png") try: img_prep = preprocess_image(str(image_path), **settings) img_prep.save(dossier_profil / "optimized.png") ocr_lang = detect_lang(img_prep) for psm in PSM_MODES: print(f" ⚙️ PSM={psm}...", end="") config = f"--psm {psm} --oem {OEM} -l {ocr_lang} -c preserve_interword_spaces=1" texte = pytesseract.image_to_string(img_prep, config=config) texte_corrige = completer_mots_tronques(texte) if len(texte_corrige) >= len(texte) * 0.9: texte = texte_corrige # Écriture du fichier texte with open(dossier_profil / f"psm{psm}.txt", "w", encoding="utf-8") as f: f.write(texte.strip()) print(f" ✅ {len(texte.strip())} caractères.") except Exception as e: print(f" ❌ Erreur avec profil {profil}: {e}") if __name__ == "__main__": os.makedirs(DOSSIER_RESULTATS, exist_ok=True) lancer_tests(DOSSIER_IMAGES)