import pytesseract import cv2 import numpy as np import os from pathlib import Path from PIL import Image from langdetect import detect import re from ocr_preprocessor import preprocess_image # ⬇️ PARAMÈTRES CENTRAUX D'ACTIVATION ⬇️ USE_PREPROCESSING = True # Active le prétraitement de l'image USE_TEXT_CORRECTION = True # Corrige les mots tronqués après OCR USE_IMAGE_RESIZE = False # Redimensionne l'image si trop petite SAVE_DEBUG_OUTPUT = False # Sauvegarde image + texte dans debug_ocr/ AUTO_DETECT_LANGUAGE = True # Détecte automatiquement la langue # Complétion de mots tronqués (rudimentaire mais utile) def completer_mots_tronques(texte): lignes = texte.splitlines() lignes_corrigees = [] for ligne in lignes: if ligne.strip().endswith("-"): ligne = ligne.strip()[:-1] lignes_corrigees.append(ligne) return "\n".join(lignes_corrigees) # Prétraitement de l'image def pretraiter_image(image_path, optimize_for_text=True): image = cv2.imread(image_path) if image is None: return None gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if optimize_for_text: gray = cv2.GaussianBlur(gray, (3, 3), 0) gray = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 15) else: gray = cv2.equalizeHist(gray) if USE_IMAGE_RESIZE: height, width = gray.shape if width < 1000 or height < 1000: ratio = max(1000 / width, 1000 / height) gray = cv2.resize(gray, (0, 0), fx=ratio, fy=ratio) return gray # Détection de langue automatique (si activée) def detect_language_tesseract(image_cv): try: text_sample = pytesseract.image_to_string(image_cv, config="--psm 6") lang = detect(text_sample) return { "fr": "fra", "en": "eng" }.get(lang, "fra+eng") except: return "fra+eng" # OCR principal def extraire_texte(image_path, lang="auto"): image = cv2.imread(image_path) if image is None: print(f"[OCR] Image non trouvée: {image_path}") return "", None img_standard = preprocess_image(image_path) if USE_PREPROCESSING else Image.open(image_path) img_optimized = img_standard # on utilise le même traitement pour les deux dans ce cas ocr_lang = lang if lang == "auto" and AUTO_DETECT_LANGUAGE: ocr_lang = detect_language_tesseract(img_standard) if ocr_lang == "auto": ocr_lang = "fra+eng" config = f"--psm 6 -l {ocr_lang}" texte = pytesseract.image_to_string(img_optimized, config=config) if USE_TEXT_CORRECTION: texte_corrige = completer_mots_tronques(texte) if len(texte_corrige) >= len(texte) * 0.9: texte = texte_corrige if SAVE_DEBUG_OUTPUT and texte: try: debug_dir = "debug_ocr" os.makedirs(debug_dir, exist_ok=True) image_name = Path(image_path).stem # Conversion si image PIL if isinstance(img_optimized, Image.Image): img_optimized = np.array(img_optimized) if img_optimized.ndim == 3 and img_optimized.shape[2] == 3: img_optimized = cv2.cvtColor(img_optimized, cv2.COLOR_RGB2BGR) elif img_optimized.ndim == 3 and img_optimized.shape[2] == 4: img_optimized = cv2.cvtColor(img_optimized, cv2.COLOR_RGBA2BGR) if isinstance(img_optimized, np.ndarray): cv2.imwrite(f"{debug_dir}/optimized_{image_name}.png", img_optimized) with open(f"{debug_dir}/ocr_{image_name}.png.txt", "w", encoding="utf-8") as f: f.write(texte) except Exception as e: print(f"[OCR DEBUG] Erreur de sauvegarde debug: {e}") return texte, img_optimized # Raccourci rapide pour juste récupérer le texte en français def extraire_texte_fr(image_path): texte, _ = extraire_texte(image_path, lang="fra") return texte