2025-05-07 16:02:39 +02:00

118 lines
4.0 KiB
Python

import pytesseract
import cv2
import numpy as np
import os
from pathlib import Path
from PIL import Image
from langdetect import detect
import re
from ocr_preprocessor import preprocess_image
# ⬇️ PARAMÈTRES CENTRAUX D'ACTIVATION ⬇️
USE_PREPROCESSING = True # Active le prétraitement de l'image
USE_TEXT_CORRECTION = True # Corrige les mots tronqués après OCR
USE_IMAGE_RESIZE = False # Redimensionne l'image si trop petite
SAVE_DEBUG_OUTPUT = False # Sauvegarde image + texte dans debug_ocr/
AUTO_DETECT_LANGUAGE = True # Détecte automatiquement la langue
# Complétion de mots tronqués (rudimentaire mais utile)
def completer_mots_tronques(texte):
lignes = texte.splitlines()
lignes_corrigees = []
for ligne in lignes:
if ligne.strip().endswith("-"):
ligne = ligne.strip()[:-1]
lignes_corrigees.append(ligne)
return "\n".join(lignes_corrigees)
# Prétraitement de l'image
def pretraiter_image(image_path, optimize_for_text=True):
image = cv2.imread(image_path)
if image is None:
return None
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
if optimize_for_text:
gray = cv2.GaussianBlur(gray, (3, 3), 0)
gray = cv2.adaptiveThreshold(gray, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 31, 15)
else:
gray = cv2.equalizeHist(gray)
if USE_IMAGE_RESIZE:
height, width = gray.shape
if width < 1000 or height < 1000:
ratio = max(1000 / width, 1000 / height)
gray = cv2.resize(gray, (0, 0), fx=ratio, fy=ratio)
return gray
# Détection de langue automatique (si activée)
def detect_language_tesseract(image_cv):
try:
text_sample = pytesseract.image_to_string(image_cv, config="--psm 6")
lang = detect(text_sample)
return {
"fr": "fra",
"en": "eng"
}.get(lang, "fra+eng")
except:
return "fra+eng"
# OCR principal
def extraire_texte(image_path, lang="auto"):
image = cv2.imread(image_path)
if image is None:
print(f"[OCR] Image non trouvée: {image_path}")
return "", None
img_standard = preprocess_image(image_path) if USE_PREPROCESSING else Image.open(image_path)
img_optimized = img_standard # on utilise le même traitement pour les deux dans ce cas
ocr_lang = lang
if lang == "auto" and AUTO_DETECT_LANGUAGE:
ocr_lang = detect_language_tesseract(img_standard)
if ocr_lang == "auto":
ocr_lang = "fra+eng"
config = f"--psm 6 -l {ocr_lang}"
texte = pytesseract.image_to_string(img_optimized, config=config)
if USE_TEXT_CORRECTION:
texte_corrige = completer_mots_tronques(texte)
if len(texte_corrige) >= len(texte) * 0.9:
texte = texte_corrige
if SAVE_DEBUG_OUTPUT and texte:
try:
debug_dir = "debug_ocr"
os.makedirs(debug_dir, exist_ok=True)
image_name = Path(image_path).stem
# Conversion si image PIL
if isinstance(img_optimized, Image.Image):
img_optimized = np.array(img_optimized)
if img_optimized.ndim == 3 and img_optimized.shape[2] == 3:
img_optimized = cv2.cvtColor(img_optimized, cv2.COLOR_RGB2BGR)
elif img_optimized.ndim == 3 and img_optimized.shape[2] == 4:
img_optimized = cv2.cvtColor(img_optimized, cv2.COLOR_RGBA2BGR)
if isinstance(img_optimized, np.ndarray):
cv2.imwrite(f"{debug_dir}/optimized_{image_name}.png", img_optimized)
with open(f"{debug_dir}/ocr_{image_name}.png.txt", "w", encoding="utf-8") as f:
f.write(texte)
except Exception as e:
print(f"[OCR DEBUG] Erreur de sauvegarde debug: {e}")
return texte, img_optimized
# Raccourci rapide pour juste récupérer le texte en français
def extraire_texte_fr(image_path):
texte, _ = extraire_texte(image_path, lang="fra")
return texte