mirror of
https://github.com/Ladebeze66/llm_ticket3.git
synced 2025-12-15 19:46:53 +01:00
118 lines
4.0 KiB
Python
118 lines
4.0 KiB
Python
import pytesseract
|
|
import cv2
|
|
import numpy as np
|
|
import os
|
|
from pathlib import Path
|
|
from PIL import Image
|
|
from langdetect import detect
|
|
import re
|
|
from ocr_preprocessor import preprocess_image
|
|
|
|
# ⬇️ PARAMÈTRES CENTRAUX D'ACTIVATION ⬇️
|
|
USE_PREPROCESSING = True # Active le prétraitement de l'image
|
|
USE_TEXT_CORRECTION = True # Corrige les mots tronqués après OCR
|
|
USE_IMAGE_RESIZE = False # Redimensionne l'image si trop petite
|
|
SAVE_DEBUG_OUTPUT = False # Sauvegarde image + texte dans debug_ocr/
|
|
AUTO_DETECT_LANGUAGE = True # Détecte automatiquement la langue
|
|
|
|
# Complétion de mots tronqués (rudimentaire mais utile)
|
|
def completer_mots_tronques(texte):
|
|
lignes = texte.splitlines()
|
|
lignes_corrigees = []
|
|
for ligne in lignes:
|
|
if ligne.strip().endswith("-"):
|
|
ligne = ligne.strip()[:-1]
|
|
lignes_corrigees.append(ligne)
|
|
return "\n".join(lignes_corrigees)
|
|
|
|
# Prétraitement de l'image
|
|
def pretraiter_image(image_path, optimize_for_text=True):
|
|
image = cv2.imread(image_path)
|
|
if image is None:
|
|
return None
|
|
|
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
|
|
if optimize_for_text:
|
|
gray = cv2.GaussianBlur(gray, (3, 3), 0)
|
|
gray = cv2.adaptiveThreshold(gray, 255,
|
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
cv2.THRESH_BINARY, 31, 15)
|
|
else:
|
|
gray = cv2.equalizeHist(gray)
|
|
|
|
if USE_IMAGE_RESIZE:
|
|
height, width = gray.shape
|
|
if width < 1000 or height < 1000:
|
|
ratio = max(1000 / width, 1000 / height)
|
|
gray = cv2.resize(gray, (0, 0), fx=ratio, fy=ratio)
|
|
|
|
return gray
|
|
|
|
# Détection de langue automatique (si activée)
|
|
def detect_language_tesseract(image_cv):
|
|
try:
|
|
text_sample = pytesseract.image_to_string(image_cv, config="--psm 6")
|
|
lang = detect(text_sample)
|
|
return {
|
|
"fr": "fra",
|
|
"en": "eng"
|
|
}.get(lang, "fra+eng")
|
|
except:
|
|
return "fra+eng"
|
|
|
|
# OCR principal
|
|
def extraire_texte(image_path, lang="auto"):
|
|
image = cv2.imread(image_path)
|
|
if image is None:
|
|
print(f"[OCR] Image non trouvée: {image_path}")
|
|
return "", None
|
|
|
|
img_standard = preprocess_image(image_path) if USE_PREPROCESSING else Image.open(image_path)
|
|
img_optimized = img_standard # on utilise le même traitement pour les deux dans ce cas
|
|
|
|
|
|
ocr_lang = lang
|
|
if lang == "auto" and AUTO_DETECT_LANGUAGE:
|
|
ocr_lang = detect_language_tesseract(img_standard)
|
|
if ocr_lang == "auto":
|
|
ocr_lang = "fra+eng"
|
|
|
|
config = f"--psm 6 -l {ocr_lang}"
|
|
texte = pytesseract.image_to_string(img_optimized, config=config)
|
|
|
|
if USE_TEXT_CORRECTION:
|
|
texte_corrige = completer_mots_tronques(texte)
|
|
if len(texte_corrige) >= len(texte) * 0.9:
|
|
texte = texte_corrige
|
|
|
|
if SAVE_DEBUG_OUTPUT and texte:
|
|
try:
|
|
debug_dir = "debug_ocr"
|
|
os.makedirs(debug_dir, exist_ok=True)
|
|
image_name = Path(image_path).stem
|
|
|
|
# Conversion si image PIL
|
|
if isinstance(img_optimized, Image.Image):
|
|
img_optimized = np.array(img_optimized)
|
|
if img_optimized.ndim == 3 and img_optimized.shape[2] == 3:
|
|
img_optimized = cv2.cvtColor(img_optimized, cv2.COLOR_RGB2BGR)
|
|
elif img_optimized.ndim == 3 and img_optimized.shape[2] == 4:
|
|
img_optimized = cv2.cvtColor(img_optimized, cv2.COLOR_RGBA2BGR)
|
|
|
|
if isinstance(img_optimized, np.ndarray):
|
|
cv2.imwrite(f"{debug_dir}/optimized_{image_name}.png", img_optimized)
|
|
|
|
with open(f"{debug_dir}/ocr_{image_name}.png.txt", "w", encoding="utf-8") as f:
|
|
f.write(texte)
|
|
|
|
except Exception as e:
|
|
print(f"[OCR DEBUG] Erreur de sauvegarde debug: {e}")
|
|
|
|
return texte, img_optimized
|
|
|
|
# Raccourci rapide pour juste récupérer le texte en français
|
|
def extraire_texte_fr(image_path):
|
|
texte, _ = extraire_texte(image_path, lang="fra")
|
|
return texte
|