llm_ticket3/utils/ocr_avance/ocr_cleaner.py

import json
from pathlib import Path

# 🧩 Dictionnaires disponibles (clés = profils activables)
CLEAN_DICT_FILES = {
    "ocr": "ocr_clean_dict.json",
    "translation": "translation_clean_dict.json",
    "hallucination": "hallucination_filter.json"
}

# 📁 Chemin racine de tous les dictionnaires
BASE_PATH = Path(__file__).parent

def load_cleaning_dict(path):
    """Charge un dictionnaire de nettoyage JSON."""
    if not path.exists():
        return {}
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_multiple_dicts(active_keys):
    """Charge et fusionne plusieurs dictionnaires selon les profils sélectionnés."""
    merged_dict = {}
    for key in active_keys:
        filename = CLEAN_DICT_FILES.get(key)
        if filename:
            path = BASE_PATH / filename
            data = load_cleaning_dict(path)
            merged_dict.update(data)
    return merged_dict

def clean_ocr_text(text, cleaning_dict=None):
    """Applique les corrections d’un dictionnaire sur un texte."""
    if cleaning_dict is None:
        return text
    for wrong, correct in cleaning_dict.items():
        text = text.replace(wrong, correct)
    return text

def clean_text_with_profiles(text, active_profiles=("ocr",)):
    """
    Nettoie un texte avec un ou plusieurs profils activés.
    Profils possibles : "ocr", "translation", "hallucination"
    """
    cleaning_dict = load_multiple_dicts(active_profiles)
    return clean_ocr_text(text, cleaning_dict)

def add_to_cleaning_dict(wrong, correct, profile="ocr"):
    """
    Ajoute une paire (erreur, correction) à un dictionnaire spécifique.
    """
    filename = CLEAN_DICT_FILES.get(profile)
    if not filename:
        raise ValueError(f"Profil inconnu : {profile}")
    path = BASE_PATH / filename
    data = load_cleaning_dict(path)
    data[wrong] = correct
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)