llm_ticket3/utils/ocr_avance/ocr_cleaner.py
2025-05-07 17:12:50 +02:00

60 lines
1.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
from pathlib import Path
# 🧩 Dictionnaires disponibles (clés = profils activables)
CLEAN_DICT_FILES = {
"ocr": "ocr_clean_dict.json",
"translation": "translation_clean_dict.json",
"hallucination": "hallucination_filter.json"
}
# 📁 Chemin racine de tous les dictionnaires
BASE_PATH = Path(__file__).parent
def load_cleaning_dict(path):
"""Charge un dictionnaire de nettoyage JSON."""
if not path.exists():
return {}
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def load_multiple_dicts(active_keys):
"""Charge et fusionne plusieurs dictionnaires selon les profils sélectionnés."""
merged_dict = {}
for key in active_keys:
filename = CLEAN_DICT_FILES.get(key)
if filename:
path = BASE_PATH / filename
data = load_cleaning_dict(path)
merged_dict.update(data)
return merged_dict
def clean_ocr_text(text, cleaning_dict=None):
"""Applique les corrections dun dictionnaire sur un texte."""
if cleaning_dict is None:
return text
for wrong, correct in cleaning_dict.items():
text = text.replace(wrong, correct)
return text
def clean_text_with_profiles(text, active_profiles=("ocr",)):
"""
Nettoie un texte avec un ou plusieurs profils activés.
Profils possibles : "ocr", "translation", "hallucination"
"""
cleaning_dict = load_multiple_dicts(active_profiles)
return clean_ocr_text(text, cleaning_dict)
def add_to_cleaning_dict(wrong, correct, profile="ocr"):
"""
Ajoute une paire (erreur, correction) à un dictionnaire spécifique.
"""
filename = CLEAN_DICT_FILES.get(profile)
if not filename:
raise ValueError(f"Profil inconnu : {profile}")
path = BASE_PATH / filename
data = load_cleaning_dict(path)
data[wrong] = correct
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)