mirror of
https://github.com/Ladebeze66/llm_ticket3.git
synced 2025-12-13 12:36:50 +01:00
60 lines
1.9 KiB
Python
60 lines
1.9 KiB
Python
import json
|
||
from pathlib import Path
|
||
|
||
# 🧩 Dictionnaires disponibles (clés = profils activables)
|
||
CLEAN_DICT_FILES = {
|
||
"ocr": "ocr_clean_dict.json",
|
||
"translation": "translation_clean_dict.json",
|
||
"hallucination": "hallucination_filter.json"
|
||
}
|
||
|
||
# 📁 Chemin racine de tous les dictionnaires
|
||
BASE_PATH = Path(__file__).parent
|
||
|
||
def load_cleaning_dict(path):
|
||
"""Charge un dictionnaire de nettoyage JSON."""
|
||
if not path.exists():
|
||
return {}
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
return json.load(f)
|
||
|
||
def load_multiple_dicts(active_keys):
|
||
"""Charge et fusionne plusieurs dictionnaires selon les profils sélectionnés."""
|
||
merged_dict = {}
|
||
for key in active_keys:
|
||
filename = CLEAN_DICT_FILES.get(key)
|
||
if filename:
|
||
path = BASE_PATH / filename
|
||
data = load_cleaning_dict(path)
|
||
merged_dict.update(data)
|
||
return merged_dict
|
||
|
||
def clean_ocr_text(text, cleaning_dict=None):
|
||
"""Applique les corrections d’un dictionnaire sur un texte."""
|
||
if cleaning_dict is None:
|
||
return text
|
||
for wrong, correct in cleaning_dict.items():
|
||
text = text.replace(wrong, correct)
|
||
return text
|
||
|
||
def clean_text_with_profiles(text, active_profiles=("ocr",)):
|
||
"""
|
||
Nettoie un texte avec un ou plusieurs profils activés.
|
||
Profils possibles : "ocr", "translation", "hallucination"
|
||
"""
|
||
cleaning_dict = load_multiple_dicts(active_profiles)
|
||
return clean_ocr_text(text, cleaning_dict)
|
||
|
||
def add_to_cleaning_dict(wrong, correct, profile="ocr"):
|
||
"""
|
||
Ajoute une paire (erreur, correction) à un dictionnaire spécifique.
|
||
"""
|
||
filename = CLEAN_DICT_FILES.get(profile)
|
||
if not filename:
|
||
raise ValueError(f"Profil inconnu : {profile}")
|
||
path = BASE_PATH / filename
|
||
data = load_cleaning_dict(path)
|
||
data[wrong] = correct
|
||
with open(path, "w", encoding="utf-8") as f:
|
||
json.dump(data, f, indent=2, ensure_ascii=False)
|