llm_ticket3/utils/ocr_cleaner.py
2025-05-05 17:37:32 +02:00

32 lines
993 B
Python

import json
from pathlib import Path
# Emplacement du dictionnaire JSON
DICT_PATH = Path(__file__).parent / "ocr_clean_dict.json"
def load_cleaning_dict(path=DICT_PATH):
"""Charge le dictionnaire de nettoyage depuis un fichier JSON."""
if not path.exists():
return {}
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def clean_ocr_text(text, cleaning_dict=None):
"""
Applique les corrections du dictionnaire à un texte OCR ou traduit.
"""
if cleaning_dict is None:
cleaning_dict = load_cleaning_dict()
for wrong, correct in cleaning_dict.items():
text = text.replace(wrong, correct)
return text
def add_to_cleaning_dict(wrong, correct, path=DICT_PATH):
"""
Ajoute une nouvelle paire d'erreur/correction au dictionnaire.
"""
data = load_cleaning_dict(path)
data[wrong] = correct
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)