llm_ticket3/utils/utils.py
2025-04-03 14:19:02 +02:00

23 lines
569 B
Python

import json
from typing import Any
def save_json(data: Any, path: str):
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def clean_html(html_content: str) -> str:
import re
from html import unescape
from bs4 import BeautifulSoup
if BeautifulSoup:
soup = BeautifulSoup(html_content, "html.parser")
text = soup.get_text()
else:
text = re.sub(r'<.*?>', '', html_content)
text = unescape(text)
text = re.sub(r'\s+', ' ', text).strip()
return text