mirror of
https://github.com/Ladebeze66/llm_ticket3.git
synced 2025-12-16 04:17:48 +01:00
23 lines
569 B
Python
23 lines
569 B
Python
import json
|
|
from typing import Any
|
|
|
|
def save_json(data: Any, path: str):
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
def clean_html(html_content: str) -> str:
|
|
import re
|
|
from html import unescape
|
|
from bs4 import BeautifulSoup
|
|
|
|
if BeautifulSoup:
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
text = soup.get_text()
|
|
else:
|
|
text = re.sub(r'<.*?>', '', html_content)
|
|
|
|
text = unescape(text)
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
return text
|