llm_ticket3/formatters/markdown_to_json.py

import os
import re
import json
import sys

def parse_markdown(md_content):
    data = {}

    # Diviser le contenu en sections
    sections = re.split(r"\n## ", md_content)

    # Traiter chaque section
    for section in sections:
        if section.startswith("Informations du ticket"):
            ticket_info = parse_ticket_info(section)
            data.update(ticket_info)
        elif section.startswith("Messages"):
            messages = parse_messages(section)
            data["messages"] = messages
        elif section.startswith("Informations sur l'extraction"):
            extraction_info = parse_extraction_info(section)
            data.update(extraction_info)

    # Réorganiser les champs pour que la description soit après "name"
    ordered_fields = ["id", "code", "name", "description"]
    ordered_data = {}

    # D'abord ajouter les champs dans l'ordre spécifié
    for field in ordered_fields:
        if field in data:
            ordered_data[field] = data[field]

    # Ensuite ajouter les autres champs
    for key, value in data.items():
        if key not in ordered_data:
            ordered_data[key] = value

    return ordered_data

def parse_ticket_info(section):
    info = {}
    description = []
    capturing_description = False

    lines = section.strip().split("\n")
    i = 0
    while i < len(lines):
        line = lines[i]

        # Si on est déjà en train de capturer la description
        if capturing_description:
            # Vérifie si on atteint une nouvelle section ou un nouveau champ
            if i + 1 < len(lines) and (lines[i + 1].startswith("## ") or lines[i + 1].startswith("- **")):
                capturing_description = False
                info["description"] = "\n".join(description).strip()
            else:
                description.append(line)
                i += 1
                continue

        # Détecte le début de la description
        desc_match = re.match(r"- \*\*description\*\*:", line)
        if desc_match:
            capturing_description = True
            i += 1  # Passe à la ligne suivante
            continue

        # Traite les autres champs normalement
        match = re.match(r"- \*\*(.*?)\*\*: (.*)", line)
        if match:
            key, value = match.groups()
            key = key.lower().replace("/", "_").replace(" ", "_")
            info[key] = value.strip()

        i += 1

    # Si on finit en capturant la description, l'ajouter au dictionnaire
    if capturing_description and description:
        info["description"] = "\n".join(description).strip()
    elif "description" not in info:
        info["description"] = ""

    return info

def parse_messages(section):
    messages = []
    current_message = {}
    in_message = False

    lines = section.strip().split("\n")

    for line in lines:
        if line.startswith("### Message"):
            if current_message:
                messages.append(current_message)
                current_message = {}
            in_message = True

        elif line.startswith("**") and in_message:
            match = re.match(r"\*\*(.*?)\*\*: (.*)", line)
            if match:
                key, value = match.groups()
                key = key.lower().replace("/", "_").replace(" ", "_")
                current_message[key] = value.strip()
        else:
            if in_message:
                current_message["content"] = current_message.get("content", "") + line + "\n"

    if current_message:
        messages.append(current_message)

    return messages


def parse_extraction_info(section):
    extraction_info = {}

    lines = section.strip().split("\n")
    for line in lines:
        match = re.match(r"- \*\*(.*?)\*\*: (.*)", line)
        if match:
            key, value = match.groups()
            key = key.lower().replace("/", "_").replace(" ", "_")
            extraction_info[key] = value.strip()

    return extraction_info


def convert_markdown_to_json(md_file_path, output_file_path):
    with open(md_file_path, 'r', encoding='utf-8') as f:
        md_content = f.read()

    data = parse_markdown(md_content)

    # S'assurer que la description est présente
    if "description" not in data:
        # Trouver l'index de "name" pour insérer la description après
        if "name" in data:
            ordered_data = {}
            for key, value in data.items():
                ordered_data[key] = value
                if key == "name":
                    ordered_data["description"] = ""
            data = ordered_data
        else:
            data["description"] = ""

    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Conversion terminée. Fichier JSON créé : {output_file_path}")

# Fonction simplifiée pour être utilisée comme import
def markdown_to_json(md_file_path, output_file_path):
    """
    Convertit un fichier Markdown en fichier JSON.

    Args:
        md_file_path: Chemin du fichier Markdown à convertir
        output_file_path: Chemin du fichier JSON à créer

    Returns:
        bool: True si la conversion a réussi, False sinon
    """
    try:
        convert_markdown_to_json(md_file_path, output_file_path)
        return True
    except Exception as e:
        print(f"Erreur lors de la conversion Markdown → JSON: {e}")
        return False

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Utilisation : python markdown_to_json.py <fichier_markdown.md> <fichier_sortie.json>")
        sys.exit(1)

    md_file = sys.argv[1]
    output_file = sys.argv[2]

    convert_markdown_to_json(md_file, output_file)