llm_ticket3/llm_classes/llama_vision.py

"""
Module for Llama Vision support.
Optimized for English-only mode, eliminating intermediate translations.
"""

import os
import json
import logging
import time
import base64
import requests
from typing import Dict, Any, Optional, List, Union
from PIL import Image
import io
from datetime import datetime, timedelta

from .base_llm import BaseLLM

logger = logging.getLogger("LlamaVision")

class LlamaVision(BaseLLM):
    """
    Interface class with Llama Vision model via its API.
    Optimized to work exclusively in English.
    """

    def __init__(self, modele: str = "llama3.2-vision:90b-instruct-q8_0"):
        # Normaliser le nom du modèle avant de le passer au parent
        normalized_model = self._normalize_model_name(modele)
        super().__init__(normalized_model)

        # Default configuration optimized for English
        self.configurer(
            temperature=0.2,
            top_p=0.8,
            max_tokens=4000
        )

        # Request timeout in seconds
        self.request_timeout = 600

        # Default system prompt (in English)
        self.prompt_system = """You are a helpful AI assistant with vision capabilities.
You are interacting with images provided by the user.
Respond in English unless specifically instructed otherwise."""

        logger.info(f"Initializing LlamaVision with model {normalized_model} (English-only mode)")

    def _normalize_model_name(self, model_name: str) -> str:
        """
        Normalizes model name to ensure compatibility across systems.

        Args:
            model_name: Original model name

        Returns:
            Normalized model name
        """
        # Check if model name already contains the backend-specific format
        if ":" not in model_name and model_name != "llama3.2-vision:90b-instruct-q8_0":
            # Convert standardized name to backend format
            if model_name == "llama3-vision-90b-instruct":
                return "llama3.2-vision:90b-instruct-q8_0"

        # For pipeline_logger compatibility, ensure it's a normalized format
        # that other parts of the code can recognize for normalization
        if model_name.startswith("image_"):
            model_name = model_name[6:]  # Remove redundant prefix

        # Make sure a standard model name is returned for pipeline_logger
        # This is a backup in case pipeline_logger fails to normalize
        if ":" in model_name and "llama" in model_name.lower():
            self.pipeline_normalized_name = "llama3-vision-90b-instruct"
        else:
            # Store normalized version for pipeline_logger
            self.pipeline_normalized_name = model_name

        # Return original name if it already has the correct format for Ollama
        return model_name

    def urlBase(self) -> str:
        """
        Returns the base URL of the Ollama API.
        """
        return "http://217.182.105.173:11434/"

    def cleAPI(self) -> str:
        """
        Ollama doesn't require an API key by default.
        """
        return ""

    def urlFonction(self) -> str:
        """
        Returns the specific Ollama URL for generating a response.
        """
        return "api/generate"

    def _encoder_image_base64(self, chemin_image: str) -> str:
        """
        Encodes an image in base64 for the API.

        Args:
            chemin_image: Path to the image to encode

        Returns:
            Base64 encoded image or empty string in case of error
        """
        try:
            # Check image size and reduce if too large
            with Image.open(chemin_image) as img:
                # If the image is too large, resize it
                max_dim = 800  # Maximum dimension
                width, height = img.size

                if width > max_dim or height > max_dim:
                    # Calculate ratio to maintain proportions
                    ratio = min(max_dim / width, max_dim / height)
                    new_width = int(width * ratio)
                    new_height = int(height * ratio)

                    # Resize image
                    img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)

                # Convert to RGB if necessary (for formats like PNG with alpha channel)
                if img.mode in ("RGBA", "LA", "P"):
                    # Create a white background and compose the image on top to handle transparency
                    background = Image.new("RGB", img.size, (255, 255, 255))
                    if img.mode == "P":
                        img = img.convert("RGBA")
                    background.paste(img, mask=img.split()[3] if img.mode == "RGBA" else None)
                    img = background
                elif img.mode != "RGB":
                    img = img.convert("RGB")

                # Temporarily save the resized image
                buffer = io.BytesIO()
                img.save(buffer, format="JPEG", quality=85)
                buffer.seek(0)

                # Encode in base64
                encoded = base64.b64encode(buffer.read()).decode("utf-8")
                return encoded
        except Exception as e:
            logger.error(f"Base64 encoding error for {chemin_image}: {e}")
            try:
                # Second attempt with a simpler approach
                with Image.open(chemin_image) as img:
                    # Convert directly to RGB regardless of the image
                    img = img.convert("RGB")
                    buffer = io.BytesIO()
                    img.save(buffer, format="JPEG", quality=75)
                    buffer.seek(0)
                    encoded = base64.b64encode(buffer.read()).decode("utf-8")
                    return encoded
            except Exception as e2:
                logger.error(f"Second error during image optimization: {str(e2)}")
                # Last resort: encode the original image without optimization
                with open(chemin_image, "rb") as image_file:
                    encoded = base64.b64encode(image_file.read()).decode("utf-8")
                return encoded

    def _preparer_contenu(self, question: str) -> Dict[str, Any]:
        """
        Prepares the request content specific to Ollama.
        """
        return {
            "model": self.modele,
            "prompt": question,
            "options": {
                "temperature": self.params.get("temperature", 0.2),
                "top_p": self.params.get("top_p", 0.8),
                "num_predict": self.params.get("max_tokens", 1024),
                "stop": self.params.get("stop", []),
                "top_k": 30,
                "num_ctx": 1024,
                "repeat_penalty": 1.1,
                "repeat_last_n": 64,
                "mirostat": 0,
                "mirostat_eta": 0.1,
                "mirostat_tau": 5,
                "keep_alive": int(timedelta(minutes=2).total_seconds()),
                "min_p": 0,
                "seed": 0,
            },
            "stream": False
        }

    def _traiter_reponse(self, reponse: requests.Response) -> str:
        """
        Processes and returns the response provided by Ollama.
        """
        data = reponse.json()
        return data.get("response", "")

    def interroger_avec_image(self, image_path: str, question: str, english_only: bool = True) -> str:
        """
        Sends a multimodal request (image + text) to the API.

        Args:
            image_path: Path to the image
            question: The prompt to send
            english_only: If True, forces the response in English

        Returns:
            Model response
        """
        url = self.urlBase() + self.urlFonction()
        headers = {"Content-Type": "application/json"}

        # Check that the image exists
        if not os.path.exists(image_path):
            logger.error(f"Image does not exist: {image_path}")
            return f"Error: image {image_path} does not exist."

        try:
            # Encode image in base64
            image_b64 = self._encoder_image_base64(image_path)

            # Ensure the question starts with an instruction to respond in English
            if english_only and not question.lower().startswith("[english"):
                question = "[ENGLISH RESPONSE REQUESTED]\n\n" + question

            # Special format for Ollama multimodal
            prompt = f"""<image>
{image_b64}
</image>

{question}"""

            # Prepare the request using the base method
            contenu = self._preparer_contenu(prompt)

            self.heureDepart = datetime.now()

            # Send request
            response = requests.post(
                url=url,
                headers=headers,
                json=contenu,
                timeout=self.request_timeout
            )

            self.heureFin = datetime.now()
            if self.heureDepart is not None:
                self.dureeTraitement = self.heureFin - self.heureDepart
            else:
                self.dureeTraitement = timedelta(0)

            # Response verification
            if response.status_code in [200, 201]:
                self.reponseErreur = False
                return self._traiter_reponse(response)
            else:
                self.reponseErreur = True
                return f"LlamaVision API Error ({response.status_code}): {response.text}"

        except requests.exceptions.Timeout:
            self.heureFin = datetime.now()
            if self.heureDepart is not None:
                self.dureeTraitement = self.heureFin - self.heureDepart
            else:
                self.dureeTraitement = timedelta(0)
            self.reponseErreur = True
            return "Timeout during API call. Image analysis took too long."

        except Exception as e:
            self.heureFin = datetime.now()
            if self.heureDepart is not None:
                self.dureeTraitement = self.heureFin - self.heureDepart
            else:
                self.dureeTraitement = timedelta(0)
            self.reponseErreur = True
            return f"Multimodal communication error: {str(e)}"