llm_ticket3/llm_classes/llama_vision.py
2025-04-28 15:26:18 +02:00

217 lines
7.8 KiB
Python

"""
Module for Llama Vision support.
Optimized for English-only mode, eliminating intermediate translations.
"""
import os
import json
import logging
import time
import base64
import requests
from typing import Dict, Any, Optional, List, Union
from PIL import Image
import io
from datetime import datetime, timedelta
from .base_llm import BaseLLM
logger = logging.getLogger("LlamaVision")
class LlamaVision(BaseLLM):
"""
Interface class with Llama Vision model via its API.
Optimized to work exclusively in English.
"""
def __init__(self, modele: str = "llama3.2-vision:90b-instruct-q8_0"):
super().__init__(modele)
# Default configuration optimized for English
self.configurer(
temperature=0.2,
top_p=0.8,
max_tokens=4000
)
# Request timeout in seconds
self.request_timeout = 600
logger.info(f"Initializing LlamaVision with model {modele} (multilingual mode)")
def urlBase(self) -> str:
"""
Returns the base URL of the Ollama API.
"""
return "http://217.182.105.173:11434/"
def cleAPI(self) -> str:
"""
Ollama doesn't require an API key by default.
"""
return ""
def urlFonction(self) -> str:
"""
Returns the specific Ollama URL for generating a response.
"""
return "api/generate"
def _encoder_image_base64(self, chemin_image: str) -> str:
"""
Encodes an image in base64 for the API.
Args:
chemin_image: Path to the image to encode
Returns:
Base64 encoded image or empty string in case of error
"""
try:
# Check image size and reduce if too large
with Image.open(chemin_image) as img:
# If the image is too large, resize it
max_dim = 800 # Maximum dimension
width, height = img.size
if width > max_dim or height > max_dim:
# Calculate ratio to maintain proportions
ratio = min(max_dim / width, max_dim / height)
new_width = int(width * ratio)
new_height = int(height * ratio)
# Resize image
img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
# Convert to RGB if necessary (for formats like PNG with alpha channel)
if img.mode in ("RGBA", "LA", "P"):
# Create a white background and compose the image on top to handle transparency
background = Image.new("RGB", img.size, (255, 255, 255))
if img.mode == "P":
img = img.convert("RGBA")
background.paste(img, mask=img.split()[3] if img.mode == "RGBA" else None)
img = background
elif img.mode != "RGB":
img = img.convert("RGB")
# Temporarily save the resized image
buffer = io.BytesIO()
img.save(buffer, format="JPEG", quality=85)
buffer.seek(0)
# Encode in base64
encoded = base64.b64encode(buffer.read()).decode("utf-8")
return encoded
except Exception as e:
logger.error(f"Base64 encoding error for {chemin_image}: {e}")
try:
# Second attempt with a simpler approach
with Image.open(chemin_image) as img:
# Convert directly to RGB regardless of the image
img = img.convert("RGB")
buffer = io.BytesIO()
img.save(buffer, format="JPEG", quality=75)
buffer.seek(0)
encoded = base64.b64encode(buffer.read()).decode("utf-8")
return encoded
except Exception as e2:
logger.error(f"Second error during image optimization: {str(e2)}")
# Last resort: encode the original image without optimization
with open(chemin_image, "rb") as image_file:
encoded = base64.b64encode(image_file.read()).decode("utf-8")
return encoded
def _preparer_contenu(self, question: str) -> Dict[str, Any]:
return {
"model": self.modele,
"prompt": question,
"options": self.params,
"stream": False
}
def _traiter_reponse(self, reponse: requests.Response) -> str:
"""
Processes and returns the response provided by Ollama.
"""
data = reponse.json()
return data.get("response", "")
def interroger_avec_image(self, image_path: str, question: str, english_only: bool = True) -> str:
"""
Sends a multimodal request (image + text) to the API.
Args:
image_path: Path to the image
question: The prompt to send
english_only: If True, forces the response in English
Returns:
Model response
"""
url = self.urlBase() + self.urlFonction()
headers = {"Content-Type": "application/json"}
# Check that the image exists
if not os.path.exists(image_path):
logger.error(f"Image does not exist: {image_path}")
return f"Error: image {image_path} does not exist."
try:
# Encode image in base64
image_b64 = self._encoder_image_base64(image_path)
# Ensure the question starts with an instruction to respond in English
if english_only and not question.lower().startswith("[english"):
question = "[ENGLISH RESPONSE REQUESTED]\n\n" + question
# Special format for Ollama multimodal
prompt = f"""<image>
{image_b64}
</image>
{question}"""
# Prepare the request using the base method
contenu = self._preparer_contenu(prompt)
self.heureDepart = datetime.now()
# Send request
response = requests.post(
url=url,
headers=headers,
json=contenu,
timeout=self.request_timeout
)
self.heureFin = datetime.now()
if self.heureDepart is not None:
self.dureeTraitement = self.heureFin - self.heureDepart
else:
self.dureeTraitement = timedelta(0)
# Response verification
if response.status_code in [200, 201]:
self.reponseErreur = False
return self._traiter_reponse(response)
else:
self.reponseErreur = True
return f"LlamaVision API Error ({response.status_code}): {response.text}"
except requests.exceptions.Timeout:
self.heureFin = datetime.now()
if self.heureDepart is not None:
self.dureeTraitement = self.heureFin - self.heureDepart
else:
self.dureeTraitement = timedelta(0)
self.reponseErreur = True
return "Timeout during API call. Image analysis took too long."
except Exception as e:
self.heureFin = datetime.now()
if self.heureDepart is not None:
self.dureeTraitement = self.heureFin - self.heureDepart
else:
self.dureeTraitement = timedelta(0)
self.reponseErreur = True
return f"Multimodal communication error: {str(e)}"