mirror of
https://github.com/Ladebeze66/llm_ticket3.git
synced 2025-12-15 20:36:50 +01:00
274 lines
10 KiB
Python
274 lines
10 KiB
Python
"""
|
|
Module for Llama Vision support.
|
|
Optimized for English-only mode, eliminating intermediate translations.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import logging
|
|
import time
|
|
import base64
|
|
import requests
|
|
from typing import Dict, Any, Optional, List, Union
|
|
from PIL import Image
|
|
import io
|
|
from datetime import datetime, timedelta
|
|
|
|
from .base_llm import BaseLLM
|
|
|
|
logger = logging.getLogger("LlamaVision")
|
|
|
|
class LlamaVision(BaseLLM):
|
|
"""
|
|
Interface class with Llama Vision model via its API.
|
|
Optimized to work exclusively in English.
|
|
"""
|
|
|
|
def __init__(self, modele: str = "llama3.2-vision:90b-instruct-q8_0"):
|
|
# Normaliser le nom du modèle avant de le passer au parent
|
|
normalized_model = self._normalize_model_name(modele)
|
|
super().__init__(normalized_model)
|
|
|
|
# Default configuration optimized for English
|
|
self.configurer(
|
|
temperature=0.2,
|
|
top_p=0.8,
|
|
max_tokens=4000
|
|
)
|
|
|
|
# Request timeout in seconds
|
|
self.request_timeout = 600
|
|
|
|
# Default system prompt (in English)
|
|
self.prompt_system = """You are a helpful AI assistant with vision capabilities.
|
|
You are interacting with images provided by the user.
|
|
Respond in English unless specifically instructed otherwise."""
|
|
|
|
logger.info(f"Initializing LlamaVision with model {normalized_model} (English-only mode)")
|
|
|
|
def _normalize_model_name(self, model_name: str) -> str:
|
|
"""
|
|
Normalizes model name to ensure compatibility across systems.
|
|
|
|
Args:
|
|
model_name: Original model name
|
|
|
|
Returns:
|
|
Normalized model name
|
|
"""
|
|
# Check if model name already contains the backend-specific format
|
|
if ":" not in model_name and model_name != "llama3.2-vision:90b-instruct-q8_0":
|
|
# Convert standardized name to backend format
|
|
if model_name == "llama3-vision-90b-instruct":
|
|
return "llama3.2-vision:90b-instruct-q8_0"
|
|
|
|
# For pipeline_logger compatibility, ensure it's a normalized format
|
|
# that other parts of the code can recognize for normalization
|
|
if model_name.startswith("image_"):
|
|
model_name = model_name[6:] # Remove redundant prefix
|
|
|
|
# Make sure a standard model name is returned for pipeline_logger
|
|
# This is a backup in case pipeline_logger fails to normalize
|
|
if ":" in model_name and "llama" in model_name.lower():
|
|
self.pipeline_normalized_name = "llama3-vision-90b-instruct"
|
|
else:
|
|
# Store normalized version for pipeline_logger
|
|
self.pipeline_normalized_name = model_name
|
|
|
|
# Return original name if it already has the correct format for Ollama
|
|
return model_name
|
|
|
|
def urlBase(self) -> str:
|
|
"""
|
|
Returns the base URL of the Ollama API.
|
|
"""
|
|
return "http://217.182.105.173:11434/"
|
|
|
|
def cleAPI(self) -> str:
|
|
"""
|
|
Ollama doesn't require an API key by default.
|
|
"""
|
|
return ""
|
|
|
|
def urlFonction(self) -> str:
|
|
"""
|
|
Returns the specific Ollama URL for generating a response.
|
|
"""
|
|
return "api/generate"
|
|
|
|
def _encoder_image_base64(self, chemin_image: str) -> str:
|
|
"""
|
|
Encodes an image in base64 for the API.
|
|
|
|
Args:
|
|
chemin_image: Path to the image to encode
|
|
|
|
Returns:
|
|
Base64 encoded image or empty string in case of error
|
|
"""
|
|
try:
|
|
# Check image size and reduce if too large
|
|
with Image.open(chemin_image) as img:
|
|
# If the image is too large, resize it
|
|
max_dim = 800 # Maximum dimension
|
|
width, height = img.size
|
|
|
|
if width > max_dim or height > max_dim:
|
|
# Calculate ratio to maintain proportions
|
|
ratio = min(max_dim / width, max_dim / height)
|
|
new_width = int(width * ratio)
|
|
new_height = int(height * ratio)
|
|
|
|
# Resize image
|
|
img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
|
|
|
# Convert to RGB if necessary (for formats like PNG with alpha channel)
|
|
if img.mode in ("RGBA", "LA", "P"):
|
|
# Create a white background and compose the image on top to handle transparency
|
|
background = Image.new("RGB", img.size, (255, 255, 255))
|
|
if img.mode == "P":
|
|
img = img.convert("RGBA")
|
|
background.paste(img, mask=img.split()[3] if img.mode == "RGBA" else None)
|
|
img = background
|
|
elif img.mode != "RGB":
|
|
img = img.convert("RGB")
|
|
|
|
# Temporarily save the resized image
|
|
buffer = io.BytesIO()
|
|
img.save(buffer, format="JPEG", quality=85)
|
|
buffer.seek(0)
|
|
|
|
# Encode in base64
|
|
encoded = base64.b64encode(buffer.read()).decode("utf-8")
|
|
return encoded
|
|
except Exception as e:
|
|
logger.error(f"Base64 encoding error for {chemin_image}: {e}")
|
|
try:
|
|
# Second attempt with a simpler approach
|
|
with Image.open(chemin_image) as img:
|
|
# Convert directly to RGB regardless of the image
|
|
img = img.convert("RGB")
|
|
buffer = io.BytesIO()
|
|
img.save(buffer, format="JPEG", quality=75)
|
|
buffer.seek(0)
|
|
encoded = base64.b64encode(buffer.read()).decode("utf-8")
|
|
return encoded
|
|
except Exception as e2:
|
|
logger.error(f"Second error during image optimization: {str(e2)}")
|
|
# Last resort: encode the original image without optimization
|
|
with open(chemin_image, "rb") as image_file:
|
|
encoded = base64.b64encode(image_file.read()).decode("utf-8")
|
|
return encoded
|
|
|
|
def _preparer_contenu(self, question: str) -> Dict[str, Any]:
|
|
"""
|
|
Prepares the request content specific to Ollama.
|
|
"""
|
|
return {
|
|
"model": self.modele,
|
|
"prompt": question,
|
|
"options": {
|
|
"temperature": self.params.get("temperature", 0.2),
|
|
"top_p": self.params.get("top_p", 0.8),
|
|
"num_predict": self.params.get("max_tokens", 1024),
|
|
"stop": self.params.get("stop", []),
|
|
"top_k": 30,
|
|
"num_ctx": 1024,
|
|
"repeat_penalty": 1.1,
|
|
"repeat_last_n": 64,
|
|
"mirostat": 0,
|
|
"mirostat_eta": 0.1,
|
|
"mirostat_tau": 5,
|
|
"keep_alive": int(timedelta(minutes=2).total_seconds()),
|
|
"min_p": 0,
|
|
"seed": 0,
|
|
},
|
|
"stream": False
|
|
}
|
|
|
|
def _traiter_reponse(self, reponse: requests.Response) -> str:
|
|
"""
|
|
Processes and returns the response provided by Ollama.
|
|
"""
|
|
data = reponse.json()
|
|
return data.get("response", "")
|
|
|
|
def interroger_avec_image(self, image_path: str, question: str, english_only: bool = True) -> str:
|
|
"""
|
|
Sends a multimodal request (image + text) to the API.
|
|
|
|
Args:
|
|
image_path: Path to the image
|
|
question: The prompt to send
|
|
english_only: If True, forces the response in English
|
|
|
|
Returns:
|
|
Model response
|
|
"""
|
|
url = self.urlBase() + self.urlFonction()
|
|
headers = {"Content-Type": "application/json"}
|
|
|
|
# Check that the image exists
|
|
if not os.path.exists(image_path):
|
|
logger.error(f"Image does not exist: {image_path}")
|
|
return f"Error: image {image_path} does not exist."
|
|
|
|
try:
|
|
# Encode image in base64
|
|
image_b64 = self._encoder_image_base64(image_path)
|
|
|
|
# Ensure the question starts with an instruction to respond in English
|
|
if english_only and not question.lower().startswith("[english"):
|
|
question = "[ENGLISH RESPONSE REQUESTED]\n\n" + question
|
|
|
|
# Special format for Ollama multimodal
|
|
prompt = f"""<image>
|
|
{image_b64}
|
|
</image>
|
|
|
|
{question}"""
|
|
|
|
# Prepare the request using the base method
|
|
contenu = self._preparer_contenu(prompt)
|
|
|
|
self.heureDepart = datetime.now()
|
|
|
|
# Send request
|
|
response = requests.post(
|
|
url=url,
|
|
headers=headers,
|
|
json=contenu,
|
|
timeout=self.request_timeout
|
|
)
|
|
|
|
self.heureFin = datetime.now()
|
|
if self.heureDepart is not None:
|
|
self.dureeTraitement = self.heureFin - self.heureDepart
|
|
else:
|
|
self.dureeTraitement = timedelta(0)
|
|
|
|
# Response verification
|
|
if response.status_code in [200, 201]:
|
|
self.reponseErreur = False
|
|
return self._traiter_reponse(response)
|
|
else:
|
|
self.reponseErreur = True
|
|
return f"LlamaVision API Error ({response.status_code}): {response.text}"
|
|
|
|
except requests.exceptions.Timeout:
|
|
self.heureFin = datetime.now()
|
|
if self.heureDepart is not None:
|
|
self.dureeTraitement = self.heureFin - self.heureDepart
|
|
else:
|
|
self.dureeTraitement = timedelta(0)
|
|
self.reponseErreur = True
|
|
return "Timeout during API call. Image analysis took too long."
|
|
|
|
except Exception as e:
|
|
self.heureFin = datetime.now()
|
|
if self.heureDepart is not None:
|
|
self.dureeTraitement = self.heureFin - self.heureDepart
|
|
else:
|
|
self.dureeTraitement = timedelta(0)
|
|
self.reponseErreur = True
|
|
return f"Multimodal communication error: {str(e)}" |