ragflow_preprocess/test_components.py
2025-03-27 17:59:10 +01:00

234 lines
9.9 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Test script to verify critical components
"""
import os
import sys
import platform
import requests
import time
import subprocess
import json
from typing import List, Dict, Any
# Check Tesseract OCR installation
def check_tesseract():
print("\n=== Checking Tesseract OCR ===")
try:
import pytesseract
from PIL import Image
# Possible paths for Tesseract on Windows
possible_paths = [
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
r"C:\Tesseract-OCR\tesseract.exe",
r"C:\Users\PCDEV\AppData\Local\Programs\Tesseract-OCR\tesseract.exe",
r"C:\Users\PCDEV\Tesseract-OCR\tesseract.exe"
]
# Check if Tesseract is in PATH
tesseract_in_path = False
try:
if platform.system() == "Windows":
result = subprocess.run(["where", "tesseract"], capture_output=True, text=True)
if result.returncode == 0:
tesseract_in_path = True
tesseract_path = result.stdout.strip().split("\n")[0]
print(f"Tesseract found in PATH: {tesseract_path}")
else:
result = subprocess.run(["which", "tesseract"], capture_output=True, text=True)
if result.returncode == 0:
tesseract_in_path = True
tesseract_path = result.stdout.strip()
print(f"Tesseract found in PATH: {tesseract_path}")
except Exception as e:
print(f"Error checking for Tesseract in PATH: {e}")
if not tesseract_in_path and platform.system() == "Windows":
print("Tesseract is not in PATH. Searching in standard locations...")
# Check standard paths
for path in possible_paths:
if os.path.exists(path):
pytesseract.pytesseract.tesseract_cmd = path
print(f"Tesseract found at: {path}")
break
# Test Tesseract with a version command
try:
if platform.system() == "Windows" and not tesseract_in_path:
# Use explicit path
for path in possible_paths:
if os.path.exists(path):
result = subprocess.run([path, "--version"], capture_output=True, text=True)
if result.returncode == 0:
print(f"Tesseract version: {result.stdout.strip().split()[0]}")
break
else:
# Tesseract is in PATH
result = subprocess.run(["tesseract", "--version"], capture_output=True, text=True)
if result.returncode == 0:
print(f"Tesseract version: {result.stdout.strip().split()[0]}")
except Exception as e:
print(f"Error checking Tesseract version: {e}")
# Check installed languages
try:
if platform.system() == "Windows" and not tesseract_in_path:
# Use explicit path
for path in possible_paths:
if os.path.exists(path):
tesseract_folder = os.path.dirname(path)
tessdata_folder = os.path.join(tesseract_folder, "tessdata")
if os.path.exists(tessdata_folder):
langs = [f for f in os.listdir(tessdata_folder) if f.endswith(".traineddata")]
print(f"Installed languages: {', '.join([lang.split('.')[0] for lang in langs])}")
break
else:
# Tesseract is in PATH
result = subprocess.run(["tesseract", "--list-langs"], capture_output=True, text=True)
if result.returncode == 0:
langs = result.stdout.strip().split("\n")[1:] # Skip the first line
print(f"Installed languages: {', '.join(langs)}")
except Exception as e:
print(f"Error checking Tesseract languages: {e}")
print("\nINSTRUCTIONS FOR TESSERACT OCR:")
print("1. If Tesseract is not installed, download it from:")
print(" https://github.com/UB-Mannheim/tesseract/wiki")
print("2. Make sure to install French (fra) and English (eng) languages")
print("3. Check the 'Add to PATH' option during installation")
except ImportError as e:
print(f"Error: {e}")
print("Tesseract OCR or its Python dependencies are not properly installed")
print("Install them with: pip install pytesseract Pillow")
# Check connection to Ollama
def check_ollama(endpoint="http://217.182.105.173:11434"):
print("\n=== Checking Ollama connection ===")
print(f"Endpoint: {endpoint}")
# Test basic connection
try:
response = requests.get(f"{endpoint}/api/version", timeout=10)
if response.status_code == 200:
version_info = response.json()
print(f"✓ Connection to Ollama successful - Version: {version_info.get('version', 'unknown')}")
# List available models
try:
response = requests.get(f"{endpoint}/api/tags", timeout=10)
if response.status_code == 200:
models = response.json().get("models", [])
if models:
print(f"✓ Available models ({len(models)}):")
for model in models:
print(f" - {model.get('name', 'Unknown')} ({model.get('size', 'Unknown size')})")
else:
print("No models found on Ollama server")
else:
print(f"✗ Error retrieving models: status {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"✗ Error retrieving models: {str(e)}")
# Test a simple model
try:
print("\nTesting a simple model (mistral)...")
payload = {
"model": "mistral",
"prompt": "Say hello in English",
"options": {
"temperature": 0.1
}
}
start_time = time.time()
response = requests.post(f"{endpoint}/api/generate", json=payload, timeout=30)
elapsed_time = time.time() - start_time
if response.status_code == 200:
try:
result = response.json()
print(f"✓ Test successful in {elapsed_time:.2f} seconds")
print(f" Response: {result.get('response', 'No response')[:100]}...")
except json.JSONDecodeError as e:
print(f"✗ JSON parsing error: {str(e)}")
print(" Trying to process first line only...")
lines = response.text.strip().split("\n")
if lines:
try:
result = json.loads(lines[0])
print(f"✓ Test successful with first line parsing in {elapsed_time:.2f} seconds")
print(f" Response: {result.get('response', 'No response')[:100]}...")
except json.JSONDecodeError:
print("✗ Failed to parse first line as JSON")
print(f" Raw response (first 200 chars): {response.text[:200]}")
else:
print(f"✗ Error testing model: status {response.status_code}")
print(f" Body: {response.text[:200]}")
except requests.exceptions.RequestException as e:
print(f"✗ Error testing model: {str(e)}")
else:
print(f"✗ Error connecting to Ollama: status {response.status_code}")
print(f" Body: {response.text[:200]}")
except requests.exceptions.RequestException as e:
print(f"✗ Unable to connect to Ollama: {str(e)}")
print("\nINSTRUCTIONS FOR OLLAMA:")
print("1. Verify that the Ollama server is running at the specified address")
print("2. Verify that port 11434 is open and accessible")
print("3. Check Ollama server logs for potential issues")
# Check Python environment
def check_python_env():
print("\n=== Checking Python environment ===")
print(f"Python {sys.version}")
print(f"Platform: {platform.platform()}")
# Check installed packages
required_packages = ["PyQt6", "PyPDF2", "pytesseract", "requests", "fitz"]
print("\nChecking required packages:")
for pkg in required_packages:
try:
__import__(pkg)
print(f"{pkg} is installed")
except ImportError:
print(f"{pkg} is NOT installed")
# Check Pillow separately (package name is Pillow but import name is PIL)
try:
import PIL
print(f"✓ PIL (Pillow) is installed")
except ImportError:
print(f"✗ PIL (Pillow) is NOT installed")
print("\nINSTRUCTIONS FOR PYTHON ENVIRONMENT:")
print("1. Make sure you're using the virtual environment if configured")
print("2. Install missing packages with: pip install -r requirements.txt")
# Main function
def main():
print("=== Testing critical components ===")
# Check Python environment
check_python_env()
# Check Tesseract OCR
check_tesseract()
# Check connection to Ollama
check_ollama()
print("\n=== Checks completed ===")
print("If issues were detected, follow the displayed instructions")
print("After fixing issues, run this script again to verify")
if __name__ == "__main__":
main()