ragflow_preprocess/test_components.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Test script to verify critical components
"""

import os
import sys
import platform
import requests
import time
import subprocess
import json
from typing import List, Dict, Any

# Check Tesseract OCR installation
def check_tesseract():
    print("\n=== Checking Tesseract OCR ===")
    try:
        import pytesseract
        from PIL import Image

        # Possible paths for Tesseract on Windows
        possible_paths = [
            r"C:\Program Files\Tesseract-OCR\tesseract.exe",
            r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
            r"C:\Tesseract-OCR\tesseract.exe",
            r"C:\Users\PCDEV\AppData\Local\Programs\Tesseract-OCR\tesseract.exe",
            r"C:\Users\PCDEV\Tesseract-OCR\tesseract.exe"
        ]

        # Check if Tesseract is in PATH
        tesseract_in_path = False
        try:
            if platform.system() == "Windows":
                result = subprocess.run(["where", "tesseract"], capture_output=True, text=True)
                if result.returncode == 0:
                    tesseract_in_path = True
                    tesseract_path = result.stdout.strip().split("\n")[0]
                    print(f"Tesseract found in PATH: {tesseract_path}")
            else:
                result = subprocess.run(["which", "tesseract"], capture_output=True, text=True)
                if result.returncode == 0:
                    tesseract_in_path = True
                    tesseract_path = result.stdout.strip()
                    print(f"Tesseract found in PATH: {tesseract_path}")
        except Exception as e:
            print(f"Error checking for Tesseract in PATH: {e}")

        if not tesseract_in_path and platform.system() == "Windows":
            print("Tesseract is not in PATH. Searching in standard locations...")

            # Check standard paths
            for path in possible_paths:
                if os.path.exists(path):
                    pytesseract.pytesseract.tesseract_cmd = path
                    print(f"Tesseract found at: {path}")
                    break

        # Test Tesseract with a version command
        try:
            if platform.system() == "Windows" and not tesseract_in_path:
                # Use explicit path
                for path in possible_paths:
                    if os.path.exists(path):
                        result = subprocess.run([path, "--version"], capture_output=True, text=True)
                        if result.returncode == 0:
                            print(f"Tesseract version: {result.stdout.strip().split()[0]}")
                            break
            else:
                # Tesseract is in PATH
                result = subprocess.run(["tesseract", "--version"], capture_output=True, text=True)
                if result.returncode == 0:
                    print(f"Tesseract version: {result.stdout.strip().split()[0]}")
        except Exception as e:
            print(f"Error checking Tesseract version: {e}")

        # Check installed languages
        try:
            if platform.system() == "Windows" and not tesseract_in_path:
                # Use explicit path
                for path in possible_paths:
                    if os.path.exists(path):
                        tesseract_folder = os.path.dirname(path)
                        tessdata_folder = os.path.join(tesseract_folder, "tessdata")
                        if os.path.exists(tessdata_folder):
                            langs = [f for f in os.listdir(tessdata_folder) if f.endswith(".traineddata")]
                            print(f"Installed languages: {', '.join([lang.split('.')[0] for lang in langs])}")
                            break
            else:
                # Tesseract is in PATH
                result = subprocess.run(["tesseract", "--list-langs"], capture_output=True, text=True)
                if result.returncode == 0:
                    langs = result.stdout.strip().split("\n")[1:]  # Skip the first line
                    print(f"Installed languages: {', '.join(langs)}")
        except Exception as e:
            print(f"Error checking Tesseract languages: {e}")

        print("\nINSTRUCTIONS FOR TESSERACT OCR:")
        print("1. If Tesseract is not installed, download it from:")
        print("   https://github.com/UB-Mannheim/tesseract/wiki")
        print("2. Make sure to install French (fra) and English (eng) languages")
        print("3. Check the 'Add to PATH' option during installation")

    except ImportError as e:
        print(f"Error: {e}")
        print("Tesseract OCR or its Python dependencies are not properly installed")
        print("Install them with: pip install pytesseract Pillow")

# Check connection to Ollama
def check_ollama(endpoint="http://217.182.105.173:11434"):
    print("\n=== Checking Ollama connection ===")
    print(f"Endpoint: {endpoint}")

    # Test basic connection
    try:
        response = requests.get(f"{endpoint}/api/version", timeout=10)
        if response.status_code == 200:
            version_info = response.json()
            print(f"✓ Connection to Ollama successful - Version: {version_info.get('version', 'unknown')}")

            # List available models
            try:
                response = requests.get(f"{endpoint}/api/tags", timeout=10)
                if response.status_code == 200:
                    models = response.json().get("models", [])
                    if models:
                        print(f"✓ Available models ({len(models)}):")
                        for model in models:
                            print(f"  - {model.get('name', 'Unknown')} ({model.get('size', 'Unknown size')})")
                    else:
                        print("No models found on Ollama server")
                else:
                    print(f"✗ Error retrieving models: status {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"✗ Error retrieving models: {str(e)}")

            # Test a simple model
            try:
                print("\nTesting a simple model (mistral)...")
                payload = {
                    "model": "mistral",
                    "prompt": "Say hello in English",
                    "options": {
                        "temperature": 0.1
                    }
                }

                start_time = time.time()
                response = requests.post(f"{endpoint}/api/generate", json=payload, timeout=30)
                elapsed_time = time.time() - start_time

                if response.status_code == 200:
                    try:
                        result = response.json()
                        print(f"✓ Test successful in {elapsed_time:.2f} seconds")
                        print(f"  Response: {result.get('response', 'No response')[:100]}...")
                    except json.JSONDecodeError as e:
                        print(f"✗ JSON parsing error: {str(e)}")
                        print("  Trying to process first line only...")

                        lines = response.text.strip().split("\n")
                        if lines:
                            try:
                                result = json.loads(lines[0])
                                print(f"✓ Test successful with first line parsing in {elapsed_time:.2f} seconds")
                                print(f"  Response: {result.get('response', 'No response')[:100]}...")
                            except json.JSONDecodeError:
                                print("✗ Failed to parse first line as JSON")
                                print(f"  Raw response (first 200 chars): {response.text[:200]}")
                else:
                    print(f"✗ Error testing model: status {response.status_code}")
                    print(f"  Body: {response.text[:200]}")
            except requests.exceptions.RequestException as e:
                print(f"✗ Error testing model: {str(e)}")
        else:
            print(f"✗ Error connecting to Ollama: status {response.status_code}")
            print(f"  Body: {response.text[:200]}")
    except requests.exceptions.RequestException as e:
        print(f"✗ Unable to connect to Ollama: {str(e)}")

    print("\nINSTRUCTIONS FOR OLLAMA:")
    print("1. Verify that the Ollama server is running at the specified address")
    print("2. Verify that port 11434 is open and accessible")
    print("3. Check Ollama server logs for potential issues")

# Check Python environment
def check_python_env():
    print("\n=== Checking Python environment ===")
    print(f"Python {sys.version}")
    print(f"Platform: {platform.platform()}")

    # Check installed packages
    required_packages = ["PyQt6", "PyPDF2", "pytesseract", "requests", "fitz"]

    print("\nChecking required packages:")
    for pkg in required_packages:
        try:
            __import__(pkg)
            print(f"✓ {pkg} is installed")
        except ImportError:
            print(f"✗ {pkg} is NOT installed")

    # Check Pillow separately (package name is Pillow but import name is PIL)
    try:
        import PIL
        print(f"✓ PIL (Pillow) is installed")
    except ImportError:
        print(f"✗ PIL (Pillow) is NOT installed")

    print("\nINSTRUCTIONS FOR PYTHON ENVIRONMENT:")
    print("1. Make sure you're using the virtual environment if configured")
    print("2. Install missing packages with: pip install -r requirements.txt")

# Main function
def main():
    print("=== Testing critical components ===")

    # Check Python environment
    check_python_env()

    # Check Tesseract OCR
    check_tesseract()

    # Check connection to Ollama
    check_ollama()

    print("\n=== Checks completed ===")
    print("If issues were detected, follow the displayed instructions")
    print("After fixing issues, run this script again to verify")

if __name__ == "__main__":
    main()