mirror of
https://github.com/Ladebeze66/ragflow_preprocess.git
synced 2026-02-04 05:50:26 +01:00
234 lines
9.9 KiB
Python
234 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Test script to verify critical components
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import platform
|
|
import requests
|
|
import time
|
|
import subprocess
|
|
import json
|
|
from typing import List, Dict, Any
|
|
|
|
# Check Tesseract OCR installation
|
|
def check_tesseract():
|
|
print("\n=== Checking Tesseract OCR ===")
|
|
try:
|
|
import pytesseract
|
|
from PIL import Image
|
|
|
|
# Possible paths for Tesseract on Windows
|
|
possible_paths = [
|
|
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
|
|
r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
|
|
r"C:\Tesseract-OCR\tesseract.exe",
|
|
r"C:\Users\PCDEV\AppData\Local\Programs\Tesseract-OCR\tesseract.exe",
|
|
r"C:\Users\PCDEV\Tesseract-OCR\tesseract.exe"
|
|
]
|
|
|
|
# Check if Tesseract is in PATH
|
|
tesseract_in_path = False
|
|
try:
|
|
if platform.system() == "Windows":
|
|
result = subprocess.run(["where", "tesseract"], capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
tesseract_in_path = True
|
|
tesseract_path = result.stdout.strip().split("\n")[0]
|
|
print(f"Tesseract found in PATH: {tesseract_path}")
|
|
else:
|
|
result = subprocess.run(["which", "tesseract"], capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
tesseract_in_path = True
|
|
tesseract_path = result.stdout.strip()
|
|
print(f"Tesseract found in PATH: {tesseract_path}")
|
|
except Exception as e:
|
|
print(f"Error checking for Tesseract in PATH: {e}")
|
|
|
|
if not tesseract_in_path and platform.system() == "Windows":
|
|
print("Tesseract is not in PATH. Searching in standard locations...")
|
|
|
|
# Check standard paths
|
|
for path in possible_paths:
|
|
if os.path.exists(path):
|
|
pytesseract.pytesseract.tesseract_cmd = path
|
|
print(f"Tesseract found at: {path}")
|
|
break
|
|
|
|
# Test Tesseract with a version command
|
|
try:
|
|
if platform.system() == "Windows" and not tesseract_in_path:
|
|
# Use explicit path
|
|
for path in possible_paths:
|
|
if os.path.exists(path):
|
|
result = subprocess.run([path, "--version"], capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
print(f"Tesseract version: {result.stdout.strip().split()[0]}")
|
|
break
|
|
else:
|
|
# Tesseract is in PATH
|
|
result = subprocess.run(["tesseract", "--version"], capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
print(f"Tesseract version: {result.stdout.strip().split()[0]}")
|
|
except Exception as e:
|
|
print(f"Error checking Tesseract version: {e}")
|
|
|
|
# Check installed languages
|
|
try:
|
|
if platform.system() == "Windows" and not tesseract_in_path:
|
|
# Use explicit path
|
|
for path in possible_paths:
|
|
if os.path.exists(path):
|
|
tesseract_folder = os.path.dirname(path)
|
|
tessdata_folder = os.path.join(tesseract_folder, "tessdata")
|
|
if os.path.exists(tessdata_folder):
|
|
langs = [f for f in os.listdir(tessdata_folder) if f.endswith(".traineddata")]
|
|
print(f"Installed languages: {', '.join([lang.split('.')[0] for lang in langs])}")
|
|
break
|
|
else:
|
|
# Tesseract is in PATH
|
|
result = subprocess.run(["tesseract", "--list-langs"], capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
langs = result.stdout.strip().split("\n")[1:] # Skip the first line
|
|
print(f"Installed languages: {', '.join(langs)}")
|
|
except Exception as e:
|
|
print(f"Error checking Tesseract languages: {e}")
|
|
|
|
print("\nINSTRUCTIONS FOR TESSERACT OCR:")
|
|
print("1. If Tesseract is not installed, download it from:")
|
|
print(" https://github.com/UB-Mannheim/tesseract/wiki")
|
|
print("2. Make sure to install French (fra) and English (eng) languages")
|
|
print("3. Check the 'Add to PATH' option during installation")
|
|
|
|
except ImportError as e:
|
|
print(f"Error: {e}")
|
|
print("Tesseract OCR or its Python dependencies are not properly installed")
|
|
print("Install them with: pip install pytesseract Pillow")
|
|
|
|
# Check connection to Ollama
|
|
def check_ollama(endpoint="http://217.182.105.173:11434"):
|
|
print("\n=== Checking Ollama connection ===")
|
|
print(f"Endpoint: {endpoint}")
|
|
|
|
# Test basic connection
|
|
try:
|
|
response = requests.get(f"{endpoint}/api/version", timeout=10)
|
|
if response.status_code == 200:
|
|
version_info = response.json()
|
|
print(f"✓ Connection to Ollama successful - Version: {version_info.get('version', 'unknown')}")
|
|
|
|
# List available models
|
|
try:
|
|
response = requests.get(f"{endpoint}/api/tags", timeout=10)
|
|
if response.status_code == 200:
|
|
models = response.json().get("models", [])
|
|
if models:
|
|
print(f"✓ Available models ({len(models)}):")
|
|
for model in models:
|
|
print(f" - {model.get('name', 'Unknown')} ({model.get('size', 'Unknown size')})")
|
|
else:
|
|
print("No models found on Ollama server")
|
|
else:
|
|
print(f"✗ Error retrieving models: status {response.status_code}")
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"✗ Error retrieving models: {str(e)}")
|
|
|
|
# Test a simple model
|
|
try:
|
|
print("\nTesting a simple model (mistral)...")
|
|
payload = {
|
|
"model": "mistral",
|
|
"prompt": "Say hello in English",
|
|
"options": {
|
|
"temperature": 0.1
|
|
}
|
|
}
|
|
|
|
start_time = time.time()
|
|
response = requests.post(f"{endpoint}/api/generate", json=payload, timeout=30)
|
|
elapsed_time = time.time() - start_time
|
|
|
|
if response.status_code == 200:
|
|
try:
|
|
result = response.json()
|
|
print(f"✓ Test successful in {elapsed_time:.2f} seconds")
|
|
print(f" Response: {result.get('response', 'No response')[:100]}...")
|
|
except json.JSONDecodeError as e:
|
|
print(f"✗ JSON parsing error: {str(e)}")
|
|
print(" Trying to process first line only...")
|
|
|
|
lines = response.text.strip().split("\n")
|
|
if lines:
|
|
try:
|
|
result = json.loads(lines[0])
|
|
print(f"✓ Test successful with first line parsing in {elapsed_time:.2f} seconds")
|
|
print(f" Response: {result.get('response', 'No response')[:100]}...")
|
|
except json.JSONDecodeError:
|
|
print("✗ Failed to parse first line as JSON")
|
|
print(f" Raw response (first 200 chars): {response.text[:200]}")
|
|
else:
|
|
print(f"✗ Error testing model: status {response.status_code}")
|
|
print(f" Body: {response.text[:200]}")
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"✗ Error testing model: {str(e)}")
|
|
else:
|
|
print(f"✗ Error connecting to Ollama: status {response.status_code}")
|
|
print(f" Body: {response.text[:200]}")
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"✗ Unable to connect to Ollama: {str(e)}")
|
|
|
|
print("\nINSTRUCTIONS FOR OLLAMA:")
|
|
print("1. Verify that the Ollama server is running at the specified address")
|
|
print("2. Verify that port 11434 is open and accessible")
|
|
print("3. Check Ollama server logs for potential issues")
|
|
|
|
# Check Python environment
|
|
def check_python_env():
|
|
print("\n=== Checking Python environment ===")
|
|
print(f"Python {sys.version}")
|
|
print(f"Platform: {platform.platform()}")
|
|
|
|
# Check installed packages
|
|
required_packages = ["PyQt6", "PyPDF2", "pytesseract", "requests", "fitz"]
|
|
|
|
print("\nChecking required packages:")
|
|
for pkg in required_packages:
|
|
try:
|
|
__import__(pkg)
|
|
print(f"✓ {pkg} is installed")
|
|
except ImportError:
|
|
print(f"✗ {pkg} is NOT installed")
|
|
|
|
# Check Pillow separately (package name is Pillow but import name is PIL)
|
|
try:
|
|
import PIL
|
|
print(f"✓ PIL (Pillow) is installed")
|
|
except ImportError:
|
|
print(f"✗ PIL (Pillow) is NOT installed")
|
|
|
|
print("\nINSTRUCTIONS FOR PYTHON ENVIRONMENT:")
|
|
print("1. Make sure you're using the virtual environment if configured")
|
|
print("2. Install missing packages with: pip install -r requirements.txt")
|
|
|
|
# Main function
|
|
def main():
|
|
print("=== Testing critical components ===")
|
|
|
|
# Check Python environment
|
|
check_python_env()
|
|
|
|
# Check Tesseract OCR
|
|
check_tesseract()
|
|
|
|
# Check connection to Ollama
|
|
check_ollama()
|
|
|
|
print("\n=== Checks completed ===")
|
|
print("If issues were detected, follow the displayed instructions")
|
|
print("After fixing issues, run this script again to verify")
|
|
|
|
if __name__ == "__main__":
|
|
main() |