mirror of
https://github.com/Ladebeze66/ragflow_preprocess.git
synced 2026-02-04 06:00:27 +01:00
86 lines
3.1 KiB
PowerShell
86 lines
3.1 KiB
PowerShell
#!/usr/bin/env pwsh
|
|
|
|
# Script to install additional languages for Tesseract OCR
|
|
|
|
# Check if the script is running as administrator
|
|
if (-NOT ([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator")) {
|
|
Write-Warning "Please run this script as administrator!"
|
|
Write-Host "The script will close in 5 seconds..."
|
|
Start-Sleep -Seconds 5
|
|
exit
|
|
}
|
|
|
|
# Function to find Tesseract installation path
|
|
function Find-TesseractPath {
|
|
$possiblePaths = @(
|
|
"C:\Program Files\Tesseract-OCR",
|
|
"C:\Program Files (x86)\Tesseract-OCR",
|
|
"C:\Tesseract-OCR"
|
|
)
|
|
|
|
foreach ($path in $possiblePaths) {
|
|
if (Test-Path "$path\tesseract.exe") {
|
|
return $path
|
|
}
|
|
}
|
|
|
|
return $null
|
|
}
|
|
|
|
# Find Tesseract path
|
|
$tesseractPath = Find-TesseractPath
|
|
|
|
if ($null -eq $tesseractPath) {
|
|
Write-Host "Tesseract OCR is not installed or was not found in standard locations." -ForegroundColor Red
|
|
Write-Host "Please install Tesseract OCR first from: https://github.com/UB-Mannheim/tesseract/wiki" -ForegroundColor Yellow
|
|
Write-Host "The script will close in 5 seconds..."
|
|
Start-Sleep -Seconds 5
|
|
exit
|
|
}
|
|
|
|
Write-Host "Tesseract OCR found at: $tesseractPath" -ForegroundColor Green
|
|
|
|
# Check tessdata folder
|
|
$tessDataPath = Join-Path -Path $tesseractPath -ChildPath "tessdata"
|
|
if (-not (Test-Path $tessDataPath)) {
|
|
Write-Host "The tessdata folder doesn't exist. Creating..." -ForegroundColor Yellow
|
|
New-Item -Path $tessDataPath -ItemType Directory | Out-Null
|
|
}
|
|
|
|
# URLs of languages to download
|
|
$languageUrls = @{
|
|
"fra" = "https://github.com/tesseract-ocr/tessdata/raw/4.00/fra.traineddata"
|
|
"fra_vert" = "https://github.com/tesseract-ocr/tessdata/raw/4.00/fra_vert.traineddata"
|
|
"frk" = "https://github.com/tesseract-ocr/tessdata/raw/4.00/frk.traineddata"
|
|
"frm" = "https://github.com/tesseract-ocr/tessdata/raw/4.00/frm.traineddata"
|
|
}
|
|
|
|
# Download languages
|
|
foreach ($lang in $languageUrls.Keys) {
|
|
$url = $languageUrls[$lang]
|
|
$outputFile = Join-Path -Path $tessDataPath -ChildPath "$lang.traineddata"
|
|
|
|
if (Test-Path $outputFile) {
|
|
Write-Host "Language file $lang already exists. Removing..." -ForegroundColor Yellow
|
|
Remove-Item -Path $outputFile -Force
|
|
}
|
|
|
|
Write-Host "Downloading $lang.traineddata..." -ForegroundColor Cyan
|
|
|
|
try {
|
|
Invoke-WebRequest -Uri $url -OutFile $outputFile
|
|
Write-Host "Language $lang successfully installed." -ForegroundColor Green
|
|
}
|
|
catch {
|
|
Write-Host "Error downloading $lang.traineddata: $_" -ForegroundColor Red
|
|
}
|
|
}
|
|
|
|
# Check installed languages
|
|
Write-Host "`nVerifying installed languages:" -ForegroundColor Cyan
|
|
$installedLanguages = Get-ChildItem -Path $tessDataPath -Filter "*.traineddata" | ForEach-Object { $_.Name.Replace(".traineddata", "") }
|
|
Write-Host "Installed languages: $($installedLanguages -join ', ')" -ForegroundColor Green
|
|
|
|
Write-Host "`nLanguage installation completed." -ForegroundColor Green
|
|
Write-Host "Press any key to close..."
|
|
$null = $Host.UI.RawUI.ReadKey("NoEcho,IncludeKeyDown") |