210 lines
6.9 KiB
Python
210 lines
6.9 KiB
Python
"""
|
|
EU-Utility - OCR Backend Manager
|
|
|
|
Manages OCR backend installation and configuration.
|
|
Checks registry for installed software and provides auto-configuration.
|
|
"""
|
|
|
|
import sys
|
|
import subprocess
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, List, Tuple
|
|
|
|
# Windows registry checking
|
|
if sys.platform == 'win32':
|
|
import winreg
|
|
WINDOWS = True
|
|
else:
|
|
WINDOWS = False
|
|
|
|
|
|
class OCRBackendManager:
|
|
"""Manages OCR backend detection and installation."""
|
|
|
|
TESSERACT_REGISTRY_PATHS = [
|
|
(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Tesseract-OCR"),
|
|
(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\WOW6432Node\Tesseract-OCR"),
|
|
(winreg.HKEY_CURRENT_USER, r"SOFTWARE\Tesseract-OCR"),
|
|
]
|
|
|
|
TESSERACT_DEFAULT_PATHS = [
|
|
Path(r"C:\Program Files\Tesseract-OCR"),
|
|
Path(r"C:\Program Files (x86)\Tesseract-OCR"),
|
|
Path.home() / "Tesseract-OCR",
|
|
]
|
|
|
|
def __init__(self):
|
|
self.backends = {
|
|
'easyocr': {'installed': False, 'available': False, 'path': None},
|
|
'tesseract': {'installed': False, 'available': False, 'path': None},
|
|
'paddleocr': {'installed': False, 'available': False, 'path': None},
|
|
}
|
|
self._scan_backends()
|
|
|
|
def _scan_backends(self):
|
|
"""Scan for installed OCR backends."""
|
|
self._check_easyocr()
|
|
self._check_tesseract()
|
|
self._check_paddleocr()
|
|
|
|
def _check_easyocr(self):
|
|
"""Check if EasyOCR Python package is installed."""
|
|
try:
|
|
import easyocr
|
|
self.backends['easyocr']['installed'] = True
|
|
self.backends['easyocr']['available'] = True
|
|
except ImportError:
|
|
pass
|
|
|
|
def _check_tesseract(self):
|
|
"""Check if Tesseract is installed (Python package and binary)."""
|
|
# Check Python package
|
|
try:
|
|
import pytesseract
|
|
self.backends['tesseract']['installed'] = True
|
|
except ImportError:
|
|
pass
|
|
|
|
# Check for binary in various locations
|
|
tesseract_exe = self._find_tesseract_binary()
|
|
if tesseract_exe:
|
|
self.backends['tesseract']['path'] = str(tesseract_exe)
|
|
self.backends['tesseract']['available'] = True
|
|
# Configure pytesseract if Python package is installed
|
|
try:
|
|
import pytesseract
|
|
pytesseract.pytesseract.tesseract_cmd = str(tesseract_exe)
|
|
except ImportError:
|
|
pass
|
|
|
|
def _find_tesseract_binary(self) -> Optional[Path]:
|
|
"""Find Tesseract binary in registry and common paths."""
|
|
# Check if already in PATH
|
|
try:
|
|
result = subprocess.run(['tesseract', '--version'],
|
|
capture_output=True, text=True, timeout=5)
|
|
if result.returncode == 0:
|
|
# Tesseract is in PATH
|
|
return Path('tesseract')
|
|
except:
|
|
pass
|
|
|
|
if WINDOWS:
|
|
# Check Windows Registry
|
|
for hkey, reg_path in self.TESSERACT_REGISTRY_PATHS:
|
|
try:
|
|
with winreg.OpenKey(hkey, reg_path) as key:
|
|
install_path, _ = winreg.QueryValueEx(key, 'InstallDir')
|
|
if install_path:
|
|
exe_path = Path(install_path) / 'tesseract.exe'
|
|
if exe_path.exists():
|
|
return exe_path
|
|
except:
|
|
continue
|
|
|
|
# Check default installation paths
|
|
for path in self.TESSERACT_DEFAULT_PATHS:
|
|
exe_path = path / 'tesseract.exe'
|
|
if exe_path.exists():
|
|
return exe_path
|
|
|
|
return None
|
|
|
|
def _check_paddleocr(self):
|
|
"""Check if PaddleOCR Python package is installed."""
|
|
try:
|
|
from paddleocr import PaddleOCR
|
|
self.backends['paddleocr']['installed'] = True
|
|
self.backends['paddleocr']['available'] = True
|
|
except ImportError:
|
|
pass
|
|
|
|
def get_backend_status(self, backend: str) -> Dict:
|
|
"""Get status of a specific backend."""
|
|
return self.backends.get(backend, {}).copy()
|
|
|
|
def get_all_status(self) -> Dict:
|
|
"""Get status of all backends."""
|
|
return self.backends.copy()
|
|
|
|
def get_best_available(self) -> Optional[str]:
|
|
"""Get the best available backend."""
|
|
priority = ['easyocr', 'tesseract', 'paddleocr']
|
|
for backend in priority:
|
|
if self.backends[backend]['available']:
|
|
return backend
|
|
return None
|
|
|
|
def install_backend(self, backend: str) -> Tuple[bool, str]:
|
|
"""Install a backend via pip."""
|
|
packages = {
|
|
'easyocr': 'easyocr',
|
|
'tesseract': 'pytesseract',
|
|
'paddleocr': 'paddleocr',
|
|
}
|
|
|
|
if backend not in packages:
|
|
return False, f"Unknown backend: {backend}"
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
[sys.executable, '-m', 'pip', 'install', packages[backend]],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=300
|
|
)
|
|
if result.returncode == 0:
|
|
self._scan_backends() # Re-scan after install
|
|
return True, f"Successfully installed {backend}"
|
|
else:
|
|
return False, f"Failed to install {backend}: {result.stderr}"
|
|
except Exception as e:
|
|
return False, f"Error installing {backend}: {e}"
|
|
|
|
def get_tesseract_install_info(self) -> str:
|
|
"""Get instructions for installing Tesseract."""
|
|
return """Tesseract is not installed or not in PATH.
|
|
|
|
To install Tesseract:
|
|
|
|
1. Download installer from:
|
|
https://github.com/UB-Mannheim/tesseract/wiki
|
|
|
|
2. Run the installer and note the installation path
|
|
(usually C:\\Program Files\\Tesseract-OCR)
|
|
|
|
3. Add to PATH or use the auto-detect feature
|
|
|
|
4. Restart EU-Utility
|
|
|
|
Alternative: Use EasyOCR instead (pip install easyocr)
|
|
"""
|
|
|
|
def auto_configure_tesseract(self) -> bool:
|
|
"""Auto-configure Tesseract from registry/paths."""
|
|
if not WINDOWS:
|
|
return False
|
|
|
|
exe_path = self._find_tesseract_binary()
|
|
if exe_path and exe_path != Path('tesseract'):
|
|
try:
|
|
import pytesseract
|
|
pytesseract.pytesseract.tesseract_cmd = str(exe_path)
|
|
self.backends['tesseract']['available'] = True
|
|
self.backends['tesseract']['path'] = str(exe_path)
|
|
return True
|
|
except:
|
|
pass
|
|
return False
|
|
|
|
|
|
# Singleton instance
|
|
_ocr_manager = None
|
|
|
|
def get_ocr_backend_manager() -> OCRBackendManager:
|
|
"""Get the OCR backend manager singleton."""
|
|
global _ocr_manager
|
|
if _ocr_manager is None:
|
|
_ocr_manager = OCRBackendManager()
|
|
return _ocr_manager
|