EU-Utility/core/ocr_backend_manager.py

210 lines
6.9 KiB
Python

"""
EU-Utility - OCR Backend Manager
Manages OCR backend installation and configuration.
Checks registry for installed software and provides auto-configuration.
"""
import sys
import subprocess
from pathlib import Path
from typing import Optional, Dict, List, Tuple
# Windows registry checking
if sys.platform == 'win32':
import winreg
WINDOWS = True
else:
WINDOWS = False
class OCRBackendManager:
"""Manages OCR backend detection and installation."""
TESSERACT_REGISTRY_PATHS = [
(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Tesseract-OCR"),
(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\WOW6432Node\Tesseract-OCR"),
(winreg.HKEY_CURRENT_USER, r"SOFTWARE\Tesseract-OCR"),
]
TESSERACT_DEFAULT_PATHS = [
Path(r"C:\Program Files\Tesseract-OCR"),
Path(r"C:\Program Files (x86)\Tesseract-OCR"),
Path.home() / "Tesseract-OCR",
]
def __init__(self):
self.backends = {
'easyocr': {'installed': False, 'available': False, 'path': None},
'tesseract': {'installed': False, 'available': False, 'path': None},
'paddleocr': {'installed': False, 'available': False, 'path': None},
}
self._scan_backends()
def _scan_backends(self):
"""Scan for installed OCR backends."""
self._check_easyocr()
self._check_tesseract()
self._check_paddleocr()
def _check_easyocr(self):
"""Check if EasyOCR Python package is installed."""
try:
import easyocr
self.backends['easyocr']['installed'] = True
self.backends['easyocr']['available'] = True
except ImportError:
pass
def _check_tesseract(self):
"""Check if Tesseract is installed (Python package and binary)."""
# Check Python package
try:
import pytesseract
self.backends['tesseract']['installed'] = True
except ImportError:
pass
# Check for binary in various locations
tesseract_exe = self._find_tesseract_binary()
if tesseract_exe:
self.backends['tesseract']['path'] = str(tesseract_exe)
self.backends['tesseract']['available'] = True
# Configure pytesseract if Python package is installed
try:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = str(tesseract_exe)
except ImportError:
pass
def _find_tesseract_binary(self) -> Optional[Path]:
"""Find Tesseract binary in registry and common paths."""
# Check if already in PATH
try:
result = subprocess.run(['tesseract', '--version'],
capture_output=True, text=True, timeout=5)
if result.returncode == 0:
# Tesseract is in PATH
return Path('tesseract')
except:
pass
if WINDOWS:
# Check Windows Registry
for hkey, reg_path in self.TESSERACT_REGISTRY_PATHS:
try:
with winreg.OpenKey(hkey, reg_path) as key:
install_path, _ = winreg.QueryValueEx(key, 'InstallDir')
if install_path:
exe_path = Path(install_path) / 'tesseract.exe'
if exe_path.exists():
return exe_path
except:
continue
# Check default installation paths
for path in self.TESSERACT_DEFAULT_PATHS:
exe_path = path / 'tesseract.exe'
if exe_path.exists():
return exe_path
return None
def _check_paddleocr(self):
"""Check if PaddleOCR Python package is installed."""
try:
from paddleocr import PaddleOCR
self.backends['paddleocr']['installed'] = True
self.backends['paddleocr']['available'] = True
except ImportError:
pass
def get_backend_status(self, backend: str) -> Dict:
"""Get status of a specific backend."""
return self.backends.get(backend, {}).copy()
def get_all_status(self) -> Dict:
"""Get status of all backends."""
return self.backends.copy()
def get_best_available(self) -> Optional[str]:
"""Get the best available backend."""
priority = ['easyocr', 'tesseract', 'paddleocr']
for backend in priority:
if self.backends[backend]['available']:
return backend
return None
def install_backend(self, backend: str) -> Tuple[bool, str]:
"""Install a backend via pip."""
packages = {
'easyocr': 'easyocr',
'tesseract': 'pytesseract',
'paddleocr': 'paddleocr',
}
if backend not in packages:
return False, f"Unknown backend: {backend}"
try:
result = subprocess.run(
[sys.executable, '-m', 'pip', 'install', packages[backend]],
capture_output=True,
text=True,
timeout=300
)
if result.returncode == 0:
self._scan_backends() # Re-scan after install
return True, f"Successfully installed {backend}"
else:
return False, f"Failed to install {backend}: {result.stderr}"
except Exception as e:
return False, f"Error installing {backend}: {e}"
def get_tesseract_install_info(self) -> str:
"""Get instructions for installing Tesseract."""
return """Tesseract is not installed or not in PATH.
To install Tesseract:
1. Download installer from:
https://github.com/UB-Mannheim/tesseract/wiki
2. Run the installer and note the installation path
(usually C:\\Program Files\\Tesseract-OCR)
3. Add to PATH or use the auto-detect feature
4. Restart EU-Utility
Alternative: Use EasyOCR instead (pip install easyocr)
"""
def auto_configure_tesseract(self) -> bool:
"""Auto-configure Tesseract from registry/paths."""
if not WINDOWS:
return False
exe_path = self._find_tesseract_binary()
if exe_path and exe_path != Path('tesseract'):
try:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = str(exe_path)
self.backends['tesseract']['available'] = True
self.backends['tesseract']['path'] = str(exe_path)
return True
except:
pass
return False
# Singleton instance
_ocr_manager = None
def get_ocr_backend_manager() -> OCRBackendManager:
"""Get the OCR backend manager singleton."""
global _ocr_manager
if _ocr_manager is None:
_ocr_manager = OCRBackendManager()
return _ocr_manager