EU-Utility/core/ocr_service.py

393 lines
13 KiB
Python

"""
EU-Utility - OCR Service Core Module
Screen capture and OCR functionality for all plugins.
Part of core - not a plugin. Plugins access via PluginAPI.
"""
import io
import base64
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass
from pathlib import Path
try:
import numpy as np
NUMPY_AVAILABLE = True
except ImportError:
NUMPY_AVAILABLE = False
np = None
@dataclass
class OCRResult:
"""Result from OCR operation."""
text: str
confidence: float
bounding_box: Tuple[int, int, int, int] # x, y, width, height
raw_data: Any = None
class OCRService:
"""
Core OCR service with multiple backend support.
Fallback chain: EasyOCR -> Tesseract -> PaddleOCR
LAZY INITIALIZATION - only loads when first used
"""
def __init__(self):
self._ocr_reader = None
self._backend = None
self._initialized = False
self._initializing = False
def _init_backends(self):
"""Initialize available OCR backends (lazy - called on first use)."""
if self._initialized or self._initializing:
return
self._initializing = True
print("[OCR] Initializing backends...")
# Try EasyOCR first (best accuracy)
try:
import easyocr
self._ocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False)
self._backend = 'easyocr'
self._initialized = True
print("[OCR] Using EasyOCR backend")
self._initializing = False
return
except ImportError:
pass
except Exception as e:
print(f"[OCR] EasyOCR failed: {e}")
# Try Tesseract (most common)
try:
import pytesseract
from PIL import Image
pytesseract.get_tesseract_version()
self._backend = 'tesseract'
self._initialized = True
print("[OCR] Using Tesseract backend")
self._initializing = False
return
except Exception as e:
print(f"[OCR] Tesseract failed: {e}")
# Try PaddleOCR (fallback) - with minimal config
try:
from paddleocr import PaddleOCR
# Use minimal config to avoid model downloads on init
import os
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
self._ocr_reader = PaddleOCR(
lang='en',
show_log=False,
use_gpu=False # This param may not work in all versions
)
self._backend = 'paddle'
self._initialized = True
print("[OCR] Using PaddleOCR backend")
except TypeError:
# Try without use_gpu if it failed
try:
self._ocr_reader = PaddleOCR(lang='en', show_log=False)
self._backend = 'paddle'
self._initialized = True
print("[OCR] Using PaddleOCR backend (no GPU)")
except Exception as e2:
print(f"[OCR] PaddleOCR failed: {e2}")
except Exception as e:
print(f"[OCR] PaddleOCR failed: {e}")
self._initializing = False
if not self._initialized:
print("[OCR] WARNING: No OCR backend available!")
print("[OCR] Install one of: easyocr, pytesseract, paddleocr")
def is_available(self) -> bool:
"""Check if OCR is available (lazy init)."""
if not self._initialized and not self._initializing:
self._init_backends()
return self._initialized
def capture_screen(self, region: Tuple[int, int, int, int] = None) -> 'Image.Image':
"""
Capture screen or region using the ScreenshotService.
Args:
region: (x, y, width, height) or None for full screen
Returns:
PIL Image
"""
try:
from core.screenshot import get_screenshot_service
screenshot_service = get_screenshot_service()
if region:
x, y, width, height = region
return screenshot_service.capture_region(x, y, width, height)
else:
return screenshot_service.capture(full_screen=True)
except Exception as e:
print(f"[OCR] Screenshot service failed, falling back: {e}")
# Fallback to direct pyautogui capture
try:
import pyautogui
if region:
x, y, width, height = region
return pyautogui.screenshot(region=(x, y, width, height))
else:
return pyautogui.screenshot()
except ImportError:
raise RuntimeError("pyautogui not installed. Run: pip install pyautogui")
def recognize(self, image=None, region: Tuple[int, int, int, int] = None) -> Dict[str, Any]:
"""
Perform OCR on image or screen region.
Args:
image: PIL Image, numpy array, or None to capture screen
region: Screen region to capture (if image is None)
Returns:
Dict with 'text', 'confidence', 'results', 'image_size'
"""
# Lazy initialization
if not self._initialized and not self._initializing:
self._init_backends()
if not self._initialized:
return {
'text': '',
'confidence': 0,
'error': 'OCR not initialized - no backend available',
'results': []
}
try:
# Capture if needed
if image is None:
image = self.capture_screen(region)
# Convert to appropriate format
if self._backend == 'easyocr':
return self._ocr_easyocr(image)
elif self._backend == 'tesseract':
return self._ocr_tesseract(image)
elif self._backend == 'paddle':
return self._ocr_paddle(image)
else:
return {
'text': '',
'confidence': 0,
'error': 'Unknown backend',
'results': []
}
except Exception as e:
return {
'text': '',
'confidence': 0,
'error': str(e),
'results': []
}
def recognize_image(self, image) -> Dict[str, Any]:
"""
Perform OCR on a PIL Image.
Convenience alias for recognize(image=image).
Args:
image: PIL Image to OCR
Returns:
Dict with 'text', 'confidence', 'results', 'image_size'
"""
return self.recognize(image=image)
def _ocr_easyocr(self, image) -> Dict[str, Any]:
"""OCR using EasyOCR."""
import numpy as np
# Convert PIL to numpy
if hasattr(image, 'convert'):
image_np = np.array(image)
else:
image_np = image
results = self._ocr_reader.readtext(image_np)
# Parse results
texts = []
total_confidence = 0
parsed_results = []
for (bbox, text, conf) in results:
texts.append(text)
total_confidence += conf
# Get bounding box
x_coords = [p[0] for p in bbox]
y_coords = [p[1] for p in bbox]
x_min, x_max = min(x_coords), max(x_coords)
y_min, y_max = min(y_coords), max(y_coords)
parsed_results.append(OCRResult(
text=text,
confidence=conf,
bounding_box=(int(x_min), int(y_min), int(x_max-x_min), int(y_max-y_min)),
raw_data={'bbox': bbox}
))
avg_confidence = total_confidence / len(results) if results else 0
return {
'text': ' '.join(texts),
'confidence': avg_confidence,
'results': parsed_results,
'image_size': image.size if hasattr(image, 'size') else None
}
def _ocr_tesseract(self, image) -> Dict[str, Any]:
"""OCR using Tesseract."""
import pytesseract
# Get full text
text = pytesseract.image_to_string(image).strip()
# Get detailed data
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
parsed_results = []
for i, word in enumerate(data['text']):
if word.strip():
conf = int(data['conf'][i])
if conf > 0: # Valid confidence
parsed_results.append(OCRResult(
text=word,
confidence=conf / 100.0,
bounding_box=(
data['left'][i],
data['top'][i],
data['width'][i],
data['height'][i]
),
raw_data={'block_num': data['block_num'][i]}
))
avg_confidence = sum(r.confidence for r in parsed_results) / len(parsed_results) if parsed_results else 0
return {
'text': text,
'confidence': avg_confidence,
'results': parsed_results,
'image_size': image.size if hasattr(image, 'size') else None
}
def _ocr_paddle(self, image) -> Dict[str, Any]:
"""OCR using PaddleOCR."""
import numpy as np
# Convert PIL to numpy
if hasattr(image, 'convert'):
image_np = np.array(image)
else:
image_np = image
result = self._ocr_reader.ocr(image_np, cls=True)
texts = []
parsed_results = []
total_confidence = 0
if result and result[0]:
for line in result[0]:
bbox, (text, conf) = line
texts.append(text)
total_confidence += conf
# Parse bounding box
x_coords = [p[0] for p in bbox]
y_coords = [p[1] for p in bbox]
parsed_results.append(OCRResult(
text=text,
confidence=conf,
bounding_box=(
int(min(x_coords)),
int(min(y_coords)),
int(max(x_coords) - min(x_coords)),
int(max(y_coords) - min(y_coords))
),
raw_data={'bbox': bbox}
))
avg_confidence = total_confidence / len(parsed_results) if parsed_results else 0
return {
'text': ' '.join(texts),
'confidence': avg_confidence,
'results': parsed_results,
'image_size': image.size if hasattr(image, 'size') else None
}
def recognize_region(self, x: int, y: int, width: int, height: int) -> Dict[str, Any]:
"""Convenience method for region OCR."""
return self.recognize(region=(x, y, width, height))
def find_text(self, target_text: str, image=None, region: Tuple[int, int, int, int] = None) -> List[OCRResult]:
"""
Find specific text in image.
Returns list of OCRResult where target_text is found.
"""
result = self.recognize(image, region)
matches = []
for r in result.get('results', []):
if target_text.lower() in r.text.lower():
matches.append(r)
return matches
def get_text_at_position(self, x: int, y: int, image=None) -> Optional[str]:
"""Get text at specific screen position."""
# Small region around point
region = (x - 50, y - 10, 100, 20)
result = self.recognize(image, region)
return result.get('text') if result.get('text') else None
# Singleton instance
_ocr_service = None
def get_ocr_service() -> OCRService:
"""Get global OCRService instance."""
global _ocr_service
if _ocr_service is None:
_ocr_service = OCRService()
return _ocr_service
# Convenience function for quick OCR
def quick_ocr(region: Tuple[int, int, int, int] = None) -> str:
"""
Quick OCR - capture and get text.
Usage:
text = quick_ocr() # Full screen
text = quick_ocr((100, 100, 200, 50)) # Region
"""
service = get_ocr_service()
result = service.recognize(region=region)
return result.get('text', '')