Lemontropia-Suite/modules/game_vision_ai.py

723 lines
24 KiB
Python

"""
Lemontropia Suite - Game Vision AI Module
Advanced computer vision with multiple OCR backends and GPU acceleration.
OCR Backends (in priority order):
1. OpenCV EAST - Fastest, no dependencies (primary fallback)
2. EasyOCR - Good accuracy, lighter than PaddleOCR
3. Tesseract OCR - Traditional, stable
4. PaddleOCR - Best accuracy (requires working PyTorch)
Handles PyTorch DLL errors on Windows Store Python gracefully.
"""
import cv2
import numpy as np
import logging
import time
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional, Tuple, List, Dict, Any, Union
from enum import Enum
import json
import hashlib
logger = logging.getLogger(__name__)
# Import hardware detection
from .hardware_detection import (
HardwareDetector, HardwareInfo, GPUBackend,
recommend_ocr_backend, get_hardware_info
)
# Import OCR backends
from .ocr_backends import (
BaseOCRBackend, OCRTextRegion, OCRBackendInfo,
OCRBackendFactory
)
@dataclass
class TextRegion:
"""Detected text region with metadata."""
text: str
confidence: float
bbox: Tuple[int, int, int, int] # x, y, w, h
language: str = "en"
backend: str = "unknown" # Which OCR backend detected this
def to_dict(self) -> Dict[str, Any]:
return {
'text': self.text,
'confidence': self.confidence,
'bbox': self.bbox,
'language': self.language,
'backend': self.backend
}
@classmethod
def from_ocr_region(cls, region: OCRTextRegion, backend: str = "unknown"):
"""Create from OCR backend region."""
return cls(
text=region.text,
confidence=region.confidence,
bbox=region.bbox,
language=region.language,
backend=backend
)
@dataclass
class IconRegion:
"""Detected icon region with metadata."""
image: np.ndarray
bbox: Tuple[int, int, int, int] # x, y, w, h
confidence: float
icon_hash: str = ""
def __post_init__(self):
if not self.icon_hash:
self.icon_hash = self._compute_hash()
def _compute_hash(self) -> str:
"""Compute perceptual hash of icon."""
if self.image is None or self.image.size == 0:
return ""
# Resize to standard size and compute average hash
small = cv2.resize(self.image, (16, 16), interpolation=cv2.INTER_AREA)
gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY) if len(small.shape) == 3 else small
avg = gray.mean()
hash_bits = (gray > avg).flatten()
return ''.join(['1' if b else '0' for b in hash_bits])
@dataclass
class ItemMatch:
"""Result of matching an icon to database."""
name: str
confidence: float
item_id: Optional[str] = None
category: Optional[str] = None
matched_hash: str = ""
@dataclass
class VisionResult:
"""Complete vision processing result."""
text_regions: List[TextRegion] = field(default_factory=list)
icon_regions: List[IconRegion] = field(default_factory=list)
processing_time_ms: float = 0.0
gpu_backend: str = "cpu"
ocr_backend: str = "unknown"
timestamp: float = field(default_factory=time.time)
def to_dict(self) -> Dict[str, Any]:
return {
'text_regions': [t.to_dict() for t in self.text_regions],
'icon_count': len(self.icon_regions),
'processing_time_ms': self.processing_time_ms,
'gpu_backend': self.gpu_backend,
'ocr_backend': self.ocr_backend,
'timestamp': self.timestamp
}
class GPUDetector:
"""Detect and manage GPU availability."""
@staticmethod
def detect_backend() -> GPUBackend:
"""Detect best available GPU backend."""
info = HardwareDetector.detect_all()
return info.gpu_backend
@staticmethod
def get_gpu_info() -> Dict[str, Any]:
"""Get detailed GPU information."""
info = HardwareDetector.detect_all()
return info.to_dict()
class UnifiedOCRProcessor:
"""
Unified OCR processor with multiple backend support.
Automatically selects the best available backend based on:
1. Hardware capabilities
2. PyTorch DLL compatibility
3. User preferences
Gracefully falls through backends if one fails.
"""
SUPPORTED_LANGUAGES = ['en', 'sv', 'latin', 'de', 'fr', 'es']
# Default priority (can be overridden)
DEFAULT_PRIORITY = [
'paddleocr', # Best accuracy if available
'easyocr', # Good balance
'tesseract', # Stable fallback
'opencv_east', # Fastest, always works
]
def __init__(self, use_gpu: bool = True, lang: str = 'en',
backend_priority: Optional[List[str]] = None,
auto_select: bool = True):
"""
Initialize Unified OCR Processor.
Args:
use_gpu: Enable GPU acceleration if available
lang: Language for OCR ('en', 'sv', 'latin', etc.)
backend_priority: Custom backend priority order
auto_select: Automatically select best backend
"""
self.use_gpu = use_gpu
self.lang = lang if lang in self.SUPPORTED_LANGUAGES else 'en'
self.backend_priority = backend_priority or self.DEFAULT_PRIORITY
self._backend: Optional[BaseOCRBackend] = None
self._backend_name: str = "unknown"
self._hardware_info: HardwareInfo = HardwareDetector.detect_all()
# Initialize
if auto_select:
self._auto_select_backend()
logger.info(f"UnifiedOCR initialized with backend: {self._backend_name}")
def _auto_select_backend(self):
"""Automatically select the best available backend."""
# Check for PyTorch DLL errors first
if self._hardware_info.pytorch_dll_error:
logger.warning(
"PyTorch DLL error detected - avoiding PyTorch-based backends"
)
# Remove PyTorch-dependent backends from priority
safe_backends = [
b for b in self.backend_priority
if b not in ['paddleocr', 'easyocr']
]
else:
safe_backends = self.backend_priority
# Get recommended backend
recommended = HardwareDetector.recommend_ocr_backend()
# Try to create backend
for name in safe_backends:
backend = OCRBackendFactory.create_backend(
name,
use_gpu=self.use_gpu,
lang=self.lang
)
if backend is not None and backend.is_available():
self._backend = backend
self._backend_name = name
logger.info(f"Selected OCR backend: {name}")
return
# Ultimate fallback - OpenCV EAST always works
logger.warning("All preferred backends failed, trying OpenCV EAST...")
backend = OCRBackendFactory.create_backend(
'opencv_east',
use_gpu=self.use_gpu,
lang=self.lang
)
if backend is not None and backend.is_available():
self._backend = backend
self._backend_name = 'opencv_east'
logger.info("Using OpenCV EAST as ultimate fallback")
else:
logger.error("CRITICAL: No OCR backend available!")
def set_backend(self, name: str) -> bool:
"""
Manually set OCR backend.
Args:
name: Backend name ('paddleocr', 'easyocr', 'tesseract', 'opencv_east')
Returns:
True if successful
"""
backend = OCRBackendFactory.create_backend(
name,
use_gpu=self.use_gpu,
lang=self.lang
)
if backend is not None and backend.is_available():
self._backend = backend
self._backend_name = name
logger.info(f"Switched to OCR backend: {name}")
return True
else:
logger.error(f"Failed to switch to OCR backend: {name}")
return False
def extract_text(self, image: Union[str, np.ndarray, Path]) -> List[TextRegion]:
"""
Extract text from image using selected backend.
Args:
image: Image path or numpy array
Returns:
List of detected text regions
"""
# Load image if path provided
if isinstance(image, (str, Path)):
img = cv2.imread(str(image))
if img is None:
logger.error(f"Failed to load image: {image}")
return []
else:
img = image.copy()
# Check backend
if self._backend is None:
logger.error("No OCR backend available")
return []
try:
# Extract text using backend
ocr_regions = self._backend.extract_text(img)
# Convert to TextRegion with backend info
regions = [
TextRegion.from_ocr_region(r, self._backend_name)
for r in ocr_regions
]
logger.debug(f"Extracted {len(regions)} text regions using {self._backend_name}")
return regions
except Exception as e:
logger.error(f"OCR extraction failed: {e}")
return []
def extract_text_from_region(self, image: np.ndarray,
region: Tuple[int, int, int, int]) -> List[TextRegion]:
"""Extract text from specific region of image."""
x, y, w, h = region
roi = image[y:y+h, x:x+w]
if roi.size == 0:
return []
regions = self.extract_text(roi)
# Adjust coordinates back to original image
for r in regions:
rx, ry, rw, rh = r.bbox
r.bbox = (x + rx, y + ry, rw, rh)
return regions
def get_available_backends(self) -> List[OCRBackendInfo]:
"""Get information about all available backends."""
return OCRBackendFactory.check_all_backends(self.use_gpu, self.lang)
def get_current_backend(self) -> str:
"""Get name of current backend."""
return self._backend_name
def get_backend_info(self) -> Dict[str, Any]:
"""Get information about current backend."""
if self._backend:
return self._backend.get_info().to_dict()
return {"error": "No backend initialized"}
def is_recognition_supported(self) -> bool:
"""
Check if current backend supports text recognition.
Note: OpenCV EAST only detects text regions, doesn't recognize text.
"""
return self._backend_name not in ['opencv_east']
# Legacy class for backward compatibility
class OCRProcessor(UnifiedOCRProcessor):
"""Legacy OCR processor - now wraps UnifiedOCRProcessor."""
pass
class IconDetector:
"""Detect and extract item icons from game UI."""
# Typical Entropia Universe loot window icon sizes
ICON_SIZES = {
'small': (32, 32),
'medium': (48, 48),
'large': (64, 64),
'hud': (40, 40)
}
def __init__(self, template_dir: Optional[Path] = None):
self.template_dir = template_dir or Path(__file__).parent / "templates" / "icons"
self.templates: Dict[str, np.ndarray] = {}
self._load_templates()
def _load_templates(self):
"""Load icon templates for matching."""
if not self.template_dir.exists():
logger.warning(f"Template directory not found: {self.template_dir}")
return
for template_file in self.template_dir.glob("*.png"):
try:
name = template_file.stem
template = cv2.imread(str(template_file), cv2.IMREAD_COLOR)
if template is not None:
self.templates[name] = template
logger.debug(f"Loaded icon template: {name}")
except Exception as e:
logger.error(f"Failed to load template {template_file}: {e}")
def detect_loot_window(self, image: np.ndarray) -> Optional[Tuple[int, int, int, int]]:
"""Detect loot window in screenshot."""
# Look for common loot window indicators
if 'loot_window' in self.templates:
result = cv2.matchTemplate(
image, self.templates['loot_window'], cv2.TM_CCOEFF_NORMED
)
_, max_val, _, max_loc = cv2.minMaxLoc(result)
if max_val > 0.7:
h, w = self.templates['loot_window'].shape[:2]
return (*max_loc, w, h)
# Method 2: Detect based on typical loot window characteristics
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Filter for icon-sized squares
potential_icons = []
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
aspect = w / h if h > 0 else 0
for size_name, (sw, sh) in self.ICON_SIZES.items():
if abs(w - sw) < 5 and abs(h - sh) < 5 and 0.8 < aspect < 1.2:
potential_icons.append((x, y, w, h))
break
# If we found multiple icons in a grid pattern, assume loot window
if len(potential_icons) >= 2:
xs = [p[0] for p in potential_icons]
ys = [p[1] for p in potential_icons]
ws = [p[2] for p in potential_icons]
hs = [p[3] for p in potential_icons]
min_x, max_x = min(xs), max(xs) + max(ws)
min_y, max_y = min(ys), max(ys) + max(hs)
padding = 20
return (
max(0, min_x - padding),
max(0, min_y - padding),
max_x - min_x + padding * 2,
max_y - min_y + padding * 2
)
return None
def extract_icons_from_region(self, image: np.ndarray,
region: Tuple[int, int, int, int],
icon_size: str = 'medium') -> List[IconRegion]:
"""Extract icons from a specific region."""
x, y, w, h = region
roi = image[y:y+h, x:x+w]
if roi.size == 0:
return []
target_size = self.ICON_SIZES.get(icon_size, (48, 48))
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
icons = []
thresholds = [(200, 255), (180, 255), (150, 255)]
for thresh_low, thresh_high in thresholds:
_, thresh = cv2.threshold(gray, thresh_low, thresh_high, cv2.THRESH_BINARY)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for cnt in contours:
cx, cy, cw, ch = cv2.boundingRect(cnt)
aspect = cw / ch if ch > 0 else 0
if (abs(cw - target_size[0]) < 8 and
abs(ch - target_size[1]) < 8 and
0.7 < aspect < 1.3):
icon_img = roi[cy:cy+ch, cx:cx+cw]
icon_img = cv2.resize(icon_img, target_size, interpolation=cv2.INTER_AREA)
icons.append(IconRegion(
image=icon_img,
bbox=(x + cx, y + cy, cw, ch),
confidence=0.8
))
# Remove duplicates
unique_icons = self._remove_duplicate_icons(icons)
return unique_icons
def _remove_duplicate_icons(self, icons: List[IconRegion],
iou_threshold: float = 0.5) -> List[IconRegion]:
"""Remove duplicate icons based on IoU."""
if not icons:
return []
sorted_icons = sorted(icons, key=lambda x: x.confidence, reverse=True)
kept = []
for icon in sorted_icons:
is_duplicate = False
for kept_icon in kept:
if self._calculate_iou(icon.bbox, kept_icon.bbox) > iou_threshold:
is_duplicate = True
break
if not is_duplicate:
kept.append(icon)
return kept
def _calculate_iou(self, box1: Tuple[int, int, int, int],
box2: Tuple[int, int, int, int]) -> float:
"""Calculate Intersection over Union."""
x1, y1, w1, h1 = box1
x2, y2, w2, h2 = box2
xi1 = max(x1, x2)
yi1 = max(y1, y2)
xi2 = min(x1 + w1, x2 + w2)
yi2 = min(y1 + h1, y2 + h2)
inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
box1_area = w1 * h1
box2_area = w2 * h2
union_area = box1_area + box2_area - inter_area
return inter_area / union_area if union_area > 0 else 0
class GameVisionAI:
"""
Main AI vision interface for game screenshot analysis.
Combines OCR and icon detection with multiple backend support.
"""
def __init__(self, use_gpu: bool = True, ocr_lang: str = 'en',
ocr_backend: Optional[str] = None,
data_dir: Optional[Path] = None):
"""
Initialize Game Vision AI.
Args:
use_gpu: Enable GPU acceleration if available
ocr_lang: Language for OCR
ocr_backend: Specific OCR backend to use (None for auto)
data_dir: Directory for storing extracted data
"""
self.use_gpu = use_gpu
self.data_dir = data_dir or Path.home() / ".lemontropia"
self.extracted_icons_dir = self.data_dir / "extracted_icons"
self.extracted_icons_dir.mkdir(parents=True, exist_ok=True)
# Detect hardware
self.hardware_info = HardwareDetector.detect_all()
self.backend = self.hardware_info.gpu_backend
# Initialize OCR processor
self.ocr = UnifiedOCRProcessor(
use_gpu=use_gpu,
lang=ocr_lang,
auto_select=(ocr_backend is None)
)
# Set specific backend if requested
if ocr_backend:
self.ocr.set_backend(ocr_backend)
# Initialize icon detector
self.icon_detector = IconDetector()
logger.info(f"GameVisionAI initialized (GPU: {self.backend.value}, "
f"OCR: {self.ocr.get_current_backend()})")
def extract_text_from_image(self, image_path: Union[str, Path]) -> List[TextRegion]:
"""Extract all text from an image."""
return self.ocr.extract_text(image_path)
def extract_icons_from_image(self, image_path: Union[str, Path],
auto_detect_window: bool = True) -> List[IconRegion]:
"""Extract item icons from image."""
image = cv2.imread(str(image_path))
if image is None:
logger.error(f"Failed to load image: {image_path}")
return []
if auto_detect_window:
window_region = self.icon_detector.detect_loot_window(image)
if window_region:
logger.debug(f"Detected loot window: {window_region}")
return self.icon_detector.extract_icons_from_region(
image, window_region
)
else:
logger.debug("No loot window detected, scanning full image")
h, w = image.shape[:2]
return self.icon_detector.extract_icons_from_region(
image, (0, 0, w, h)
)
else:
h, w = image.shape[:2]
return self.icon_detector.extract_icons_from_region(
image, (0, 0, w, h)
)
def process_screenshot(self, image_path: Union[str, Path],
extract_text: bool = True,
extract_icons: bool = True) -> VisionResult:
"""
Process screenshot with all vision capabilities.
Args:
image_path: Path to screenshot
extract_text: Enable text extraction
extract_icons: Enable icon extraction
Returns:
VisionResult with all detections
"""
start_time = time.time()
result = VisionResult(
gpu_backend=self.backend.value,
ocr_backend=self.ocr.get_current_backend()
)
# Load image once
image = cv2.imread(str(image_path))
if image is None:
logger.error(f"Failed to load image: {image_path}")
return result
# Extract text
if extract_text:
result.text_regions = self.ocr.extract_text(image)
logger.debug(f"Extracted {len(result.text_regions)} text regions")
# Extract icons
if extract_icons:
result.icon_regions = self.extract_icons_from_image(image_path)
logger.debug(f"Extracted {len(result.icon_regions)} icons")
# Save extracted icons
self._save_extracted_icons(result.icon_regions)
result.processing_time_ms = (time.time() - start_time) * 1000
return result
def _save_extracted_icons(self, icons: List[IconRegion]):
"""Save extracted icons to disk."""
for i, icon in enumerate(icons):
filename = f"icon_{icon.icon_hash[:16]}_{int(time.time())}_{i}.png"
filepath = self.extracted_icons_dir / filename
cv2.imwrite(str(filepath), icon.image)
logger.debug(f"Saved icon: {filepath}")
def get_gpu_info(self) -> Dict[str, Any]:
"""Get GPU information."""
return self.hardware_info.to_dict()
def is_gpu_available(self) -> bool:
"""Check if GPU acceleration is available."""
return self.backend != GPUBackend.CPU
def get_ocr_backends(self) -> List[Dict[str, Any]]:
"""Get information about all available OCR backends."""
backends = self.ocr.get_available_backends()
return [b.to_dict() for b in backends]
def switch_ocr_backend(self, name: str) -> bool:
"""Switch to a different OCR backend."""
return self.ocr.set_backend(name)
def calibrate_for_game(self, sample_screenshots: List[Path]) -> Dict[str, Any]:
"""Calibrate vision system using sample screenshots."""
calibration = {
'screenshots_processed': 0,
'text_regions_detected': 0,
'icons_detected': 0,
'average_processing_time_ms': 0,
'detected_regions': {},
'ocr_backend': self.ocr.get_current_backend(),
'gpu_backend': self.backend.value,
}
total_time = 0
for screenshot_path in sample_screenshots:
try:
start = time.time()
result = self.process_screenshot(screenshot_path)
elapsed = (time.time() - start) * 1000
calibration['screenshots_processed'] += 1
calibration['text_regions_detected'] += len(result.text_regions)
calibration['icons_detected'] += len(result.icon_regions)
total_time += elapsed
except Exception as e:
logger.error(f"Failed to process {screenshot_path}: {e}")
if calibration['screenshots_processed'] > 0:
calibration['average_processing_time_ms'] = (
total_time / calibration['screenshots_processed']
)
return calibration
@staticmethod
def diagnose() -> Dict[str, Any]:
"""Run full diagnostic on vision system."""
return {
'hardware': HardwareDetector.detect_all().to_dict(),
'ocr_backends': [
b.to_dict() for b in
OCRBackendFactory.check_all_backends()
],
'recommendations': {
'ocr_backend': HardwareDetector.recommend_ocr_backend(),
'gpu': GPUDetector.detect_backend().value,
}
}
# Export main classes
__all__ = [
'GameVisionAI',
'UnifiedOCRProcessor',
'OCRProcessor', # Legacy
'TextRegion',
'IconRegion',
'ItemMatch',
'VisionResult',
'GPUBackend',
'GPUDetector',
'IconDetector',
'HardwareDetector',
'OCRBackendFactory',
'BaseOCRBackend',
]