Lemontropia-Suite/modules/ocr_backends/tesseract_backend.py

290 lines
9.5 KiB
Python

"""
Lemontropia Suite - Tesseract OCR Backend
Traditional OCR using Tesseract - stable, no ML dependencies.
"""
import numpy as np
import logging
from typing import List, Optional, Tuple
from pathlib import Path
import shutil
from . import BaseOCRBackend, OCRTextRegion
logger = logging.getLogger(__name__)
class TesseractBackend(BaseOCRBackend):
"""
OCR backend using Tesseract OCR.
Pros:
- Very stable and mature
- No PyTorch/TensorFlow dependencies
- Fast on CPU
- Works with Windows Store Python
Cons:
- Lower accuracy on game UI text than neural OCR
- Requires Tesseract binary installation
Installation:
- Windows: choco install tesseract or download from UB Mannheim
- Linux: sudo apt-get install tesseract-ocr
- macOS: brew install tesseract
- Python: pip install pytesseract
"""
NAME = "tesseract"
SUPPORTS_GPU = False # Tesseract is CPU-only
def __init__(self, use_gpu: bool = True, lang: str = 'en', **kwargs):
super().__init__(use_gpu=use_gpu, lang=lang, **kwargs)
self.tesseract_cmd = kwargs.get('tesseract_cmd', None)
self._version = None
# Language mapping for Tesseract
self.lang_map = {
'en': 'eng',
'sv': 'swe', # Swedish
'de': 'deu',
'fr': 'fra',
'es': 'spa',
'latin': 'eng+deu+fra+spa', # Multi-language
}
# Tesseract configuration
self.config = kwargs.get('config', '--psm 6') # Assume single uniform block of text
def _initialize(self) -> bool:
"""Initialize Tesseract OCR."""
try:
import pytesseract
# Set custom path if provided
if self.tesseract_cmd:
pytesseract.pytesseract.tesseract_cmd = self.tesseract_cmd
# Try to get version to verify installation
try:
version = pytesseract.get_tesseract_version()
self._version = str(version)
logger.info(f"Tesseract version: {version}")
except Exception as e:
# Try to find tesseract in PATH
tesseract_path = shutil.which('tesseract')
if tesseract_path:
pytesseract.pytesseract.tesseract_cmd = tesseract_path
version = pytesseract.get_tesseract_version()
self._version = str(version)
logger.info(f"Tesseract found at: {tesseract_path}, version: {version}")
else:
raise e
self._available = True
logger.info("Tesseract OCR initialized successfully")
return True
except ImportError:
self._error_msg = "pytesseract not installed. Run: pip install pytesseract"
logger.warning(self._error_msg)
return False
except Exception as e:
self._error_msg = f"Tesseract not found: {e}. Please install Tesseract OCR."
logger.warning(self._error_msg)
logger.info("Download from: https://github.com/UB-Mannheim/tesseract/wiki")
return False
def extract_text(self, image: np.ndarray) -> List[OCRTextRegion]:
"""
Extract text from image using Tesseract.
Uses a two-step approach:
1. Detect text regions using OpenCV contours
2. Run Tesseract on each region
Args:
image: Input image (BGR format from OpenCV)
Returns:
List of detected text regions with recognized text
"""
if not self._available:
logger.error("Tesseract backend not initialized")
return []
try:
import pytesseract
import cv2
# Preprocess image
gray = self._to_grayscale(image)
processed = self._preprocess_for_tesseract(gray)
# Get data including bounding boxes
tesseract_lang = self.lang_map.get(self.lang, 'eng')
data = pytesseract.image_to_data(
processed,
lang=tesseract_lang,
config=self.config,
output_type=pytesseract.Output.DICT
)
regions = []
n_boxes = len(data['text'])
for i in range(n_boxes):
text = data['text'][i].strip()
conf = int(data['conf'][i])
# Filter low confidence and empty text
if conf > 30 and text:
x = data['left'][i]
y = data['top'][i]
w = data['width'][i]
h = data['height'][i]
regions.append(OCRTextRegion(
text=text,
confidence=conf / 100.0, # Normalize to 0-1
bbox=(x, y, w, h),
language=self.lang
))
# Merge overlapping regions that are likely the same text
regions = self._merge_nearby_regions(regions)
logger.debug(f"Tesseract detected {len(regions)} text regions")
return regions
except Exception as e:
logger.error(f"Tesseract extraction failed: {e}")
return []
def _preprocess_for_tesseract(self, gray: np.ndarray) -> np.ndarray:
"""Preprocess image specifically for Tesseract."""
import cv2
# Resize small images (Tesseract works better with larger text)
h, w = gray.shape[:2]
min_height = 100
if h < min_height:
scale = min_height / h
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
# Apply adaptive thresholding
processed = cv2.adaptiveThreshold(
gray, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
11, 2
)
# Denoise
processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)
return processed
def _merge_nearby_regions(self, regions: List[OCRTextRegion],
max_distance: int = 10) -> List[OCRTextRegion]:
"""Merge text regions that are close to each other."""
if not regions:
return []
# Sort by y position
sorted_regions = sorted(regions, key=lambda r: (r.bbox[1], r.bbox[0]))
merged = []
current = sorted_regions[0]
for next_region in sorted_regions[1:]:
# Check if regions are close enough to merge
cx, cy, cw, ch = current.bbox
nx, ny, nw, nh = next_region.bbox
# Calculate distance
distance = abs(ny - cy)
x_overlap = not (cx + cw < nx or nx + nw < cx)
if distance < max_distance and x_overlap:
# Merge regions
min_x = min(cx, nx)
min_y = min(cy, ny)
max_x = max(cx + cw, nx + nw)
max_y = max(cy + ch, ny + nh)
# Combine text
combined_text = current.text + " " + next_region.text
avg_conf = (current.confidence + next_region.confidence) / 2
current = OCRTextRegion(
text=combined_text.strip(),
confidence=avg_conf,
bbox=(min_x, min_y, max_x - min_x, max_y - min_y),
language=self.lang
)
else:
merged.append(current)
current = next_region
merged.append(current)
return merged
def extract_text_simple(self, image: np.ndarray) -> str:
"""
Simple text extraction without region detection.
Returns:
All text found in image as single string
"""
if not self._available:
return ""
try:
import pytesseract
import cv2
# Convert to RGB if needed
if len(image.shape) == 3:
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
tesseract_lang = self.lang_map.get(self.lang, 'eng')
text = pytesseract.image_to_string(
image,
lang=tesseract_lang,
config=self.config
)
return text.strip()
except Exception as e:
logger.error(f"Tesseract simple extraction failed: {e}")
return ""
@staticmethod
def find_tesseract() -> Optional[str]:
"""Find Tesseract installation path."""
path = shutil.which('tesseract')
if path:
return path
# Common Windows paths
common_paths = [
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
r"C:\Users\%USERNAME%\AppData\Local\Tesseract-OCR\tesseract.exe",
r"C:\Tesseract-OCR\tesseract.exe",
]
import os
for p in common_paths:
expanded = os.path.expandvars(p)
if Path(expanded).exists():
return expanded
return None