290 lines
9.5 KiB
Python
290 lines
9.5 KiB
Python
"""
|
|
Lemontropia Suite - Tesseract OCR Backend
|
|
Traditional OCR using Tesseract - stable, no ML dependencies.
|
|
"""
|
|
|
|
import numpy as np
|
|
import logging
|
|
from typing import List, Optional, Tuple
|
|
from pathlib import Path
|
|
import shutil
|
|
|
|
from . import BaseOCRBackend, OCRTextRegion
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TesseractBackend(BaseOCRBackend):
|
|
"""
|
|
OCR backend using Tesseract OCR.
|
|
|
|
Pros:
|
|
- Very stable and mature
|
|
- No PyTorch/TensorFlow dependencies
|
|
- Fast on CPU
|
|
- Works with Windows Store Python
|
|
|
|
Cons:
|
|
- Lower accuracy on game UI text than neural OCR
|
|
- Requires Tesseract binary installation
|
|
|
|
Installation:
|
|
- Windows: choco install tesseract or download from UB Mannheim
|
|
- Linux: sudo apt-get install tesseract-ocr
|
|
- macOS: brew install tesseract
|
|
- Python: pip install pytesseract
|
|
"""
|
|
|
|
NAME = "tesseract"
|
|
SUPPORTS_GPU = False # Tesseract is CPU-only
|
|
|
|
def __init__(self, use_gpu: bool = True, lang: str = 'en', **kwargs):
|
|
super().__init__(use_gpu=use_gpu, lang=lang, **kwargs)
|
|
|
|
self.tesseract_cmd = kwargs.get('tesseract_cmd', None)
|
|
self._version = None
|
|
|
|
# Language mapping for Tesseract
|
|
self.lang_map = {
|
|
'en': 'eng',
|
|
'sv': 'swe', # Swedish
|
|
'de': 'deu',
|
|
'fr': 'fra',
|
|
'es': 'spa',
|
|
'latin': 'eng+deu+fra+spa', # Multi-language
|
|
}
|
|
|
|
# Tesseract configuration
|
|
self.config = kwargs.get('config', '--psm 6') # Assume single uniform block of text
|
|
|
|
def _initialize(self) -> bool:
|
|
"""Initialize Tesseract OCR."""
|
|
try:
|
|
import pytesseract
|
|
|
|
# Set custom path if provided
|
|
if self.tesseract_cmd:
|
|
pytesseract.pytesseract.tesseract_cmd = self.tesseract_cmd
|
|
|
|
# Try to get version to verify installation
|
|
try:
|
|
version = pytesseract.get_tesseract_version()
|
|
self._version = str(version)
|
|
logger.info(f"Tesseract version: {version}")
|
|
except Exception as e:
|
|
# Try to find tesseract in PATH
|
|
tesseract_path = shutil.which('tesseract')
|
|
if tesseract_path:
|
|
pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
|
version = pytesseract.get_tesseract_version()
|
|
self._version = str(version)
|
|
logger.info(f"Tesseract found at: {tesseract_path}, version: {version}")
|
|
else:
|
|
raise e
|
|
|
|
self._available = True
|
|
logger.info("Tesseract OCR initialized successfully")
|
|
return True
|
|
|
|
except ImportError:
|
|
self._error_msg = "pytesseract not installed. Run: pip install pytesseract"
|
|
logger.warning(self._error_msg)
|
|
return False
|
|
|
|
except Exception as e:
|
|
self._error_msg = f"Tesseract not found: {e}. Please install Tesseract OCR."
|
|
logger.warning(self._error_msg)
|
|
logger.info("Download from: https://github.com/UB-Mannheim/tesseract/wiki")
|
|
return False
|
|
|
|
def extract_text(self, image: np.ndarray) -> List[OCRTextRegion]:
|
|
"""
|
|
Extract text from image using Tesseract.
|
|
|
|
Uses a two-step approach:
|
|
1. Detect text regions using OpenCV contours
|
|
2. Run Tesseract on each region
|
|
|
|
Args:
|
|
image: Input image (BGR format from OpenCV)
|
|
|
|
Returns:
|
|
List of detected text regions with recognized text
|
|
"""
|
|
if not self._available:
|
|
logger.error("Tesseract backend not initialized")
|
|
return []
|
|
|
|
try:
|
|
import pytesseract
|
|
import cv2
|
|
|
|
# Preprocess image
|
|
gray = self._to_grayscale(image)
|
|
processed = self._preprocess_for_tesseract(gray)
|
|
|
|
# Get data including bounding boxes
|
|
tesseract_lang = self.lang_map.get(self.lang, 'eng')
|
|
|
|
data = pytesseract.image_to_data(
|
|
processed,
|
|
lang=tesseract_lang,
|
|
config=self.config,
|
|
output_type=pytesseract.Output.DICT
|
|
)
|
|
|
|
regions = []
|
|
n_boxes = len(data['text'])
|
|
|
|
for i in range(n_boxes):
|
|
text = data['text'][i].strip()
|
|
conf = int(data['conf'][i])
|
|
|
|
# Filter low confidence and empty text
|
|
if conf > 30 and text:
|
|
x = data['left'][i]
|
|
y = data['top'][i]
|
|
w = data['width'][i]
|
|
h = data['height'][i]
|
|
|
|
regions.append(OCRTextRegion(
|
|
text=text,
|
|
confidence=conf / 100.0, # Normalize to 0-1
|
|
bbox=(x, y, w, h),
|
|
language=self.lang
|
|
))
|
|
|
|
# Merge overlapping regions that are likely the same text
|
|
regions = self._merge_nearby_regions(regions)
|
|
|
|
logger.debug(f"Tesseract detected {len(regions)} text regions")
|
|
return regions
|
|
|
|
except Exception as e:
|
|
logger.error(f"Tesseract extraction failed: {e}")
|
|
return []
|
|
|
|
def _preprocess_for_tesseract(self, gray: np.ndarray) -> np.ndarray:
|
|
"""Preprocess image specifically for Tesseract."""
|
|
import cv2
|
|
|
|
# Resize small images (Tesseract works better with larger text)
|
|
h, w = gray.shape[:2]
|
|
min_height = 100
|
|
if h < min_height:
|
|
scale = min_height / h
|
|
gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
|
|
|
# Apply adaptive thresholding
|
|
processed = cv2.adaptiveThreshold(
|
|
gray, 255,
|
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
cv2.THRESH_BINARY,
|
|
11, 2
|
|
)
|
|
|
|
# Denoise
|
|
processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)
|
|
|
|
return processed
|
|
|
|
def _merge_nearby_regions(self, regions: List[OCRTextRegion],
|
|
max_distance: int = 10) -> List[OCRTextRegion]:
|
|
"""Merge text regions that are close to each other."""
|
|
if not regions:
|
|
return []
|
|
|
|
# Sort by y position
|
|
sorted_regions = sorted(regions, key=lambda r: (r.bbox[1], r.bbox[0]))
|
|
|
|
merged = []
|
|
current = sorted_regions[0]
|
|
|
|
for next_region in sorted_regions[1:]:
|
|
# Check if regions are close enough to merge
|
|
cx, cy, cw, ch = current.bbox
|
|
nx, ny, nw, nh = next_region.bbox
|
|
|
|
# Calculate distance
|
|
distance = abs(ny - cy)
|
|
x_overlap = not (cx + cw < nx or nx + nw < cx)
|
|
|
|
if distance < max_distance and x_overlap:
|
|
# Merge regions
|
|
min_x = min(cx, nx)
|
|
min_y = min(cy, ny)
|
|
max_x = max(cx + cw, nx + nw)
|
|
max_y = max(cy + ch, ny + nh)
|
|
|
|
# Combine text
|
|
combined_text = current.text + " " + next_region.text
|
|
avg_conf = (current.confidence + next_region.confidence) / 2
|
|
|
|
current = OCRTextRegion(
|
|
text=combined_text.strip(),
|
|
confidence=avg_conf,
|
|
bbox=(min_x, min_y, max_x - min_x, max_y - min_y),
|
|
language=self.lang
|
|
)
|
|
else:
|
|
merged.append(current)
|
|
current = next_region
|
|
|
|
merged.append(current)
|
|
return merged
|
|
|
|
def extract_text_simple(self, image: np.ndarray) -> str:
|
|
"""
|
|
Simple text extraction without region detection.
|
|
|
|
Returns:
|
|
All text found in image as single string
|
|
"""
|
|
if not self._available:
|
|
return ""
|
|
|
|
try:
|
|
import pytesseract
|
|
import cv2
|
|
|
|
# Convert to RGB if needed
|
|
if len(image.shape) == 3:
|
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
|
|
|
tesseract_lang = self.lang_map.get(self.lang, 'eng')
|
|
|
|
text = pytesseract.image_to_string(
|
|
image,
|
|
lang=tesseract_lang,
|
|
config=self.config
|
|
)
|
|
|
|
return text.strip()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Tesseract simple extraction failed: {e}")
|
|
return ""
|
|
|
|
@staticmethod
|
|
def find_tesseract() -> Optional[str]:
|
|
"""Find Tesseract installation path."""
|
|
path = shutil.which('tesseract')
|
|
if path:
|
|
return path
|
|
|
|
# Common Windows paths
|
|
common_paths = [
|
|
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
|
|
r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
|
|
r"C:\Users\%USERNAME%\AppData\Local\Tesseract-OCR\tesseract.exe",
|
|
r"C:\Tesseract-OCR\tesseract.exe",
|
|
]
|
|
|
|
import os
|
|
for p in common_paths:
|
|
expanded = os.path.expandvars(p)
|
|
if Path(expanded).exists():
|
|
return expanded
|
|
|
|
return None
|