Lemontropia-Suite/modules/ocr_backends/tesseract_backend.py

"""
Lemontropia Suite - Tesseract OCR Backend
Traditional OCR using Tesseract - stable, no ML dependencies.
"""

import numpy as np
import logging
from typing import List, Optional, Tuple
from pathlib import Path
import shutil

from . import BaseOCRBackend, OCRTextRegion

logger = logging.getLogger(__name__)


class TesseractBackend(BaseOCRBackend):
    """
    OCR backend using Tesseract OCR.

    Pros:
    - Very stable and mature
    - No PyTorch/TensorFlow dependencies
    - Fast on CPU
    - Works with Windows Store Python

    Cons:
    - Lower accuracy on game UI text than neural OCR
    - Requires Tesseract binary installation

    Installation:
    - Windows: choco install tesseract or download from UB Mannheim
    - Linux: sudo apt-get install tesseract-ocr
    - macOS: brew install tesseract
    - Python: pip install pytesseract
    """

    NAME = "tesseract"
    SUPPORTS_GPU = False  # Tesseract is CPU-only

    def __init__(self, use_gpu: bool = True, lang: str = 'en', **kwargs):
        super().__init__(use_gpu=use_gpu, lang=lang, **kwargs)

        self.tesseract_cmd = kwargs.get('tesseract_cmd', None)
        self._version = None

        # Language mapping for Tesseract
        self.lang_map = {
            'en': 'eng',
            'sv': 'swe',  # Swedish
            'de': 'deu',
            'fr': 'fra',
            'es': 'spa',
            'latin': 'eng+deu+fra+spa',  # Multi-language
        }

        # Tesseract configuration
        self.config = kwargs.get('config', '--psm 6')  # Assume single uniform block of text

    def _initialize(self) -> bool:
        """Initialize Tesseract OCR."""
        try:
            import pytesseract

            # Set custom path if provided
            if self.tesseract_cmd:
                pytesseract.pytesseract.tesseract_cmd = self.tesseract_cmd

            # Try to get version to verify installation
            try:
                version = pytesseract.get_tesseract_version()
                self._version = str(version)
                logger.info(f"Tesseract version: {version}")
            except Exception as e:
                # Try to find tesseract in PATH
                tesseract_path = shutil.which('tesseract')
                if tesseract_path:
                    pytesseract.pytesseract.tesseract_cmd = tesseract_path
                    version = pytesseract.get_tesseract_version()
                    self._version = str(version)
                    logger.info(f"Tesseract found at: {tesseract_path}, version: {version}")
                else:
                    raise e

            self._available = True
            logger.info("Tesseract OCR initialized successfully")
            return True

        except ImportError:
            self._error_msg = "pytesseract not installed. Run: pip install pytesseract"
            logger.warning(self._error_msg)
            return False

        except Exception as e:
            self._error_msg = f"Tesseract not found: {e}. Please install Tesseract OCR."
            logger.warning(self._error_msg)
            logger.info("Download from: https://github.com/UB-Mannheim/tesseract/wiki")
            return False

    def extract_text(self, image: np.ndarray) -> List[OCRTextRegion]:
        """
        Extract text from image using Tesseract.

        Uses a two-step approach:
        1. Detect text regions using OpenCV contours
        2. Run Tesseract on each region

        Args:
            image: Input image (BGR format from OpenCV)

        Returns:
            List of detected text regions with recognized text
        """
        if not self._available:
            logger.error("Tesseract backend not initialized")
            return []

        try:
            import pytesseract
            import cv2

            # Preprocess image
            gray = self._to_grayscale(image)
            processed = self._preprocess_for_tesseract(gray)

            # Get data including bounding boxes
            tesseract_lang = self.lang_map.get(self.lang, 'eng')

            data = pytesseract.image_to_data(
                processed,
                lang=tesseract_lang,
                config=self.config,
                output_type=pytesseract.Output.DICT
            )

            regions = []
            n_boxes = len(data['text'])

            for i in range(n_boxes):
                text = data['text'][i].strip()
                conf = int(data['conf'][i])

                # Filter low confidence and empty text
                if conf > 30 and text:
                    x = data['left'][i]
                    y = data['top'][i]
                    w = data['width'][i]
                    h = data['height'][i]

                    regions.append(OCRTextRegion(
                        text=text,
                        confidence=conf / 100.0,  # Normalize to 0-1
                        bbox=(x, y, w, h),
                        language=self.lang
                    ))

            # Merge overlapping regions that are likely the same text
            regions = self._merge_nearby_regions(regions)

            logger.debug(f"Tesseract detected {len(regions)} text regions")
            return regions

        except Exception as e:
            logger.error(f"Tesseract extraction failed: {e}")
            return []

    def _preprocess_for_tesseract(self, gray: np.ndarray) -> np.ndarray:
        """Preprocess image specifically for Tesseract."""
        import cv2

        # Resize small images (Tesseract works better with larger text)
        h, w = gray.shape[:2]
        min_height = 100
        if h < min_height:
            scale = min_height / h
            gray = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)

        # Apply adaptive thresholding
        processed = cv2.adaptiveThreshold(
            gray, 255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            11, 2
        )

        # Denoise
        processed = cv2.fastNlMeansDenoising(processed, None, 10, 7, 21)

        return processed

    def _merge_nearby_regions(self, regions: List[OCRTextRegion],
                              max_distance: int = 10) -> List[OCRTextRegion]:
        """Merge text regions that are close to each other."""
        if not regions:
            return []

        # Sort by y position
        sorted_regions = sorted(regions, key=lambda r: (r.bbox[1], r.bbox[0]))

        merged = []
        current = sorted_regions[0]

        for next_region in sorted_regions[1:]:
            # Check if regions are close enough to merge
            cx, cy, cw, ch = current.bbox
            nx, ny, nw, nh = next_region.bbox

            # Calculate distance
            distance = abs(ny - cy)
            x_overlap = not (cx + cw < nx or nx + nw < cx)

            if distance < max_distance and x_overlap:
                # Merge regions
                min_x = min(cx, nx)
                min_y = min(cy, ny)
                max_x = max(cx + cw, nx + nw)
                max_y = max(cy + ch, ny + nh)

                # Combine text
                combined_text = current.text + " " + next_region.text
                avg_conf = (current.confidence + next_region.confidence) / 2

                current = OCRTextRegion(
                    text=combined_text.strip(),
                    confidence=avg_conf,
                    bbox=(min_x, min_y, max_x - min_x, max_y - min_y),
                    language=self.lang
                )
            else:
                merged.append(current)
                current = next_region

        merged.append(current)
        return merged

    def extract_text_simple(self, image: np.ndarray) -> str:
        """
        Simple text extraction without region detection.

        Returns:
            All text found in image as single string
        """
        if not self._available:
            return ""

        try:
            import pytesseract
            import cv2

            # Convert to RGB if needed
            if len(image.shape) == 3:
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            tesseract_lang = self.lang_map.get(self.lang, 'eng')

            text = pytesseract.image_to_string(
                image,
                lang=tesseract_lang,
                config=self.config
            )

            return text.strip()

        except Exception as e:
            logger.error(f"Tesseract simple extraction failed: {e}")
            return ""

    @staticmethod
    def find_tesseract() -> Optional[str]:
        """Find Tesseract installation path."""
        path = shutil.which('tesseract')
        if path:
            return path

        # Common Windows paths
        common_paths = [
            r"C:\Program Files\Tesseract-OCR\tesseract.exe",
            r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
            r"C:\Users\%USERNAME%\AppData\Local\Tesseract-OCR\tesseract.exe",
            r"C:\Tesseract-OCR\tesseract.exe",
        ]

        import os
        for p in common_paths:
            expanded = os.path.expandvars(p)
            if Path(expanded).exists():
                return expanded

        return None