Lemontropia-Suite/modules/opencv_text_detector.py

"""
Lemontropia Suite - OpenCV GPU Text Detector
Alternative to PaddleOCR using OpenCV DNN with CUDA support.
Faster, simpler, no heavy dependencies.

Based on: https://pyimagesearch.com/2022/03/14/improving-text-detection-speed-with-opencv-and-gpus/
"""

import cv2
import numpy as np
import logging
from pathlib import Path
from typing import List, Tuple, Optional, Dict, Any
from dataclasses import dataclass

logger = logging.getLogger(__name__)

@dataclass
class TextDetection:
    """Detected text region."""
    text: str  # Recognized text (may be empty if detection only)
    confidence: float
    bbox: Tuple[int, int, int, int]  # x, y, w, h

    def to_dict(self) -> Dict[str, Any]:
        return {
            'text': self.text,
            'confidence': self.confidence,
            'bbox': self.bbox
        }


class OpenCVTextDetector:
    """
    Text detector using OpenCV DNN with optional GPU acceleration.

    Uses EAST (Efficient and Accurate Scene Text) detection model.
    Much faster than PaddleOCR and has fewer dependencies.

    Performance (from PyImageSearch):
    - CPU: ~23 FPS
    - GPU: ~97 FPS (4x faster!)
    """

    # EAST model download URL
    EAST_MODEL_URL = "https://github.com/oyyd/frozen_east_text_detection.pb/raw/master/frozen_east_text_detection.pb"

    def __init__(self, model_path: Optional[str] = None, use_gpu: bool = True):
        """
        Initialize OpenCV text detector.

        Args:
            model_path: Path to frozen_east_text_detection.pb model
            use_gpu: Whether to use CUDA GPU acceleration
        """
        self.use_gpu = use_gpu
        self.net = None
        self.model_path = model_path
        self._model_loaded = False

        # Default input size (must be multiple of 32)
        self.input_width = 320
        self.input_height = 320

        # Detection thresholds
        self.confidence_threshold = 0.5
        self.nms_threshold = 0.4

        self._load_model()

    def _load_model(self) -> bool:
        """Load EAST text detection model."""
        try:
            # Default model location
            if not self.model_path:
                model_dir = Path(__file__).parent.parent / "data" / "models"
                model_dir.mkdir(parents=True, exist_ok=True)
                self.model_path = str(model_dir / "frozen_east_text_detection.pb")

            model_file = Path(self.model_path)

            # Download if not exists
            if not model_file.exists():
                logger.info(f"EAST model not found, downloading...")
                self._download_model()

            # Load model
            logger.info(f"Loading EAST text detector from {self.model_path}")
            self.net = cv2.dnn.readNet(self.model_path)

            # Enable GPU if requested and available
            if self.use_gpu:
                self._enable_gpu()

            self._model_loaded = True
            logger.info("EAST text detector loaded successfully")
            return True

        except Exception as e:
            logger.error(f"Failed to load EAST model: {e}")
            return False

    def _download_model(self) -> bool:
        """Download EAST model if not present."""
        try:
            import urllib.request

            logger.info(f"Downloading EAST model to {self.model_path}")
            urllib.request.urlretrieve(self.EAST_MODEL_URL, self.model_path)
            logger.info("EAST model downloaded successfully")
            return True

        except Exception as e:
            logger.error(f"Failed to download EAST model: {e}")
            return False

    def _enable_gpu(self) -> bool:
        """Enable CUDA GPU acceleration."""
        try:
            # Check if CUDA is available in OpenCV
            if cv2.cuda.getCudaEnabledDeviceCount() > 0:
                logger.info("Enabling CUDA GPU acceleration for text detection")
                self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
                self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
                return True
            else:
                logger.warning("CUDA not available, using CPU")
                return False

        except Exception as e:
            logger.warning(f"Failed to enable GPU: {e}, using CPU")
            return False

    def detect_text(self, image: np.ndarray) -> List[TextDetection]:
        """
        Detect text regions in image.

        Args:
            image: Input image (BGR format)

        Returns:
            List of detected text regions
        """
        if not self._model_loaded:
            logger.error("Model not loaded, cannot detect text")
            return []

        try:
            # Get image dimensions
            (H, W) = image.shape[:2]

            # Resize image to multiple of 32
            (newW, newH) = (self.input_width, self.input_height)
            rW = W / float(newW)
            rH = H / float(newH)

            # Resize and prepare blob
            resized = cv2.resize(image, (newW, newH))
            blob = cv2.dnn.blobFromImage(resized, 1.0, (newW, newH),
                                         (123.68, 116.78, 103.94),
                                         swapRB=True, crop=False)

            # Forward pass
            self.net.setInput(blob)
            (scores, geometry) = self.net.forward(self._get_output_layers())

            # Decode predictions
            (rects, confidences) = self._decode_predictions(scores, geometry)

            # Apply non-maxima suppression
            boxes = self._apply_nms(rects, confidences, rW, rH)

            # Create detection objects
            detections = []
            for (startX, startY, endX, endY, conf) in boxes:
                detections.append(TextDetection(
                    text="",  # EAST only detects, doesn't recognize
                    confidence=conf,
                    bbox=(int(startX), int(startY), int(endX - startX), int(endY - startY))
                ))

            return detections

        except Exception as e:
            logger.error(f"Text detection failed: {e}")
            return []

    def _get_output_layers(self) -> List[str]:
        """Get EAST model output layer names."""
        layerNames = [
            "feature_fusion/Conv_7/Sigmoid",  # scores
            "feature_fusion/concat_3"         # geometry
        ]
        return layerNames

    def _decode_predictions(self, scores: np.ndarray, geometry: np.ndarray) -> Tuple[List, List]:
        """Decode EAST model output to bounding boxes."""
        (numRows, numCols) = scores.shape[2:4]
        rects = []
        confidences = []

        for y in range(0, numRows):
            scoresData = scores[0, 0, y]
            xData0 = geometry[0, 0, y]
            xData1 = geometry[0, 1, y]
            xData2 = geometry[0, 2, y]
            xData3 = geometry[0, 3, y]
            anglesData = geometry[0, 4, y]

            for x in range(0, numCols):
                if scoresData[x] < self.confidence_threshold:
                    continue

                # Compute offset factor
                (offsetX, offsetY) = (x * 4.0, y * 4.0)

                # Extract rotation angle
                angle = anglesData[x]
                cos = np.cos(angle)
                sin = np.sin(angle)

                # Compute bounding box dimensions
                h = xData0[x] + xData2[x]
                w = xData1[x] + xData3[x]

                # Compute bounding box coordinates
                endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
                endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
                startX = int(endX - w)
                startY = int(endY - h)

                rects.append((startX, startY, endX, endY))
                confidences.append(float(scoresData[x]))

        return (rects, confidences)

    def _apply_nms(self, rects: List, confidences: List,
                   rW: float, rH: float) -> List[Tuple]:
        """Apply non-maximum suppression and scale boxes."""
        # Convert to numpy arrays
        boxes = np.array(rects)

        # Apply NMS
        indices = cv2.dnn.NMSBoxesRotated(
            [((0, 0), 0, 0)] * len(boxes),  # dummy rotated boxes
            confidences,
            self.confidence_threshold,
            self.nms_threshold
        )

        # Scale boxes back to original image size
        results = []
        if len(indices) > 0:
            for i in indices.flatten():
                (startX, startY, endX, endY) = boxes[i]

                # Scale coordinates
                startX = int(startX * rW)
                startY = int(startY * rH)
                endX = int(endX * rW)
                endY = int(endY * rH)

                results.append((startX, startY, endX, endY, confidences[i]))

        return results

    def is_available(self) -> bool:
        """Check if detector is available."""
        return self._model_loaded

    @staticmethod
    def check_gpu_available() -> bool:
        """Check if CUDA GPU is available in OpenCV."""
        try:
            return cv2.cuda.getCudaEnabledDeviceCount() > 0
        except:
            return False


# Convenience function for quick text detection
def detect_text_opencv(image: np.ndarray, use_gpu: bool = True) -> List[TextDetection]:
    """
    Quick text detection using OpenCV DNN.

    Args:
        image: Input image
        use_gpu: Use GPU acceleration if available

    Returns:
        List of detected text regions
    """
    detector = OpenCVTextDetector(use_gpu=use_gpu)
    return detector.detect_text(image)