""" Lemontropia Suite - OpenCV GPU Text Detector Alternative to PaddleOCR using OpenCV DNN with CUDA support. Faster, simpler, no heavy dependencies. Based on: https://pyimagesearch.com/2022/03/14/improving-text-detection-speed-with-opencv-and-gpus/ """ import cv2 import numpy as np import logging from pathlib import Path from typing import List, Tuple, Optional, Dict, Any from dataclasses import dataclass logger = logging.getLogger(__name__) @dataclass class TextDetection: """Detected text region.""" text: str # Recognized text (may be empty if detection only) confidence: float bbox: Tuple[int, int, int, int] # x, y, w, h def to_dict(self) -> Dict[str, Any]: return { 'text': self.text, 'confidence': self.confidence, 'bbox': self.bbox } class OpenCVTextDetector: """ Text detector using OpenCV DNN with optional GPU acceleration. Uses EAST (Efficient and Accurate Scene Text) detection model. Much faster than PaddleOCR and has fewer dependencies. Performance (from PyImageSearch): - CPU: ~23 FPS - GPU: ~97 FPS (4x faster!) """ # EAST model download URL EAST_MODEL_URL = "https://github.com/oyyd/frozen_east_text_detection.pb/raw/master/frozen_east_text_detection.pb" def __init__(self, model_path: Optional[str] = None, use_gpu: bool = True): """ Initialize OpenCV text detector. Args: model_path: Path to frozen_east_text_detection.pb model use_gpu: Whether to use CUDA GPU acceleration """ self.use_gpu = use_gpu self.net = None self.model_path = model_path self._model_loaded = False # Default input size (must be multiple of 32) self.input_width = 320 self.input_height = 320 # Detection thresholds self.confidence_threshold = 0.5 self.nms_threshold = 0.4 self._load_model() def _load_model(self) -> bool: """Load EAST text detection model.""" try: # Default model location if not self.model_path: model_dir = Path(__file__).parent.parent / "data" / "models" model_dir.mkdir(parents=True, exist_ok=True) self.model_path = str(model_dir / "frozen_east_text_detection.pb") model_file = Path(self.model_path) # Download if not exists if not model_file.exists(): logger.info(f"EAST model not found, downloading...") self._download_model() # Load model logger.info(f"Loading EAST text detector from {self.model_path}") self.net = cv2.dnn.readNet(self.model_path) # Enable GPU if requested and available if self.use_gpu: self._enable_gpu() self._model_loaded = True logger.info("EAST text detector loaded successfully") return True except Exception as e: logger.error(f"Failed to load EAST model: {e}") return False def _download_model(self) -> bool: """Download EAST model if not present.""" try: import urllib.request logger.info(f"Downloading EAST model to {self.model_path}") urllib.request.urlretrieve(self.EAST_MODEL_URL, self.model_path) logger.info("EAST model downloaded successfully") return True except Exception as e: logger.error(f"Failed to download EAST model: {e}") return False def _enable_gpu(self) -> bool: """Enable CUDA GPU acceleration.""" try: # Check if CUDA is available in OpenCV if cv2.cuda.getCudaEnabledDeviceCount() > 0: logger.info("Enabling CUDA GPU acceleration for text detection") self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA) self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA) return True else: logger.warning("CUDA not available, using CPU") return False except Exception as e: logger.warning(f"Failed to enable GPU: {e}, using CPU") return False def detect_text(self, image: np.ndarray) -> List[TextDetection]: """ Detect text regions in image. Args: image: Input image (BGR format) Returns: List of detected text regions """ if not self._model_loaded: logger.error("Model not loaded, cannot detect text") return [] try: # Get image dimensions (H, W) = image.shape[:2] # Resize image to multiple of 32 (newW, newH) = (self.input_width, self.input_height) rW = W / float(newW) rH = H / float(newH) # Resize and prepare blob resized = cv2.resize(image, (newW, newH)) blob = cv2.dnn.blobFromImage(resized, 1.0, (newW, newH), (123.68, 116.78, 103.94), swapRB=True, crop=False) # Forward pass self.net.setInput(blob) (scores, geometry) = self.net.forward(self._get_output_layers()) # Decode predictions (rects, confidences) = self._decode_predictions(scores, geometry) # Apply non-maxima suppression boxes = self._apply_nms(rects, confidences, rW, rH) # Create detection objects detections = [] for (startX, startY, endX, endY, conf) in boxes: detections.append(TextDetection( text="", # EAST only detects, doesn't recognize confidence=conf, bbox=(int(startX), int(startY), int(endX - startX), int(endY - startY)) )) return detections except Exception as e: logger.error(f"Text detection failed: {e}") return [] def _get_output_layers(self) -> List[str]: """Get EAST model output layer names.""" layerNames = [ "feature_fusion/Conv_7/Sigmoid", # scores "feature_fusion/concat_3" # geometry ] return layerNames def _decode_predictions(self, scores: np.ndarray, geometry: np.ndarray) -> Tuple[List, List]: """Decode EAST model output to bounding boxes.""" (numRows, numCols) = scores.shape[2:4] rects = [] confidences = [] for y in range(0, numRows): scoresData = scores[0, 0, y] xData0 = geometry[0, 0, y] xData1 = geometry[0, 1, y] xData2 = geometry[0, 2, y] xData3 = geometry[0, 3, y] anglesData = geometry[0, 4, y] for x in range(0, numCols): if scoresData[x] < self.confidence_threshold: continue # Compute offset factor (offsetX, offsetY) = (x * 4.0, y * 4.0) # Extract rotation angle angle = anglesData[x] cos = np.cos(angle) sin = np.sin(angle) # Compute bounding box dimensions h = xData0[x] + xData2[x] w = xData1[x] + xData3[x] # Compute bounding box coordinates endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x])) endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x])) startX = int(endX - w) startY = int(endY - h) rects.append((startX, startY, endX, endY)) confidences.append(float(scoresData[x])) return (rects, confidences) def _apply_nms(self, rects: List, confidences: List, rW: float, rH: float) -> List[Tuple]: """Apply non-maximum suppression and scale boxes.""" # Convert to numpy arrays boxes = np.array(rects) # Apply NMS indices = cv2.dnn.NMSBoxesRotated( [((0, 0), 0, 0)] * len(boxes), # dummy rotated boxes confidences, self.confidence_threshold, self.nms_threshold ) # Scale boxes back to original image size results = [] if len(indices) > 0: for i in indices.flatten(): (startX, startY, endX, endY) = boxes[i] # Scale coordinates startX = int(startX * rW) startY = int(startY * rH) endX = int(endX * rW) endY = int(endY * rH) results.append((startX, startY, endX, endY, confidences[i])) return results def is_available(self) -> bool: """Check if detector is available.""" return self._model_loaded @staticmethod def check_gpu_available() -> bool: """Check if CUDA GPU is available in OpenCV.""" try: return cv2.cuda.getCudaEnabledDeviceCount() > 0 except: return False # Convenience function for quick text detection def detect_text_opencv(image: np.ndarray, use_gpu: bool = True) -> List[TextDetection]: """ Quick text detection using OpenCV DNN. Args: image: Input image use_gpu: Use GPU acceleration if available Returns: List of detected text regions """ detector = OpenCVTextDetector(use_gpu=use_gpu) return detector.detect_text(image)