oximedia-cv 0.1.3

//! Text detection and recognition (OCR) for extracting burned-in subtitles.
//!
//! This module provides:
//! - Stroke-width transform (SWT) based text region detection.
//! - Connected-component analysis for character isolation.
//! - Bounding-box aggregation into text lines.
//! - A simple template-free OCR path that leverages character segmentation and
//!   descriptor matching against a reference character set.
//!
//! The implementation is CPU-only and requires no external dependencies beyond
//! the standard oximedia-cv image processing primitives.

use crate::error::{CvError, CvResult};

/// A detected text region in an image.
#[derive(Debug, Clone)]
pub struct TextRegion {
    /// Bounding box `(x_min, y_min, x_max, y_max)` in pixels.
    pub bbox: (u32, u32, u32, u32),
    /// Estimated text orientation in degrees (0 = horizontal).
    pub angle: f32,
    /// Confidence score (0.0–1.0).
    pub score: f32,
    /// Whether the region is likely a subtitle line.
    pub is_subtitle: bool,
}

impl TextRegion {
    /// Create a new text region.
    #[must_use]
    pub fn new(bbox: (u32, u32, u32, u32), angle: f32, score: f32, is_subtitle: bool) -> Self {
        Self {
            bbox,
            angle,
            score,
            is_subtitle,
        }
    }

    /// Width of the bounding box.
    #[must_use]
    pub fn width(&self) -> u32 {
        self.bbox.2.saturating_sub(self.bbox.0)
    }

    /// Height of the bounding box.
    #[must_use]
    pub fn height(&self) -> u32 {
        self.bbox.3.saturating_sub(self.bbox.1)
    }

    /// Aspect ratio (width / height).
    #[must_use]
    pub fn aspect_ratio(&self) -> f32 {
        let h = self.height();
        if h == 0 {
            return 0.0;
        }
        self.width() as f32 / h as f32
    }
}

/// Configuration for the text detector.
#[derive(Debug, Clone)]
pub struct TextDetectorConfig {
    /// Canny edge detection low threshold (0.0–1.0 for normalised input).
    pub edge_low_threshold: f32,
    /// Canny edge detection high threshold.
    pub edge_high_threshold: f32,
    /// Minimum text region area in pixels.
    pub min_area: u32,
    /// Maximum text region area in pixels.
    pub max_area: u32,
    /// Minimum aspect ratio for a text line (width / height).
    pub min_aspect_ratio: f32,
    /// Maximum aspect ratio for a text line.
    pub max_aspect_ratio: f32,
    /// Vertical position range for subtitle detection (fraction of image height).
    /// Subtitles are typically in the bottom 20% of the frame.
    pub subtitle_bottom_fraction: f32,
    /// Minimum confidence to include a region.
    pub min_confidence: f32,
}

impl Default for TextDetectorConfig {
    fn default() -> Self {
        Self {
            edge_low_threshold: 0.1,
            edge_high_threshold: 0.3,
            min_area: 50,
            max_area: 50_000,
            min_aspect_ratio: 1.5,
            max_aspect_ratio: 30.0,
            subtitle_bottom_fraction: 0.75,
            min_confidence: 0.4,
        }
    }
}

/// CPU text detector using edge + connected-component analysis.
///
/// Detects rectangular text regions in a grayscale image by:
/// 1. Computing gradient magnitude (Sobel).
/// 2. Thresholding to produce an edge map.
/// 3. Morphological closing to connect nearby edge fragments.
/// 4. Connected-component labelling.
/// 5. Filtering by area, aspect ratio, and density.
pub struct TextDetector {
    config: TextDetectorConfig,
}

impl TextDetector {
    /// Create a new text detector with default configuration.
    #[must_use]
    pub fn new() -> Self {
        Self {
            config: TextDetectorConfig::default(),
        }
    }

    /// Create a new text detector with custom configuration.
    #[must_use]
    pub fn with_config(config: TextDetectorConfig) -> Self {
        Self { config }
    }

    /// Detect text regions in a grayscale image.
    ///
    /// # Arguments
    ///
    /// * `image` – Grayscale image as `u8` slice (row-major).
    /// * `width`, `height` – Image dimensions.
    ///
    /// Returns text regions sorted by y-position (top to bottom).
    ///
    /// # Errors
    ///
    /// Returns an error if image dimensions are inconsistent.
    pub fn detect(&self, image: &[u8], width: u32, height: u32) -> CvResult<Vec<TextRegion>> {
        let n = (width * height) as usize;
        if image.len() < n {
            return Err(CvError::insufficient_data(n, image.len()));
        }
        if width == 0 || height == 0 {
            return Ok(Vec::new());
        }

        let w = width as usize;
        let h = height as usize;

        // Step 1: Sobel gradient magnitude (normalised to 0.0–1.0)
        let gradient = self.sobel_magnitude(image, w, h);

        // Step 2: Binary edge map
        let edges: Vec<bool> = gradient
            .iter()
            .map(|&g| g >= self.config.edge_low_threshold)
            .collect();

        // Step 3: Morphological closing (dilation then erosion) to bridge gaps
        let closed = self.dilate_binary(&edges, w, h, 2);
        let closed = self.erode_binary(&closed, w, h, 2);

        // Step 4: Convert to f32 for connected components
        let float_map: Vec<f32> = closed.iter().map(|&b| if b { 1.0 } else { 0.0 }).collect();
        let label_map = crate::segmentation::connected_components(&float_map, w, h, 0.5);

        // Step 5: Build bounding boxes for each label
        let n_labels = label_map.num_labels();
        let mut regions = Vec::new();

        for label in 1..=(n_labels as u32) {
            let bbox = label_map.bounding_box(label as u32);
            if let Some((x0, y0, x1, y1)) = bbox {
                let bb_w = (x1 - x0 + 1) as u32;
                let bb_h = (y1 - y0 + 1) as u32;
                let area = bb_w * bb_h;

                if area < self.config.min_area || area > self.config.max_area {
                    continue;
                }

                let aspect = bb_w as f32 / bb_h.max(1) as f32;
                if aspect < self.config.min_aspect_ratio || aspect > self.config.max_aspect_ratio {
                    continue;
                }

                // Fill density: how many edge pixels are inside the bounding box
                let pixel_count = label_map.count_label(label as u32);
                let density = pixel_count as f32 / area as f32;

                // Score based on density and aspect ratio closeness to typical text
                let aspect_score = 1.0 - ((aspect - 5.0).abs() / 25.0).clamp(0.0, 1.0);
                let score = (density * 0.5 + aspect_score * 0.5).clamp(0.0, 1.0);

                if score < self.config.min_confidence {
                    continue;
                }

                let y_frac = y0 as f32 / h as f32;
                let is_subtitle = y_frac >= self.config.subtitle_bottom_fraction;

                regions.push(TextRegion::new(
                    (x0 as u32, y0 as u32, x1 as u32, y1 as u32),
                    0.0, // horizontal text assumed
                    score,
                    is_subtitle,
                ));
            }
        }

        // Sort top-to-bottom, then left-to-right
        regions.sort_by(|a, b| a.bbox.1.cmp(&b.bbox.1).then(a.bbox.0.cmp(&b.bbox.0)));

        Ok(regions)
    }

    /// Detect only subtitle text regions (bottom portion of frame).
    ///
    /// # Errors
    ///
    /// Returns an error if detection fails.
    pub fn detect_subtitles(
        &self,
        image: &[u8],
        width: u32,
        height: u32,
    ) -> CvResult<Vec<TextRegion>> {
        let all = self.detect(image, width, height)?;
        Ok(all.into_iter().filter(|r| r.is_subtitle).collect())
    }

    /// Compute Sobel gradient magnitude, normalised to \[0.0, 1.0\].
    fn sobel_magnitude(&self, image: &[u8], w: usize, h: usize) -> Vec<f32> {
        let mut mag = vec![0.0f32; w * h];

        for y in 1..h - 1 {
            for x in 1..w - 1 {
                let get = |dx: i32, dy: i32| -> i32 {
                    let nx = (x as i32 + dx) as usize;
                    let ny = (y as i32 + dy) as usize;
                    image[ny * w + nx] as i32
                };

                let gx = -get(-1, -1) - 2 * get(-1, 0) - get(-1, 1)
                    + get(1, -1)
                    + 2 * get(1, 0)
                    + get(1, 1);
                let gy = -get(-1, -1) - 2 * get(0, -1) - get(1, -1)
                    + get(-1, 1)
                    + 2 * get(0, 1)
                    + get(1, 1);

                let m = ((gx * gx + gy * gy) as f32).sqrt() / (4.0 * 255.0 * 2.0_f32.sqrt());
                mag[y * w + x] = m.clamp(0.0, 1.0);
            }
        }
        mag
    }

    /// Binary dilation with a square structuring element of given radius.
    fn dilate_binary(&self, src: &[bool], w: usize, h: usize, radius: usize) -> Vec<bool> {
        let mut dst = vec![false; w * h];
        for y in 0..h {
            for x in 0..w {
                let y0 = y.saturating_sub(radius);
                let y1 = (y + radius + 1).min(h);
                let x0 = x.saturating_sub(radius);
                let x1 = (x + radius + 1).min(w);
                'search: for ny in y0..y1 {
                    for nx in x0..x1 {
                        if src[ny * w + nx] {
                            dst[y * w + x] = true;
                            break 'search;
                        }
                    }
                }
            }
        }
        dst
    }

    /// Binary erosion with a square structuring element.
    fn erode_binary(&self, src: &[bool], w: usize, h: usize, radius: usize) -> Vec<bool> {
        let mut dst = vec![true; w * h];
        for y in 0..h {
            for x in 0..w {
                let y0 = y.saturating_sub(radius);
                let y1 = (y + radius + 1).min(h);
                let x0 = x.saturating_sub(radius);
                let x1 = (x + radius + 1).min(w);
                'search: for ny in y0..y1 {
                    for nx in x0..x1 {
                        if !src[ny * w + nx] {
                            dst[y * w + x] = false;
                            break 'search;
                        }
                    }
                }
            }
        }
        dst
    }
}

impl Default for TextDetector {
    fn default() -> Self {
        Self::new()
    }
}

/// A recognised character from OCR.
#[derive(Debug, Clone)]
pub struct RecognisedChar {
    /// The character (Unicode scalar).
    pub ch: char,
    /// Confidence score (0.0–1.0).
    pub confidence: f32,
    /// Bounding box in the source image.
    pub bbox: (u32, u32, u32, u32),
}

/// A recognised text line.
#[derive(Debug, Clone)]
pub struct RecognisedLine {
    /// The full text content of the line.
    pub text: String,
    /// Individual character recognitions.
    pub chars: Vec<RecognisedChar>,
    /// Bounding box of the whole line.
    pub bbox: (u32, u32, u32, u32),
    /// Mean confidence across characters.
    pub confidence: f32,
}

impl RecognisedLine {
    /// Create a new recognised line.
    #[must_use]
    pub fn new(chars: Vec<RecognisedChar>, bbox: (u32, u32, u32, u32)) -> Self {
        let text: String = chars.iter().map(|c| c.ch).collect();
        let confidence = if chars.is_empty() {
            0.0
        } else {
            chars.iter().map(|c| c.confidence).sum::<f32>() / chars.len() as f32
        };
        Self {
            text,
            chars,
            bbox,
            confidence,
        }
    }

    /// Whether the line is likely a subtitle (high confidence + subtitle position).
    #[must_use]
    pub fn is_likely_subtitle(&self, image_height: u32) -> bool {
        let line_y = self.bbox.3; // bottom of bbox
        let frac = line_y as f32 / image_height.max(1) as f32;
        frac >= 0.6 && self.confidence >= 0.5
    }
}

/// Lightweight OCR engine for burned-in subtitle extraction.
///
/// This is a structural OCR that works by:
/// 1. Segmenting characters from a text region using vertical projection.
/// 2. Comparing each character blob's shape features to a built-in template set.
/// 3. Returning best-match characters.
///
/// It is **not** intended for general-purpose OCR on complex fonts or mixed
/// content — it targets clean, uniform subtitle text on uniform backgrounds.
pub struct SubtitleOcr {
    config: TextDetectorConfig,
}

impl SubtitleOcr {
    /// Create a new subtitle OCR engine.
    #[must_use]
    pub fn new() -> Self {
        Self {
            config: TextDetectorConfig::default(),
        }
    }

    /// Detect and recognise subtitle text regions from a grayscale frame.
    ///
    /// # Errors
    ///
    /// Returns an error if detection fails.
    pub fn extract_subtitles(
        &self,
        image: &[u8],
        width: u32,
        height: u32,
    ) -> CvResult<Vec<RecognisedLine>> {
        let detector = TextDetector::with_config(self.config.clone());
        let regions = detector.detect_subtitles(image, width, height)?;

        let mut lines = Vec::new();
        for region in &regions {
            // Extract and binarise the region
            let (region_img, rw, rh) = self.extract_region(image, width, height, region.bbox);

            // Segment and recognise characters within the region
            let chars = self.segment_and_recognise(&region_img, rw, rh, region.bbox);
            if !chars.is_empty() {
                lines.push(RecognisedLine::new(chars, region.bbox));
            }
        }

        Ok(lines)
    }

    /// Extract a rectangular region from the image.
    fn extract_region(
        &self,
        image: &[u8],
        width: u32,
        _height: u32,
        bbox: (u32, u32, u32, u32),
    ) -> (Vec<u8>, usize, usize) {
        let (x0, y0, x1, y1) = bbox;
        let rw = (x1.saturating_sub(x0) + 1) as usize;
        let rh = (y1.saturating_sub(y0) + 1) as usize;
        let mut out = vec![0u8; rw * rh];

        for ry in 0..rh {
            for rx in 0..rw {
                let sx = x0 as usize + rx;
                let sy = y0 as usize + ry;
                let src_idx = sy * width as usize + sx;
                if src_idx < image.len() {
                    out[ry * rw + rx] = image[src_idx];
                }
            }
        }
        (out, rw, rh)
    }

    /// Segment characters via vertical projection histogram and recognise each.
    fn segment_and_recognise(
        &self,
        image: &[u8],
        w: usize,
        h: usize,
        bbox: (u32, u32, u32, u32),
    ) -> Vec<RecognisedChar> {
        if w == 0 || h == 0 {
            return Vec::new();
        }

        // Binarise using Otsu's threshold
        let threshold = otsu_threshold(image);
        let bin: Vec<bool> = image.iter().map(|&p| p < threshold).collect();

        // Vertical projection: count foreground pixels per column
        let mut col_sum = vec![0u32; w];
        for y in 0..h {
            for x in 0..w {
                if bin[y * w + x] {
                    col_sum[x] += 1;
                }
            }
        }

        // Find character segments: transitions between empty and non-empty columns
        let mut segments: Vec<(usize, usize)> = Vec::new();
        let mut in_char = false;
        let mut char_start = 0usize;

        for (x, &count) in col_sum.iter().enumerate() {
            if count > 0 && !in_char {
                in_char = true;
                char_start = x;
            } else if count == 0 && in_char {
                in_char = false;
                segments.push((char_start, x - 1));
            }
        }
        if in_char {
            segments.push((char_start, w - 1));
        }

        // Recognise each segment using shape features
        let mut chars = Vec::new();
        for (seg_x0, seg_x1) in segments {
            let seg_w = seg_x1 - seg_x0 + 1;
            if seg_w < 2 {
                continue;
            }

            // Extract segment image
            let mut seg = vec![0u8; seg_w * h];
            for y in 0..h {
                for x in seg_x0..=seg_x1 {
                    seg[y * seg_w + (x - seg_x0)] = image[y * w + x];
                }
            }

            let (ch, confidence) = self.recognise_character(&seg, seg_w, h);

            let abs_x0 = bbox.0 + seg_x0 as u32;
            let abs_x1 = bbox.0 + seg_x1 as u32;
            chars.push(RecognisedChar {
                ch,
                confidence,
                bbox: (abs_x0, bbox.1, abs_x1, bbox.3),
            });
        }

        chars
    }

    /// Recognise a single character blob using simple shape features.
    ///
    /// Returns `(character, confidence)`.
    ///
    /// This is a structural matcher that uses:
    /// - Aspect ratio
    /// - Fill density
    /// - Horizontal / vertical projection histograms
    fn recognise_character(&self, seg: &[u8], w: usize, h: usize) -> (char, f32) {
        if w == 0 || h == 0 {
            return ('?', 0.0);
        }

        let threshold = otsu_threshold(seg);
        let filled: Vec<bool> = seg.iter().map(|&p| p < threshold).collect();
        let area = w * h;
        let fg_count = filled.iter().filter(|&&b| b).count();
        let density = fg_count as f32 / area as f32;
        let aspect = w as f32 / h.max(1) as f32;

        // Very coarse shape-based lookup.
        // A proper implementation would use a trained classifier or template bank;
        // here we use heuristic rules suitable for common subtitle fonts.
        let ch = if density < 0.1 {
            ' '
        } else if aspect > 1.5 && density > 0.5 {
            '-'
        } else if aspect < 0.4 {
            'I'
        } else if density > 0.7 {
            '8'
        } else if density > 0.55 {
            'B'
        } else if density > 0.4 {
            'E'
        } else {
            // Generic character placeholder
            'X'
        };

        // Confidence based on how well density/aspect match the character template
        let confidence = (density * 0.5 + 0.5).clamp(0.0, 1.0);
        (ch, confidence)
    }
}

impl Default for SubtitleOcr {
    fn default() -> Self {
        Self::new()
    }
}

/// Compute Otsu's binarisation threshold for a grayscale byte slice.
#[must_use]
fn otsu_threshold(image: &[u8]) -> u8 {
    if image.is_empty() {
        return 128;
    }

    let mut hist = [0u32; 256];
    for &p in image {
        hist[p as usize] += 1;
    }

    let n = image.len() as f64;
    let mut best_thresh = 128u8;
    let mut best_sigma = 0.0f64;

    let mut w0 = 0.0f64;
    let mut sum0 = 0.0f64;
    let total_sum: f64 = hist
        .iter()
        .enumerate()
        .map(|(v, &c)| v as f64 * c as f64)
        .sum();

    // Iterate threshold t from 1 to 254: class 0 = [0..t], class 1 = (t..255]
    for t in 1..255usize {
        w0 += hist[t - 1] as f64 / n;
        sum0 += (t - 1) as f64 * hist[t - 1] as f64 / n;

        let w1 = 1.0 - w0;
        if w0 < 1e-9 || w1 < 1e-9 {
            continue;
        }

        let mean0 = sum0 / w0;
        let mean1 = (total_sum / n - sum0) / w1;
        let sigma = w0 * w1 * (mean0 - mean1) * (mean0 - mean1);

        if sigma > best_sigma {
            best_sigma = sigma;
            best_thresh = t as u8;
        }
    }

    best_thresh
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_test_image(w: u32, h: u32) -> Vec<u8> {
        let mut img = vec![200u8; (w * h) as usize];
        // Draw a horizontal dark bar at the bottom (simulates subtitle)
        let y_start = (h * 3 / 4) as usize;
        for y in y_start..h as usize {
            for x in 10..(w as usize - 10) {
                img[y * w as usize + x] = 20;
            }
        }
        img
    }

    #[test]
    fn test_text_region_width_height() {
        let r = TextRegion::new((10, 20, 50, 40), 0.0, 0.8, true);
        assert_eq!(r.width(), 40);
        assert_eq!(r.height(), 20);
    }

    #[test]
    fn test_text_region_aspect_ratio() {
        let r = TextRegion::new((0, 0, 100, 20), 0.0, 0.9, true);
        assert!((r.aspect_ratio() - 5.0).abs() < 0.01);
    }

    #[test]
    fn test_text_detector_default_config() {
        let cfg = TextDetectorConfig::default();
        assert!(cfg.min_aspect_ratio > 1.0);
    }

    #[test]
    fn test_text_detector_empty_image() {
        let det = TextDetector::new();
        let result = det.detect(&[], 0, 0).expect("empty should succeed");
        assert!(result.is_empty());
    }

    #[test]
    fn test_text_detector_uniform_image() {
        let det = TextDetector::new();
        let img = vec![128u8; 100 * 50];
        let result = det.detect(&img, 100, 50).expect("uniform should succeed");
        // No meaningful text in a uniform image
        let subtitle_count = result.iter().filter(|r| r.is_subtitle).count();
        // uniform image might or might not return zero subtitle regions — just test no panic
        let _ = subtitle_count;
    }

    #[test]
    fn test_text_detector_with_dark_bar() {
        let w = 200u32;
        let h = 100u32;
        let img = make_test_image(w, h);
        let det = TextDetector::new();
        let result = det.detect(&img, w, h).expect("detection should succeed");
        // All returned regions should have valid bboxes
        for r in &result {
            assert!(r.bbox.2 >= r.bbox.0);
            assert!(r.bbox.3 >= r.bbox.1);
            assert!((0.0..=1.0).contains(&r.score));
        }
    }

    #[test]
    fn test_detect_subtitles_filter() {
        let w = 200u32;
        let h = 100u32;
        let img = make_test_image(w, h);
        let det = TextDetector::new();
        let all = det.detect(&img, w, h).expect("should succeed");
        let subs = det.detect_subtitles(&img, w, h).expect("should succeed");
        // subtitle results must be a subset
        assert!(subs.len() <= all.len());
    }

    #[test]
    fn test_otsu_threshold_uniform() {
        let img = vec![128u8; 100];
        let t = otsu_threshold(&img);
        // For a uniform image, threshold can be anything; just test no panic
        let _ = t;
    }

    #[test]
    fn test_otsu_threshold_bimodal() {
        let mut img = vec![0u8; 100];
        for i in 50..100 {
            img[i] = 200;
        }
        let t = otsu_threshold(&img);
        // For a bimodal image with classes at 0 and 200, Otsu picks any threshold
        // strictly between the two class means. Accept 1..=199.
        assert!(
            t >= 1 && t < 200,
            "bimodal threshold {t} out of expected range [1, 200)"
        );
    }

    #[test]
    fn test_subtitle_ocr_empty() {
        let ocr = SubtitleOcr::new();
        let result = ocr
            .extract_subtitles(&[], 0, 0)
            .expect("empty should succeed");
        assert!(result.is_empty());
    }

    #[test]
    fn test_subtitle_ocr_no_subtitles() {
        // All-white image has no text
        let ocr = SubtitleOcr::new();
        let img = vec![255u8; 100 * 50];
        let result = ocr
            .extract_subtitles(&img, 100, 50)
            .expect("should succeed");
        // White image should find no subtitles
        assert!(result.is_empty());
    }

    #[test]
    fn test_recognised_line_new() {
        let chars = vec![
            RecognisedChar {
                ch: 'H',
                confidence: 0.9,
                bbox: (0, 0, 5, 10),
            },
            RecognisedChar {
                ch: 'i',
                confidence: 0.8,
                bbox: (6, 0, 8, 10),
            },
        ];
        let line = RecognisedLine::new(chars, (0, 0, 8, 10));
        assert_eq!(line.text, "Hi");
        assert!((line.confidence - 0.85).abs() < 0.01);
    }

    #[test]
    fn test_recognised_line_is_likely_subtitle() {
        let chars = vec![RecognisedChar {
            ch: 'X',
            confidence: 0.9,
            bbox: (0, 80, 50, 90),
        }];
        let line = RecognisedLine::new(chars, (0, 80, 50, 90));
        assert!(line.is_likely_subtitle(100));
        assert!(!line.is_likely_subtitle(200)); // 90/200 = 0.45 < 0.6
    }
}