libgrammstein 0.1.0

//! Extractive summarization using ModernBERT embeddings.
//!
//! This module provides extractive summarization by selecting the most
//! representative sentences from a document based on embedding similarity.

use std::collections::HashSet;
use std::sync::Arc;

use super::embedder::{EmbeddingConfig, ModernBertEmbedder};
use super::modernbert::ModernBertModel;
use super::Result;

/// Configuration for summarization.
#[derive(Clone, Debug)]
pub struct SummarizerConfig {
    /// Number of sentences to extract.
    pub num_sentences: usize,
    /// Minimum sentence length (in characters) to consider.
    pub min_sentence_length: usize,
    /// Maximum sentence length (in characters) to consider.
    pub max_sentence_length: usize,
    /// Whether to maintain original sentence order in output.
    pub preserve_order: bool,
    /// Minimum diversity between selected sentences (0.0 to 1.0).
    /// Higher values prefer more diverse sentences.
    pub diversity_threshold: f32,
}

impl Default for SummarizerConfig {
    fn default() -> Self {
        Self {
            num_sentences: 3,
            min_sentence_length: 20,
            max_sentence_length: 500,
            preserve_order: true,
            diversity_threshold: 0.3,
        }
    }
}

/// Source of a synopsis (explicit from document or generated).
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum SynopsisSource {
    /// Synopsis was explicitly provided (e.g., document abstract).
    Explicit,
    /// Synopsis was generated by the summarizer.
    #[default]
    Generated,
}

/// A synopsis with its source.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
pub struct Synopsis {
    /// The synopsis text.
    pub text: String,
    /// Source of the synopsis.
    pub source: SynopsisSource,
}

impl Synopsis {
    /// Create an explicit synopsis.
    pub fn explicit(text: impl Into<String>) -> Self {
        Self {
            text: text.into(),
            source: SynopsisSource::Explicit,
        }
    }

    /// Create a generated synopsis.
    pub fn generated(text: impl Into<String>) -> Self {
        Self {
            text: text.into(),
            source: SynopsisSource::Generated,
        }
    }

    /// Check if synopsis is explicit.
    pub fn is_explicit(&self) -> bool {
        matches!(self.source, SynopsisSource::Explicit)
    }
}

/// Extractive summarizer using ModernBERT embeddings.
pub struct Summarizer {
    embedder: ModernBertEmbedder,
    config: SummarizerConfig,
}

impl Summarizer {
    /// Create a new summarizer.
    pub fn new(embedder: ModernBertEmbedder, config: SummarizerConfig) -> Self {
        Self { embedder, config }
    }

    /// Create a summarizer from a model.
    pub fn from_model(model: Arc<ModernBertModel>, config: SummarizerConfig) -> Result<Self> {
        let embedder = ModernBertEmbedder::from_model(model, EmbeddingConfig::default());
        Ok(Self { embedder, config })
    }

    /// Generate extractive summary by selecting representative sentences.
    ///
    /// Algorithm:
    /// 1. Split text into sentences
    /// 2. Embed all sentences
    /// 3. Compute document centroid (mean of all embeddings)
    /// 4. Select sentences most similar to centroid, with diversity constraint
    ///
    /// This method takes `&self` instead of `&mut self`, enabling concurrent summarization
    /// by multiple threads without external synchronization.
    pub fn extractive(&self, text: &str, num_sentences: Option<usize>) -> Result<String> {
        let num = num_sentences.unwrap_or(self.config.num_sentences);
        let sentences = self.split_sentences(text);

        if sentences.is_empty() {
            return Ok(String::new());
        }

        if sentences.len() <= num {
            return Ok(sentences.join(" "));
        }

        // Filter sentences by length
        let valid_sentences: Vec<(usize, &str)> = sentences
            .iter()
            .enumerate()
            .filter(|(_, s)| {
                s.len() >= self.config.min_sentence_length
                    && s.len() <= self.config.max_sentence_length
            })
            .map(|(i, s)| (i, s.as_str()))
            .collect();

        if valid_sentences.is_empty() {
            // No valid sentences, return first few
            return Ok(sentences
                .into_iter()
                .take(num)
                .collect::<Vec<_>>()
                .join(" "));
        }

        // Embed valid sentences
        let sentence_texts: Vec<&str> = valid_sentences.iter().map(|(_, s)| *s).collect();
        let embeddings = self.embedder.embed_batch(&sentence_texts)?;

        // Compute centroid
        let centroid = Self::compute_centroid(&embeddings);

        // Score sentences by similarity to centroid
        let mut scored: Vec<(usize, &str, f32)> = valid_sentences
            .iter()
            .zip(&embeddings)
            .map(|((idx, sent), emb)| {
                let sim = ModernBertEmbedder::cosine_similarity(emb, &centroid);
                (*idx, *sent, sim)
            })
            .collect();

        // Select top sentences with diversity constraint
        let selected = self.select_diverse(&mut scored, &embeddings, num);

        // Order by original position if configured
        let mut result_indices: Vec<usize> = selected.iter().map(|(idx, _, _)| *idx).collect();
        if self.config.preserve_order {
            result_indices.sort();
        }

        // Build result
        let result: Vec<&str> = result_indices
            .iter()
            .filter_map(|idx| sentences.get(*idx).map(|s| s.as_str()))
            .collect();

        Ok(result.join(" "))
    }

    /// Select diverse sentences using maximal marginal relevance (MMR).
    fn select_diverse<'a>(
        &self,
        scored: &mut [(usize, &'a str, f32)],
        embeddings: &[Vec<f32>],
        num: usize,
    ) -> Vec<(usize, &'a str, f32)> {
        // Sort by similarity score
        scored.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));

        let mut selected: Vec<(usize, &'a str, f32)> = Vec::with_capacity(num);
        let mut selected_indices: HashSet<usize> = HashSet::new();

        // Always select the best sentence
        if let Some(&(idx, sent, score)) = scored.first() {
            selected.push((idx, sent, score));
            selected_indices.insert(idx);
        }

        // Select remaining sentences with diversity constraint
        while selected.len() < num && selected.len() < scored.len() {
            let mut best_candidate: Option<(usize, &'a str, f32)> = None;
            let mut best_mmr = f32::NEG_INFINITY;

            for (i, &(idx, sent, relevance)) in scored.iter().enumerate() {
                if selected_indices.contains(&idx) {
                    continue;
                }

                // Compute max similarity to already selected sentences
                let max_sim_to_selected: f32 = selected
                    .iter()
                    .filter_map(|(sel_idx, _, _)| {
                        // Find embedding index for selected sentence
                        scored.iter().position(|(sidx, _, _)| sidx == sel_idx)
                    })
                    .map(|sel_emb_idx| {
                        ModernBertEmbedder::cosine_similarity(
                            &embeddings[i],
                            &embeddings[sel_emb_idx],
                        )
                    })
                    .fold(f32::NEG_INFINITY, f32::max);

                // MMR = lambda * relevance - (1 - lambda) * max_sim
                let lambda = 1.0 - self.config.diversity_threshold;
                let mmr = lambda * relevance - (1.0 - lambda) * max_sim_to_selected;

                if mmr > best_mmr {
                    best_mmr = mmr;
                    best_candidate = Some((idx, sent, relevance));
                }
            }

            if let Some(candidate) = best_candidate {
                selected.push(candidate);
                selected_indices.insert(candidate.0);
            } else {
                break;
            }
        }

        selected
    }

    /// Compute centroid (mean) of embeddings.
    fn compute_centroid(embeddings: &[Vec<f32>]) -> Vec<f32> {
        if embeddings.is_empty() {
            return vec![];
        }

        let dim = embeddings[0].len();
        let mut centroid = vec![0.0f32; dim];

        for emb in embeddings {
            for (i, val) in emb.iter().enumerate() {
                centroid[i] += val;
            }
        }

        let n = embeddings.len() as f32;
        for val in &mut centroid {
            *val /= n;
        }

        // Normalize centroid
        ModernBertEmbedder::normalize(&centroid)
    }

    /// Split text into sentences.
    fn split_sentences(&self, text: &str) -> Vec<String> {
        // Simple sentence splitting by punctuation
        let mut sentences = Vec::new();
        let mut current = String::new();

        for c in text.chars() {
            current.push(c);

            // Check for sentence boundary
            if c == '.' || c == '!' || c == '?' {
                let trimmed = current.trim().to_string();
                if !trimmed.is_empty() {
                    // Check for common abbreviations
                    if !self.is_abbreviation(&trimmed) {
                        sentences.push(trimmed);
                        current = String::new();
                    }
                }
            }
        }

        // Add remaining text
        let trimmed = current.trim().to_string();
        if !trimmed.is_empty() {
            sentences.push(trimmed);
        }

        sentences
    }

    /// Check if text ends with a common abbreviation.
    fn is_abbreviation(&self, text: &str) -> bool {
        let lower = text.to_lowercase();
        let abbreviations = [
            "mr.", "mrs.", "ms.", "dr.", "prof.", "sr.", "jr.", "vs.", "etc.", "e.g.", "i.e.",
            "no.", "vol.", "pg.", "p.", "pp.", "fig.", "ed.", "eds.", "rev.", "st.", "inc.",
            "ltd.", "corp.", "co.", "approx.", "dept.", "est.", "ave.", "blvd.",
        ];

        for abbr in &abbreviations {
            if lower.ends_with(abbr) {
                return true;
            }
        }

        false
    }

    /// Create a synopsis from a document.
    ///
    /// Uses explicit synopsis if provided, otherwise generates one.
    ///
    /// This method takes `&self` instead of `&mut self`, enabling concurrent synopsis
    /// creation by multiple threads without external synchronization.
    pub fn create_synopsis(&self, explicit: Option<&str>, content: &str) -> Result<Synopsis> {
        match explicit {
            Some(text) => Ok(Synopsis::explicit(text)),
            None => {
                let text = self.extractive(content, None)?;
                Ok(Synopsis::generated(text))
            }
        }
    }

    /// Get configuration.
    pub fn config(&self) -> &SummarizerConfig {
        &self.config
    }

    /// Update configuration.
    pub fn set_config(&mut self, config: SummarizerConfig) {
        self.config = config;
    }

    /// Get the embedder.
    pub fn embedder(&self) -> &ModernBertEmbedder {
        &self.embedder
    }

    /// Get mutable embedder.
    pub fn embedder_mut(&mut self) -> &mut ModernBertEmbedder {
        &mut self.embedder
    }
}

impl std::fmt::Debug for Summarizer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Summarizer")
            .field("num_sentences", &self.config.num_sentences)
            .field("preserve_order", &self.config.preserve_order)
            .field("diversity_threshold", &self.config.diversity_threshold)
            .finish()
    }
}

/// Scored sentence for summarization.
#[derive(Clone, Debug)]
pub struct ScoredSentence {
    /// The sentence text.
    pub text: String,
    /// Original index in document.
    pub index: usize,
    /// Similarity to document centroid.
    pub score: f32,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_synopsis_explicit() {
        let synopsis = Synopsis::explicit("This is a test.");
        assert!(synopsis.is_explicit());
        assert_eq!(synopsis.text, "This is a test.");
    }

    #[test]
    fn test_synopsis_generated() {
        let synopsis = Synopsis::generated("Generated text.");
        assert!(!synopsis.is_explicit());
        assert_eq!(synopsis.source, SynopsisSource::Generated);
    }

    #[test]
    fn test_compute_centroid() {
        let embeddings = vec![
            vec![1.0, 0.0, 0.0],
            vec![0.0, 1.0, 0.0],
            vec![0.0, 0.0, 1.0],
        ];

        let centroid = Summarizer::compute_centroid(&embeddings);

        // Mean is (1/3, 1/3, 1/3), normalized
        let expected_norm = (3.0f32 * (1.0 / 9.0)).sqrt();
        let expected = 1.0 / (3.0 * expected_norm);

        for val in &centroid {
            assert!((val - expected).abs() < 1e-5);
        }
    }

    #[test]
    fn test_abbreviations() {
        // We can't test is_abbreviation directly without an embedder,
        // but we can verify the abbreviations list is reasonable
        let abbreviations = ["Dr.", "Mr.", "etc.", "e.g."];
        for abbr in &abbreviations {
            assert!(abbr.ends_with('.'));
        }
    }
}