codex-memory 3.0.15

use crate::error::Result;
use regex::Regex;

/// A content chunk with semantic boundaries
#[derive(Debug, Clone)]
pub struct ContentChunk {
    pub content: String,
    pub start_byte: usize,
    pub end_byte: usize,
    pub chunk_index: usize,
}

/// Chunking strategy for semantic boundary preservation
#[derive(Debug, Clone, Default)]
pub enum ChunkingStrategy {
    /// Split by sentences (preserves sentence boundaries)
    Sentence,
    /// Split by paragraphs (preserves paragraph boundaries)
    Paragraph,
    /// Semantic boundaries using NLP heuristics
    Semantic,
    /// Hybrid approach: size-based with semantic boundary adjustment
    #[default]
    Hybrid,
}

impl std::str::FromStr for ChunkingStrategy {
    type Err = ();

    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
        match s.to_lowercase().as_str() {
            "sentence" => Ok(ChunkingStrategy::Sentence),
            "paragraph" => Ok(ChunkingStrategy::Paragraph),
            "semantic" => Ok(ChunkingStrategy::Semantic),
            "hybrid" => Ok(ChunkingStrategy::Hybrid),
            _ => Ok(ChunkingStrategy::Hybrid), // Default for unknown values
        }
    }
}

/// Advanced file chunker with semantic boundary preservation
pub struct FileChunker {
    chunk_size: usize,
    overlap_size: usize,
    strategy: ChunkingStrategy,
}

impl FileChunker {
    /// Create a new file chunker with specified chunk and overlap sizes
    pub fn new(chunk_size: usize, overlap_size: usize) -> Self {
        Self {
            chunk_size,
            overlap_size,
            strategy: ChunkingStrategy::default(),
        }
    }

    /// Create a new file chunker with specified strategy
    pub fn with_strategy(chunk_size: usize, overlap_size: usize, strategy: ChunkingStrategy) -> Self {
        Self {
            chunk_size,
            overlap_size,
            strategy,
        }
    }

    /// Create a default chunker with 8KB chunks and 200 byte overlap
    pub fn with_defaults() -> Self {
        Self::new(8192, 200)
    }

    /// Chunk content into overlapping pieces using the configured strategy
    pub fn chunk_content(&self, content: &str) -> Result<Vec<ContentChunk>> {
        match self.strategy {
            ChunkingStrategy::Sentence => self.chunk_by_sentences(content),
            ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(content),
            ChunkingStrategy::Semantic => self.chunk_semantic(content),
            ChunkingStrategy::Hybrid => self.chunk_hybrid(content),
        }
    }

    /// Chunk content by sentence boundaries
    fn chunk_by_sentences(&self, content: &str) -> Result<Vec<ContentChunk>> {
        let sentence_regex = Regex::new(r"[.!?]+\s+").map_err(|e| {
            crate::error::Error::InternalError(format!("Failed to create sentence regex: {}", e))
        })?;

        let sentences: Vec<&str> = sentence_regex
            .split(content)
            .filter(|s| !s.trim().is_empty())
            .collect();

        self.group_sentences_into_chunks(&sentences, content)
    }

    /// Chunk content by paragraph boundaries
    fn chunk_by_paragraphs(&self, content: &str) -> Result<Vec<ContentChunk>> {
        let paragraphs: Vec<&str> = content
            .split("\n\n")
            .filter(|p| !p.trim().is_empty())
            .collect();

        self.group_paragraphs_into_chunks(&paragraphs, content)
    }

    /// Semantic chunking using NLP heuristics
    fn chunk_semantic(&self, content: &str) -> Result<Vec<ContentChunk>> {
        // Use a combination of semantic boundaries:
        // 1. Code blocks (```...```)
        // 2. Headers (# ## ###)
        // 3. List items
        // 4. Paragraph breaks
        
        let semantic_boundaries = self.find_semantic_boundaries(content)?;
        self.create_chunks_from_boundaries(content, &semantic_boundaries)
    }

    /// Hybrid approach: size-based with semantic boundary adjustment
    fn chunk_hybrid(&self, content: &str) -> Result<Vec<ContentChunk>> {
        let mut chunks = Vec::new();
        let content_bytes = content.as_bytes();
        let mut start = 0;
        let mut chunk_index = 0;

        while start < content_bytes.len() {
            let initial_end = (start + self.chunk_size).min(content_bytes.len());
            
            // Find the best semantic boundary near the target size
            let semantic_end = self.find_best_semantic_boundary(
                content, 
                start, 
                initial_end, 
                content_bytes.len()
            );

            // Extract chunk content
            let chunk_content = content[start..semantic_end].to_string();

            chunks.push(ContentChunk {
                content: chunk_content,
                start_byte: start,
                end_byte: semantic_end,
                chunk_index,
            });

            // Move to next chunk with overlap, but respect word boundaries
            if semantic_end >= content_bytes.len() {
                break;
            }

            start = self.calculate_semantic_overlap_start(content, semantic_end);
            chunk_index += 1;
        }

        Ok(chunks)
    }

    /// Group sentences into chunks respecting size limits
    fn group_sentences_into_chunks(&self, sentences: &[&str], _original: &str) -> Result<Vec<ContentChunk>> {
        let mut chunks = Vec::new();
        let mut current_chunk = String::new();
        let mut chunk_start = 0;
        let mut chunk_index = 0;

        for sentence in sentences {
            let potential_chunk = if current_chunk.is_empty() {
                sentence.to_string()
            } else {
                format!("{} {}", current_chunk, sentence)
            };

            if potential_chunk.len() <= self.chunk_size || current_chunk.is_empty() {
                current_chunk = potential_chunk;
            } else {
                // Finalize current chunk
                let chunk_end = chunk_start + current_chunk.len();
                chunks.push(ContentChunk {
                    content: current_chunk.trim().to_string(),
                    start_byte: chunk_start,
                    end_byte: chunk_end,
                    chunk_index,
                });

                // Start new chunk with overlap
                let overlap_content = self.calculate_sentence_overlap(&current_chunk);
                current_chunk = if overlap_content.is_empty() {
                    sentence.to_string()
                } else {
                    format!("{} {}", overlap_content, sentence)
                };
                
                chunk_start = chunk_end - overlap_content.len();
                chunk_index += 1;
            }
        }

        // Add the last chunk if not empty
        if !current_chunk.trim().is_empty() {
            let chunk_end = chunk_start + current_chunk.len();
            chunks.push(ContentChunk {
                content: current_chunk.trim().to_string(),
                start_byte: chunk_start,
                end_byte: chunk_end,
                chunk_index,
            });
        }

        Ok(chunks)
    }

    /// Group paragraphs into chunks respecting size limits
    fn group_paragraphs_into_chunks(&self, paragraphs: &[&str], _original: &str) -> Result<Vec<ContentChunk>> {
        let mut chunks = Vec::new();
        let mut current_chunk = String::new();
        let mut chunk_start = 0;
        let mut chunk_index = 0;

        for paragraph in paragraphs {
            let potential_chunk = if current_chunk.is_empty() {
                paragraph.to_string()
            } else {
                format!("{}\n\n{}", current_chunk, paragraph)
            };

            if potential_chunk.len() <= self.chunk_size || current_chunk.is_empty() {
                current_chunk = potential_chunk;
            } else {
                // Finalize current chunk
                let chunk_end = chunk_start + current_chunk.len();
                chunks.push(ContentChunk {
                    content: current_chunk.trim().to_string(),
                    start_byte: chunk_start,
                    end_byte: chunk_end,
                    chunk_index,
                });

                // Start new chunk (no overlap for paragraph chunking)
                current_chunk = paragraph.to_string();
                chunk_start = chunk_end;
                chunk_index += 1;
            }
        }

        // Add the last chunk if not empty
        if !current_chunk.trim().is_empty() {
            let chunk_end = chunk_start + current_chunk.len();
            chunks.push(ContentChunk {
                content: current_chunk.trim().to_string(),
                start_byte: chunk_start,
                end_byte: chunk_end,
                chunk_index,
            });
        }

        Ok(chunks)
    }

    /// Find semantic boundaries in the text
    fn find_semantic_boundaries(&self, content: &str) -> Result<Vec<usize>> {
        let mut boundaries = vec![0]; // Start with the beginning
        
        // Find code blocks
        let code_block_regex = Regex::new(r"```[\s\S]*?```").map_err(|e| {
            crate::error::Error::InternalError(format!("Failed to create code block regex: {}", e))
        })?;
        
        for mat in code_block_regex.find_iter(content) {
            boundaries.push(mat.start());
            boundaries.push(mat.end());
        }
        
        // Find headers
        let header_regex = Regex::new(r"(?m)^#{1,6}\s").map_err(|e| {
            crate::error::Error::InternalError(format!("Failed to create header regex: {}", e))
        })?;
        
        for mat in header_regex.find_iter(content) {
            boundaries.push(mat.start());
        }
        
        // Find paragraph breaks
        let paragraph_regex = Regex::new(r"\n\s*\n").map_err(|e| {
            crate::error::Error::InternalError(format!("Failed to create paragraph regex: {}", e))
        })?;
        
        for mat in paragraph_regex.find_iter(content) {
            boundaries.push(mat.end());
        }
        
        boundaries.push(content.len()); // End with the content length
        boundaries.sort_unstable();
        boundaries.dedup();
        
        Ok(boundaries)
    }

    /// Create chunks from semantic boundaries
    fn create_chunks_from_boundaries(&self, content: &str, boundaries: &[usize]) -> Result<Vec<ContentChunk>> {
        let mut chunks = Vec::new();
        let mut chunk_index = 0;
        
        for window in boundaries.windows(2) {
            let start = window[0];
            let end = window[1];
            let chunk_content = content[start..end].trim();
            
            if !chunk_content.is_empty() && chunk_content.len() >= 10 {
                chunks.push(ContentChunk {
                    content: chunk_content.to_string(),
                    start_byte: start,
                    end_byte: end,
                    chunk_index,
                });
                chunk_index += 1;
            }
        }
        
        Ok(chunks)
    }

    /// Find the best semantic boundary near the target position
    fn find_best_semantic_boundary(&self, content: &str, start: usize, target_end: usize, content_len: usize) -> usize {
        if target_end >= content_len {
            return content_len;
        }
        
        // Search window around target_end
        let search_start = (target_end.saturating_sub(200)).max(start);
        let search_end = (target_end + 200).min(content_len);
        
        let search_text = &content[search_start..search_end];
        
        // Look for good boundaries in order of preference:
        // 1. Double newlines (paragraph breaks)
        // 2. Single newlines
        // 3. Sentence endings
        // 4. Word boundaries
        
        let relative_target = target_end - search_start;
        
        // Paragraph breaks
        if let Some(pos) = self.find_nearest_match(search_text, r"\n\s*\n", relative_target) {
            return search_start + pos;
        }
        
        // Single newlines
        if let Some(pos) = self.find_nearest_match(search_text, r"\n", relative_target) {
            return search_start + pos;
        }
        
        // Sentence endings
        if let Some(pos) = self.find_nearest_match(search_text, r"[.!?]\s+", relative_target) {
            return search_start + pos;
        }
        
        // Word boundaries
        if let Some(pos) = self.find_nearest_match(search_text, r"\s+", relative_target) {
            return search_start + pos;
        }
        
        // Fallback to original target
        target_end
    }

    /// Find the nearest regex match to a target position
    fn find_nearest_match(&self, text: &str, pattern: &str, target: usize) -> Option<usize> {
        let regex = Regex::new(pattern).ok()?;
        let mut closest_pos = None;
        let mut closest_distance = usize::MAX;
        
        for mat in regex.find_iter(text) {
            let distance = if mat.end() > target {
                mat.end() - target
            } else {
                target - mat.end()
            };
            
            if distance < closest_distance {
                closest_distance = distance;
                closest_pos = Some(mat.end());
            }
        }
        
        closest_pos
    }

    /// Calculate semantic overlap start position
    fn calculate_semantic_overlap_start(&self, content: &str, end: usize) -> usize {
        let overlap_target = end.saturating_sub(self.overlap_size);
        
        // Find word boundary for overlap
        let search_start = overlap_target.saturating_sub(50);
        let search_end = end.min(search_start + 100);
        
        if search_start >= search_end {
            return overlap_target;
        }
        
        let search_text = &content[search_start..search_end];
        let relative_target = overlap_target - search_start;
        
        if let Some(pos) = self.find_nearest_match(search_text, r"\s+", relative_target) {
            search_start + pos
        } else {
            overlap_target
        }
    }

    /// Calculate overlap for sentence-based chunking
    fn calculate_sentence_overlap(&self, chunk: &str) -> String {
        let words: Vec<&str> = chunk.split_whitespace().collect();
        let overlap_words = (words.len() * self.overlap_size / chunk.len()).min(words.len() / 4);
        
        if overlap_words > 0 {
            words[words.len().saturating_sub(overlap_words)..]
                .join(" ")
        } else {
            String::new()
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    #[allow(clippy::needless_borrow)]
    fn test_basic_chunking() {
        let chunker = FileChunker::new(100, 20);
        let content = "a".repeat(250);
        let chunks = chunker.chunk_content(&content).unwrap();

        assert!(chunks.len() >= 3);
        assert_eq!(chunks[0].chunk_index, 0);
        assert_eq!(chunks[1].chunk_index, 1);
    }

    #[test]
    #[allow(clippy::needless_borrow)]
    fn test_utf8_boundary_safety() {
        let chunker = FileChunker::new(10, 2);
        let content = "Hello 世界 World";
        let chunks = chunker.chunk_content(&content).unwrap();

        // Ensure all chunks are valid UTF-8
        for chunk in chunks {
            assert!(
                chunk.content.is_ascii()
                    || chunk
                        .content
                        .chars()
                        .all(|c| c.is_alphabetic() || c.is_whitespace())
            );
        }
    }

    #[test]
    #[allow(clippy::needless_borrow)]
    fn test_overlap() {
        let chunker = FileChunker::new(50, 10);
        let content = "a".repeat(100);
        let chunks = chunker.chunk_content(&content).unwrap();

        // Check that chunks have overlap
        if chunks.len() > 1 {
            let overlap_start = chunks[1].start_byte;
            let first_end = chunks[0].end_byte;
            assert!(overlap_start < first_end);
        }
    }
}