vyctor 0.1.0 - Docs.rs

//! Text chunking for embedding
//!
//! This module provides multiple chunking strategies:
//! 1. AST-aware chunking using tree-sitter (when available)
//! 2. Regex-based chunking as fallback
//! 3. Character-based chunking as final fallback

#[cfg(feature = "semantic-chunking")]
use crate::indexer::ast_chunker::AstChunker;
use crate::indexer::language::{detect_language_from_str, Language};
use crate::indexer::regex_chunker::RegexChunker;

/// Types of semantic units that chunks can represent
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum ChunkType {
    /// A function or method definition
    Function,
    /// A class definition
    Class,
    /// A method within a class
    Method,
    /// A struct definition
    Struct,
    /// An enum definition
    Enum,
    /// An interface or trait definition
    Interface,
    /// A module or namespace
    Module,
    /// An impl block (Rust)
    Impl,
    /// A trait definition (Rust)
    Trait,
    /// Import statements
    Import,
    /// Constants or static values
    Constant,
    /// Variable declarations
    Variable,
    /// Type definitions/aliases
    Type,
    /// Multiple semantic units combined
    Mixed,
    /// Unknown or character-based chunking
    #[default]
    Unknown,
}

impl ChunkType {
    /// Get the string representation
    #[allow(dead_code)]
    pub fn as_str(&self) -> &'static str {
        match self {
            ChunkType::Function => "function",
            ChunkType::Class => "class",
            ChunkType::Method => "method",
            ChunkType::Struct => "struct",
            ChunkType::Enum => "enum",
            ChunkType::Interface => "interface",
            ChunkType::Module => "module",
            ChunkType::Impl => "impl",
            ChunkType::Trait => "trait",
            ChunkType::Import => "import",
            ChunkType::Constant => "constant",
            ChunkType::Variable => "variable",
            ChunkType::Type => "type",
            ChunkType::Mixed => "mixed",
            ChunkType::Unknown => "unknown",
        }
    }
}

/// A chunk of text from a file
#[derive(Debug, Clone)]
pub struct Chunk {
    /// The text content of the chunk
    pub content: String,
    /// The starting line number (1-indexed)
    pub start_line: usize,
    /// The ending line number (1-indexed)
    pub end_line: usize,
    /// The type of semantic unit this chunk represents
    #[allow(dead_code)]
    pub chunk_type: ChunkType,
    /// The name of the primary symbol in this chunk (function name, class name, etc.)
    #[allow(dead_code)]
    pub symbol_name: Option<String>,
    /// The detected language of this chunk
    #[allow(dead_code)]
    pub language: Option<String>,
}

impl Chunk {
    /// Create a new chunk with minimal fields (for backward compatibility)
    #[allow(dead_code)]
    pub fn new(content: String, start_line: usize, end_line: usize) -> Self {
        Self {
            content,
            start_line,
            end_line,
            chunk_type: ChunkType::Unknown,
            symbol_name: None,
            language: None,
        }
    }

    /// Create a chunk with full metadata
    pub fn with_metadata(
        content: String,
        start_line: usize,
        end_line: usize,
        chunk_type: ChunkType,
        symbol_name: Option<String>,
        language: Option<String>,
    ) -> Self {
        Self {
            content,
            start_line,
            end_line,
            chunk_type,
            symbol_name,
            language,
        }
    }
}

/// Text chunker that splits content into overlapping chunks
#[derive(Clone)]
pub struct Chunker {
    /// Target size for each chunk (in characters)
    chunk_size: usize,
    /// Overlap between adjacent chunks (in characters)
    overlap: usize,
    /// Maximum chunk size before forced splitting
    max_chunk_size: usize,
    /// Whether to use semantic (AST/regex) chunking
    semantic_chunking: bool,
}

/// Find the nearest valid UTF-8 character boundary at or before the given byte index
fn floor_char_boundary(s: &str, index: usize) -> usize {
    if index >= s.len() {
        return s.len();
    }
    let mut i = index;
    while i > 0 && !s.is_char_boundary(i) {
        i -= 1;
    }
    i
}

/// Find the nearest valid UTF-8 character boundary at or after the given byte index
#[allow(dead_code)]
fn ceil_char_boundary(s: &str, index: usize) -> usize {
    if index >= s.len() {
        return s.len();
    }
    let mut i = index;
    while i < s.len() && !s.is_char_boundary(i) {
        i += 1;
    }
    i
}

impl Chunker {
    /// Create a new chunker with the specified parameters
    #[allow(dead_code)]
    pub fn new(chunk_size: usize, overlap: usize) -> Self {
        Self {
            chunk_size,
            overlap,
            max_chunk_size: chunk_size * 3, // Default max is 3x target
            semantic_chunking: true,
        }
    }

    /// Create a chunker with all options
    pub fn with_options(
        chunk_size: usize,
        overlap: usize,
        max_chunk_size: usize,
        semantic_chunking: bool,
    ) -> Self {
        Self {
            chunk_size,
            overlap,
            max_chunk_size,
            semantic_chunking,
        }
    }

    /// Set whether to use semantic chunking
    #[allow(dead_code)]
    pub fn set_semantic_chunking(&mut self, enabled: bool) {
        self.semantic_chunking = enabled;
    }

    /// Set the maximum chunk size
    #[allow(dead_code)]
    pub fn set_max_chunk_size(&mut self, size: usize) {
        self.max_chunk_size = size;
    }

    /// Split content into chunks, using the best available strategy
    ///
    /// If `file_path` is provided, attempts semantic chunking based on language.
    /// Falls back to character-based chunking if semantic chunking fails or is disabled.
    #[allow(dead_code)]
    pub fn chunk(&self, content: &str) -> Vec<Chunk> {
        self.chunk_with_path(content, None)
    }

    /// Split content into chunks with file path for language detection
    pub fn chunk_with_path(&self, content: &str, file_path: Option<&str>) -> Vec<Chunk> {
        if content.is_empty() {
            return vec![];
        }

        // Try semantic chunking if enabled and file path is provided
        if self.semantic_chunking {
            if let Some(path) = file_path {
                let language = detect_language_from_str(path);
                if language != Language::Unknown {
                    // Try AST chunking first
                    #[cfg(feature = "semantic-chunking")]
                    if language.has_tree_sitter_grammar() {
                        if let Ok(chunks) = self.chunk_with_ast(content, language) {
                            return chunks;
                        }
                    }

                    // Fall back to regex chunking
                    if let Ok(chunks) = self.chunk_with_regex(content, language) {
                        return chunks;
                    }
                }
            }
        }

        // Final fallback: character-based chunking
        self.chunk_by_characters(content, file_path)
    }

    /// Chunk using AST parsing (tree-sitter)
    #[cfg(feature = "semantic-chunking")]
    fn chunk_with_ast(&self, content: &str, language: Language) -> Result<Vec<Chunk>, ()> {
        let ast_chunker = AstChunker::new(self.max_chunk_size, self.overlap);

        let units = ast_chunker
            .extract_semantic_units(content, language)
            .map_err(|_| ())?;

        if units.is_empty() {
            return Err(());
        }

        let lang_str = language.as_str().to_string();
        let mut chunks = Vec::new();

        for unit in units {
            // Split large units
            let split_units = ast_chunker.split_large_unit(&unit);

            for split_unit in split_units {
                chunks.push(Chunk::with_metadata(
                    split_unit.content,
                    split_unit.start_line,
                    split_unit.end_line,
                    convert_semantic_unit_type(split_unit.unit_type),
                    split_unit.symbol_name,
                    Some(lang_str.clone()),
                ));
            }
        }

        // Merge small adjacent chunks if they're both "Other" type
        let chunks = self.merge_small_chunks(chunks);

        Ok(chunks)
    }

    /// Chunk using regex patterns
    fn chunk_with_regex(&self, content: &str, language: Language) -> Result<Vec<Chunk>, ()> {
        let regex_chunker = RegexChunker::new(self.max_chunk_size, self.overlap);

        let units = regex_chunker
            .extract_boundaries(content, language)
            .map_err(|_| ())?;

        if units.is_empty() {
            return Err(());
        }

        let lang_str = language.as_str().to_string();
        let mut chunks = Vec::new();

        for unit in units {
            // Split large units
            let split_units = regex_chunker.split_large_unit(&unit);

            for split_unit in split_units {
                chunks.push(Chunk::with_metadata(
                    split_unit.content,
                    split_unit.start_line,
                    split_unit.end_line,
                    convert_semantic_unit_type(split_unit.unit_type),
                    split_unit.symbol_name,
                    Some(lang_str.clone()),
                ));
            }
        }

        Ok(chunks)
    }

    /// Character-based chunking (original implementation)
    fn chunk_by_characters(&self, content: &str, file_path: Option<&str>) -> Vec<Chunk> {
        let lines: Vec<&str> = content.lines().collect();
        if lines.is_empty() {
            return vec![];
        }

        let language = file_path.map(|p| detect_language_from_str(p).as_str().to_string());

        let mut chunks = Vec::new();
        let mut current_chunk = String::new();
        let mut chunk_start_line = 1;
        let mut current_line = 1;

        for line in &lines {
            let line_with_newline = format!("{}\n", line);

            // Check if adding this line would exceed chunk size
            if !current_chunk.is_empty()
                && current_chunk.len() + line_with_newline.len() > self.chunk_size
            {
                // Save the current chunk
                chunks.push(Chunk::with_metadata(
                    current_chunk.trim_end().to_string(),
                    chunk_start_line,
                    current_line - 1,
                    ChunkType::Unknown,
                    None,
                    language.clone(),
                ));

                // Start new chunk with overlap
                let overlap_start = self.find_overlap_start(&current_chunk);
                current_chunk = current_chunk[overlap_start..].to_string();

                // Recalculate start line based on overlap
                let overlap_lines = current_chunk.lines().count();
                chunk_start_line = current_line.saturating_sub(overlap_lines);
            }

            current_chunk.push_str(&line_with_newline);
            current_line += 1;
        }

        // Don't forget the last chunk
        if !current_chunk.trim().is_empty() {
            chunks.push(Chunk::with_metadata(
                current_chunk.trim_end().to_string(),
                chunk_start_line,
                lines.len(),
                ChunkType::Unknown,
                None,
                language,
            ));
        }

        chunks
    }

    /// Merge small adjacent chunks of type "Other" to reduce fragmentation
    fn merge_small_chunks(&self, chunks: Vec<Chunk>) -> Vec<Chunk> {
        if chunks.len() <= 1 {
            return chunks;
        }

        let min_chunk_size = self.chunk_size / 4; // Minimum size before considering merge
        let mut result = Vec::new();
        let mut pending: Option<Chunk> = None;

        for chunk in chunks {
            if let Some(mut prev) = pending.take() {
                // Check if we should merge
                let both_small =
                    prev.content.len() < min_chunk_size && chunk.content.len() < min_chunk_size;
                let both_other =
                    prev.chunk_type == ChunkType::Unknown && chunk.chunk_type == ChunkType::Unknown;
                let combined_fits = prev.content.len() + chunk.content.len() < self.chunk_size;

                if (both_small || both_other) && combined_fits {
                    // Merge chunks
                    prev.content = format!("{}\n{}", prev.content, chunk.content);
                    prev.end_line = chunk.end_line;
                    prev.chunk_type = ChunkType::Mixed;
                    pending = Some(prev);
                } else {
                    result.push(prev);
                    pending = Some(chunk);
                }
            } else {
                pending = Some(chunk);
            }
        }

        if let Some(last) = pending {
            result.push(last);
        }

        result
    }

    /// Find the starting position for overlap, trying to break at a line boundary
    fn find_overlap_start(&self, content: &str) -> usize {
        if content.len() <= self.overlap {
            return 0;
        }

        let target_start = content.len() - self.overlap;
        // Ensure we're at a valid char boundary
        let target_start = floor_char_boundary(content, target_start);

        // Try to find a newline near the target start
        let search_start = floor_char_boundary(content, target_start.saturating_sub(100));
        if let Some(pos) = content[search_start..target_start].rfind('\n') {
            return search_start + pos + 1;
        }

        target_start
    }

    /// Chunk content with code-aware splitting (legacy API)
    /// Deprecated: Use chunk_with_path instead
    #[allow(dead_code)]
    pub fn chunk_code(&self, content: &str, language: Option<&str>) -> Vec<Chunk> {
        // Convert language string to file path hint
        let fake_path = language.map(|lang| match lang {
            "rust" => "file.rs",
            "python" => "file.py",
            "javascript" => "file.js",
            "typescript" => "file.ts",
            "go" => "file.go",
            "java" => "file.java",
            "c" => "file.c",
            "cpp" => "file.cpp",
            _ => "file.txt",
        });
        self.chunk_with_path(content, fake_path)
    }
}

/// Convert from ast_chunker's SemanticUnitType to ChunkType
fn convert_semantic_unit_type(
    unit_type: crate::indexer::ast_chunker::SemanticUnitType,
) -> ChunkType {
    use crate::indexer::ast_chunker::SemanticUnitType;
    match unit_type {
        SemanticUnitType::Function => ChunkType::Function,
        SemanticUnitType::Class => ChunkType::Class,
        SemanticUnitType::Method => ChunkType::Method,
        SemanticUnitType::Struct => ChunkType::Struct,
        SemanticUnitType::Enum => ChunkType::Enum,
        SemanticUnitType::Interface => ChunkType::Interface,
        SemanticUnitType::Module => ChunkType::Module,
        SemanticUnitType::Impl => ChunkType::Impl,
        SemanticUnitType::Trait => ChunkType::Trait,
        SemanticUnitType::Import => ChunkType::Import,
        SemanticUnitType::Constant => ChunkType::Constant,
        SemanticUnitType::Variable => ChunkType::Variable,
        SemanticUnitType::Type => ChunkType::Type,
        SemanticUnitType::Other => ChunkType::Unknown,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_basic_chunking() {
        let chunker = Chunker::new(100, 20);
        let content = "line1\nline2\nline3\nline4\nline5\n";
        let chunks = chunker.chunk(content);

        assert!(!chunks.is_empty());
        assert_eq!(chunks[0].start_line, 1);
    }

    #[test]
    fn test_empty_content() {
        let chunker = Chunker::new(100, 20);
        let chunks = chunker.chunk("");
        assert!(chunks.is_empty());
    }

    #[test]
    fn test_small_content() {
        let chunker = Chunker::new(1000, 200);
        let content = "small file";
        let chunks = chunker.chunk(content);

        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].content, "small file");
    }

    #[test]
    fn test_overlap() {
        let chunker = Chunker::new(50, 10);
        let content = (0..20)
            .map(|i| format!("line{}", i))
            .collect::<Vec<_>>()
            .join("\n");
        let chunks = chunker.chunk(&content);

        // With overlap, adjacent chunks should share some content
        if chunks.len() >= 2 {
            // The end of the first chunk's lines should overlap with the start of the second
            assert!(chunks[1].start_line <= chunks[0].end_line + 1);
        }
    }

    #[test]
    fn test_whitespace_only_content() {
        let chunker = Chunker::new(100, 20);
        let chunks = chunker.chunk("   \n\n  \n");
        // Whitespace-only content should produce empty chunks
        assert!(chunks.is_empty() || chunks.iter().all(|c| c.content.trim().is_empty()));
    }

    #[test]
    fn test_single_line() {
        let chunker = Chunker::new(100, 20);
        let content = "single line content";
        let chunks = chunker.chunk(content);

        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].start_line, 1);
        assert_eq!(chunks[0].end_line, 1);
    }

    #[test]
    fn test_exact_chunk_size() {
        let chunker = Chunker::new(20, 5);
        let content = "12345678901234567890"; // Exactly 20 chars
        let chunks = chunker.chunk(content);

        assert_eq!(chunks.len(), 1);
    }

    #[test]
    fn test_line_numbers_are_correct() {
        let chunker = Chunker::new(1000, 100);
        let content = "line 1\nline 2\nline 3\nline 4\nline 5";
        let chunks = chunker.chunk(content);

        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].start_line, 1);
        assert_eq!(chunks[0].end_line, 5);
    }

    #[test]
    fn test_large_file_produces_multiple_chunks() {
        let chunker = Chunker::new(100, 20);
        let content = (0..100)
            .map(|i| format!("line number {}", i))
            .collect::<Vec<_>>()
            .join("\n");
        let chunks = chunker.chunk(&content);

        assert!(
            chunks.len() > 1,
            "Large content should produce multiple chunks"
        );

        // Verify all content is covered
        let total_lines = content.lines().count();
        assert!(chunks.last().unwrap().end_line >= total_lines - 1);
    }

    #[test]
    fn test_chunk_content_preserved() {
        let chunker = Chunker::new(1000, 100);
        let content = "fn main() {\n    println!(\"Hello\");\n}";
        let chunks = chunker.chunk(content);

        assert_eq!(chunks.len(), 1);
        assert!(chunks[0].content.contains("fn main()"));
        assert!(chunks[0].content.contains("println!"));
    }

    #[test]
    fn test_unicode_content() {
        let chunker = Chunker::new(100, 20);
        let content = "日本語テスト\n中文测试\n한국어 테스트\nemoji: 🎉🚀";
        let chunks = chunker.chunk(content);

        assert!(!chunks.is_empty());
        assert!(chunks[0].content.contains("日本語"));
    }

    #[test]
    fn test_very_long_line() {
        let chunker = Chunker::new(50, 10);
        let long_line = "a".repeat(200);
        let chunks = chunker.chunk(&long_line);

        // Should still produce at least one chunk even with very long line
        assert!(!chunks.is_empty());
    }

    #[test]
    fn test_chunks_dont_lose_content() {
        let chunker = Chunker::new(100, 20);
        let original_lines: Vec<_> = (0..50).map(|i| format!("line{}", i)).collect();
        let content = original_lines.join("\n");
        let chunks = chunker.chunk(&content);

        // Concatenate all chunk content (accounting for overlap)
        // and verify no original lines are missing
        let all_chunk_content: String = chunks
            .iter()
            .map(|c| c.content.as_str())
            .collect::<Vec<_>>()
            .join("\n");

        // At minimum, first and last lines should be present
        assert!(all_chunk_content.contains("line0"));
        assert!(all_chunk_content.contains("line49"));
    }

    #[test]
    fn test_chunk_with_path_rust() {
        let chunker = Chunker::new(1000, 100);
        let content = r#"
fn hello() {
    println!("hello");
}

fn world() {
    println!("world");
}
"#;
        let chunks = chunker.chunk_with_path(content, Some("main.rs"));

        // Should detect language
        assert!(chunks.iter().all(|c| c.language.as_deref() == Some("rust")));
    }

    #[test]
    fn test_chunk_with_path_python() {
        let chunker = Chunker::new(1000, 100);
        let content = r#"
def hello():
    print("hello")

class MyClass:
    pass
"#;
        let chunks = chunker.chunk_with_path(content, Some("script.py"));

        // Should detect language
        assert!(chunks
            .iter()
            .all(|c| c.language.as_deref() == Some("python")));
    }

    #[test]
    fn test_chunk_metadata() {
        let chunk = Chunk::with_metadata(
            "fn test() {}".to_string(),
            1,
            1,
            ChunkType::Function,
            Some("test".to_string()),
            Some("rust".to_string()),
        );

        assert_eq!(chunk.chunk_type, ChunkType::Function);
        assert_eq!(chunk.symbol_name.as_deref(), Some("test"));
        assert_eq!(chunk.language.as_deref(), Some("rust"));
    }

    #[test]
    fn test_no_overlap_at_content_start() {
        let chunker = Chunker::new(100, 20);
        let content = (0..50)
            .map(|i| format!("line{}", i))
            .collect::<Vec<_>>()
            .join("\n");
        let chunks = chunker.chunk(&content);

        // First chunk should always start at line 1
        assert_eq!(chunks[0].start_line, 1);
    }

    #[test]
    fn test_semantic_chunking_disabled() {
        let mut chunker = Chunker::new(100, 20);
        chunker.set_semantic_chunking(false);

        let content = "fn test() { }\nfn other() { }";
        let chunks = chunker.chunk_with_path(content, Some("test.rs"));

        // Should use character-based chunking, all chunks should be Unknown type
        for chunk in &chunks {
            assert_eq!(chunk.chunk_type, ChunkType::Unknown);
        }
    }

    #[test]
    fn test_chunk_type_as_str() {
        assert_eq!(ChunkType::Function.as_str(), "function");
        assert_eq!(ChunkType::Class.as_str(), "class");
        assert_eq!(ChunkType::Unknown.as_str(), "unknown");
    }
}