vyctor 0.1.0 - Docs.rs

//! Regex-based code chunking fallback
//!
//! This module provides regex patterns to detect function/class boundaries
//! when tree-sitter parsing is unavailable or fails.

use crate::indexer::ast_chunker::{SemanticUnit, SemanticUnitType};
use crate::indexer::language::Language;
use regex::Regex;
use std::sync::LazyLock;

/// Regex patterns for detecting semantic boundaries in different languages
struct LanguagePatterns {
    /// Pattern to match function/method definitions
    function_pattern: Option<Regex>,
    /// Pattern to match class definitions
    class_pattern: Option<Regex>,
    /// Pattern to match struct/enum definitions
    struct_pattern: Option<Regex>,
    /// Pattern to match module/namespace definitions
    module_pattern: Option<Regex>,
}

// Pre-compiled regex patterns for each language
static RUST_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| {
    LanguagePatterns {
    function_pattern: Some(
        Regex::new(r#"(?m)^[ \t]*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?(?:unsafe\s+)?(?:extern\s+(?:"[^"]*"\s+)?)?fn\s+(\w+)"#)
            .unwrap(),
    ),
    class_pattern: None,
    struct_pattern: Some(
        Regex::new(r"(?m)^[ \t]*(?:pub(?:\([^)]*\))?\s+)?(?:struct|enum)\s+(\w+)").unwrap(),
    ),
    module_pattern: Some(
        Regex::new(r"(?m)^[ \t]*(?:pub(?:\([^)]*\))?\s+)?mod\s+(\w+)").unwrap(),
    ),
}
});

static PYTHON_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| LanguagePatterns {
    function_pattern: Some(Regex::new(r"(?m)^[ \t]*(?:async\s+)?def\s+(\w+)").unwrap()),
    class_pattern: Some(Regex::new(r"(?m)^[ \t]*class\s+(\w+)").unwrap()),
    struct_pattern: None,
    module_pattern: None,
});

static JAVASCRIPT_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| LanguagePatterns {
    function_pattern: Some(
        Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s*\*?\s*(\w+)?")
            .unwrap(),
    ),
    class_pattern: Some(
        Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:default\s+)?class\s+(\w+)").unwrap(),
    ),
    struct_pattern: None,
    module_pattern: None,
});

static TYPESCRIPT_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| LanguagePatterns {
    function_pattern: Some(
        Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s*\*?\s*(\w+)?")
            .unwrap(),
    ),
    class_pattern: Some(
        Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:default\s+)?(?:abstract\s+)?class\s+(\w+)")
            .unwrap(),
    ),
    struct_pattern: Some(
        Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:interface|type)\s+(\w+)").unwrap(),
    ),
    module_pattern: Some(
        Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:declare\s+)?(?:namespace|module)\s+(\w+)")
            .unwrap(),
    ),
});

static GO_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| LanguagePatterns {
    function_pattern: Some(Regex::new(r"(?m)^func\s+(?:\([^)]+\)\s+)?(\w+)").unwrap()),
    class_pattern: None,
    struct_pattern: Some(Regex::new(r"(?m)^type\s+(\w+)\s+(?:struct|interface)").unwrap()),
    module_pattern: None,
});

static JAVA_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| {
    LanguagePatterns {
    function_pattern: Some(
        Regex::new(
            r"(?m)^[ \t]*(?:@\w+(?:\([^)]*\))?\s+)*(?:public|private|protected)?\s*(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?(?:native\s+)?(?:\w+(?:<[^>]+>)?\s+)+(\w+)\s*\(",
        )
        .unwrap(),
    ),
    class_pattern: Some(
        Regex::new(
            r"(?m)^[ \t]*(?:@\w+(?:\([^)]*\))?\s+)*(?:public\s+)?(?:abstract\s+)?(?:final\s+)?(?:class|interface|enum)\s+(\w+)",
        )
        .unwrap(),
    ),
    struct_pattern: None,
    module_pattern: None,
}
});

static C_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| LanguagePatterns {
    function_pattern: Some(
        Regex::new(r"(?m)^(?:static\s+)?(?:inline\s+)?(?:\w+\s+)+\*?\s*(\w+)\s*\([^)]*\)\s*\{")
            .unwrap(),
    ),
    class_pattern: None,
    struct_pattern: Some(
        Regex::new(r"(?m)^(?:typedef\s+)?(?:struct|union|enum)\s+(\w+)?").unwrap(),
    ),
    module_pattern: None,
});

static CPP_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| {
    LanguagePatterns {
    function_pattern: Some(
        Regex::new(
            r"(?m)^(?:template\s*<[^>]*>\s*)?(?:static\s+)?(?:inline\s+)?(?:virtual\s+)?(?:explicit\s+)?(?:\w+\s+)+\*?\s*(\w+)\s*\([^)]*\)\s*(?:const\s*)?(?:override\s*)?(?:final\s*)?\{",
        )
        .unwrap(),
    ),
    class_pattern: Some(
        Regex::new(
            r"(?m)^(?:template\s*<[^>]*>\s*)?(?:class|struct)\s+(?:\w+\s+)?(\w+)(?:\s*:\s*(?:public|private|protected)\s+\w+)?",
        )
        .unwrap(),
    ),
    struct_pattern: None,
    module_pattern: Some(
        Regex::new(r"(?m)^namespace\s+(\w+)").unwrap(),
    ),
}
});

/// Regex-based chunker for languages without tree-sitter support
pub struct RegexChunker {
    /// Maximum chunk size
    max_chunk_size: usize,
    /// Overlap between chunks
    overlap: usize,
}

impl RegexChunker {
    /// Create a new regex chunker
    pub fn new(max_chunk_size: usize, overlap: usize) -> Self {
        Self {
            max_chunk_size,
            overlap,
        }
    }

    /// Get the patterns for a language
    fn get_patterns(language: Language) -> Option<&'static LanguagePatterns> {
        match language {
            Language::Rust => Some(&*RUST_PATTERNS),
            Language::Python => Some(&*PYTHON_PATTERNS),
            Language::JavaScript | Language::Jsx => Some(&*JAVASCRIPT_PATTERNS),
            Language::TypeScript | Language::Tsx => Some(&*TYPESCRIPT_PATTERNS),
            Language::Go => Some(&*GO_PATTERNS),
            Language::Java => Some(&*JAVA_PATTERNS),
            Language::C => Some(&*C_PATTERNS),
            Language::Cpp => Some(&*CPP_PATTERNS),
            _ => None,
        }
    }

    /// Extract semantic boundaries using regex patterns
    pub fn extract_boundaries(
        &self,
        content: &str,
        language: Language,
    ) -> Result<Vec<SemanticUnit>, RegexChunkError> {
        let patterns = Self::get_patterns(language).ok_or(RegexChunkError::UnsupportedLanguage)?;

        let mut boundaries: Vec<(usize, String, SemanticUnitType)> = Vec::new();

        // Find all function matches
        if let Some(ref pattern) = patterns.function_pattern {
            for cap in pattern.captures_iter(content) {
                let full_match = cap.get(0).unwrap();
                let name = cap.get(1).map(|m| m.as_str().to_string());
                boundaries.push((
                    full_match.start(),
                    name.unwrap_or_else(|| "anonymous".to_string()),
                    SemanticUnitType::Function,
                ));
            }
        }

        // Find all class matches
        if let Some(ref pattern) = patterns.class_pattern {
            for cap in pattern.captures_iter(content) {
                let full_match = cap.get(0).unwrap();
                let name = cap.get(1).map(|m| m.as_str().to_string());
                boundaries.push((
                    full_match.start(),
                    name.unwrap_or_else(|| "anonymous".to_string()),
                    SemanticUnitType::Class,
                ));
            }
        }

        // Find all struct/interface matches
        if let Some(ref pattern) = patterns.struct_pattern {
            for cap in pattern.captures_iter(content) {
                let full_match = cap.get(0).unwrap();
                let name = cap.get(1).map(|m| m.as_str().to_string());
                boundaries.push((
                    full_match.start(),
                    name.unwrap_or_else(|| "anonymous".to_string()),
                    SemanticUnitType::Struct,
                ));
            }
        }

        // Find all module/namespace matches
        if let Some(ref pattern) = patterns.module_pattern {
            for cap in pattern.captures_iter(content) {
                let full_match = cap.get(0).unwrap();
                let name = cap.get(1).map(|m| m.as_str().to_string());
                boundaries.push((
                    full_match.start(),
                    name.unwrap_or_else(|| "anonymous".to_string()),
                    SemanticUnitType::Module,
                ));
            }
        }

        if boundaries.is_empty() {
            return Err(RegexChunkError::NoBoundariesFound);
        }

        // Sort by position
        boundaries.sort_by_key(|(pos, _, _)| *pos);

        // Convert boundaries to semantic units
        self.boundaries_to_units(content, boundaries)
    }

    /// Convert boundary positions to semantic units
    fn boundaries_to_units(
        &self,
        content: &str,
        boundaries: Vec<(usize, String, SemanticUnitType)>,
    ) -> Result<Vec<SemanticUnit>, RegexChunkError> {
        let mut units = Vec::new();

        // Calculate line numbers for each byte position
        let line_starts: Vec<usize> = std::iter::once(0)
            .chain(content.match_indices('\n').map(|(i, _)| i + 1))
            .collect();

        let byte_to_line = |byte_pos: usize| -> usize {
            line_starts
                .iter()
                .take_while(|&&start| start <= byte_pos)
                .count()
        };

        // Add leading content if first boundary isn't at the start
        if let Some((first_pos, _, _)) = boundaries.first() {
            if *first_pos > 0 {
                let leading_content = &content[..*first_pos];
                if !leading_content.trim().is_empty() {
                    let end_line = byte_to_line(*first_pos);
                    units.push(SemanticUnit {
                        content: leading_content.trim_end().to_string(),
                        start_line: 1,
                        end_line,
                        unit_type: SemanticUnitType::Other,
                        symbol_name: None,
                    });
                }
            }
        }

        // Process each boundary
        for (i, (start_pos, name, unit_type)) in boundaries.iter().enumerate() {
            let end_pos = boundaries
                .get(i + 1)
                .map(|(pos, _, _)| *pos)
                .unwrap_or(content.len());

            let unit_content = &content[*start_pos..end_pos];

            // Trim trailing whitespace but keep the semantic content
            let trimmed = unit_content.trim_end();
            if trimmed.is_empty() {
                continue;
            }

            let start_line = byte_to_line(*start_pos);
            let end_line = start_line + trimmed.lines().count().saturating_sub(1);

            units.push(SemanticUnit {
                content: trimmed.to_string(),
                start_line,
                end_line: end_line.max(start_line),
                unit_type: *unit_type,
                symbol_name: Some(name.clone()),
            });
        }

        Ok(units)
    }

    /// Split a large unit into smaller chunks with signature context
    pub fn split_large_unit(&self, unit: &SemanticUnit) -> Vec<SemanticUnit> {
        if unit.content.len() <= self.max_chunk_size {
            return vec![unit.clone()];
        }

        let lines: Vec<&str> = unit.content.lines().collect();
        if lines.is_empty() {
            return vec![unit.clone()];
        }

        // Extract signature
        let signature = self.extract_signature(&unit.content);

        let mut chunks = Vec::new();
        let mut current_chunk = String::new();
        let mut chunk_start_line = unit.start_line;
        let mut is_first_chunk = true;

        for (i, line) in lines.iter().enumerate() {
            let line_with_newline = if i < lines.len() - 1 {
                format!("{}\n", line)
            } else {
                line.to_string()
            };

            let would_exceed = if is_first_chunk {
                current_chunk.len() + line_with_newline.len() > self.max_chunk_size
            } else {
                signature.len() + 1 + current_chunk.len() + line_with_newline.len()
                    > self.max_chunk_size
            };

            if would_exceed && !current_chunk.is_empty() {
                let chunk_content = if is_first_chunk {
                    current_chunk.clone()
                } else {
                    format!("{}\n{}", signature, current_chunk)
                };

                chunks.push(SemanticUnit {
                    content: chunk_content,
                    start_line: chunk_start_line,
                    end_line: unit.start_line + i - 1,
                    unit_type: unit.unit_type,
                    symbol_name: unit.symbol_name.clone(),
                });

                // Start new chunk with overlap
                let overlap_start = self.find_overlap_start(&current_chunk);
                current_chunk = current_chunk[overlap_start..].to_string();
                let overlap_lines = current_chunk.lines().count();
                chunk_start_line = unit.start_line + i - overlap_lines;
                is_first_chunk = false;
            }

            current_chunk.push_str(&line_with_newline);
        }

        // Last chunk
        if !current_chunk.trim().is_empty() {
            let chunk_content = if is_first_chunk {
                current_chunk
            } else {
                format!("{}\n{}", signature, current_chunk)
            };

            chunks.push(SemanticUnit {
                content: chunk_content,
                start_line: chunk_start_line,
                end_line: unit.end_line,
                unit_type: unit.unit_type,
                symbol_name: unit.symbol_name.clone(),
            });
        }

        chunks
    }

    /// Extract the signature from code
    fn extract_signature(&self, content: &str) -> String {
        let lines: Vec<&str> = content.lines().collect();
        if lines.is_empty() {
            return String::new();
        }

        let mut signature_lines = Vec::new();
        for line in &lines {
            signature_lines.push(*line);
            let trimmed = line.trim();
            if trimmed.ends_with('{') || trimmed.ends_with(':') {
                break;
            }
            if signature_lines.len() >= 3 {
                break;
            }
        }

        signature_lines.join("\n")
    }

    /// Find overlap start position
    fn find_overlap_start(&self, content: &str) -> usize {
        if content.len() <= self.overlap {
            return 0;
        }

        let target_start = content.len() - self.overlap;
        let mut start = target_start;
        while start > 0 && !content.is_char_boundary(start) {
            start -= 1;
        }

        if let Some(pos) = content[..start].rfind('\n') {
            return pos + 1;
        }

        start
    }
}

/// Errors that can occur during regex chunking
#[derive(Debug, Clone)]
pub enum RegexChunkError {
    /// Language doesn't have regex patterns defined
    UnsupportedLanguage,
    /// No boundaries found in the content
    NoBoundariesFound,
}

impl std::fmt::Display for RegexChunkError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            RegexChunkError::UnsupportedLanguage => {
                write!(f, "Language not supported for regex chunking")
            }
            RegexChunkError::NoBoundariesFound => {
                write!(f, "No semantic boundaries found in content")
            }
        }
    }
}

impl std::error::Error for RegexChunkError {}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_rust_function_detection() {
        let content = r#"
pub fn hello_world() {
    println!("Hello");
}

async fn async_func() {
    todo!()
}

pub(crate) fn restricted() {}
"#;

        let chunker = RegexChunker::new(1000, 100);
        let units = chunker.extract_boundaries(content, Language::Rust).unwrap();

        let functions: Vec<_> = units
            .iter()
            .filter(|u| u.unit_type == SemanticUnitType::Function)
            .collect();

        assert!(functions.len() >= 3);
    }

    #[test]
    fn test_python_detection() {
        let content = r#"
def hello():
    print("hello")

class MyClass:
    def method(self):
        pass

async def async_func():
    await something()
"#;

        let chunker = RegexChunker::new(1000, 100);
        let units = chunker
            .extract_boundaries(content, Language::Python)
            .unwrap();

        let functions: Vec<_> = units
            .iter()
            .filter(|u| u.unit_type == SemanticUnitType::Function)
            .collect();

        assert!(functions.len() >= 2);

        let classes: Vec<_> = units
            .iter()
            .filter(|u| u.unit_type == SemanticUnitType::Class)
            .collect();

        assert_eq!(classes.len(), 1);
    }

    #[test]
    fn test_javascript_detection() {
        let content = r#"
function hello() {
    console.log("hello");
}

async function asyncFunc() {
    await fetch();
}

class MyClass {
    constructor() {}
}

export default function exportedFunc() {}
"#;

        let chunker = RegexChunker::new(1000, 100);
        let units = chunker
            .extract_boundaries(content, Language::JavaScript)
            .unwrap();

        assert!(!units.is_empty());
    }

    #[test]
    fn test_go_detection() {
        let content = r#"
func main() {
    fmt.Println("hello")
}

func (s *Server) Handle() {
    // method
}

type Config struct {
    Name string
}
"#;

        let chunker = RegexChunker::new(1000, 100);
        let units = chunker.extract_boundaries(content, Language::Go).unwrap();

        let functions: Vec<_> = units
            .iter()
            .filter(|u| u.unit_type == SemanticUnitType::Function)
            .collect();

        assert!(functions.len() >= 2);
    }

    #[test]
    fn test_split_large_unit() {
        let chunker = RegexChunker::new(100, 20);

        let content = (0..50)
            .map(|i| format!("    line{};", i))
            .collect::<Vec<_>>()
            .join("\n");

        let large_unit = SemanticUnit {
            content: format!("fn large() {{\n{}\n}}", content),
            start_line: 1,
            end_line: 52,
            unit_type: SemanticUnitType::Function,
            symbol_name: Some("large".to_string()),
        };

        let chunks = chunker.split_large_unit(&large_unit);
        assert!(chunks.len() > 1);
    }

    #[test]
    fn test_unsupported_language() {
        let chunker = RegexChunker::new(1000, 100);
        let result = chunker.extract_boundaries("some content", Language::Unknown);
        assert!(matches!(result, Err(RegexChunkError::UnsupportedLanguage)));
    }
}