use crate::indexer::ast_chunker::{SemanticUnit, SemanticUnitType};
use crate::indexer::language::Language;
use regex::Regex;
use std::sync::LazyLock;
struct LanguagePatterns {
function_pattern: Option<Regex>,
class_pattern: Option<Regex>,
struct_pattern: Option<Regex>,
module_pattern: Option<Regex>,
}
static RUST_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| {
LanguagePatterns {
function_pattern: Some(
Regex::new(r#"(?m)^[ \t]*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?(?:unsafe\s+)?(?:extern\s+(?:"[^"]*"\s+)?)?fn\s+(\w+)"#)
.unwrap(),
),
class_pattern: None,
struct_pattern: Some(
Regex::new(r"(?m)^[ \t]*(?:pub(?:\([^)]*\))?\s+)?(?:struct|enum)\s+(\w+)").unwrap(),
),
module_pattern: Some(
Regex::new(r"(?m)^[ \t]*(?:pub(?:\([^)]*\))?\s+)?mod\s+(\w+)").unwrap(),
),
}
});
static PYTHON_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| LanguagePatterns {
function_pattern: Some(Regex::new(r"(?m)^[ \t]*(?:async\s+)?def\s+(\w+)").unwrap()),
class_pattern: Some(Regex::new(r"(?m)^[ \t]*class\s+(\w+)").unwrap()),
struct_pattern: None,
module_pattern: None,
});
static JAVASCRIPT_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| LanguagePatterns {
function_pattern: Some(
Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s*\*?\s*(\w+)?")
.unwrap(),
),
class_pattern: Some(
Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:default\s+)?class\s+(\w+)").unwrap(),
),
struct_pattern: None,
module_pattern: None,
});
static TYPESCRIPT_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| LanguagePatterns {
function_pattern: Some(
Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s*\*?\s*(\w+)?")
.unwrap(),
),
class_pattern: Some(
Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:default\s+)?(?:abstract\s+)?class\s+(\w+)")
.unwrap(),
),
struct_pattern: Some(
Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:interface|type)\s+(\w+)").unwrap(),
),
module_pattern: Some(
Regex::new(r"(?m)^[ \t]*(?:export\s+)?(?:declare\s+)?(?:namespace|module)\s+(\w+)")
.unwrap(),
),
});
static GO_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| LanguagePatterns {
function_pattern: Some(Regex::new(r"(?m)^func\s+(?:\([^)]+\)\s+)?(\w+)").unwrap()),
class_pattern: None,
struct_pattern: Some(Regex::new(r"(?m)^type\s+(\w+)\s+(?:struct|interface)").unwrap()),
module_pattern: None,
});
static JAVA_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| {
LanguagePatterns {
function_pattern: Some(
Regex::new(
r"(?m)^[ \t]*(?:@\w+(?:\([^)]*\))?\s+)*(?:public|private|protected)?\s*(?:static\s+)?(?:final\s+)?(?:synchronized\s+)?(?:native\s+)?(?:\w+(?:<[^>]+>)?\s+)+(\w+)\s*\(",
)
.unwrap(),
),
class_pattern: Some(
Regex::new(
r"(?m)^[ \t]*(?:@\w+(?:\([^)]*\))?\s+)*(?:public\s+)?(?:abstract\s+)?(?:final\s+)?(?:class|interface|enum)\s+(\w+)",
)
.unwrap(),
),
struct_pattern: None,
module_pattern: None,
}
});
static C_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| LanguagePatterns {
function_pattern: Some(
Regex::new(r"(?m)^(?:static\s+)?(?:inline\s+)?(?:\w+\s+)+\*?\s*(\w+)\s*\([^)]*\)\s*\{")
.unwrap(),
),
class_pattern: None,
struct_pattern: Some(
Regex::new(r"(?m)^(?:typedef\s+)?(?:struct|union|enum)\s+(\w+)?").unwrap(),
),
module_pattern: None,
});
static CPP_PATTERNS: LazyLock<LanguagePatterns> = LazyLock::new(|| {
LanguagePatterns {
function_pattern: Some(
Regex::new(
r"(?m)^(?:template\s*<[^>]*>\s*)?(?:static\s+)?(?:inline\s+)?(?:virtual\s+)?(?:explicit\s+)?(?:\w+\s+)+\*?\s*(\w+)\s*\([^)]*\)\s*(?:const\s*)?(?:override\s*)?(?:final\s*)?\{",
)
.unwrap(),
),
class_pattern: Some(
Regex::new(
r"(?m)^(?:template\s*<[^>]*>\s*)?(?:class|struct)\s+(?:\w+\s+)?(\w+)(?:\s*:\s*(?:public|private|protected)\s+\w+)?",
)
.unwrap(),
),
struct_pattern: None,
module_pattern: Some(
Regex::new(r"(?m)^namespace\s+(\w+)").unwrap(),
),
}
});
pub struct RegexChunker {
max_chunk_size: usize,
overlap: usize,
}
impl RegexChunker {
pub fn new(max_chunk_size: usize, overlap: usize) -> Self {
Self {
max_chunk_size,
overlap,
}
}
fn get_patterns(language: Language) -> Option<&'static LanguagePatterns> {
match language {
Language::Rust => Some(&*RUST_PATTERNS),
Language::Python => Some(&*PYTHON_PATTERNS),
Language::JavaScript | Language::Jsx => Some(&*JAVASCRIPT_PATTERNS),
Language::TypeScript | Language::Tsx => Some(&*TYPESCRIPT_PATTERNS),
Language::Go => Some(&*GO_PATTERNS),
Language::Java => Some(&*JAVA_PATTERNS),
Language::C => Some(&*C_PATTERNS),
Language::Cpp => Some(&*CPP_PATTERNS),
_ => None,
}
}
pub fn extract_boundaries(
&self,
content: &str,
language: Language,
) -> Result<Vec<SemanticUnit>, RegexChunkError> {
let patterns = Self::get_patterns(language).ok_or(RegexChunkError::UnsupportedLanguage)?;
let mut boundaries: Vec<(usize, String, SemanticUnitType)> = Vec::new();
if let Some(ref pattern) = patterns.function_pattern {
for cap in pattern.captures_iter(content) {
let full_match = cap.get(0).unwrap();
let name = cap.get(1).map(|m| m.as_str().to_string());
boundaries.push((
full_match.start(),
name.unwrap_or_else(|| "anonymous".to_string()),
SemanticUnitType::Function,
));
}
}
if let Some(ref pattern) = patterns.class_pattern {
for cap in pattern.captures_iter(content) {
let full_match = cap.get(0).unwrap();
let name = cap.get(1).map(|m| m.as_str().to_string());
boundaries.push((
full_match.start(),
name.unwrap_or_else(|| "anonymous".to_string()),
SemanticUnitType::Class,
));
}
}
if let Some(ref pattern) = patterns.struct_pattern {
for cap in pattern.captures_iter(content) {
let full_match = cap.get(0).unwrap();
let name = cap.get(1).map(|m| m.as_str().to_string());
boundaries.push((
full_match.start(),
name.unwrap_or_else(|| "anonymous".to_string()),
SemanticUnitType::Struct,
));
}
}
if let Some(ref pattern) = patterns.module_pattern {
for cap in pattern.captures_iter(content) {
let full_match = cap.get(0).unwrap();
let name = cap.get(1).map(|m| m.as_str().to_string());
boundaries.push((
full_match.start(),
name.unwrap_or_else(|| "anonymous".to_string()),
SemanticUnitType::Module,
));
}
}
if boundaries.is_empty() {
return Err(RegexChunkError::NoBoundariesFound);
}
boundaries.sort_by_key(|(pos, _, _)| *pos);
self.boundaries_to_units(content, boundaries)
}
fn boundaries_to_units(
&self,
content: &str,
boundaries: Vec<(usize, String, SemanticUnitType)>,
) -> Result<Vec<SemanticUnit>, RegexChunkError> {
let mut units = Vec::new();
let line_starts: Vec<usize> = std::iter::once(0)
.chain(content.match_indices('\n').map(|(i, _)| i + 1))
.collect();
let byte_to_line = |byte_pos: usize| -> usize {
line_starts
.iter()
.take_while(|&&start| start <= byte_pos)
.count()
};
if let Some((first_pos, _, _)) = boundaries.first() {
if *first_pos > 0 {
let leading_content = &content[..*first_pos];
if !leading_content.trim().is_empty() {
let end_line = byte_to_line(*first_pos);
units.push(SemanticUnit {
content: leading_content.trim_end().to_string(),
start_line: 1,
end_line,
unit_type: SemanticUnitType::Other,
symbol_name: None,
});
}
}
}
for (i, (start_pos, name, unit_type)) in boundaries.iter().enumerate() {
let end_pos = boundaries
.get(i + 1)
.map(|(pos, _, _)| *pos)
.unwrap_or(content.len());
let unit_content = &content[*start_pos..end_pos];
let trimmed = unit_content.trim_end();
if trimmed.is_empty() {
continue;
}
let start_line = byte_to_line(*start_pos);
let end_line = start_line + trimmed.lines().count().saturating_sub(1);
units.push(SemanticUnit {
content: trimmed.to_string(),
start_line,
end_line: end_line.max(start_line),
unit_type: *unit_type,
symbol_name: Some(name.clone()),
});
}
Ok(units)
}
pub fn split_large_unit(&self, unit: &SemanticUnit) -> Vec<SemanticUnit> {
if unit.content.len() <= self.max_chunk_size {
return vec![unit.clone()];
}
let lines: Vec<&str> = unit.content.lines().collect();
if lines.is_empty() {
return vec![unit.clone()];
}
let signature = self.extract_signature(&unit.content);
let mut chunks = Vec::new();
let mut current_chunk = String::new();
let mut chunk_start_line = unit.start_line;
let mut is_first_chunk = true;
for (i, line) in lines.iter().enumerate() {
let line_with_newline = if i < lines.len() - 1 {
format!("{}\n", line)
} else {
line.to_string()
};
let would_exceed = if is_first_chunk {
current_chunk.len() + line_with_newline.len() > self.max_chunk_size
} else {
signature.len() + 1 + current_chunk.len() + line_with_newline.len()
> self.max_chunk_size
};
if would_exceed && !current_chunk.is_empty() {
let chunk_content = if is_first_chunk {
current_chunk.clone()
} else {
format!("{}\n{}", signature, current_chunk)
};
chunks.push(SemanticUnit {
content: chunk_content,
start_line: chunk_start_line,
end_line: unit.start_line + i - 1,
unit_type: unit.unit_type,
symbol_name: unit.symbol_name.clone(),
});
let overlap_start = self.find_overlap_start(¤t_chunk);
current_chunk = current_chunk[overlap_start..].to_string();
let overlap_lines = current_chunk.lines().count();
chunk_start_line = unit.start_line + i - overlap_lines;
is_first_chunk = false;
}
current_chunk.push_str(&line_with_newline);
}
if !current_chunk.trim().is_empty() {
let chunk_content = if is_first_chunk {
current_chunk
} else {
format!("{}\n{}", signature, current_chunk)
};
chunks.push(SemanticUnit {
content: chunk_content,
start_line: chunk_start_line,
end_line: unit.end_line,
unit_type: unit.unit_type,
symbol_name: unit.symbol_name.clone(),
});
}
chunks
}
fn extract_signature(&self, content: &str) -> String {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return String::new();
}
let mut signature_lines = Vec::new();
for line in &lines {
signature_lines.push(*line);
let trimmed = line.trim();
if trimmed.ends_with('{') || trimmed.ends_with(':') {
break;
}
if signature_lines.len() >= 3 {
break;
}
}
signature_lines.join("\n")
}
fn find_overlap_start(&self, content: &str) -> usize {
if content.len() <= self.overlap {
return 0;
}
let target_start = content.len() - self.overlap;
let mut start = target_start;
while start > 0 && !content.is_char_boundary(start) {
start -= 1;
}
if let Some(pos) = content[..start].rfind('\n') {
return pos + 1;
}
start
}
}
#[derive(Debug, Clone)]
pub enum RegexChunkError {
UnsupportedLanguage,
NoBoundariesFound,
}
impl std::fmt::Display for RegexChunkError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
RegexChunkError::UnsupportedLanguage => {
write!(f, "Language not supported for regex chunking")
}
RegexChunkError::NoBoundariesFound => {
write!(f, "No semantic boundaries found in content")
}
}
}
}
impl std::error::Error for RegexChunkError {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_rust_function_detection() {
let content = r#"
pub fn hello_world() {
println!("Hello");
}
async fn async_func() {
todo!()
}
pub(crate) fn restricted() {}
"#;
let chunker = RegexChunker::new(1000, 100);
let units = chunker.extract_boundaries(content, Language::Rust).unwrap();
let functions: Vec<_> = units
.iter()
.filter(|u| u.unit_type == SemanticUnitType::Function)
.collect();
assert!(functions.len() >= 3);
}
#[test]
fn test_python_detection() {
let content = r#"
def hello():
print("hello")
class MyClass:
def method(self):
pass
async def async_func():
await something()
"#;
let chunker = RegexChunker::new(1000, 100);
let units = chunker
.extract_boundaries(content, Language::Python)
.unwrap();
let functions: Vec<_> = units
.iter()
.filter(|u| u.unit_type == SemanticUnitType::Function)
.collect();
assert!(functions.len() >= 2);
let classes: Vec<_> = units
.iter()
.filter(|u| u.unit_type == SemanticUnitType::Class)
.collect();
assert_eq!(classes.len(), 1);
}
#[test]
fn test_javascript_detection() {
let content = r#"
function hello() {
console.log("hello");
}
async function asyncFunc() {
await fetch();
}
class MyClass {
constructor() {}
}
export default function exportedFunc() {}
"#;
let chunker = RegexChunker::new(1000, 100);
let units = chunker
.extract_boundaries(content, Language::JavaScript)
.unwrap();
assert!(!units.is_empty());
}
#[test]
fn test_go_detection() {
let content = r#"
func main() {
fmt.Println("hello")
}
func (s *Server) Handle() {
// method
}
type Config struct {
Name string
}
"#;
let chunker = RegexChunker::new(1000, 100);
let units = chunker.extract_boundaries(content, Language::Go).unwrap();
let functions: Vec<_> = units
.iter()
.filter(|u| u.unit_type == SemanticUnitType::Function)
.collect();
assert!(functions.len() >= 2);
}
#[test]
fn test_split_large_unit() {
let chunker = RegexChunker::new(100, 20);
let content = (0..50)
.map(|i| format!(" line{};", i))
.collect::<Vec<_>>()
.join("\n");
let large_unit = SemanticUnit {
content: format!("fn large() {{\n{}\n}}", content),
start_line: 1,
end_line: 52,
unit_type: SemanticUnitType::Function,
symbol_name: Some("large".to_string()),
};
let chunks = chunker.split_large_unit(&large_unit);
assert!(chunks.len() > 1);
}
#[test]
fn test_unsupported_language() {
let chunker = RegexChunker::new(1000, 100);
let result = chunker.extract_boundaries("some content", Language::Unknown);
assert!(matches!(result, Err(RegexChunkError::UnsupportedLanguage)));
}
}