pub mod brace_family;
pub mod config_files;
pub mod python;
pub mod scanner;
use std::collections::{HashMap, HashSet};
use crate::context::FileContext;
const CHARS_PER_TOKEN: u32 = 4;
pub const DEFAULT_MAX_SEMANTIC_TOKENS: u32 = 4000;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum UnitKind {
Function,
Method,
Class,
Struct,
Enum,
Trait,
Impl,
DeclarationGroup,
TopLevel,
}
#[derive(Debug, Clone)]
pub struct SemanticUnit {
pub kind: UnitKind,
pub name: String,
pub file: String,
pub start_line: u32,
pub end_line: u32,
pub content: String,
pub changed_lines: Vec<u32>,
pub context: FileContext,
}
#[derive(Debug)]
pub struct SemanticBatch {
pub units: Vec<SemanticUnit>,
pub estimated_tokens: u32,
}
pub trait LanguageExtractor {
fn language_id(&self) -> &str;
fn extensions(&self) -> &[&str];
fn extract_units(
&self,
content: &str,
file_path: &str,
changed_lines: &HashSet<u32>,
) -> Vec<SemanticUnit>;
}
#[derive(Clone)]
pub enum ExtractorKind {
BraceFamily(brace_family::BraceFamilyExtractor),
Python(python::PythonExtractor),
Config(config_files::ConfigExtractor),
}
impl LanguageExtractor for ExtractorKind {
fn language_id(&self) -> &str {
match self {
Self::BraceFamily(e) => e.language_id(),
Self::Python(e) => e.language_id(),
Self::Config(e) => e.language_id(),
}
}
fn extensions(&self) -> &[&str] {
match self {
Self::BraceFamily(_) => &[],
Self::Python(e) => e.extensions(),
Self::Config(e) => e.extensions(),
}
}
fn extract_units(
&self,
content: &str,
file_path: &str,
changed_lines: &HashSet<u32>,
) -> Vec<SemanticUnit> {
match self {
Self::BraceFamily(e) => e.extract_units(content, file_path, changed_lines),
Self::Python(e) => e.extract_units(content, file_path, changed_lines),
Self::Config(e) => e.extract_units(content, file_path, changed_lines),
}
}
}
pub struct LanguageRegistry {
extractors: HashMap<String, ExtractorKind>,
}
impl LanguageRegistry {
pub fn new() -> Self {
let mut registry = Self {
extractors: HashMap::new(),
};
for extractor in brace_family::all_extractors() {
for ext in extractor.extensions() {
registry.extractors.insert(
ext.to_string(),
ExtractorKind::BraceFamily(extractor.clone()),
);
}
}
let py = python::PythonExtractor;
for ext in py.extensions() {
registry
.extractors
.insert(ext.to_string(), ExtractorKind::Python(py));
}
let cfg = config_files::ConfigExtractor;
for ext in cfg.extensions() {
registry
.extractors
.insert(ext.to_string(), ExtractorKind::Config(cfg));
}
registry
}
pub fn get(&self, file_path: &str) -> Option<&ExtractorKind> {
let ext = file_path.rsplit('.').next()?;
self.extractors.get(ext)
}
}
impl Default for LanguageRegistry {
fn default() -> Self {
Self::new()
}
}
pub fn group_semantic_batches(units: Vec<SemanticUnit>, max_tokens: u32) -> Vec<SemanticBatch> {
let mut batches = Vec::new();
let mut current_units = Vec::new();
let mut current_tokens: u32 = 0;
for unit in units {
let unit_tokens = u32::try_from(unit.content.len()).unwrap_or(u32::MAX) / CHARS_PER_TOKEN;
if unit_tokens > max_tokens {
if !current_units.is_empty() {
batches.push(SemanticBatch {
units: std::mem::take(&mut current_units),
estimated_tokens: current_tokens,
});
current_tokens = 0;
}
eprintln!(
"Warning: oversized semantic unit '{}' ({} tokens > {} limit)",
unit.name, unit_tokens, max_tokens
);
batches.push(SemanticBatch {
estimated_tokens: unit_tokens,
units: vec![unit],
});
continue;
}
if current_tokens + unit_tokens > max_tokens && !current_units.is_empty() {
batches.push(SemanticBatch {
units: std::mem::take(&mut current_units),
estimated_tokens: current_tokens,
});
current_tokens = 0;
}
current_tokens += unit_tokens;
current_units.push(unit);
}
if !current_units.is_empty() {
batches.push(SemanticBatch {
units: current_units,
estimated_tokens: current_tokens,
});
}
batches
}
pub fn fallback_extract(
content: &str,
file_path: &str,
changed_lines: &HashSet<u32>,
context: &FileContext,
) -> Vec<SemanticUnit> {
if changed_lines.is_empty() {
return vec![];
}
let lines: Vec<&str> = content.lines().collect();
let total = u32::try_from(lines.len()).unwrap_or(u32::MAX);
let min_line = *changed_lines.iter().min().unwrap_or(&1);
let max_line = *changed_lines.iter().max().unwrap_or(&total);
let start = min_line.saturating_sub(20).max(1);
let end = (max_line + 20).min(total);
let start_idx = (start - 1) as usize;
let end_idx = end as usize;
let unit_content = lines
.get(start_idx..end_idx.min(lines.len()))
.unwrap_or(&[])
.join("\n");
let mut sorted_changed: Vec<u32> = changed_lines
.iter()
.copied()
.filter(|&l| l >= start && l <= end)
.collect();
sorted_changed.sort_unstable();
vec![SemanticUnit {
kind: UnitKind::TopLevel,
name: format!("lines {}-{}", start, end),
file: file_path.to_string(),
start_line: start,
end_line: end,
content: unit_content,
changed_lines: sorted_changed,
context: context.clone(),
}]
}
#[cfg(test)]
mod tests {
use super::*;
fn make_unit(name: &str, content_len: usize) -> SemanticUnit {
SemanticUnit {
kind: UnitKind::Function,
name: name.to_string(),
file: "test.rs".to_string(),
start_line: 1,
end_line: 10,
content: "x".repeat(content_len),
changed_lines: vec![5],
context: FileContext::default(),
}
}
#[test]
fn group_semantic_batches_respects_limit() {
let units = vec![make_unit("a", 400), make_unit("b", 400)];
let batches = group_semantic_batches(units, 150);
assert_eq!(batches.len(), 2, "Should split into 2 batches");
assert_eq!(batches[0].units.len(), 1);
assert_eq!(batches[1].units.len(), 1);
}
#[test]
fn group_semantic_batches_oversized_unit_solo() {
let units = vec![
make_unit("small1", 400),
make_unit("big", 8000),
make_unit("small2", 400),
];
let batches = group_semantic_batches(units, 500);
assert!(
batches.len() >= 2,
"Oversized unit should be in its own batch"
);
let big_batch = batches
.iter()
.find(|b| b.units.iter().any(|u| u.name == "big"))
.expect("Should find batch with big unit");
assert_eq!(
big_batch.units.len(),
1,
"Oversized unit should be alone in its batch"
);
}
#[test]
fn registry_dispatches_by_extension() {
let registry = LanguageRegistry::new();
let ext = registry.get("src/main.rs");
assert!(ext.is_some(), ".rs should be registered");
assert!(
matches!(ext.unwrap(), ExtractorKind::BraceFamily(_)),
".rs should use BraceFamily extractor"
);
}
#[test]
fn registry_returns_none_for_unknown() {
let registry = LanguageRegistry::new();
assert!(
registry.get("data.xyz").is_none(),
"Unknown extension should return None"
);
}
}