use std::collections::HashSet;
use super::scanner::{find_closing_brace, CodeScanner, LexicalRules, ScannedLine};
use super::{LanguageExtractor, SemanticUnit, UnitKind};
use crate::context::FileContext;
#[derive(Clone)]
pub struct BraceFamilyExtractor {
language: String,
exts: Vec<String>,
function_keywords: Vec<String>,
block_keywords: Vec<String>,
declaration_keywords: Vec<String>,
doc_comment_prefixes: Vec<String>,
attribute_prefixes: Vec<String>,
lexical_rules: LexicalRules,
}
impl LanguageExtractor for BraceFamilyExtractor {
fn language_id(&self) -> &str {
&self.language
}
fn extensions(&self) -> &[&str] {
&[]
}
fn extract_units(
&self,
content: &str,
file_path: &str,
changed_lines: &HashSet<u32>,
) -> Vec<SemanticUnit> {
if changed_lines.is_empty() {
return vec![];
}
let lines: Vec<&str> = content.lines().collect();
let mut scanner = CodeScanner::new(self.lexical_rules.clone());
let scanned = scanner.scan_all(content);
let constructs = self.detect_constructs(&lines, &scanned);
let mut units: Vec<SemanticUnit> = Vec::new();
let mut covered_lines: HashSet<u32> = HashSet::new();
for construct in &constructs {
let overlaps = changed_lines
.iter()
.any(|&l| l >= construct.start_line && l <= construct.end_line);
if overlaps {
let changed_in_unit: Vec<u32> = changed_lines
.iter()
.copied()
.filter(|&l| l >= construct.start_line && l <= construct.end_line)
.collect();
for &l in &changed_in_unit {
covered_lines.insert(l);
}
let start_idx = (construct.start_line - 1) as usize;
let end_idx = construct.end_line as usize;
let unit_content = lines
.get(start_idx..end_idx.min(lines.len()))
.unwrap_or(&[])
.join("\n");
units.push(SemanticUnit {
kind: construct.kind,
name: construct.name.clone(),
file: file_path.to_string(),
start_line: construct.start_line,
end_line: construct.end_line,
content: unit_content,
changed_lines: changed_in_unit,
context: FileContext::default(),
});
}
}
let orphans: Vec<u32> = changed_lines
.iter()
.copied()
.filter(|l| !covered_lines.contains(l))
.collect();
if !orphans.is_empty() {
let total_lines = u32::try_from(lines.len()).unwrap_or(u32::MAX);
let min_orphan = *orphans.iter().min().unwrap_or(&1);
let max_orphan = *orphans.iter().max().unwrap_or(&total_lines);
let start = self.find_gap_start(&constructs, min_orphan);
let end = self.find_gap_end(&constructs, max_orphan, total_lines);
let start_idx = start - 1;
let end_idx = end as usize;
let unit_content = lines
.get(start_idx..end_idx.min(lines.len()))
.unwrap_or(&[])
.join("\n");
let mut sorted_orphans = orphans;
sorted_orphans.sort_unstable();
let start_u32 = u32::try_from(start).unwrap_or(1);
units.push(SemanticUnit {
kind: UnitKind::TopLevel,
name: format!("lines {}-{}", start_u32, end),
file: file_path.to_string(),
start_line: start_u32,
end_line: end,
content: unit_content,
changed_lines: sorted_orphans,
context: FileContext::default(),
});
}
units
}
}
struct Construct {
kind: UnitKind,
name: String,
start_line: u32,
end_line: u32,
}
impl BraceFamilyExtractor {
fn detect_constructs(&self, lines: &[&str], scanned: &[ScannedLine]) -> Vec<Construct> {
let mut constructs = Vec::new();
let mut i = 0;
while i < lines.len() {
if i < scanned.len() && scanned[i].is_non_code {
i += 1;
continue;
}
let trimmed = lines[i].trim();
let line_num = u32::try_from(i + 1).unwrap_or(u32::MAX);
if let Some(kind) = self.match_function_keyword(trimmed) {
let name = extract_name_from_line(trimmed, &self.function_keywords);
let start = self.scan_back_for_docs(lines, i);
let end = if i < scanned.len() && scanned[i].brace_delta > 0 {
let close_idx = find_closing_brace(scanned, i);
u32::try_from(close_idx + 1).unwrap_or(line_num)
} else {
let mut brace_line = i;
for j in (i + 1)..lines.len().min(i + 3) {
if j < scanned.len() && scanned[j].brace_delta > 0 {
brace_line = j;
break;
}
}
if brace_line != i {
let close_idx = find_closing_brace(scanned, brace_line);
u32::try_from(close_idx + 1).unwrap_or(line_num)
} else {
line_num }
};
constructs.push(Construct {
kind,
name,
start_line: u32::try_from(start + 1).unwrap_or(1),
end_line: end,
});
i = (end as usize).max(i + 1);
continue;
}
if let Some(kind) = self.match_block_keyword(trimmed) {
let name = extract_name_from_line(trimmed, &self.block_keywords);
let start = self.scan_back_for_docs(lines, i);
let end = if i < scanned.len() && scanned[i].brace_delta > 0 {
let close_idx = find_closing_brace(scanned, i);
u32::try_from(close_idx + 1).unwrap_or(line_num)
} else {
let mut brace_line = i;
for j in (i + 1)..lines.len().min(i + 3) {
if j < scanned.len() && scanned[j].brace_delta > 0 {
brace_line = j;
break;
}
}
if brace_line != i {
let close_idx = find_closing_brace(scanned, brace_line);
u32::try_from(close_idx + 1).unwrap_or(line_num)
} else {
line_num
}
};
constructs.push(Construct {
kind,
name,
start_line: u32::try_from(start + 1).unwrap_or(1),
end_line: end,
});
i = (end as usize).max(i + 1);
continue;
}
if self.match_declaration_keyword(trimmed) {
let group_start = i;
let mut group_end = i;
while group_end + 1 < lines.len() {
let next = lines[group_end + 1].trim();
if self.match_declaration_keyword(next) {
group_end += 1;
} else if next.is_empty() {
if group_end + 2 < lines.len()
&& self.match_declaration_keyword(lines[group_end + 2].trim())
{
group_end += 2;
} else {
break;
}
} else {
break;
}
}
let doc_start = self.scan_back_for_docs(lines, group_start);
let name = self.declaration_group_name(trimmed);
constructs.push(Construct {
kind: UnitKind::DeclarationGroup,
name,
start_line: u32::try_from(doc_start + 1).unwrap_or(1),
end_line: u32::try_from(group_end + 1).unwrap_or(1),
});
i = group_end + 1;
continue;
}
i += 1;
}
constructs
}
fn match_function_keyword(&self, trimmed: &str) -> Option<UnitKind> {
for kw in &self.function_keywords {
if trimmed.starts_with(kw.as_str()) {
return Some(UnitKind::Function);
}
}
None
}
fn match_block_keyword(&self, trimmed: &str) -> Option<UnitKind> {
for kw in &self.block_keywords {
if trimmed.starts_with(kw.as_str()) {
return match kw.trim() {
"struct" => Some(UnitKind::Struct),
"pub struct" => Some(UnitKind::Struct),
"pub(crate) struct" => Some(UnitKind::Struct),
"enum" | "pub enum" | "pub(crate) enum" => Some(UnitKind::Enum),
"trait" | "pub trait" | "pub(crate) trait" => Some(UnitKind::Trait),
"impl" => Some(UnitKind::Impl),
"class" | "pub class" | "export class" => Some(UnitKind::Class),
"mod" | "pub mod" | "pub(crate) mod" => Some(UnitKind::Function), "interface" | "pub interface" | "export interface" => Some(UnitKind::Trait),
"namespace" => Some(UnitKind::Class),
"union" | "type" => Some(UnitKind::Struct),
_ => Some(UnitKind::Class),
};
}
}
None
}
fn match_declaration_keyword(&self, trimmed: &str) -> bool {
for kw in &self.declaration_keywords {
if trimmed.starts_with(kw.as_str()) {
return true;
}
}
false
}
fn scan_back_for_docs(&self, lines: &[&str], idx: usize) -> usize {
let mut start = idx;
while start > 0 {
let prev = lines[start - 1].trim();
let is_doc = self
.doc_comment_prefixes
.iter()
.any(|p| prev.starts_with(p.as_str()));
let is_attr = self
.attribute_prefixes
.iter()
.any(|p| prev.starts_with(p.as_str()));
if is_doc || is_attr {
start -= 1;
} else {
break;
}
}
start
}
fn declaration_group_name(&self, first_line: &str) -> String {
let trimmed = first_line.trim();
if trimmed.starts_with("use ")
|| trimmed.starts_with("pub use ")
|| trimmed.starts_with("import ")
|| trimmed.starts_with("from ")
{
"imports".to_string()
} else if trimmed.starts_with("const ") || trimmed.starts_with("pub const ") {
"constants".to_string()
} else if trimmed.starts_with("static ") || trimmed.starts_with("pub static ") {
"statics".to_string()
} else if trimmed.starts_with("#define ") {
"defines".to_string()
} else if trimmed.starts_with("typedef ") {
"typedefs".to_string()
} else if trimmed.starts_with("extern ") {
"externs".to_string()
} else if trimmed.starts_with("type ") || trimmed.starts_with("pub type ") {
"type aliases".to_string()
} else if trimmed.starts_with("let ") || trimmed.starts_with("var ") {
"variables".to_string()
} else if trimmed.starts_with("export ") {
"exports".to_string()
} else if trimmed.starts_with("package ") {
"package".to_string()
} else {
"declarations".to_string()
}
}
fn find_gap_start(&self, constructs: &[Construct], line: u32) -> usize {
let mut start = 1u32;
for c in constructs {
if c.end_line < line {
start = start.max(c.end_line + 1);
}
}
start as usize
}
fn find_gap_end(&self, constructs: &[Construct], line: u32, total: u32) -> u32 {
let mut end = total;
for c in constructs {
if c.start_line > line && c.start_line < end {
end = c.start_line - 1;
}
}
end
}
}
fn extract_name_from_line(trimmed: &str, keywords: &[String]) -> String {
for kw in keywords {
if let Some(rest) = trimmed.strip_prefix(kw.as_str()) {
let name: String = rest
.chars()
.take_while(|c| c.is_alphanumeric() || *c == '_')
.collect();
if !name.is_empty() {
return name;
}
}
}
trimmed
.split_whitespace()
.nth(1)
.unwrap_or("unnamed")
.chars()
.take_while(|c| c.is_alphanumeric() || *c == '_')
.collect::<String>()
}
pub fn all_extractors() -> Vec<BraceFamilyExtractor> {
vec![
BraceFamilyExtractor::rust(),
BraceFamilyExtractor::c(),
BraceFamilyExtractor::cpp(),
BraceFamilyExtractor::go(),
BraceFamilyExtractor::java(),
BraceFamilyExtractor::javascript(),
BraceFamilyExtractor::typescript(),
]
}
impl BraceFamilyExtractor {
pub fn rust() -> Self {
Self {
language: "rust".to_string(),
exts: vec!["rs".to_string()],
function_keywords: vec![
"pub async fn ".to_string(),
"pub(crate) async fn ".to_string(),
"async fn ".to_string(),
"pub(crate) fn ".to_string(),
"pub fn ".to_string(),
"fn ".to_string(),
],
block_keywords: vec![
"pub(crate) struct ".to_string(),
"pub struct ".to_string(),
"struct ".to_string(),
"pub(crate) enum ".to_string(),
"pub enum ".to_string(),
"enum ".to_string(),
"pub(crate) trait ".to_string(),
"pub trait ".to_string(),
"trait ".to_string(),
"impl ".to_string(),
"pub(crate) mod ".to_string(),
"pub mod ".to_string(),
"mod ".to_string(),
],
declaration_keywords: vec![
"pub(crate) const ".to_string(),
"pub const ".to_string(),
"const ".to_string(),
"pub(crate) static ".to_string(),
"pub static ".to_string(),
"static ".to_string(),
"pub(crate) type ".to_string(),
"pub type ".to_string(),
"type ".to_string(),
"pub use ".to_string(),
"use ".to_string(),
],
doc_comment_prefixes: vec!["///".to_string(), "//!".to_string()],
attribute_prefixes: vec!["#[".to_string()],
lexical_rules: LexicalRules::rust(),
}
}
pub fn c() -> Self {
Self {
language: "c".to_string(),
exts: vec!["c".to_string(), "h".to_string()],
function_keywords: vec![],
block_keywords: vec![
"struct ".to_string(),
"union ".to_string(),
"enum ".to_string(),
],
declaration_keywords: vec![
"#define ".to_string(),
"typedef ".to_string(),
"extern ".to_string(),
"#include ".to_string(),
],
doc_comment_prefixes: vec!["/**".to_string(), "///".to_string()],
attribute_prefixes: vec![],
lexical_rules: LexicalRules::c(),
}
}
pub fn cpp() -> Self {
let mut ext = Self::c();
ext.language = "cpp".to_string();
ext.exts = vec![
"cpp".to_string(),
"cxx".to_string(),
"cc".to_string(),
"hpp".to_string(),
"hxx".to_string(),
];
ext.block_keywords
.extend(["class ".to_string(), "namespace ".to_string()]);
ext.lexical_rules = LexicalRules::cpp();
ext
}
pub fn go() -> Self {
Self {
language: "go".to_string(),
exts: vec!["go".to_string()],
function_keywords: vec!["func ".to_string()],
block_keywords: vec!["type ".to_string()],
declaration_keywords: vec![
"import ".to_string(),
"var ".to_string(),
"const ".to_string(),
],
doc_comment_prefixes: vec!["//".to_string()],
attribute_prefixes: vec![],
lexical_rules: LexicalRules::go(),
}
}
pub fn java() -> Self {
Self {
language: "java".to_string(),
exts: vec!["java".to_string()],
function_keywords: vec![
"public ".to_string(),
"private ".to_string(),
"protected ".to_string(),
"static ".to_string(),
],
block_keywords: vec![
"class ".to_string(),
"interface ".to_string(),
"enum ".to_string(),
"public class ".to_string(),
"public interface ".to_string(),
"public enum ".to_string(),
],
declaration_keywords: vec!["import ".to_string(), "package ".to_string()],
doc_comment_prefixes: vec!["/**".to_string(), "///".to_string()],
attribute_prefixes: vec!["@".to_string()],
lexical_rules: LexicalRules::java(),
}
}
pub fn javascript() -> Self {
Self {
language: "javascript".to_string(),
exts: vec!["js".to_string(), "jsx".to_string(), "mjs".to_string()],
function_keywords: vec![
"function ".to_string(),
"async function ".to_string(),
"export function ".to_string(),
"export async function ".to_string(),
"export default function ".to_string(),
],
block_keywords: vec![
"class ".to_string(),
"export class ".to_string(),
"export default class ".to_string(),
],
declaration_keywords: vec![
"import ".to_string(),
"export ".to_string(),
"const ".to_string(),
"let ".to_string(),
"var ".to_string(),
],
doc_comment_prefixes: vec!["/**".to_string(), "///".to_string()],
attribute_prefixes: vec!["@".to_string()],
lexical_rules: LexicalRules::javascript(),
}
}
pub fn typescript() -> Self {
let mut ext = Self::javascript();
ext.language = "typescript".to_string();
ext.exts = vec!["ts".to_string(), "tsx".to_string(), "mts".to_string()];
ext.block_keywords.push("interface ".to_string());
ext.block_keywords.push("export interface ".to_string());
ext.declaration_keywords.push("type ".to_string());
ext.declaration_keywords.push("export type ".to_string());
ext.lexical_rules = LexicalRules::typescript();
ext
}
pub fn extensions(&self) -> &[String] {
&self.exts
}
}
#[cfg(test)]
mod tests {
use super::*;
fn rust_extractor() -> BraceFamilyExtractor {
BraceFamilyExtractor::rust()
}
fn c_extractor() -> BraceFamilyExtractor {
BraceFamilyExtractor::c()
}
fn extract(ext: &BraceFamilyExtractor, code: &str, changed: &[u32]) -> Vec<SemanticUnit> {
let changed_set: HashSet<u32> = changed.iter().copied().collect();
ext.extract_units(code, "test.rs", &changed_set)
}
#[test]
fn rust_extracts_function_with_changed_line() {
let code = "fn foo() {\n let x = 1;\n let y = 2;\n}";
let units = extract(&rust_extractor(), code, &[2]);
assert_eq!(units.len(), 1, "Should extract one function");
assert_eq!(units[0].kind, UnitKind::Function);
assert_eq!(units[0].name, "foo");
assert!(units[0].content.contains("fn foo()"));
}
#[test]
fn rust_extracts_struct_with_changed_field() {
let code = "pub struct Config {\n pub model: String,\n pub backend: String,\n}";
let units = extract(&rust_extractor(), code, &[2]);
assert_eq!(units.len(), 1, "Should extract one struct");
assert_eq!(units[0].kind, UnitKind::Struct);
assert_eq!(units[0].name, "Config");
}
#[test]
fn rust_extracts_impl_block() {
let code = "impl Config {\n fn new() -> Self {\n Self {}\n }\n}";
let units = extract(&rust_extractor(), code, &[2]);
assert_eq!(units.len(), 1, "Should extract one impl block");
assert_eq!(units[0].kind, UnitKind::Impl);
assert!(units[0].name.starts_with("Config"));
}
#[test]
fn rust_extracts_const_declaration_group() {
let code = "const MAX: u32 = 100;\nconst MIN: u32 = 0;\nconst DEFAULT: u32 = 50;";
let units = extract(&rust_extractor(), code, &[2]);
assert_eq!(units.len(), 1, "Should extract one declaration group");
assert_eq!(units[0].kind, UnitKind::DeclarationGroup);
assert_eq!(units[0].name, "constants");
}
#[test]
fn rust_extracts_use_import_group() {
let code =
"use std::collections::HashMap;\nuse std::collections::HashSet;\nuse std::path::Path;";
let units = extract(&rust_extractor(), code, &[2]);
assert_eq!(units.len(), 1, "Should extract one import group");
assert_eq!(units[0].kind, UnitKind::DeclarationGroup);
assert_eq!(units[0].name, "imports");
}
#[test]
fn rust_includes_doc_comments_and_attributes() {
let code =
"/// A config struct.\n#[derive(Debug)]\npub struct Config {\n pub x: u32,\n}";
let units = extract(&rust_extractor(), code, &[4]);
assert_eq!(units.len(), 1);
assert!(
units[0].content.contains("/// A config struct."),
"Should include doc comment"
);
assert!(
units[0].content.contains("#[derive(Debug)]"),
"Should include attribute"
);
}
#[test]
fn rust_skips_unchanged_function() {
let code = "fn foo() {\n let x = 1;\n}\nfn bar() {\n let y = 2;\n}";
let units = extract(&rust_extractor(), code, &[5]);
assert_eq!(units.len(), 1, "Should only extract changed function");
assert_eq!(units[0].name, "bar");
}
#[test]
fn rust_orphan_line_between_functions() {
let code = "fn foo() {\n bar();\n}\n\n// orphan comment\n\nfn baz() {\n qux();\n}";
let units = extract(&rust_extractor(), code, &[5]);
assert!(
units.iter().any(|u| u.kind == UnitKind::TopLevel),
"Orphan line should produce a TopLevel unit"
);
}
#[test]
fn c_extracts_global_variable() {
let code = "extern int count;\nextern int total;";
let units = extract(&c_extractor(), code, &[1]);
assert_eq!(units.len(), 1);
assert_eq!(units[0].kind, UnitKind::DeclarationGroup);
}
#[test]
fn c_extracts_define_group() {
let code = "#define MAX_SIZE 100\n#define MIN_SIZE 10\n#define DEFAULT_SIZE 50";
let units = extract(&c_extractor(), code, &[2]);
assert_eq!(units.len(), 1, "Should extract one define group");
assert_eq!(units[0].kind, UnitKind::DeclarationGroup);
assert_eq!(units[0].name, "defines");
assert!(
units[0].content.contains("#define MAX_SIZE"),
"Group should include all consecutive defines"
);
assert!(
units[0].content.contains("#define DEFAULT_SIZE"),
"Group should include all consecutive defines"
);
}
}