Skip to main content

lean_ctx/core/
semantic_chunks.rs

1//! Semantic Chunking with Attention Bridges.
2//!
3//! Groups content into semantic chunks (function bodies, import blocks, type
4//! definitions) rather than treating lines independently. Orders chunks for
5//! optimal LLM attention flow:
6//!
7//! 1. Most relevant chunk FIRST (high-attention position)
8//! 2. Its immediate dependencies (imports, types it uses) adjacent
9//! 3. Supporting context in the middle
10//! 4. Tail anchor: brief reference back to the primary chunk (attention bridge)
11//!
12//! This exploits how transformer attention actually works:
13//! local coherence + global anchors beats scattered high-importance lines.
14
15use std::collections::HashSet;
16
17#[derive(Debug, Clone)]
18pub struct SemanticChunk {
19    pub lines: Vec<String>,
20    pub kind: ChunkKind,
21    pub relevance: f64,
22    pub start_line: usize,
23    pub identifier: Option<String>,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum ChunkKind {
28    Imports,
29    TypeDefinition,
30    FunctionDef,
31    Logic,
32    Empty,
33}
34
35/// Detect semantic boundaries in content and group lines into chunks.
36pub fn detect_chunks(content: &str) -> Vec<SemanticChunk> {
37    let lines: Vec<&str> = content.lines().collect();
38    if lines.is_empty() {
39        return Vec::new();
40    }
41
42    let mut chunks: Vec<SemanticChunk> = Vec::new();
43    let mut current_lines: Vec<String> = Vec::new();
44    let mut current_kind = ChunkKind::Empty;
45    let mut current_start = 0;
46    let mut current_ident: Option<String> = None;
47    let mut brace_depth: i32 = 0;
48    let mut in_block = false;
49
50    for (i, &line) in lines.iter().enumerate() {
51        let trimmed = line.trim();
52        let line_kind = classify_line(trimmed);
53
54        let opens = trimmed.matches('{').count() as i32;
55        let closes = trimmed.matches('}').count() as i32;
56
57        if !in_block && is_block_start(trimmed) {
58            if !current_lines.is_empty() {
59                chunks.push(SemanticChunk {
60                    lines: current_lines.clone(),
61                    kind: current_kind,
62                    relevance: 0.0,
63                    start_line: current_start,
64                    identifier: current_ident.take(),
65                });
66                current_lines.clear();
67            }
68            current_start = i;
69            current_kind = line_kind;
70            current_ident = extract_identifier(trimmed);
71            in_block = opens > closes;
72            brace_depth = opens - closes;
73            current_lines.push(line.to_string());
74            continue;
75        }
76
77        if in_block {
78            brace_depth += opens - closes;
79            current_lines.push(line.to_string());
80            if brace_depth <= 0 {
81                in_block = false;
82                chunks.push(SemanticChunk {
83                    lines: current_lines.clone(),
84                    kind: current_kind,
85                    relevance: 0.0,
86                    start_line: current_start,
87                    identifier: current_ident.take(),
88                });
89                current_lines.clear();
90            }
91            continue;
92        }
93
94        // Boundary detection: blank lines or kind changes
95        let is_boundary =
96            trimmed.is_empty() || (line_kind != current_kind && !current_lines.is_empty());
97
98        if is_boundary && !current_lines.is_empty() {
99            chunks.push(SemanticChunk {
100                lines: current_lines.clone(),
101                kind: current_kind,
102                relevance: 0.0,
103                start_line: current_start,
104                identifier: current_ident.take(),
105            });
106            current_lines.clear();
107        }
108
109        if !trimmed.is_empty() {
110            if current_lines.is_empty() {
111                current_start = i;
112                current_kind = line_kind;
113            }
114            current_lines.push(line.to_string());
115        }
116    }
117
118    if !current_lines.is_empty() {
119        chunks.push(SemanticChunk {
120            lines: current_lines,
121            kind: current_kind,
122            relevance: 0.0,
123            start_line: current_start,
124            identifier: current_ident,
125        });
126    }
127
128    chunks
129}
130
131/// Score chunks by task relevance and reorder for optimal attention flow.
132pub fn order_for_attention(
133    mut chunks: Vec<SemanticChunk>,
134    task_keywords: &[String],
135) -> Vec<SemanticChunk> {
136    if chunks.is_empty() {
137        return chunks;
138    }
139
140    let kw_lower: Vec<String> = task_keywords.iter().map(|k| k.to_lowercase()).collect();
141
142    // Score each chunk
143    for chunk in &mut chunks {
144        let text = chunk.lines.join(" ").to_lowercase();
145        let keyword_score: f64 = kw_lower
146            .iter()
147            .filter(|kw| text.contains(kw.as_str()))
148            .count() as f64;
149
150        let kind_weight = match chunk.kind {
151            ChunkKind::FunctionDef => 2.0,
152            ChunkKind::TypeDefinition => 1.8,
153            ChunkKind::Imports => 1.0,
154            ChunkKind::Logic => 0.8,
155            ChunkKind::Empty => 0.1,
156        };
157
158        let size_factor = (chunk.lines.len() as f64 / 5.0).min(1.5);
159
160        chunk.relevance = keyword_score * 2.0 + kind_weight + size_factor * 0.3;
161    }
162
163    // Sort by relevance (most relevant first)
164    chunks.sort_by(|a, b| {
165        b.relevance
166            .partial_cmp(&a.relevance)
167            .unwrap_or(std::cmp::Ordering::Equal)
168    });
169
170    if chunks.len() <= 2 {
171        return chunks;
172    }
173
174    // Reorder: primary chunk first, then its dependencies, then rest
175    let primary = &chunks[0];
176    let primary_tokens: HashSet<String> = primary
177        .lines
178        .iter()
179        .flat_map(|l| l.split_whitespace().map(|w| w.to_lowercase()))
180        .collect();
181
182    let (mut deps, mut rest): (Vec<_>, Vec<_>) = chunks[1..].iter().cloned().partition(|chunk| {
183        if chunk.kind == ChunkKind::Imports || chunk.kind == ChunkKind::TypeDefinition {
184            let chunk_tokens: HashSet<String> = chunk
185                .lines
186                .iter()
187                .flat_map(|l| l.split_whitespace().map(|w| w.to_lowercase()))
188                .collect();
189            let overlap = primary_tokens.intersection(&chunk_tokens).count();
190            overlap >= 2
191        } else {
192            false
193        }
194    });
195
196    deps.sort_by(|a, b| {
197        b.relevance
198            .partial_cmp(&a.relevance)
199            .unwrap_or(std::cmp::Ordering::Equal)
200    });
201    rest.sort_by(|a, b| {
202        b.relevance
203            .partial_cmp(&a.relevance)
204            .unwrap_or(std::cmp::Ordering::Equal)
205    });
206
207    let mut ordered = Vec::with_capacity(chunks.len());
208    ordered.push(chunks[0].clone());
209    ordered.extend(deps);
210    ordered.extend(rest);
211
212    ordered
213}
214
215/// Render chunks back to text with attention bridges.
216pub fn render_with_bridges(chunks: &[SemanticChunk]) -> String {
217    if chunks.is_empty() {
218        return String::new();
219    }
220
221    let mut output = Vec::new();
222
223    for (i, chunk) in chunks.iter().enumerate() {
224        if i > 0 {
225            output.push(String::new());
226        }
227        for line in &chunk.lines {
228            output.push(line.clone());
229        }
230    }
231
232    // Tail anchor: reference back to primary chunk
233    if chunks.len() > 2 {
234        if let Some(ref ident) = chunks[0].identifier {
235            output.push(String::new());
236            output.push(format!("[primary: {ident}]"));
237        }
238    }
239
240    output.join("\n")
241}
242
243fn classify_line(trimmed: &str) -> ChunkKind {
244    if trimmed.is_empty() {
245        return ChunkKind::Empty;
246    }
247    if is_import(trimmed) {
248        return ChunkKind::Imports;
249    }
250    if is_type_def(trimmed) {
251        return ChunkKind::TypeDefinition;
252    }
253    if is_fn_start(trimmed) {
254        return ChunkKind::FunctionDef;
255    }
256    ChunkKind::Logic
257}
258
259fn is_block_start(trimmed: &str) -> bool {
260    is_fn_start(trimmed) || is_type_def(trimmed)
261}
262
263fn is_fn_start(line: &str) -> bool {
264    let starters = [
265        "fn ",
266        "pub fn ",
267        "async fn ",
268        "pub async fn ",
269        "function ",
270        "export function ",
271        "async function ",
272        "def ",
273        "async def ",
274        "func ",
275        "pub(crate) fn ",
276        "pub(super) fn ",
277    ];
278    starters.iter().any(|s| line.starts_with(s))
279}
280
281fn is_type_def(line: &str) -> bool {
282    let starters = [
283        "struct ",
284        "pub struct ",
285        "enum ",
286        "pub enum ",
287        "trait ",
288        "pub trait ",
289        "type ",
290        "pub type ",
291        "interface ",
292        "export interface ",
293        "class ",
294        "export class ",
295    ];
296    starters.iter().any(|s| line.starts_with(s))
297}
298
299fn is_import(line: &str) -> bool {
300    line.starts_with("use ")
301        || line.starts_with("import ")
302        || line.starts_with("from ")
303        || line.starts_with("#include")
304}
305
306fn extract_identifier(line: &str) -> Option<String> {
307    let cleaned = line
308        .replace("pub ", "")
309        .replace("async ", "")
310        .replace("export ", "");
311    let trimmed = cleaned.trim();
312
313    for prefix in &[
314        "fn ",
315        "struct ",
316        "enum ",
317        "trait ",
318        "type ",
319        "class ",
320        "interface ",
321        "function ",
322        "def ",
323        "func ",
324    ] {
325        if let Some(rest) = trimmed.strip_prefix(prefix) {
326            let name: String = rest
327                .chars()
328                .take_while(|c| c.is_alphanumeric() || *c == '_')
329                .collect();
330            if !name.is_empty() {
331                return Some(name);
332            }
333        }
334    }
335    None
336}
337
338#[cfg(test)]
339mod tests {
340    use super::*;
341
342    #[test]
343    fn detect_chunks_basic() {
344        let content = "use std::io;\nuse std::fs;\n\nfn main() {\n    let x = 1;\n}\n\nfn helper() {\n    let y = 2;\n}";
345        let chunks = detect_chunks(content);
346        assert!(
347            chunks.len() >= 2,
348            "should detect multiple chunks, got {}",
349            chunks.len()
350        );
351    }
352
353    #[test]
354    fn detect_chunks_identifies_functions() {
355        let content = "fn main() {\n    println!(\"hello\");\n}";
356        let chunks = detect_chunks(content);
357        assert!(
358            chunks.iter().any(|c| c.kind == ChunkKind::FunctionDef),
359            "should detect function definition"
360        );
361    }
362
363    #[test]
364    fn order_puts_relevant_first() {
365        let content =
366            "fn unrelated() {\n    let x = 1;\n}\n\nfn validate_token() {\n    check();\n}";
367        let chunks = detect_chunks(content);
368        let ordered = order_for_attention(chunks, &["validate".to_string()]);
369        assert!(
370            ordered[0].identifier.as_deref() == Some("validate_token"),
371            "most relevant chunk should be first"
372        );
373    }
374
375    #[test]
376    fn render_with_bridges_adds_anchor() {
377        let chunks = vec![
378            SemanticChunk {
379                lines: vec!["fn main() {".into(), "  let x = 1;".into(), "}".into()],
380                kind: ChunkKind::FunctionDef,
381                relevance: 5.0,
382                start_line: 0,
383                identifier: Some("main".into()),
384            },
385            SemanticChunk {
386                lines: vec!["use std::io;".into()],
387                kind: ChunkKind::Imports,
388                relevance: 1.0,
389                start_line: 5,
390                identifier: None,
391            },
392            SemanticChunk {
393                lines: vec!["fn helper() {".into(), "}".into()],
394                kind: ChunkKind::FunctionDef,
395                relevance: 0.5,
396                start_line: 8,
397                identifier: Some("helper".into()),
398            },
399        ];
400        let result = render_with_bridges(&chunks);
401        assert!(
402            result.contains("[primary: main]"),
403            "should have tail anchor"
404        );
405    }
406
407    #[test]
408    fn extract_identifier_fn() {
409        assert_eq!(
410            extract_identifier("pub fn validate_token() {"),
411            Some("validate_token".into())
412        );
413        assert_eq!(extract_identifier("struct Config {"), Some("Config".into()));
414        assert_eq!(extract_identifier("let x = 1;"), None);
415    }
416}