Skip to main content

normalize_semantic/
chunks.rs

1//! Context window construction from the structural index.
2//!
3//! Each chunk is a rich text window centered on a symbol:
4//!
5//!   symbol name + signature
6//!   + doc comment (tree-sitter extracted)
7//!   + parent module/crate path
8//!   + callers (top N by frequency)
9//!   + callees
10//!   + co-change neighbors
11//!
12//! Additional source types:
13//!
14//! - **`doc`**: Markdown files chunked by heading section. Each section becomes
15//!   one chunk with a breadcrumb of parent headings prepended for context.
16//! - **`commit`**: Git commit messages (subject + body), keyed by commit hash.
17//!
18//! The quality of the embedding is directly upstream of the quality of the
19//! index. Better extraction -> better context windows -> better embeddings.
20
21/// A chunk ready for embedding. Each chunk corresponds to one row in the
22/// `embeddings` table.
23#[derive(Debug, Clone)]
24pub struct Chunk {
25    /// Source type tag: "symbol", "doc", "commit", or "cluster".
26    pub source_type: String,
27    /// Relative file path containing the source.
28    pub source_path: String,
29    /// FK into the `symbols` table (rowid) if this is a symbol chunk.
30    pub source_id: Option<i64>,
31    /// The text passed to the embedding model.
32    pub text: String,
33    /// Git HEAD SHA at construction time (for incremental invalidation).
34    pub last_commit: Option<String>,
35    /// Staleness score in [0, 1] -- higher means less trustworthy.
36    pub staleness: f32,
37}
38
39/// A row from the symbols table with enough data to build a chunk.
40#[derive(Debug, Clone)]
41pub struct SymbolRow {
42    pub rowid: i64,
43    pub file: String,
44    pub name: String,
45    pub kind: String,
46    pub start_line: i64,
47    pub end_line: i64,
48    pub parent: Option<String>,
49}
50
51/// Build the chunk text for a symbol given its context.
52///
53/// The context window format:
54/// ```text
55/// [kind] name (parent if any)
56/// path/to/file.rs:start_line
57/// <doc comment prose>
58///
59/// Callers: foo, bar, baz
60/// Callees: qux, quux
61/// Co-changes with: some_file.rs, other_file.rs
62/// ```
63pub fn build_symbol_chunk(
64    symbol: &SymbolRow,
65    doc_comment: Option<&str>,
66    callers: &[String],
67    callees: &[String],
68    co_change_files: &[String],
69) -> String {
70    let mut parts: Vec<String> = Vec::new();
71
72    // Signature line
73    let sig = if let Some(parent) = &symbol.parent {
74        format!(
75            "[{}] {}.{} -- {}:{}",
76            symbol.kind, parent, symbol.name, symbol.file, symbol.start_line
77        )
78    } else {
79        format!(
80            "[{}] {} -- {}:{}",
81            symbol.kind, symbol.name, symbol.file, symbol.start_line
82        )
83    };
84    parts.push(sig);
85
86    // Doc comment prose (comment markers already stripped by caller)
87    if let Some(doc) = doc_comment {
88        let prose = collapse_line_wraps(doc.trim());
89        if !prose.is_empty() {
90            parts.push(prose);
91        }
92    }
93
94    // Callers
95    if !callers.is_empty() {
96        let top: Vec<&String> = callers.iter().take(5).collect();
97        parts.push(format!(
98            "Callers: {}",
99            top.iter()
100                .map(|s| s.as_str())
101                .collect::<Vec<_>>()
102                .join(", ")
103        ));
104    }
105
106    // Callees
107    if !callees.is_empty() {
108        let top: Vec<&String> = callees.iter().take(5).collect();
109        parts.push(format!(
110            "Callees: {}",
111            top.iter()
112                .map(|s| s.as_str())
113                .collect::<Vec<_>>()
114                .join(", ")
115        ));
116    }
117
118    // Co-change neighbors
119    if !co_change_files.is_empty() {
120        let top: Vec<&String> = co_change_files.iter().take(5).collect();
121        parts.push(format!(
122            "Co-changes with: {}",
123            top.iter()
124                .map(|s| s.as_str())
125                .collect::<Vec<_>>()
126                .join(", ")
127        ));
128    }
129
130    parts.join("\n")
131}
132
133/// Build a chunk for a single markdown heading section.
134///
135/// Format:
136/// ```text
137/// [doc] path/to/README.md
138/// Parent Heading > Section Heading
139///
140/// Section body text here.
141/// ```
142pub fn build_markdown_chunk(path: &str, heading_breadcrumb: &str, body: &str) -> String {
143    let mut out = format!("[doc] {path}");
144    if !heading_breadcrumb.is_empty() {
145        out.push('\n');
146        out.push_str(heading_breadcrumb);
147    }
148    let trimmed_body = body.trim();
149    if !trimmed_body.is_empty() {
150        out.push_str("\n\n");
151        out.push_str(trimmed_body);
152    }
153    out
154}
155
156/// Parse a markdown file into (heading_breadcrumb, body) section pairs.
157///
158/// Each ATX heading (`#`, `##`, ...) starts a new section. The breadcrumb is
159/// built by joining the heading hierarchy with ` > `. The root section (text
160/// before the first heading) uses an empty breadcrumb.
161pub fn split_markdown_sections(content: &str) -> Vec<(String, String)> {
162    let mut sections: Vec<(String, String)> = Vec::new();
163    // Stack holds (level, title) for active headings.
164    let mut heading_stack: Vec<(usize, String)> = Vec::new();
165    let mut current_body = String::new();
166    let mut current_breadcrumb = String::new();
167
168    for line in content.lines() {
169        if line.starts_with('#') {
170            // Count the leading '#' characters
171            let level = line.chars().take_while(|&c| c == '#').count();
172            let title = line.trim_start_matches('#').trim().to_string();
173
174            // Flush current section
175            if !current_body.trim().is_empty() || !current_breadcrumb.is_empty() {
176                sections.push((current_breadcrumb.clone(), current_body.clone()));
177            }
178
179            // Pop headings of equal or deeper level from stack
180            heading_stack.retain(|(l, _)| *l < level);
181            heading_stack.push((level, title));
182
183            // Build new breadcrumb from stack
184            current_breadcrumb = heading_stack
185                .iter()
186                .map(|(_, t)| t.as_str())
187                .collect::<Vec<_>>()
188                .join(" > ");
189            current_body = String::new();
190        } else if !current_body.is_empty() || !line.trim().is_empty() {
191            current_body.push_str(line);
192            current_body.push('\n');
193        }
194    }
195
196    // Flush final section
197    if !current_body.trim().is_empty() || !current_breadcrumb.is_empty() {
198        sections.push((current_breadcrumb, current_body));
199    }
200
201    sections
202}
203
204/// Build a chunk for a git commit message.
205///
206/// Format:
207/// ```text
208/// [commit] <hash>
209/// Date: <ISO date>
210/// <subject line>
211///
212/// <body>
213/// ```
214pub fn build_commit_chunk(hash: &str, date_str: &str, subject: &str, body: &str) -> String {
215    let mut out = format!("[commit] {hash}\nDate: {date_str}\n{subject}");
216    let trimmed = body.trim();
217    if !trimmed.is_empty() {
218        out.push_str("\n\n");
219        out.push_str(trimmed);
220    }
221    out
222}
223
224/// Collapse soft line-wraps in prose. Single newlines within a paragraph
225/// become spaces; double newlines (paragraph breaks) are preserved.
226fn collapse_line_wraps(text: &str) -> String {
227    let mut result = String::new();
228    let mut last_empty = false;
229
230    for line in text.lines() {
231        let trimmed = line.trim();
232        if trimmed.is_empty() {
233            if !last_empty {
234                result.push('\n');
235            }
236            last_empty = true;
237        } else {
238            if !result.is_empty() && !last_empty {
239                result.push(' ');
240            } else if !result.is_empty() && last_empty {
241                result.push('\n');
242            }
243            result.push_str(trimmed);
244            last_empty = false;
245        }
246    }
247    result
248}
249
250#[cfg(test)]
251mod tests {
252    use super::*;
253
254    #[test]
255    fn test_collapse_line_wraps() {
256        let text = "This is a long\nsentence that wraps.\n\nNew paragraph here.";
257        let collapsed = collapse_line_wraps(text);
258        assert!(collapsed.contains("This is a long sentence that wraps."));
259        assert!(collapsed.contains("New paragraph here."));
260    }
261
262    #[test]
263    fn test_build_symbol_chunk() {
264        let sym = SymbolRow {
265            rowid: 1,
266            file: "src/lib.rs".to_string(),
267            name: "open".to_string(),
268            kind: "function".to_string(),
269            start_line: 42,
270            end_line: 60,
271            parent: None,
272        };
273        let chunk = build_symbol_chunk(
274            &sym,
275            Some("Opens the database connection."),
276            &["main".to_string()],
277            &["connect".to_string()],
278            &[],
279        );
280        assert!(chunk.contains("[function] open"));
281        assert!(chunk.contains("Opens the database connection."));
282        assert!(chunk.contains("Callers: main"));
283        assert!(chunk.contains("Callees: connect"));
284    }
285
286    #[test]
287    fn test_split_markdown_sections_basic() {
288        let md = "# Title\n\nIntro text.\n\n## Section A\n\nBody A.\n\n## Section B\n\nBody B.\n";
289        let sections = split_markdown_sections(md);
290        assert_eq!(sections.len(), 3);
291        assert_eq!(sections[0].0, "Title");
292        assert!(sections[0].1.contains("Intro text."));
293        assert_eq!(sections[1].0, "Title > Section A");
294        assert!(sections[1].1.contains("Body A."));
295        assert_eq!(sections[2].0, "Title > Section B");
296        assert!(sections[2].1.contains("Body B."));
297    }
298
299    #[test]
300    fn test_split_markdown_sections_empty() {
301        let sections = split_markdown_sections("");
302        assert!(sections.is_empty());
303    }
304
305    #[test]
306    fn test_build_markdown_chunk() {
307        let chunk = build_markdown_chunk("docs/README.md", "Title > Section A", "Body text.");
308        assert!(chunk.contains("[doc] docs/README.md"));
309        assert!(chunk.contains("Title > Section A"));
310        assert!(chunk.contains("Body text."));
311    }
312
313    #[test]
314    fn test_build_commit_chunk() {
315        let chunk = build_commit_chunk(
316            "abc1234",
317            "2026-01-15",
318            "feat: add semantic search",
319            "Longer description here.",
320        );
321        assert!(chunk.contains("[commit] abc1234"));
322        assert!(chunk.contains("Date: 2026-01-15"));
323        assert!(chunk.contains("feat: add semantic search"));
324        assert!(chunk.contains("Longer description here."));
325    }
326}