Skip to main content

roder_code_index/
chunk.rs

1use std::fs;
2use std::path::{Path, PathBuf};
3
4use anyhow::Context;
5use roder_api::code_index::{CodeByteRange, CodeChunk, CodeLineRange};
6
7use crate::hex_sha256;
8use crate::merkle::{FileManifestEntry, hash_path};
9
10const MAX_CHUNK_LINES: usize = 80;
11
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct ChunkedFile {
14    pub path: PathBuf,
15    pub chunks: Vec<CodeChunk>,
16}
17
18pub fn chunk_workspace(
19    workspace_root: impl AsRef<Path>,
20    files: &[FileManifestEntry],
21) -> anyhow::Result<Vec<CodeChunk>> {
22    let workspace_root = workspace_root.as_ref();
23    let mut chunks = Vec::new();
24    for file in files {
25        let path = workspace_root.join(&file.path);
26        let bytes = fs::read(&path)
27            .with_context(|| format!("read source chunk file {}", path.display()))?;
28        let Ok(text) = String::from_utf8(bytes) else {
29            continue;
30        };
31        chunks.extend(chunk_text_file(&file.path, &text));
32    }
33    Ok(chunks)
34}
35
36pub fn chunk_text_file(path: impl AsRef<Path>, text: &str) -> Vec<CodeChunk> {
37    let path = path.as_ref();
38    if text.is_empty() {
39        return Vec::new();
40    }
41
42    let lines = line_spans(text);
43    if lines.is_empty() {
44        return Vec::new();
45    }
46
47    let mut boundaries = vec![0usize];
48    for (idx, line) in lines.iter().enumerate().skip(1) {
49        if is_symbol_boundary(&text[line.start..line.end]) {
50            boundaries.push(idx);
51        }
52    }
53    push_fallback_boundaries(&mut boundaries, lines.len());
54    boundaries.sort_unstable();
55    boundaries.dedup();
56
57    let mut chunks = Vec::new();
58    for (boundary_idx, start_line_idx) in boundaries.iter().enumerate() {
59        let end_line_idx = boundaries
60            .get(boundary_idx + 1)
61            .copied()
62            .unwrap_or(lines.len());
63        if end_line_idx <= *start_line_idx {
64            continue;
65        }
66        let start_byte = lines[*start_line_idx].start;
67        let end_byte = lines[end_line_idx - 1].end_with_newline;
68        if start_byte >= end_byte {
69            continue;
70        }
71        let source = &text[start_byte..end_byte];
72        let content_hash = hex_sha256(source);
73        let language = language_for_path(path);
74        let symbol_hint = symbol_hint(source);
75        let path_hash = hash_path(path);
76        let chunk_material = format!(
77            "{}\0{}\0{}\0{}",
78            path.display(),
79            start_byte,
80            end_byte,
81            content_hash
82        );
83        chunks.push(CodeChunk {
84            chunk_hash: hex_sha256(chunk_material),
85            path: path.to_path_buf(),
86            path_hash,
87            byte_range: CodeByteRange {
88                start: start_byte as u64,
89                end: end_byte as u64,
90            },
91            line_range: CodeLineRange {
92                start: (*start_line_idx + 1) as u32,
93                end: end_line_idx as u32,
94            },
95            content_hash,
96            language: language.map(str::to_string),
97            symbol_hint,
98        });
99    }
100
101    chunks
102}
103
104#[derive(Debug, Clone, Copy)]
105struct LineSpan {
106    start: usize,
107    end: usize,
108    end_with_newline: usize,
109}
110
111fn line_spans(text: &str) -> Vec<LineSpan> {
112    let mut spans = Vec::new();
113    let mut start = 0usize;
114    for line in text.split_inclusive('\n') {
115        let end_with_newline = start + line.len();
116        let end = line.strip_suffix('\n').map_or(end_with_newline, |trimmed| {
117            start + trimmed.trim_end_matches('\r').len()
118        });
119        spans.push(LineSpan {
120            start,
121            end,
122            end_with_newline,
123        });
124        start = end_with_newline;
125    }
126    if start < text.len() {
127        spans.push(LineSpan {
128            start,
129            end: text.len(),
130            end_with_newline: text.len(),
131        });
132    }
133    spans
134}
135
136fn push_fallback_boundaries(boundaries: &mut Vec<usize>, line_count: usize) {
137    let mut cursor = MAX_CHUNK_LINES;
138    while cursor < line_count {
139        boundaries.push(cursor);
140        cursor += MAX_CHUNK_LINES;
141    }
142}
143
144fn is_symbol_boundary(line: &str) -> bool {
145    let trimmed = line.trim_start();
146    SYMBOL_PREFIXES
147        .iter()
148        .any(|prefix| trimmed.starts_with(prefix))
149}
150
151fn symbol_hint(source: &str) -> Option<String> {
152    source.lines().find_map(|line| {
153        let trimmed = line.trim_start();
154        let matched = SYMBOL_PREFIXES
155            .iter()
156            .find(|prefix| trimmed.starts_with(**prefix))?;
157        let rest = trimmed.trim_start_matches(matched).trim_start();
158        let name = rest
159            .split(|ch: char| !(ch.is_ascii_alphanumeric() || ch == '_'))
160            .find(|part| !part.is_empty())?;
161        Some(name.to_string())
162    })
163}
164
165fn language_for_path(path: &Path) -> Option<&'static str> {
166    match path.extension().and_then(|ext| ext.to_str()) {
167        Some("rs") => Some("rust"),
168        Some("go") => Some("go"),
169        Some("ts") => Some("typescript"),
170        Some("tsx") => Some("tsx"),
171        Some("js") => Some("javascript"),
172        Some("jsx") => Some("jsx"),
173        Some("py") => Some("python"),
174        Some("java") => Some("java"),
175        Some("kt") => Some("kotlin"),
176        Some("swift") => Some("swift"),
177        _ => None,
178    }
179}
180
181const SYMBOL_PREFIXES: &[&str] = &[
182    "pub async fn ",
183    "pub fn ",
184    "async fn ",
185    "fn ",
186    "pub struct ",
187    "struct ",
188    "pub enum ",
189    "enum ",
190    "pub trait ",
191    "trait ",
192    "impl ",
193    "class ",
194    "def ",
195    "function ",
196    "export function ",
197    "const ",
198    "let ",
199];
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    #[test]
206    fn chunk_uses_symbol_boundaries_and_line_ranges() {
207        let source = "pub fn first() {\n}\n\npub struct Second;\nimpl Second {\n}\n";
208        let chunks = chunk_text_file("src/lib.rs", source);
209
210        assert_eq!(chunks.len(), 3);
211        assert_eq!(chunks[0].line_range.start, 1);
212        assert_eq!(chunks[0].line_range.end, 3);
213        assert_eq!(chunks[1].line_range.start, 4);
214        assert_eq!(chunks[1].symbol_hint.as_deref(), Some("Second"));
215        assert_eq!(chunks[2].line_range.start, 5);
216        assert_eq!(chunks[2].symbol_hint.as_deref(), Some("Second"));
217    }
218
219    #[test]
220    fn chunk_falls_back_to_bounded_line_ranges() {
221        let source = (0..170)
222            .map(|idx| format!("// line {idx}\n"))
223            .collect::<String>();
224        let chunks = chunk_text_file("notes.txt", &source);
225
226        assert_eq!(chunks.len(), 3);
227        assert_eq!(chunks[0].line_range, CodeLineRange { start: 1, end: 80 });
228        assert_eq!(
229            chunks[1].line_range,
230            CodeLineRange {
231                start: 81,
232                end: 160
233            }
234        );
235        assert_eq!(
236            chunks[2].line_range,
237            CodeLineRange {
238                start: 161,
239                end: 170
240            }
241        );
242    }
243}