1use std::fs;
2use std::path::{Path, PathBuf};
3
4use anyhow::Context;
5use roder_api::code_index::{CodeByteRange, CodeChunk, CodeLineRange};
6
7use crate::hex_sha256;
8use crate::merkle::{FileManifestEntry, hash_path};
9
10const MAX_CHUNK_LINES: usize = 80;
11
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct ChunkedFile {
14 pub path: PathBuf,
15 pub chunks: Vec<CodeChunk>,
16}
17
18pub fn chunk_workspace(
19 workspace_root: impl AsRef<Path>,
20 files: &[FileManifestEntry],
21) -> anyhow::Result<Vec<CodeChunk>> {
22 let workspace_root = workspace_root.as_ref();
23 let mut chunks = Vec::new();
24 for file in files {
25 let path = workspace_root.join(&file.path);
26 let bytes = fs::read(&path)
27 .with_context(|| format!("read source chunk file {}", path.display()))?;
28 let Ok(text) = String::from_utf8(bytes) else {
29 continue;
30 };
31 chunks.extend(chunk_text_file(&file.path, &text));
32 }
33 Ok(chunks)
34}
35
36pub fn chunk_text_file(path: impl AsRef<Path>, text: &str) -> Vec<CodeChunk> {
37 let path = path.as_ref();
38 if text.is_empty() {
39 return Vec::new();
40 }
41
42 let lines = line_spans(text);
43 if lines.is_empty() {
44 return Vec::new();
45 }
46
47 let mut boundaries = vec![0usize];
48 for (idx, line) in lines.iter().enumerate().skip(1) {
49 if is_symbol_boundary(&text[line.start..line.end]) {
50 boundaries.push(idx);
51 }
52 }
53 push_fallback_boundaries(&mut boundaries, lines.len());
54 boundaries.sort_unstable();
55 boundaries.dedup();
56
57 let mut chunks = Vec::new();
58 for (boundary_idx, start_line_idx) in boundaries.iter().enumerate() {
59 let end_line_idx = boundaries
60 .get(boundary_idx + 1)
61 .copied()
62 .unwrap_or(lines.len());
63 if end_line_idx <= *start_line_idx {
64 continue;
65 }
66 let start_byte = lines[*start_line_idx].start;
67 let end_byte = lines[end_line_idx - 1].end_with_newline;
68 if start_byte >= end_byte {
69 continue;
70 }
71 let source = &text[start_byte..end_byte];
72 let content_hash = hex_sha256(source);
73 let language = language_for_path(path);
74 let symbol_hint = symbol_hint(source);
75 let path_hash = hash_path(path);
76 let chunk_material = format!(
77 "{}\0{}\0{}\0{}",
78 path.display(),
79 start_byte,
80 end_byte,
81 content_hash
82 );
83 chunks.push(CodeChunk {
84 chunk_hash: hex_sha256(chunk_material),
85 path: path.to_path_buf(),
86 path_hash,
87 byte_range: CodeByteRange {
88 start: start_byte as u64,
89 end: end_byte as u64,
90 },
91 line_range: CodeLineRange {
92 start: (*start_line_idx + 1) as u32,
93 end: end_line_idx as u32,
94 },
95 content_hash,
96 language: language.map(str::to_string),
97 symbol_hint,
98 });
99 }
100
101 chunks
102}
103
104#[derive(Debug, Clone, Copy)]
105struct LineSpan {
106 start: usize,
107 end: usize,
108 end_with_newline: usize,
109}
110
111fn line_spans(text: &str) -> Vec<LineSpan> {
112 let mut spans = Vec::new();
113 let mut start = 0usize;
114 for line in text.split_inclusive('\n') {
115 let end_with_newline = start + line.len();
116 let end = line.strip_suffix('\n').map_or(end_with_newline, |trimmed| {
117 start + trimmed.trim_end_matches('\r').len()
118 });
119 spans.push(LineSpan {
120 start,
121 end,
122 end_with_newline,
123 });
124 start = end_with_newline;
125 }
126 if start < text.len() {
127 spans.push(LineSpan {
128 start,
129 end: text.len(),
130 end_with_newline: text.len(),
131 });
132 }
133 spans
134}
135
136fn push_fallback_boundaries(boundaries: &mut Vec<usize>, line_count: usize) {
137 let mut cursor = MAX_CHUNK_LINES;
138 while cursor < line_count {
139 boundaries.push(cursor);
140 cursor += MAX_CHUNK_LINES;
141 }
142}
143
144fn is_symbol_boundary(line: &str) -> bool {
145 let trimmed = line.trim_start();
146 SYMBOL_PREFIXES
147 .iter()
148 .any(|prefix| trimmed.starts_with(prefix))
149}
150
151fn symbol_hint(source: &str) -> Option<String> {
152 source.lines().find_map(|line| {
153 let trimmed = line.trim_start();
154 let matched = SYMBOL_PREFIXES
155 .iter()
156 .find(|prefix| trimmed.starts_with(**prefix))?;
157 let rest = trimmed.trim_start_matches(matched).trim_start();
158 let name = rest
159 .split(|ch: char| !(ch.is_ascii_alphanumeric() || ch == '_'))
160 .find(|part| !part.is_empty())?;
161 Some(name.to_string())
162 })
163}
164
165fn language_for_path(path: &Path) -> Option<&'static str> {
166 match path.extension().and_then(|ext| ext.to_str()) {
167 Some("rs") => Some("rust"),
168 Some("go") => Some("go"),
169 Some("ts") => Some("typescript"),
170 Some("tsx") => Some("tsx"),
171 Some("js") => Some("javascript"),
172 Some("jsx") => Some("jsx"),
173 Some("py") => Some("python"),
174 Some("java") => Some("java"),
175 Some("kt") => Some("kotlin"),
176 Some("swift") => Some("swift"),
177 _ => None,
178 }
179}
180
181const SYMBOL_PREFIXES: &[&str] = &[
182 "pub async fn ",
183 "pub fn ",
184 "async fn ",
185 "fn ",
186 "pub struct ",
187 "struct ",
188 "pub enum ",
189 "enum ",
190 "pub trait ",
191 "trait ",
192 "impl ",
193 "class ",
194 "def ",
195 "function ",
196 "export function ",
197 "const ",
198 "let ",
199];
200
201#[cfg(test)]
202mod tests {
203 use super::*;
204
205 #[test]
206 fn chunk_uses_symbol_boundaries_and_line_ranges() {
207 let source = "pub fn first() {\n}\n\npub struct Second;\nimpl Second {\n}\n";
208 let chunks = chunk_text_file("src/lib.rs", source);
209
210 assert_eq!(chunks.len(), 3);
211 assert_eq!(chunks[0].line_range.start, 1);
212 assert_eq!(chunks[0].line_range.end, 3);
213 assert_eq!(chunks[1].line_range.start, 4);
214 assert_eq!(chunks[1].symbol_hint.as_deref(), Some("Second"));
215 assert_eq!(chunks[2].line_range.start, 5);
216 assert_eq!(chunks[2].symbol_hint.as_deref(), Some("Second"));
217 }
218
219 #[test]
220 fn chunk_falls_back_to_bounded_line_ranges() {
221 let source = (0..170)
222 .map(|idx| format!("// line {idx}\n"))
223 .collect::<String>();
224 let chunks = chunk_text_file("notes.txt", &source);
225
226 assert_eq!(chunks.len(), 3);
227 assert_eq!(chunks[0].line_range, CodeLineRange { start: 1, end: 80 });
228 assert_eq!(
229 chunks[1].line_range,
230 CodeLineRange {
231 start: 81,
232 end: 160
233 }
234 );
235 assert_eq!(
236 chunks[2].line_range,
237 CodeLineRange {
238 start: 161,
239 end: 170
240 }
241 );
242 }
243}