gobby_code/index/
chunker.rs1use crate::models::ContentChunk;
14
15const CHUNK_SIZE: usize = 100;
16const CHUNK_OVERLAP: usize = 10;
17
18pub fn chunk_file_content(
20 source: &[u8],
21 rel_path: &str,
22 project_id: &str,
23 language: Option<&str>,
24) -> Vec<ContentChunk> {
25 let text = String::from_utf8_lossy(source);
26 let lines: Vec<&str> = text.split('\n').collect();
27 if lines.is_empty() {
28 return Vec::new();
29 }
30
31 let step = CHUNK_SIZE.saturating_sub(CHUNK_OVERLAP).max(1);
32 let mut chunks = Vec::new();
33 let mut chunk_index: usize = 0;
34 let mut start = 0;
35
36 while start < lines.len() {
37 let end = (start + CHUNK_SIZE).min(lines.len());
38 let chunk_content: String = lines[start..end].join("\n");
39
40 if !chunk_content.trim().is_empty() {
41 chunks.push(ContentChunk {
42 id: ContentChunk::make_id(project_id, rel_path, chunk_index),
43 project_id: project_id.to_string(),
44 file_path: rel_path.to_string(),
45 chunk_index,
46 line_start: start + 1,
47 line_end: end,
48 content: chunk_content,
49 language: language.unwrap_or("unknown").to_string(),
50 created_at: epoch_secs_str(),
51 });
52 chunk_index += 1;
53 }
54
55 if end >= lines.len() {
56 break;
57 }
58 start += step;
59 }
60
61 chunks
62}
63
64fn epoch_secs_str() -> String {
65 use std::time::SystemTime;
67 let secs = SystemTime::now()
68 .duration_since(SystemTime::UNIX_EPOCH)
69 .unwrap_or_default()
70 .as_secs();
71 format!("{secs}")
72}
73
74#[cfg(test)]
75mod tests {
76 #[test]
77 fn chunker_stays_gcode_owned_with_documented_narrowing() {
78 let source = include_str!("chunker.rs");
79 let doc_phrase = ["line-based `ContentChunk`", " records"].concat();
80 assert!(source.contains(&doc_phrase));
81
82 for forbidden in [
83 ["use gobby_core", "::indexing::Chunk"].concat(),
84 ["use gobby_core", "::indexing::ChunkIdentity"].concat(),
85 ["use gobby_core", "::indexing::IndexEvent"].concat(),
86 ["use gobby_core", "::indexing::index_events_from_hashes"].concat(),
87 ] {
88 assert!(!source.contains(&forbidden));
89 }
90 }
91}