starpod_memory/
indexer.rs1use sqlx::SqlitePool;
2
3use starpod_core::StarpodError;
4
5#[derive(Debug, Clone)]
7pub struct Chunk {
8 pub source: String,
9 pub text: String,
10 pub line_start: usize,
11 pub line_end: usize,
12}
13
14pub const CHUNK_SIZE: usize = 1600;
16pub const CHUNK_OVERLAP: usize = 320;
18
19pub fn chunk_text(source: &str, text: &str, chunk_size: usize, chunk_overlap: usize) -> Vec<Chunk> {
24 let lines: Vec<&str> = text.lines().collect();
25 if lines.is_empty() {
26 return Vec::new();
27 }
28
29 let mut chunks = Vec::new();
30 let mut start_line = 0;
31
32 while start_line < lines.len() {
33 let mut char_count = 0;
34 let mut end_line = start_line;
35
36 while end_line < lines.len() && char_count < chunk_size {
38 char_count += lines[end_line].len() + 1; end_line += 1;
40 }
41
42 let chunk_text: String = lines[start_line..end_line].join("\n");
43 if !chunk_text.trim().is_empty() {
44 chunks.push(Chunk {
45 source: source.to_string(),
46 text: chunk_text,
47 line_start: start_line + 1, line_end: end_line,
49 });
50 }
51
52 let mut overlap_chars = 0;
54 let mut overlap_lines = 0;
55 for i in (start_line..end_line).rev() {
56 overlap_chars += lines[i].len() + 1;
57 overlap_lines += 1;
58 if overlap_chars >= chunk_overlap {
59 break;
60 }
61 }
62
63 let advance = end_line - start_line;
64 if advance <= overlap_lines {
65 start_line = end_line;
67 } else {
68 start_line = end_line - overlap_lines;
69 }
70 }
71
72 chunks
73}
74
75pub async fn reindex_source(
77 pool: &SqlitePool,
78 source: &str,
79 text: &str,
80 chunk_size: usize,
81 chunk_overlap: usize,
82) -> Result<(), StarpodError> {
83 sqlx::query("DELETE FROM memory_fts WHERE source = ?1")
85 .bind(source)
86 .execute(pool)
87 .await
88 .map_err(|e| StarpodError::Database(format!("Failed to delete old chunks: {}", e)))?;
89
90 let chunks = chunk_text(source, text, chunk_size, chunk_overlap);
92 for chunk in &chunks {
93 sqlx::query("INSERT INTO memory_fts (source, chunk_text, line_start, line_end) VALUES (?1, ?2, ?3, ?4)")
94 .bind(&chunk.source)
95 .bind(&chunk.text)
96 .bind(chunk.line_start as i64)
97 .bind(chunk.line_end as i64)
98 .execute(pool)
99 .await
100 .map_err(|e| StarpodError::Database(format!("Failed to insert chunk: {}", e)))?;
101 }
102
103 Ok(())
104}
105
106#[cfg(test)]
107mod tests {
108 use super::*;
109
110 #[test]
111 fn test_chunk_text_small() {
112 let text = "line one\nline two\nline three";
113 let chunks = chunk_text("test.md", text, CHUNK_SIZE, CHUNK_OVERLAP);
114 assert_eq!(chunks.len(), 1);
115 assert_eq!(chunks[0].source, "test.md");
116 assert_eq!(chunks[0].line_start, 1);
117 assert_eq!(chunks[0].line_end, 3);
118 }
119
120 #[test]
121 fn test_chunk_text_empty() {
122 let chunks = chunk_text("test.md", "", CHUNK_SIZE, CHUNK_OVERLAP);
123 assert!(chunks.is_empty());
124 }
125
126 #[test]
127 fn test_chunk_text_large() {
128 let line = "x".repeat(200);
130 let text: String = (0..20)
131 .map(|_| line.as_str())
132 .collect::<Vec<_>>()
133 .join("\n");
134 let chunks = chunk_text("big.md", &text, CHUNK_SIZE, CHUNK_OVERLAP);
135 assert!(chunks.len() > 1, "Should produce multiple chunks");
136 for chunk in &chunks {
138 assert!(!chunk.text.trim().is_empty());
139 }
140 }
141
142 #[test]
143 fn test_chunk_text_custom_sizes() {
144 let line = "a".repeat(200);
146 let long_text: String = (0..20)
147 .map(|_| line.as_str())
148 .collect::<Vec<_>>()
149 .join("\n");
150
151 let chunks_default = chunk_text("test.md", &long_text, CHUNK_SIZE, CHUNK_OVERLAP);
152 let chunks_small = chunk_text("test.md", &long_text, 200, 50);
153
154 assert!(
156 chunks_small.len() > chunks_default.len(),
157 "Small chunk_size ({} chunks) should produce more chunks than default ({} chunks)",
158 chunks_small.len(),
159 chunks_default.len(),
160 );
161
162 if chunks_small.len() >= 2 {
165 for i in 0..chunks_small.len() - 1 {
166 let current_lines: Vec<&str> = chunks_small[i].text.lines().collect();
167 let next_lines: Vec<&str> = chunks_small[i + 1].text.lines().collect();
168 let first_next_line = next_lines[0];
170 assert!(
171 current_lines.contains(&first_next_line),
172 "Overlap not respected between chunk {} and chunk {}: first line of next chunk not found in current chunk",
173 i,
174 i + 1,
175 );
176 }
177 }
178 }
179}