1use crate::embedder::f32_to_bytes;
10use crate::errors::AppError;
11use rusqlite::{params, Connection};
12
13#[derive(Debug, Clone)]
14pub struct Chunk {
15 pub memory_id: i64,
16 pub chunk_idx: i32,
17 pub chunk_text: String,
18 pub start_offset: i32,
19 pub end_offset: i32,
20 pub token_count: i32,
21}
22
23pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
24 for chunk in chunks {
25 conn.execute(
26 "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
27 VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
28 params![
29 chunk.memory_id,
30 chunk.chunk_idx,
31 chunk.chunk_text,
32 chunk.start_offset,
33 chunk.end_offset,
34 chunk.token_count,
35 ],
36 )?;
37 }
38 Ok(())
39}
40
41pub fn insert_chunk_slices(
42 conn: &Connection,
43 memory_id: i64,
44 body: &str,
45 chunks: &[crate::chunking::Chunk],
46) -> Result<(), AppError> {
47 for (chunk_idx, chunk) in chunks.iter().enumerate() {
48 conn.execute(
49 "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
50 VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
51 params![
52 memory_id,
53 chunk_idx as i32,
54 crate::chunking::chunk_text(body, chunk),
55 chunk.start_offset as i32,
56 chunk.end_offset as i32,
57 chunk.token_count_approx as i32,
58 ],
59 )?;
60 }
61 Ok(())
62}
63
64pub fn upsert_chunk_vec(
65 conn: &Connection,
66 _rowid: i64,
67 memory_id: i64,
68 chunk_idx: i32,
69 embedding: &[f32],
70) -> Result<(), AppError> {
71 conn.execute(
72 "INSERT OR REPLACE INTO chunk_embeddings(chunk_id, memory_id, embedding, source, model, dim)
73 VALUES (
74 (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
75 ?1, ?3, 'llm-headless', ?4, ?5
76 )",
77 params![
78 memory_id,
79 chunk_idx,
80 f32_to_bytes(embedding),
81 crate::constants::SQLITE_GRAPHRAG_VERSION,
82 crate::constants::EMBEDDING_DIM as i64,
83 ],
84 )?;
85 Ok(())
86}
87
88pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
89 conn.execute(
90 "DELETE FROM memory_chunks WHERE memory_id = ?1",
91 params![memory_id],
92 )?;
93 Ok(())
94}
95
96pub fn knn_search_chunks(
97 conn: &Connection,
98 embedding: &[f32],
99 k: usize,
100) -> Result<Vec<(i64, i32, f32)>, AppError> {
101 if embedding.len() != crate::constants::EMBEDDING_DIM {
102 return Err(AppError::Embedding(format!(
103 "knn_search_chunks embedding has {} dims, expected {}",
104 embedding.len(),
105 crate::constants::EMBEDDING_DIM
106 )));
107 }
108 let mut stmt =
112 conn.prepare_cached("SELECT chunk_id, memory_id, embedding FROM chunk_embeddings")?;
113 let mut scored: Vec<(i64, i32, f32)> = stmt
114 .query_map([], |r| {
115 let chunk_id: i64 = r.get(0)?;
116 let memory_id: i64 = r.get(1)?;
117 let bytes: Vec<u8> = r.get(2)?;
118 Ok((chunk_id, memory_id, bytes))
119 })?
120 .filter_map(|row| {
121 row.ok().and_then(|(_, memory_id, bytes)| {
122 let stored = crate::embedder::bytes_to_f32(&bytes);
123 if stored.len() != embedding.len() {
124 return None;
125 }
126 let score = crate::similarity::cosine_similarity(embedding, &stored);
127 Some((memory_id, 0, score))
128 })
129 })
130 .collect();
131 scored.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
132 scored.truncate(k);
133 Ok(scored)
134}
135
136pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
137 let mut stmt = conn.prepare_cached(
138 "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
139 FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
140 )?;
141 let rows = stmt
142 .query_map(params![memory_id], |r| {
143 Ok(Chunk {
144 memory_id: r.get(0)?,
145 chunk_idx: r.get(1)?,
146 chunk_text: r.get(2)?,
147 start_offset: r.get(3)?,
148 end_offset: r.get(4)?,
149 token_count: r.get(5)?,
150 })
151 })?
152 .collect::<Result<Vec<_>, _>>()?;
153 Ok(rows)
154}
155
156#[cfg(test)]
157mod tests {
158 use super::*;
159 use crate::constants::EMBEDDING_DIM;
160 use crate::storage::connection::register_vec_extension;
161 use rusqlite::Connection;
162 use tempfile::TempDir;
163
164 fn setup_db() -> (TempDir, Connection) {
165 register_vec_extension();
166 let tmp = TempDir::new().unwrap();
167 let db_path = tmp.path().join("test.db");
168 let mut conn = Connection::open(&db_path).unwrap();
169 crate::migrations::runner().run(&mut conn).unwrap();
170 (tmp, conn)
171 }
172
173 fn insert_memory(conn: &Connection) -> i64 {
174 conn.execute(
175 "INSERT INTO memories (namespace, name, type, description, body, body_hash)
176 VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
177 [],
178 )
179 .unwrap();
180 conn.last_insert_rowid()
181 }
182
183 #[test]
184 fn test_insert_chunks_empty_ok() {
185 let (_tmp, conn) = setup_db();
186 let resultado = insert_chunks(&conn, &[]);
187 assert!(resultado.is_ok());
188 }
189
190 #[test]
191 fn test_insert_chunks_and_get_by_memory() {
192 let (_tmp, conn) = setup_db();
193 let memory_id = insert_memory(&conn);
194
195 let chunks = vec![
196 Chunk {
197 memory_id,
198 chunk_idx: 0,
199 chunk_text: "primeiro chunk".to_string(),
200 start_offset: 0,
201 end_offset: 14,
202 token_count: 3,
203 },
204 Chunk {
205 memory_id,
206 chunk_idx: 1,
207 chunk_text: "segundo chunk".to_string(),
208 start_offset: 15,
209 end_offset: 28,
210 token_count: 3,
211 },
212 ];
213
214 insert_chunks(&conn, &chunks).unwrap();
215
216 let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
217 assert_eq!(recuperados.len(), 2);
218 assert_eq!(recuperados[0].chunk_idx, 0);
219 assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
220 assert_eq!(recuperados[0].start_offset, 0);
221 assert_eq!(recuperados[0].end_offset, 14);
222 assert_eq!(recuperados[0].token_count, 3);
223 assert_eq!(recuperados[1].chunk_idx, 1);
224 assert_eq!(recuperados[1].chunk_text, "segundo chunk");
225 }
226
227 #[test]
228 fn test_get_chunks_missing_memory_returns_empty() {
229 let (_tmp, conn) = setup_db();
230 let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
231 assert!(resultado.is_empty());
232 }
233
234 #[test]
235 fn test_delete_chunks_removes_all() {
236 let (_tmp, conn) = setup_db();
237 let memory_id = insert_memory(&conn);
238
239 let chunks = vec![
240 Chunk {
241 memory_id,
242 chunk_idx: 0,
243 chunk_text: "chunk a".to_string(),
244 start_offset: 0,
245 end_offset: 7,
246 token_count: 2,
247 },
248 Chunk {
249 memory_id,
250 chunk_idx: 1,
251 chunk_text: "chunk b".to_string(),
252 start_offset: 8,
253 end_offset: 15,
254 token_count: 2,
255 },
256 ];
257 insert_chunks(&conn, &chunks).unwrap();
258
259 delete_chunks(&conn, memory_id).unwrap();
260
261 let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
262 assert!(recuperados.is_empty());
263 }
264
265 #[test]
266 fn test_delete_chunks_memory_without_chunks_ok() {
267 let (_tmp, conn) = setup_db();
268 let resultado = delete_chunks(&conn, 9999);
269 assert!(resultado.is_ok());
270 }
271
272 #[test]
273 fn test_get_chunks_ordered_by_chunk_idx() {
274 let (_tmp, conn) = setup_db();
275 let memory_id = insert_memory(&conn);
276
277 let chunks = vec![
278 Chunk {
279 memory_id,
280 chunk_idx: 2,
281 chunk_text: "terceiro".to_string(),
282 start_offset: 20,
283 end_offset: 28,
284 token_count: 1,
285 },
286 Chunk {
287 memory_id,
288 chunk_idx: 0,
289 chunk_text: "primeiro".to_string(),
290 start_offset: 0,
291 end_offset: 8,
292 token_count: 1,
293 },
294 Chunk {
295 memory_id,
296 chunk_idx: 1,
297 chunk_text: "segundo".to_string(),
298 start_offset: 9,
299 end_offset: 16,
300 token_count: 1,
301 },
302 ];
303 insert_chunks(&conn, &chunks).unwrap();
304
305 let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
306 assert_eq!(recuperados.len(), 3);
307 assert_eq!(recuperados[0].chunk_idx, 0);
308 assert_eq!(recuperados[1].chunk_idx, 1);
309 assert_eq!(recuperados[2].chunk_idx, 2);
310 }
311
312 #[test]
313 fn test_upsert_chunk_vec_and_knn_search() {
314 let (_tmp, conn) = setup_db();
315 let memory_id = insert_memory(&conn);
316
317 let chunk = Chunk {
318 memory_id,
319 chunk_idx: 0,
320 chunk_text: "embedding test".to_string(),
321 start_offset: 0,
322 end_offset: 14,
323 token_count: 2,
324 };
325 insert_chunks(&conn, &[chunk]).unwrap();
326
327 let mut embedding = vec![0.0f32; EMBEDDING_DIM];
328 embedding[0] = 1.0;
329
330 let chunk_id: i64 = conn
331 .query_row(
332 "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
333 params![memory_id],
334 |r| r.get(0),
335 )
336 .unwrap();
337
338 upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
339
340 let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
341 assert_eq!(resultados.len(), 1);
342 assert_eq!(resultados[0].0, memory_id);
343 assert_eq!(resultados[0].1, 0);
344 }
345
346 #[test]
347 fn test_knn_search_chunks_without_data_returns_empty() {
348 let (_tmp, conn) = setup_db();
349 let embedding = vec![0.0f32; EMBEDDING_DIM];
350 let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
351 assert!(resultado.is_empty());
352 }
353
354 #[test]
355 fn test_insert_chunks_invalid_fk_fails() {
356 let (_tmp, conn) = setup_db();
357 let chunk = Chunk {
358 memory_id: 99999,
359 chunk_idx: 0,
360 chunk_text: "sem pai".to_string(),
361 start_offset: 0,
362 end_offset: 7,
363 token_count: 1,
364 };
365 let resultado = insert_chunks(&conn, &[chunk]);
366 assert!(resultado.is_err());
367 }
368}