1use crate::embedder::f32_to_bytes;
10use crate::errors::AppError;
11use rusqlite::{params, Connection};
12
13#[derive(Debug, Clone)]
14pub struct Chunk {
15 pub memory_id: i64,
16 pub chunk_idx: i32,
17 pub chunk_text: String,
18 pub start_offset: i32,
19 pub end_offset: i32,
20 pub token_count: i32,
21}
22
23pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
24 for chunk in chunks {
25 conn.execute(
26 "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
27 VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
28 params![
29 chunk.memory_id,
30 chunk.chunk_idx,
31 chunk.chunk_text,
32 chunk.start_offset,
33 chunk.end_offset,
34 chunk.token_count,
35 ],
36 )?;
37 }
38 Ok(())
39}
40
41pub fn insert_chunk_slices(
42 conn: &Connection,
43 memory_id: i64,
44 body: &str,
45 chunks: &[crate::chunking::Chunk],
46) -> Result<(), AppError> {
47 for (chunk_idx, chunk) in chunks.iter().enumerate() {
48 conn.execute(
49 "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
50 VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
51 params![
52 memory_id,
53 chunk_idx as i32,
54 crate::chunking::chunk_text(body, chunk),
55 chunk.start_offset as i32,
56 chunk.end_offset as i32,
57 chunk.token_count_approx as i32,
58 ],
59 )?;
60 }
61 Ok(())
62}
63
64pub fn upsert_chunk_vec(
65 conn: &Connection,
66 _rowid: i64,
67 memory_id: i64,
68 chunk_idx: i32,
69 embedding: &[f32],
70) -> Result<(), AppError> {
71 if embedding.is_empty() {
74 tracing::debug!(
75 memory_id,
76 chunk_idx,
77 "empty chunk embedding: skipping chunk_embeddings row (backfill via enrich re-embed --target chunks)"
78 );
79 return Ok(());
80 }
81 conn.execute(
82 "INSERT OR REPLACE INTO chunk_embeddings(chunk_id, memory_id, embedding, source, model, dim)
83 VALUES (
84 (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
85 ?1, ?3, 'llm-headless', ?4, ?5
86 )",
87 params![
88 memory_id,
89 chunk_idx,
90 f32_to_bytes(embedding),
91 crate::constants::SQLITE_GRAPHRAG_VERSION,
92 crate::constants::embedding_dim() as i64,
93 ],
94 )?;
95 Ok(())
96}
97
98pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
99 conn.execute(
100 "DELETE FROM memory_chunks WHERE memory_id = ?1",
101 params![memory_id],
102 )?;
103 Ok(())
104}
105
106pub fn count_for_memory(conn: &Connection, memory_id: i64) -> Result<usize, AppError> {
113 let n: i64 = conn.query_row(
114 "SELECT COUNT(*) FROM memory_chunks WHERE memory_id = ?1",
115 params![memory_id],
116 |r| r.get(0),
117 )?;
118 Ok(n as usize)
119}
120
121pub fn knn_search_chunks(
122 conn: &Connection,
123 embedding: &[f32],
124 k: usize,
125) -> Result<Vec<(i64, i32, f32)>, AppError> {
126 if embedding.len() != crate::constants::embedding_dim() {
127 return Err(AppError::Embedding(format!(
128 "knn_search_chunks embedding has {} dims, expected {}",
129 embedding.len(),
130 crate::constants::embedding_dim()
131 )));
132 }
133 let mut stmt =
137 conn.prepare_cached("SELECT chunk_id, memory_id, embedding FROM chunk_embeddings")?;
138 let mut scored: Vec<(i64, i32, f32)> = stmt
139 .query_map([], |r| {
140 let chunk_id: i64 = r.get(0)?;
141 let memory_id: i64 = r.get(1)?;
142 let bytes: Vec<u8> = r.get(2)?;
143 Ok((chunk_id, memory_id, bytes))
144 })?
145 .filter_map(|row| {
146 row.ok().and_then(|(_, memory_id, bytes)| {
147 let stored = crate::embedder::bytes_to_f32(&bytes);
148 if stored.len() != embedding.len() {
149 return None;
150 }
151 let score = crate::similarity::cosine_similarity(embedding, &stored);
152 Some((memory_id, 0, score))
153 })
154 })
155 .collect();
156 scored.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
157 scored.truncate(k);
158 Ok(scored)
159}
160
161pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
162 let mut stmt = conn.prepare_cached(
163 "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
164 FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
165 )?;
166 let rows = stmt
167 .query_map(params![memory_id], |r| {
168 Ok(Chunk {
169 memory_id: r.get(0)?,
170 chunk_idx: r.get(1)?,
171 chunk_text: r.get(2)?,
172 start_offset: r.get(3)?,
173 end_offset: r.get(4)?,
174 token_count: r.get(5)?,
175 })
176 })?
177 .collect::<Result<Vec<_>, _>>()?;
178 Ok(rows)
179}
180
181#[cfg(test)]
182mod tests {
183 use super::*;
184 use crate::constants::embedding_dim;
185 use crate::storage::connection::register_vec_extension;
186 use rusqlite::Connection;
187 use tempfile::TempDir;
188
189 fn setup_db() -> (TempDir, Connection) {
190 register_vec_extension();
191 let tmp = TempDir::new().unwrap();
192 let db_path = tmp.path().join("test.db");
193 let mut conn = Connection::open(&db_path).unwrap();
194 crate::migrations::runner().run(&mut conn).unwrap();
195 (tmp, conn)
196 }
197
198 fn insert_memory(conn: &Connection) -> i64 {
199 conn.execute(
200 "INSERT INTO memories (namespace, name, type, description, body, body_hash)
201 VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
202 [],
203 )
204 .unwrap();
205 conn.last_insert_rowid()
206 }
207
208 #[test]
209 fn test_insert_chunks_empty_ok() {
210 let (_tmp, conn) = setup_db();
211 let resultado = insert_chunks(&conn, &[]);
212 assert!(resultado.is_ok());
213 }
214
215 #[test]
216 fn test_insert_chunks_and_get_by_memory() {
217 let (_tmp, conn) = setup_db();
218 let memory_id = insert_memory(&conn);
219
220 let chunks = vec![
221 Chunk {
222 memory_id,
223 chunk_idx: 0,
224 chunk_text: "primeiro chunk".to_string(),
225 start_offset: 0,
226 end_offset: 14,
227 token_count: 3,
228 },
229 Chunk {
230 memory_id,
231 chunk_idx: 1,
232 chunk_text: "segundo chunk".to_string(),
233 start_offset: 15,
234 end_offset: 28,
235 token_count: 3,
236 },
237 ];
238
239 insert_chunks(&conn, &chunks).unwrap();
240
241 let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
242 assert_eq!(recuperados.len(), 2);
243 assert_eq!(recuperados[0].chunk_idx, 0);
244 assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
245 assert_eq!(recuperados[0].start_offset, 0);
246 assert_eq!(recuperados[0].end_offset, 14);
247 assert_eq!(recuperados[0].token_count, 3);
248 assert_eq!(recuperados[1].chunk_idx, 1);
249 assert_eq!(recuperados[1].chunk_text, "segundo chunk");
250 }
251
252 #[test]
253 fn test_get_chunks_missing_memory_returns_empty() {
254 let (_tmp, conn) = setup_db();
255 let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
256 assert!(resultado.is_empty());
257 }
258
259 #[test]
261 fn test_count_for_memory_reflects_persisted_rows() {
262 let (_tmp, conn) = setup_db();
263 let memory_id = insert_memory(&conn);
264
265 assert_eq!(count_for_memory(&conn, memory_id).unwrap(), 0);
267
268 let chunks = vec![
269 Chunk {
270 memory_id,
271 chunk_idx: 0,
272 chunk_text: "a".to_string(),
273 start_offset: 0,
274 end_offset: 1,
275 token_count: 1,
276 },
277 Chunk {
278 memory_id,
279 chunk_idx: 1,
280 chunk_text: "b".to_string(),
281 start_offset: 1,
282 end_offset: 2,
283 token_count: 1,
284 },
285 ];
286 insert_chunks(&conn, &chunks).unwrap();
287 assert_eq!(count_for_memory(&conn, memory_id).unwrap(), 2);
288
289 assert_eq!(count_for_memory(&conn, 9999).unwrap(), 0);
291 }
292
293 #[test]
294 fn test_delete_chunks_removes_all() {
295 let (_tmp, conn) = setup_db();
296 let memory_id = insert_memory(&conn);
297
298 let chunks = vec![
299 Chunk {
300 memory_id,
301 chunk_idx: 0,
302 chunk_text: "chunk a".to_string(),
303 start_offset: 0,
304 end_offset: 7,
305 token_count: 2,
306 },
307 Chunk {
308 memory_id,
309 chunk_idx: 1,
310 chunk_text: "chunk b".to_string(),
311 start_offset: 8,
312 end_offset: 15,
313 token_count: 2,
314 },
315 ];
316 insert_chunks(&conn, &chunks).unwrap();
317
318 delete_chunks(&conn, memory_id).unwrap();
319
320 let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
321 assert!(recuperados.is_empty());
322 }
323
324 #[test]
325 fn test_delete_chunks_memory_without_chunks_ok() {
326 let (_tmp, conn) = setup_db();
327 let resultado = delete_chunks(&conn, 9999);
328 assert!(resultado.is_ok());
329 }
330
331 #[test]
332 fn test_get_chunks_ordered_by_chunk_idx() {
333 let (_tmp, conn) = setup_db();
334 let memory_id = insert_memory(&conn);
335
336 let chunks = vec![
337 Chunk {
338 memory_id,
339 chunk_idx: 2,
340 chunk_text: "terceiro".to_string(),
341 start_offset: 20,
342 end_offset: 28,
343 token_count: 1,
344 },
345 Chunk {
346 memory_id,
347 chunk_idx: 0,
348 chunk_text: "primeiro".to_string(),
349 start_offset: 0,
350 end_offset: 8,
351 token_count: 1,
352 },
353 Chunk {
354 memory_id,
355 chunk_idx: 1,
356 chunk_text: "segundo".to_string(),
357 start_offset: 9,
358 end_offset: 16,
359 token_count: 1,
360 },
361 ];
362 insert_chunks(&conn, &chunks).unwrap();
363
364 let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
365 assert_eq!(recuperados.len(), 3);
366 assert_eq!(recuperados[0].chunk_idx, 0);
367 assert_eq!(recuperados[1].chunk_idx, 1);
368 assert_eq!(recuperados[2].chunk_idx, 2);
369 }
370
371 #[test]
372 #[serial_test::serial(env)]
373 fn test_upsert_chunk_vec_and_knn_search() {
374 let (_tmp, conn) = setup_db();
375 let memory_id = insert_memory(&conn);
376
377 let chunk = Chunk {
378 memory_id,
379 chunk_idx: 0,
380 chunk_text: "embedding test".to_string(),
381 start_offset: 0,
382 end_offset: 14,
383 token_count: 2,
384 };
385 insert_chunks(&conn, &[chunk]).unwrap();
386
387 let mut embedding = vec![0.0f32; embedding_dim()];
388 embedding[0] = 1.0;
389
390 let chunk_id: i64 = conn
391 .query_row(
392 "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
393 params![memory_id],
394 |r| r.get(0),
395 )
396 .unwrap();
397
398 upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
399
400 let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
401 assert_eq!(resultados.len(), 1);
402 assert_eq!(resultados[0].0, memory_id);
403 assert_eq!(resultados[0].1, 0);
404 }
405
406 #[test]
407 #[serial_test::serial(env)]
408 fn test_knn_search_chunks_without_data_returns_empty() {
409 let (_tmp, conn) = setup_db();
410 let embedding = vec![0.0f32; embedding_dim()];
411 let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
412 assert!(resultado.is_empty());
413 }
414
415 #[test]
418 fn test_upsert_chunk_vec_empty_embedding_skips_row() {
419 let (_tmp, conn) = setup_db();
420 let memory_id = insert_memory(&conn);
421 let chunk = Chunk {
422 memory_id,
423 chunk_idx: 0,
424 chunk_text: "sem vetor".to_string(),
425 start_offset: 0,
426 end_offset: 9,
427 token_count: 2,
428 };
429 insert_chunks(&conn, &[chunk]).unwrap();
430
431 upsert_chunk_vec(&conn, 0, memory_id, 0, &[]).unwrap();
432
433 let count: i64 = conn
434 .query_row(
435 "SELECT COUNT(*) FROM chunk_embeddings WHERE memory_id = ?1",
436 params![memory_id],
437 |r| r.get(0),
438 )
439 .unwrap();
440 assert_eq!(count, 0, "empty embedding must not persist a chunk row");
441 }
442
443 #[test]
444 fn test_insert_chunks_invalid_fk_fails() {
445 let (_tmp, conn) = setup_db();
446 let chunk = Chunk {
447 memory_id: 99999,
448 chunk_idx: 0,
449 chunk_text: "sem pai".to_string(),
450 start_offset: 0,
451 end_offset: 7,
452 token_count: 1,
453 };
454 let resultado = insert_chunks(&conn, &[chunk]);
455 assert!(resultado.is_err());
456 }
457}