1use crate::embedder::f32_to_bytes;
10use crate::errors::AppError;
11use rusqlite::{params, Connection};
12
13#[derive(Debug, Clone)]
14pub struct Chunk {
15 pub memory_id: i64,
16 pub chunk_idx: i32,
17 pub chunk_text: String,
18 pub start_offset: i32,
19 pub end_offset: i32,
20 pub token_count: i32,
21}
22
23pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
24 for chunk in chunks {
25 conn.execute(
26 "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
27 VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
28 params![
29 chunk.memory_id,
30 chunk.chunk_idx,
31 chunk.chunk_text,
32 chunk.start_offset,
33 chunk.end_offset,
34 chunk.token_count,
35 ],
36 )?;
37 }
38 Ok(())
39}
40
41pub fn insert_chunk_slices(
42 conn: &Connection,
43 memory_id: i64,
44 body: &str,
45 chunks: &[crate::chunking::Chunk],
46) -> Result<(), AppError> {
47 for (chunk_idx, chunk) in chunks.iter().enumerate() {
48 conn.execute(
49 "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
50 VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
51 params![
52 memory_id,
53 chunk_idx as i32,
54 crate::chunking::chunk_text(body, chunk),
55 chunk.start_offset as i32,
56 chunk.end_offset as i32,
57 chunk.token_count_approx as i32,
58 ],
59 )?;
60 }
61 Ok(())
62}
63
64pub fn upsert_chunk_vec(
65 conn: &Connection,
66 _rowid: i64,
67 memory_id: i64,
68 chunk_idx: i32,
69 embedding: &[f32],
70) -> Result<(), AppError> {
71 conn.execute(
72 "INSERT OR REPLACE INTO chunk_embeddings(chunk_id, memory_id, embedding, source, model, dim)
73 VALUES (
74 (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
75 ?1, ?3, 'llm-headless', ?4, ?5
76 )",
77 params![
78 memory_id,
79 chunk_idx,
80 f32_to_bytes(embedding),
81 crate::constants::SQLITE_GRAPHRAG_VERSION,
82 crate::constants::embedding_dim() as i64,
83 ],
84 )?;
85 Ok(())
86}
87
88pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
89 conn.execute(
90 "DELETE FROM memory_chunks WHERE memory_id = ?1",
91 params![memory_id],
92 )?;
93 Ok(())
94}
95
96pub fn count_for_memory(conn: &Connection, memory_id: i64) -> Result<usize, AppError> {
103 let n: i64 = conn.query_row(
104 "SELECT COUNT(*) FROM memory_chunks WHERE memory_id = ?1",
105 params![memory_id],
106 |r| r.get(0),
107 )?;
108 Ok(n as usize)
109}
110
111pub fn knn_search_chunks(
112 conn: &Connection,
113 embedding: &[f32],
114 k: usize,
115) -> Result<Vec<(i64, i32, f32)>, AppError> {
116 if embedding.len() != crate::constants::embedding_dim() {
117 return Err(AppError::Embedding(format!(
118 "knn_search_chunks embedding has {} dims, expected {}",
119 embedding.len(),
120 crate::constants::embedding_dim()
121 )));
122 }
123 let mut stmt =
127 conn.prepare_cached("SELECT chunk_id, memory_id, embedding FROM chunk_embeddings")?;
128 let mut scored: Vec<(i64, i32, f32)> = stmt
129 .query_map([], |r| {
130 let chunk_id: i64 = r.get(0)?;
131 let memory_id: i64 = r.get(1)?;
132 let bytes: Vec<u8> = r.get(2)?;
133 Ok((chunk_id, memory_id, bytes))
134 })?
135 .filter_map(|row| {
136 row.ok().and_then(|(_, memory_id, bytes)| {
137 let stored = crate::embedder::bytes_to_f32(&bytes);
138 if stored.len() != embedding.len() {
139 return None;
140 }
141 let score = crate::similarity::cosine_similarity(embedding, &stored);
142 Some((memory_id, 0, score))
143 })
144 })
145 .collect();
146 scored.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
147 scored.truncate(k);
148 Ok(scored)
149}
150
151pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
152 let mut stmt = conn.prepare_cached(
153 "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
154 FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
155 )?;
156 let rows = stmt
157 .query_map(params![memory_id], |r| {
158 Ok(Chunk {
159 memory_id: r.get(0)?,
160 chunk_idx: r.get(1)?,
161 chunk_text: r.get(2)?,
162 start_offset: r.get(3)?,
163 end_offset: r.get(4)?,
164 token_count: r.get(5)?,
165 })
166 })?
167 .collect::<Result<Vec<_>, _>>()?;
168 Ok(rows)
169}
170
171#[cfg(test)]
172mod tests {
173 use super::*;
174 use crate::constants::embedding_dim;
175 use crate::storage::connection::register_vec_extension;
176 use rusqlite::Connection;
177 use tempfile::TempDir;
178
179 fn setup_db() -> (TempDir, Connection) {
180 register_vec_extension();
181 let tmp = TempDir::new().unwrap();
182 let db_path = tmp.path().join("test.db");
183 let mut conn = Connection::open(&db_path).unwrap();
184 crate::migrations::runner().run(&mut conn).unwrap();
185 (tmp, conn)
186 }
187
188 fn insert_memory(conn: &Connection) -> i64 {
189 conn.execute(
190 "INSERT INTO memories (namespace, name, type, description, body, body_hash)
191 VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
192 [],
193 )
194 .unwrap();
195 conn.last_insert_rowid()
196 }
197
198 #[test]
199 fn test_insert_chunks_empty_ok() {
200 let (_tmp, conn) = setup_db();
201 let resultado = insert_chunks(&conn, &[]);
202 assert!(resultado.is_ok());
203 }
204
205 #[test]
206 fn test_insert_chunks_and_get_by_memory() {
207 let (_tmp, conn) = setup_db();
208 let memory_id = insert_memory(&conn);
209
210 let chunks = vec![
211 Chunk {
212 memory_id,
213 chunk_idx: 0,
214 chunk_text: "primeiro chunk".to_string(),
215 start_offset: 0,
216 end_offset: 14,
217 token_count: 3,
218 },
219 Chunk {
220 memory_id,
221 chunk_idx: 1,
222 chunk_text: "segundo chunk".to_string(),
223 start_offset: 15,
224 end_offset: 28,
225 token_count: 3,
226 },
227 ];
228
229 insert_chunks(&conn, &chunks).unwrap();
230
231 let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
232 assert_eq!(recuperados.len(), 2);
233 assert_eq!(recuperados[0].chunk_idx, 0);
234 assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
235 assert_eq!(recuperados[0].start_offset, 0);
236 assert_eq!(recuperados[0].end_offset, 14);
237 assert_eq!(recuperados[0].token_count, 3);
238 assert_eq!(recuperados[1].chunk_idx, 1);
239 assert_eq!(recuperados[1].chunk_text, "segundo chunk");
240 }
241
242 #[test]
243 fn test_get_chunks_missing_memory_returns_empty() {
244 let (_tmp, conn) = setup_db();
245 let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
246 assert!(resultado.is_empty());
247 }
248
249 #[test]
251 fn test_count_for_memory_reflects_persisted_rows() {
252 let (_tmp, conn) = setup_db();
253 let memory_id = insert_memory(&conn);
254
255 assert_eq!(count_for_memory(&conn, memory_id).unwrap(), 0);
257
258 let chunks = vec![
259 Chunk {
260 memory_id,
261 chunk_idx: 0,
262 chunk_text: "a".to_string(),
263 start_offset: 0,
264 end_offset: 1,
265 token_count: 1,
266 },
267 Chunk {
268 memory_id,
269 chunk_idx: 1,
270 chunk_text: "b".to_string(),
271 start_offset: 1,
272 end_offset: 2,
273 token_count: 1,
274 },
275 ];
276 insert_chunks(&conn, &chunks).unwrap();
277 assert_eq!(count_for_memory(&conn, memory_id).unwrap(), 2);
278
279 assert_eq!(count_for_memory(&conn, 9999).unwrap(), 0);
281 }
282
283 #[test]
284 fn test_delete_chunks_removes_all() {
285 let (_tmp, conn) = setup_db();
286 let memory_id = insert_memory(&conn);
287
288 let chunks = vec![
289 Chunk {
290 memory_id,
291 chunk_idx: 0,
292 chunk_text: "chunk a".to_string(),
293 start_offset: 0,
294 end_offset: 7,
295 token_count: 2,
296 },
297 Chunk {
298 memory_id,
299 chunk_idx: 1,
300 chunk_text: "chunk b".to_string(),
301 start_offset: 8,
302 end_offset: 15,
303 token_count: 2,
304 },
305 ];
306 insert_chunks(&conn, &chunks).unwrap();
307
308 delete_chunks(&conn, memory_id).unwrap();
309
310 let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
311 assert!(recuperados.is_empty());
312 }
313
314 #[test]
315 fn test_delete_chunks_memory_without_chunks_ok() {
316 let (_tmp, conn) = setup_db();
317 let resultado = delete_chunks(&conn, 9999);
318 assert!(resultado.is_ok());
319 }
320
321 #[test]
322 fn test_get_chunks_ordered_by_chunk_idx() {
323 let (_tmp, conn) = setup_db();
324 let memory_id = insert_memory(&conn);
325
326 let chunks = vec![
327 Chunk {
328 memory_id,
329 chunk_idx: 2,
330 chunk_text: "terceiro".to_string(),
331 start_offset: 20,
332 end_offset: 28,
333 token_count: 1,
334 },
335 Chunk {
336 memory_id,
337 chunk_idx: 0,
338 chunk_text: "primeiro".to_string(),
339 start_offset: 0,
340 end_offset: 8,
341 token_count: 1,
342 },
343 Chunk {
344 memory_id,
345 chunk_idx: 1,
346 chunk_text: "segundo".to_string(),
347 start_offset: 9,
348 end_offset: 16,
349 token_count: 1,
350 },
351 ];
352 insert_chunks(&conn, &chunks).unwrap();
353
354 let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
355 assert_eq!(recuperados.len(), 3);
356 assert_eq!(recuperados[0].chunk_idx, 0);
357 assert_eq!(recuperados[1].chunk_idx, 1);
358 assert_eq!(recuperados[2].chunk_idx, 2);
359 }
360
361 #[test]
362 #[serial_test::serial(env)]
363 fn test_upsert_chunk_vec_and_knn_search() {
364 let (_tmp, conn) = setup_db();
365 let memory_id = insert_memory(&conn);
366
367 let chunk = Chunk {
368 memory_id,
369 chunk_idx: 0,
370 chunk_text: "embedding test".to_string(),
371 start_offset: 0,
372 end_offset: 14,
373 token_count: 2,
374 };
375 insert_chunks(&conn, &[chunk]).unwrap();
376
377 let mut embedding = vec![0.0f32; embedding_dim()];
378 embedding[0] = 1.0;
379
380 let chunk_id: i64 = conn
381 .query_row(
382 "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
383 params![memory_id],
384 |r| r.get(0),
385 )
386 .unwrap();
387
388 upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
389
390 let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
391 assert_eq!(resultados.len(), 1);
392 assert_eq!(resultados[0].0, memory_id);
393 assert_eq!(resultados[0].1, 0);
394 }
395
396 #[test]
397 #[serial_test::serial(env)]
398 fn test_knn_search_chunks_without_data_returns_empty() {
399 let (_tmp, conn) = setup_db();
400 let embedding = vec![0.0f32; embedding_dim()];
401 let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
402 assert!(resultado.is_empty());
403 }
404
405 #[test]
406 fn test_insert_chunks_invalid_fk_fails() {
407 let (_tmp, conn) = setup_db();
408 let chunk = Chunk {
409 memory_id: 99999,
410 chunk_idx: 0,
411 chunk_text: "sem pai".to_string(),
412 start_offset: 0,
413 end_offset: 7,
414 token_count: 1,
415 };
416 let resultado = insert_chunks(&conn, &[chunk]);
417 assert!(resultado.is_err());
418 }
419}