Skip to main content

sqlite_graphrag/storage/
chunks.rs

1//! Chunk storage CRUD for multi-chunk memories.
2//!
3//! Manages the `memory_chunks` table: insert embeddings for bodies that
4//! exceed the 512-token E5 limit and query chunks for vector search.
5
6// src/storage/chunks.rs
7// Chunk storage for bodies exceeding 512 tokens E5 limit
8
9use crate::embedder::f32_to_bytes;
10use crate::errors::AppError;
11use rusqlite::{params, Connection};
12
13#[derive(Debug, Clone)]
14pub struct Chunk {
15    pub memory_id: i64,
16    pub chunk_idx: i32,
17    pub chunk_text: String,
18    pub start_offset: i32,
19    pub end_offset: i32,
20    pub token_count: i32,
21}
22
23pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
24    for chunk in chunks {
25        conn.execute(
26            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
27             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
28            params![
29                chunk.memory_id,
30                chunk.chunk_idx,
31                chunk.chunk_text,
32                chunk.start_offset,
33                chunk.end_offset,
34                chunk.token_count,
35            ],
36        )?;
37    }
38    Ok(())
39}
40
41pub fn insert_chunk_slices(
42    conn: &Connection,
43    memory_id: i64,
44    body: &str,
45    chunks: &[crate::chunking::Chunk],
46) -> Result<(), AppError> {
47    for (chunk_idx, chunk) in chunks.iter().enumerate() {
48        conn.execute(
49            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
50             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
51            params![
52                memory_id,
53                chunk_idx as i32,
54                crate::chunking::chunk_text(body, chunk),
55                chunk.start_offset as i32,
56                chunk.end_offset as i32,
57                chunk.token_count_approx as i32,
58            ],
59        )?;
60    }
61    Ok(())
62}
63
64pub fn upsert_chunk_vec(
65    conn: &Connection,
66    _rowid: i64,
67    memory_id: i64,
68    chunk_idx: i32,
69    embedding: &[f32],
70) -> Result<(), AppError> {
71    // v1.1.1 (P1): skip empty vectors so the chunk stays visible to the
72    // re-embed backfill scanner instead of persisting a vector-less row.
73    if embedding.is_empty() {
74        tracing::debug!(
75            memory_id,
76            chunk_idx,
77            "empty chunk embedding: skipping chunk_embeddings row (backfill via enrich re-embed --target chunks)"
78        );
79        return Ok(());
80    }
81    conn.execute(
82        "INSERT OR REPLACE INTO chunk_embeddings(chunk_id, memory_id, embedding, source, model, dim)
83         VALUES (
84             (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
85             ?1, ?3, 'llm-headless', ?4, ?5
86         )",
87        params![
88            memory_id,
89            chunk_idx,
90            f32_to_bytes(embedding),
91            crate::constants::SQLITE_GRAPHRAG_VERSION,
92            crate::constants::embedding_dim() as i64,
93        ],
94    )?;
95    Ok(())
96}
97
98pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
99    conn.execute(
100        "DELETE FROM memory_chunks WHERE memory_id = ?1",
101        params![memory_id],
102    )?;
103    Ok(())
104}
105
106/// GAP-SG-40: counts the rows actually persisted in `memory_chunks` for a
107/// memory. Single-chunk bodies are stored inline in the `memories` row and
108/// append no chunk rows, so this returns `0` for them; multi-chunk bodies
109/// return the exact number of persisted chunk rows. Callers query this AFTER
110/// the transaction commits so the reported `chunks_persisted` reflects the
111/// observable database state rather than a pre-commit estimate.
112pub fn count_for_memory(conn: &Connection, memory_id: i64) -> Result<usize, AppError> {
113    let n: i64 = conn.query_row(
114        "SELECT COUNT(*) FROM memory_chunks WHERE memory_id = ?1",
115        params![memory_id],
116        |r| r.get(0),
117    )?;
118    Ok(n as usize)
119}
120
121pub fn knn_search_chunks(
122    conn: &Connection,
123    embedding: &[f32],
124    k: usize,
125) -> Result<Vec<(i64, i32, f32)>, AppError> {
126    if embedding.len() != crate::constants::embedding_dim() {
127        return Err(AppError::Embedding(format!(
128            "knn_search_chunks embedding has {} dims, expected {}",
129            embedding.len(),
130            crate::constants::embedding_dim()
131        )));
132    }
133    // v1.0.76: full table scan + in-process cosine similarity. The
134    // `chunk_embeddings` table no longer has a `distance` column;
135    // similarity is computed in Rust.
136    let mut stmt =
137        conn.prepare_cached("SELECT chunk_id, memory_id, embedding FROM chunk_embeddings")?;
138    let mut scored: Vec<(i64, i32, f32)> = stmt
139        .query_map([], |r| {
140            let chunk_id: i64 = r.get(0)?;
141            let memory_id: i64 = r.get(1)?;
142            let bytes: Vec<u8> = r.get(2)?;
143            Ok((chunk_id, memory_id, bytes))
144        })?
145        .filter_map(|row| {
146            row.ok().and_then(|(_, memory_id, bytes)| {
147                let stored = crate::embedder::bytes_to_f32(&bytes);
148                if stored.len() != embedding.len() {
149                    return None;
150                }
151                let score = crate::similarity::cosine_similarity(embedding, &stored);
152                Some((memory_id, 0, score))
153            })
154        })
155        .collect();
156    scored.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
157    scored.truncate(k);
158    Ok(scored)
159}
160
161pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
162    let mut stmt = conn.prepare_cached(
163        "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
164         FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
165    )?;
166    let rows = stmt
167        .query_map(params![memory_id], |r| {
168            Ok(Chunk {
169                memory_id: r.get(0)?,
170                chunk_idx: r.get(1)?,
171                chunk_text: r.get(2)?,
172                start_offset: r.get(3)?,
173                end_offset: r.get(4)?,
174                token_count: r.get(5)?,
175            })
176        })?
177        .collect::<Result<Vec<_>, _>>()?;
178    Ok(rows)
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184    use crate::constants::embedding_dim;
185    use crate::storage::connection::register_vec_extension;
186    use rusqlite::Connection;
187    use tempfile::TempDir;
188
189    fn setup_db() -> (TempDir, Connection) {
190        register_vec_extension();
191        let tmp = TempDir::new().unwrap();
192        let db_path = tmp.path().join("test.db");
193        let mut conn = Connection::open(&db_path).unwrap();
194        crate::migrations::runner().run(&mut conn).unwrap();
195        (tmp, conn)
196    }
197
198    fn insert_memory(conn: &Connection) -> i64 {
199        conn.execute(
200            "INSERT INTO memories (namespace, name, type, description, body, body_hash)
201             VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
202            [],
203        )
204        .unwrap();
205        conn.last_insert_rowid()
206    }
207
208    #[test]
209    fn test_insert_chunks_empty_ok() {
210        let (_tmp, conn) = setup_db();
211        let resultado = insert_chunks(&conn, &[]);
212        assert!(resultado.is_ok());
213    }
214
215    #[test]
216    fn test_insert_chunks_and_get_by_memory() {
217        let (_tmp, conn) = setup_db();
218        let memory_id = insert_memory(&conn);
219
220        let chunks = vec![
221            Chunk {
222                memory_id,
223                chunk_idx: 0,
224                chunk_text: "primeiro chunk".to_string(),
225                start_offset: 0,
226                end_offset: 14,
227                token_count: 3,
228            },
229            Chunk {
230                memory_id,
231                chunk_idx: 1,
232                chunk_text: "segundo chunk".to_string(),
233                start_offset: 15,
234                end_offset: 28,
235                token_count: 3,
236            },
237        ];
238
239        insert_chunks(&conn, &chunks).unwrap();
240
241        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
242        assert_eq!(recuperados.len(), 2);
243        assert_eq!(recuperados[0].chunk_idx, 0);
244        assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
245        assert_eq!(recuperados[0].start_offset, 0);
246        assert_eq!(recuperados[0].end_offset, 14);
247        assert_eq!(recuperados[0].token_count, 3);
248        assert_eq!(recuperados[1].chunk_idx, 1);
249        assert_eq!(recuperados[1].chunk_text, "segundo chunk");
250    }
251
252    #[test]
253    fn test_get_chunks_missing_memory_returns_empty() {
254        let (_tmp, conn) = setup_db();
255        let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
256        assert!(resultado.is_empty());
257    }
258
259    // GAP-SG-40: count_for_memory reports the real persisted chunk-row count.
260    #[test]
261    fn test_count_for_memory_reflects_persisted_rows() {
262        let (_tmp, conn) = setup_db();
263        let memory_id = insert_memory(&conn);
264
265        // No chunk rows yet (single-chunk bodies live inline) → 0.
266        assert_eq!(count_for_memory(&conn, memory_id).unwrap(), 0);
267
268        let chunks = vec![
269            Chunk {
270                memory_id,
271                chunk_idx: 0,
272                chunk_text: "a".to_string(),
273                start_offset: 0,
274                end_offset: 1,
275                token_count: 1,
276            },
277            Chunk {
278                memory_id,
279                chunk_idx: 1,
280                chunk_text: "b".to_string(),
281                start_offset: 1,
282                end_offset: 2,
283                token_count: 1,
284            },
285        ];
286        insert_chunks(&conn, &chunks).unwrap();
287        assert_eq!(count_for_memory(&conn, memory_id).unwrap(), 2);
288
289        // Unknown memory id → 0.
290        assert_eq!(count_for_memory(&conn, 9999).unwrap(), 0);
291    }
292
293    #[test]
294    fn test_delete_chunks_removes_all() {
295        let (_tmp, conn) = setup_db();
296        let memory_id = insert_memory(&conn);
297
298        let chunks = vec![
299            Chunk {
300                memory_id,
301                chunk_idx: 0,
302                chunk_text: "chunk a".to_string(),
303                start_offset: 0,
304                end_offset: 7,
305                token_count: 2,
306            },
307            Chunk {
308                memory_id,
309                chunk_idx: 1,
310                chunk_text: "chunk b".to_string(),
311                start_offset: 8,
312                end_offset: 15,
313                token_count: 2,
314            },
315        ];
316        insert_chunks(&conn, &chunks).unwrap();
317
318        delete_chunks(&conn, memory_id).unwrap();
319
320        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
321        assert!(recuperados.is_empty());
322    }
323
324    #[test]
325    fn test_delete_chunks_memory_without_chunks_ok() {
326        let (_tmp, conn) = setup_db();
327        let resultado = delete_chunks(&conn, 9999);
328        assert!(resultado.is_ok());
329    }
330
331    #[test]
332    fn test_get_chunks_ordered_by_chunk_idx() {
333        let (_tmp, conn) = setup_db();
334        let memory_id = insert_memory(&conn);
335
336        let chunks = vec![
337            Chunk {
338                memory_id,
339                chunk_idx: 2,
340                chunk_text: "terceiro".to_string(),
341                start_offset: 20,
342                end_offset: 28,
343                token_count: 1,
344            },
345            Chunk {
346                memory_id,
347                chunk_idx: 0,
348                chunk_text: "primeiro".to_string(),
349                start_offset: 0,
350                end_offset: 8,
351                token_count: 1,
352            },
353            Chunk {
354                memory_id,
355                chunk_idx: 1,
356                chunk_text: "segundo".to_string(),
357                start_offset: 9,
358                end_offset: 16,
359                token_count: 1,
360            },
361        ];
362        insert_chunks(&conn, &chunks).unwrap();
363
364        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
365        assert_eq!(recuperados.len(), 3);
366        assert_eq!(recuperados[0].chunk_idx, 0);
367        assert_eq!(recuperados[1].chunk_idx, 1);
368        assert_eq!(recuperados[2].chunk_idx, 2);
369    }
370
371    #[test]
372    #[serial_test::serial(env)]
373    fn test_upsert_chunk_vec_and_knn_search() {
374        let (_tmp, conn) = setup_db();
375        let memory_id = insert_memory(&conn);
376
377        let chunk = Chunk {
378            memory_id,
379            chunk_idx: 0,
380            chunk_text: "embedding test".to_string(),
381            start_offset: 0,
382            end_offset: 14,
383            token_count: 2,
384        };
385        insert_chunks(&conn, &[chunk]).unwrap();
386
387        let mut embedding = vec![0.0f32; embedding_dim()];
388        embedding[0] = 1.0;
389
390        let chunk_id: i64 = conn
391            .query_row(
392                "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
393                params![memory_id],
394                |r| r.get(0),
395            )
396            .unwrap();
397
398        upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
399
400        let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
401        assert_eq!(resultados.len(), 1);
402        assert_eq!(resultados[0].0, memory_id);
403        assert_eq!(resultados[0].1, 0);
404    }
405
406    #[test]
407    #[serial_test::serial(env)]
408    fn test_knn_search_chunks_without_data_returns_empty() {
409        let (_tmp, conn) = setup_db();
410        let embedding = vec![0.0f32; embedding_dim()];
411        let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
412        assert!(resultado.is_empty());
413    }
414
415    // v1.1.1 (P1): an empty embedding must NOT create a chunk vector row, so
416    // the chunk stays visible to `enrich re-embed --target chunks`.
417    #[test]
418    fn test_upsert_chunk_vec_empty_embedding_skips_row() {
419        let (_tmp, conn) = setup_db();
420        let memory_id = insert_memory(&conn);
421        let chunk = Chunk {
422            memory_id,
423            chunk_idx: 0,
424            chunk_text: "sem vetor".to_string(),
425            start_offset: 0,
426            end_offset: 9,
427            token_count: 2,
428        };
429        insert_chunks(&conn, &[chunk]).unwrap();
430
431        upsert_chunk_vec(&conn, 0, memory_id, 0, &[]).unwrap();
432
433        let count: i64 = conn
434            .query_row(
435                "SELECT COUNT(*) FROM chunk_embeddings WHERE memory_id = ?1",
436                params![memory_id],
437                |r| r.get(0),
438            )
439            .unwrap();
440        assert_eq!(count, 0, "empty embedding must not persist a chunk row");
441    }
442
443    #[test]
444    fn test_insert_chunks_invalid_fk_fails() {
445        let (_tmp, conn) = setup_db();
446        let chunk = Chunk {
447            memory_id: 99999,
448            chunk_idx: 0,
449            chunk_text: "sem pai".to_string(),
450            start_offset: 0,
451            end_offset: 7,
452            token_count: 1,
453        };
454        let resultado = insert_chunks(&conn, &[chunk]);
455        assert!(resultado.is_err());
456    }
457}