Skip to main content

sqlite_graphrag/storage/
chunks.rs

1//! Chunk storage CRUD for multi-chunk memories.
2//!
3//! Manages the `memory_chunks` table: insert embeddings for bodies that
4//! exceed the 512-token E5 limit and query chunks for vector search.
5
6// src/storage/chunks.rs
7// Chunk storage for bodies exceeding 512 tokens E5 limit
8
9use crate::embedder::f32_to_bytes;
10use crate::errors::AppError;
11use rusqlite::{params, Connection};
12
13#[derive(Debug, Clone)]
14pub struct Chunk {
15    pub memory_id: i64,
16    pub chunk_idx: i32,
17    pub chunk_text: String,
18    pub start_offset: i32,
19    pub end_offset: i32,
20    pub token_count: i32,
21}
22
23pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
24    for chunk in chunks {
25        conn.execute(
26            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
27             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
28            params![
29                chunk.memory_id,
30                chunk.chunk_idx,
31                chunk.chunk_text,
32                chunk.start_offset,
33                chunk.end_offset,
34                chunk.token_count,
35            ],
36        )?;
37    }
38    Ok(())
39}
40
41pub fn insert_chunk_slices(
42    conn: &Connection,
43    memory_id: i64,
44    body: &str,
45    chunks: &[crate::chunking::Chunk],
46) -> Result<(), AppError> {
47    for (chunk_idx, chunk) in chunks.iter().enumerate() {
48        conn.execute(
49            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
50             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
51            params![
52                memory_id,
53                chunk_idx as i32,
54                crate::chunking::chunk_text(body, chunk),
55                chunk.start_offset as i32,
56                chunk.end_offset as i32,
57                chunk.token_count_approx as i32,
58            ],
59        )?;
60    }
61    Ok(())
62}
63
64pub fn upsert_chunk_vec(
65    conn: &Connection,
66    _rowid: i64,
67    memory_id: i64,
68    chunk_idx: i32,
69    embedding: &[f32],
70) -> Result<(), AppError> {
71    conn.execute(
72        "INSERT OR REPLACE INTO chunk_embeddings(chunk_id, memory_id, embedding, source, model, dim)
73         VALUES (
74             (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
75             ?1, ?3, 'llm-headless', ?4, ?5
76         )",
77        params![
78            memory_id,
79            chunk_idx,
80            f32_to_bytes(embedding),
81            crate::constants::SQLITE_GRAPHRAG_VERSION,
82            crate::constants::embedding_dim() as i64,
83        ],
84    )?;
85    Ok(())
86}
87
88pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
89    conn.execute(
90        "DELETE FROM memory_chunks WHERE memory_id = ?1",
91        params![memory_id],
92    )?;
93    Ok(())
94}
95
96/// GAP-SG-40: counts the rows actually persisted in `memory_chunks` for a
97/// memory. Single-chunk bodies are stored inline in the `memories` row and
98/// append no chunk rows, so this returns `0` for them; multi-chunk bodies
99/// return the exact number of persisted chunk rows. Callers query this AFTER
100/// the transaction commits so the reported `chunks_persisted` reflects the
101/// observable database state rather than a pre-commit estimate.
102pub fn count_for_memory(conn: &Connection, memory_id: i64) -> Result<usize, AppError> {
103    let n: i64 = conn.query_row(
104        "SELECT COUNT(*) FROM memory_chunks WHERE memory_id = ?1",
105        params![memory_id],
106        |r| r.get(0),
107    )?;
108    Ok(n as usize)
109}
110
111pub fn knn_search_chunks(
112    conn: &Connection,
113    embedding: &[f32],
114    k: usize,
115) -> Result<Vec<(i64, i32, f32)>, AppError> {
116    if embedding.len() != crate::constants::embedding_dim() {
117        return Err(AppError::Embedding(format!(
118            "knn_search_chunks embedding has {} dims, expected {}",
119            embedding.len(),
120            crate::constants::embedding_dim()
121        )));
122    }
123    // v1.0.76: full table scan + in-process cosine similarity. The
124    // `chunk_embeddings` table no longer has a `distance` column;
125    // similarity is computed in Rust.
126    let mut stmt =
127        conn.prepare_cached("SELECT chunk_id, memory_id, embedding FROM chunk_embeddings")?;
128    let mut scored: Vec<(i64, i32, f32)> = stmt
129        .query_map([], |r| {
130            let chunk_id: i64 = r.get(0)?;
131            let memory_id: i64 = r.get(1)?;
132            let bytes: Vec<u8> = r.get(2)?;
133            Ok((chunk_id, memory_id, bytes))
134        })?
135        .filter_map(|row| {
136            row.ok().and_then(|(_, memory_id, bytes)| {
137                let stored = crate::embedder::bytes_to_f32(&bytes);
138                if stored.len() != embedding.len() {
139                    return None;
140                }
141                let score = crate::similarity::cosine_similarity(embedding, &stored);
142                Some((memory_id, 0, score))
143            })
144        })
145        .collect();
146    scored.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
147    scored.truncate(k);
148    Ok(scored)
149}
150
151pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
152    let mut stmt = conn.prepare_cached(
153        "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
154         FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
155    )?;
156    let rows = stmt
157        .query_map(params![memory_id], |r| {
158            Ok(Chunk {
159                memory_id: r.get(0)?,
160                chunk_idx: r.get(1)?,
161                chunk_text: r.get(2)?,
162                start_offset: r.get(3)?,
163                end_offset: r.get(4)?,
164                token_count: r.get(5)?,
165            })
166        })?
167        .collect::<Result<Vec<_>, _>>()?;
168    Ok(rows)
169}
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174    use crate::constants::embedding_dim;
175    use crate::storage::connection::register_vec_extension;
176    use rusqlite::Connection;
177    use tempfile::TempDir;
178
179    fn setup_db() -> (TempDir, Connection) {
180        register_vec_extension();
181        let tmp = TempDir::new().unwrap();
182        let db_path = tmp.path().join("test.db");
183        let mut conn = Connection::open(&db_path).unwrap();
184        crate::migrations::runner().run(&mut conn).unwrap();
185        (tmp, conn)
186    }
187
188    fn insert_memory(conn: &Connection) -> i64 {
189        conn.execute(
190            "INSERT INTO memories (namespace, name, type, description, body, body_hash)
191             VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
192            [],
193        )
194        .unwrap();
195        conn.last_insert_rowid()
196    }
197
198    #[test]
199    fn test_insert_chunks_empty_ok() {
200        let (_tmp, conn) = setup_db();
201        let resultado = insert_chunks(&conn, &[]);
202        assert!(resultado.is_ok());
203    }
204
205    #[test]
206    fn test_insert_chunks_and_get_by_memory() {
207        let (_tmp, conn) = setup_db();
208        let memory_id = insert_memory(&conn);
209
210        let chunks = vec![
211            Chunk {
212                memory_id,
213                chunk_idx: 0,
214                chunk_text: "primeiro chunk".to_string(),
215                start_offset: 0,
216                end_offset: 14,
217                token_count: 3,
218            },
219            Chunk {
220                memory_id,
221                chunk_idx: 1,
222                chunk_text: "segundo chunk".to_string(),
223                start_offset: 15,
224                end_offset: 28,
225                token_count: 3,
226            },
227        ];
228
229        insert_chunks(&conn, &chunks).unwrap();
230
231        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
232        assert_eq!(recuperados.len(), 2);
233        assert_eq!(recuperados[0].chunk_idx, 0);
234        assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
235        assert_eq!(recuperados[0].start_offset, 0);
236        assert_eq!(recuperados[0].end_offset, 14);
237        assert_eq!(recuperados[0].token_count, 3);
238        assert_eq!(recuperados[1].chunk_idx, 1);
239        assert_eq!(recuperados[1].chunk_text, "segundo chunk");
240    }
241
242    #[test]
243    fn test_get_chunks_missing_memory_returns_empty() {
244        let (_tmp, conn) = setup_db();
245        let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
246        assert!(resultado.is_empty());
247    }
248
249    // GAP-SG-40: count_for_memory reports the real persisted chunk-row count.
250    #[test]
251    fn test_count_for_memory_reflects_persisted_rows() {
252        let (_tmp, conn) = setup_db();
253        let memory_id = insert_memory(&conn);
254
255        // No chunk rows yet (single-chunk bodies live inline) → 0.
256        assert_eq!(count_for_memory(&conn, memory_id).unwrap(), 0);
257
258        let chunks = vec![
259            Chunk {
260                memory_id,
261                chunk_idx: 0,
262                chunk_text: "a".to_string(),
263                start_offset: 0,
264                end_offset: 1,
265                token_count: 1,
266            },
267            Chunk {
268                memory_id,
269                chunk_idx: 1,
270                chunk_text: "b".to_string(),
271                start_offset: 1,
272                end_offset: 2,
273                token_count: 1,
274            },
275        ];
276        insert_chunks(&conn, &chunks).unwrap();
277        assert_eq!(count_for_memory(&conn, memory_id).unwrap(), 2);
278
279        // Unknown memory id → 0.
280        assert_eq!(count_for_memory(&conn, 9999).unwrap(), 0);
281    }
282
283    #[test]
284    fn test_delete_chunks_removes_all() {
285        let (_tmp, conn) = setup_db();
286        let memory_id = insert_memory(&conn);
287
288        let chunks = vec![
289            Chunk {
290                memory_id,
291                chunk_idx: 0,
292                chunk_text: "chunk a".to_string(),
293                start_offset: 0,
294                end_offset: 7,
295                token_count: 2,
296            },
297            Chunk {
298                memory_id,
299                chunk_idx: 1,
300                chunk_text: "chunk b".to_string(),
301                start_offset: 8,
302                end_offset: 15,
303                token_count: 2,
304            },
305        ];
306        insert_chunks(&conn, &chunks).unwrap();
307
308        delete_chunks(&conn, memory_id).unwrap();
309
310        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
311        assert!(recuperados.is_empty());
312    }
313
314    #[test]
315    fn test_delete_chunks_memory_without_chunks_ok() {
316        let (_tmp, conn) = setup_db();
317        let resultado = delete_chunks(&conn, 9999);
318        assert!(resultado.is_ok());
319    }
320
321    #[test]
322    fn test_get_chunks_ordered_by_chunk_idx() {
323        let (_tmp, conn) = setup_db();
324        let memory_id = insert_memory(&conn);
325
326        let chunks = vec![
327            Chunk {
328                memory_id,
329                chunk_idx: 2,
330                chunk_text: "terceiro".to_string(),
331                start_offset: 20,
332                end_offset: 28,
333                token_count: 1,
334            },
335            Chunk {
336                memory_id,
337                chunk_idx: 0,
338                chunk_text: "primeiro".to_string(),
339                start_offset: 0,
340                end_offset: 8,
341                token_count: 1,
342            },
343            Chunk {
344                memory_id,
345                chunk_idx: 1,
346                chunk_text: "segundo".to_string(),
347                start_offset: 9,
348                end_offset: 16,
349                token_count: 1,
350            },
351        ];
352        insert_chunks(&conn, &chunks).unwrap();
353
354        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
355        assert_eq!(recuperados.len(), 3);
356        assert_eq!(recuperados[0].chunk_idx, 0);
357        assert_eq!(recuperados[1].chunk_idx, 1);
358        assert_eq!(recuperados[2].chunk_idx, 2);
359    }
360
361    #[test]
362    #[serial_test::serial(env)]
363    fn test_upsert_chunk_vec_and_knn_search() {
364        let (_tmp, conn) = setup_db();
365        let memory_id = insert_memory(&conn);
366
367        let chunk = Chunk {
368            memory_id,
369            chunk_idx: 0,
370            chunk_text: "embedding test".to_string(),
371            start_offset: 0,
372            end_offset: 14,
373            token_count: 2,
374        };
375        insert_chunks(&conn, &[chunk]).unwrap();
376
377        let mut embedding = vec![0.0f32; embedding_dim()];
378        embedding[0] = 1.0;
379
380        let chunk_id: i64 = conn
381            .query_row(
382                "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
383                params![memory_id],
384                |r| r.get(0),
385            )
386            .unwrap();
387
388        upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
389
390        let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
391        assert_eq!(resultados.len(), 1);
392        assert_eq!(resultados[0].0, memory_id);
393        assert_eq!(resultados[0].1, 0);
394    }
395
396    #[test]
397    #[serial_test::serial(env)]
398    fn test_knn_search_chunks_without_data_returns_empty() {
399        let (_tmp, conn) = setup_db();
400        let embedding = vec![0.0f32; embedding_dim()];
401        let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
402        assert!(resultado.is_empty());
403    }
404
405    #[test]
406    fn test_insert_chunks_invalid_fk_fails() {
407        let (_tmp, conn) = setup_db();
408        let chunk = Chunk {
409            memory_id: 99999,
410            chunk_idx: 0,
411            chunk_text: "sem pai".to_string(),
412            start_offset: 0,
413            end_offset: 7,
414            token_count: 1,
415        };
416        let resultado = insert_chunks(&conn, &[chunk]);
417        assert!(resultado.is_err());
418    }
419}