Skip to main content

sqlite_graphrag/storage/
chunks.rs

1//! Chunk storage CRUD for multi-chunk memories.
2//!
3//! Manages the `memory_chunks` table: insert embeddings for bodies that
4//! exceed the 512-token E5 limit and query chunks for vector search.
5
6// src/storage/chunks.rs
7// Chunk storage for bodies exceeding 512 tokens E5 limit
8
9use crate::embedder::f32_to_bytes;
10use crate::errors::AppError;
11use rusqlite::{params, Connection};
12
13#[derive(Debug, Clone)]
14pub struct Chunk {
15    pub memory_id: i64,
16    pub chunk_idx: i32,
17    pub chunk_text: String,
18    pub start_offset: i32,
19    pub end_offset: i32,
20    pub token_count: i32,
21}
22
23pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
24    for chunk in chunks {
25        conn.execute(
26            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
27             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
28            params![
29                chunk.memory_id,
30                chunk.chunk_idx,
31                chunk.chunk_text,
32                chunk.start_offset,
33                chunk.end_offset,
34                chunk.token_count,
35            ],
36        )?;
37    }
38    Ok(())
39}
40
41pub fn insert_chunk_slices(
42    conn: &Connection,
43    memory_id: i64,
44    body: &str,
45    chunks: &[crate::chunking::Chunk],
46) -> Result<(), AppError> {
47    for (chunk_idx, chunk) in chunks.iter().enumerate() {
48        conn.execute(
49            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
50             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
51            params![
52                memory_id,
53                chunk_idx as i32,
54                crate::chunking::chunk_text(body, chunk),
55                chunk.start_offset as i32,
56                chunk.end_offset as i32,
57                chunk.token_count_approx as i32,
58            ],
59        )?;
60    }
61    Ok(())
62}
63
64pub fn upsert_chunk_vec(
65    conn: &Connection,
66    _rowid: i64,
67    memory_id: i64,
68    chunk_idx: i32,
69    embedding: &[f32],
70) -> Result<(), AppError> {
71    conn.execute(
72        "INSERT OR REPLACE INTO vec_chunks(rowid, memory_id, chunk_idx, embedding)
73         VALUES (
74             (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
75             ?1, ?2, ?3
76         )",
77        params![memory_id, chunk_idx, f32_to_bytes(embedding)],
78    )?;
79    Ok(())
80}
81
82pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
83    conn.execute(
84        "DELETE FROM memory_chunks WHERE memory_id = ?1",
85        params![memory_id],
86    )?;
87    Ok(())
88}
89
90pub fn knn_search_chunks(
91    conn: &Connection,
92    embedding: &[f32],
93    k: usize,
94) -> Result<Vec<(i64, i32, f32)>, AppError> {
95    let bytes = f32_to_bytes(embedding);
96    let mut stmt = conn.prepare(
97        "SELECT memory_id, chunk_idx, distance FROM vec_chunks
98         WHERE embedding MATCH ?1
99         ORDER BY distance LIMIT ?2",
100    )?;
101    let rows = stmt
102        .query_map(params![bytes, k as i64], |r| {
103            Ok((
104                r.get::<_, i64>(0)?,
105                r.get::<_, i32>(1)?,
106                r.get::<_, f32>(2)?,
107            ))
108        })?
109        .collect::<Result<Vec<_>, _>>()?;
110    Ok(rows)
111}
112
113pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
114    let mut stmt = conn.prepare(
115        "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
116         FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
117    )?;
118    let rows = stmt
119        .query_map(params![memory_id], |r| {
120            Ok(Chunk {
121                memory_id: r.get(0)?,
122                chunk_idx: r.get(1)?,
123                chunk_text: r.get(2)?,
124                start_offset: r.get(3)?,
125                end_offset: r.get(4)?,
126                token_count: r.get(5)?,
127            })
128        })?
129        .collect::<Result<Vec<_>, _>>()?;
130    Ok(rows)
131}
132
133#[cfg(test)]
134mod tests {
135    use super::*;
136    use crate::constants::EMBEDDING_DIM;
137    use crate::storage::connection::register_vec_extension;
138    use rusqlite::Connection;
139    use tempfile::TempDir;
140
141    fn setup_db() -> (TempDir, Connection) {
142        register_vec_extension();
143        let tmp = TempDir::new().unwrap();
144        let db_path = tmp.path().join("test.db");
145        let mut conn = Connection::open(&db_path).unwrap();
146        crate::migrations::runner().run(&mut conn).unwrap();
147        (tmp, conn)
148    }
149
150    fn insert_memory(conn: &Connection) -> i64 {
151        conn.execute(
152            "INSERT INTO memories (namespace, name, type, description, body, body_hash)
153             VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
154            [],
155        )
156        .unwrap();
157        conn.last_insert_rowid()
158    }
159
160    #[test]
161    fn test_insert_chunks_vazia_ok() {
162        let (_tmp, conn) = setup_db();
163        let resultado = insert_chunks(&conn, &[]);
164        assert!(resultado.is_ok());
165    }
166
167    #[test]
168    fn test_insert_chunks_e_get_por_memory() {
169        let (_tmp, conn) = setup_db();
170        let memory_id = insert_memory(&conn);
171
172        let chunks = vec![
173            Chunk {
174                memory_id,
175                chunk_idx: 0,
176                chunk_text: "primeiro chunk".to_string(),
177                start_offset: 0,
178                end_offset: 14,
179                token_count: 3,
180            },
181            Chunk {
182                memory_id,
183                chunk_idx: 1,
184                chunk_text: "segundo chunk".to_string(),
185                start_offset: 15,
186                end_offset: 28,
187                token_count: 3,
188            },
189        ];
190
191        insert_chunks(&conn, &chunks).unwrap();
192
193        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
194        assert_eq!(recuperados.len(), 2);
195        assert_eq!(recuperados[0].chunk_idx, 0);
196        assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
197        assert_eq!(recuperados[0].start_offset, 0);
198        assert_eq!(recuperados[0].end_offset, 14);
199        assert_eq!(recuperados[0].token_count, 3);
200        assert_eq!(recuperados[1].chunk_idx, 1);
201        assert_eq!(recuperados[1].chunk_text, "segundo chunk");
202    }
203
204    #[test]
205    fn test_get_chunks_memory_inexistente_retorna_vazio() {
206        let (_tmp, conn) = setup_db();
207        let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
208        assert!(resultado.is_empty());
209    }
210
211    #[test]
212    fn test_delete_chunks_remove_todos() {
213        let (_tmp, conn) = setup_db();
214        let memory_id = insert_memory(&conn);
215
216        let chunks = vec![
217            Chunk {
218                memory_id,
219                chunk_idx: 0,
220                chunk_text: "chunk a".to_string(),
221                start_offset: 0,
222                end_offset: 7,
223                token_count: 2,
224            },
225            Chunk {
226                memory_id,
227                chunk_idx: 1,
228                chunk_text: "chunk b".to_string(),
229                start_offset: 8,
230                end_offset: 15,
231                token_count: 2,
232            },
233        ];
234        insert_chunks(&conn, &chunks).unwrap();
235
236        delete_chunks(&conn, memory_id).unwrap();
237
238        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
239        assert!(recuperados.is_empty());
240    }
241
242    #[test]
243    fn test_delete_chunks_memory_sem_chunks_ok() {
244        let (_tmp, conn) = setup_db();
245        let resultado = delete_chunks(&conn, 9999);
246        assert!(resultado.is_ok());
247    }
248
249    #[test]
250    fn test_get_chunks_ordenados_por_chunk_idx() {
251        let (_tmp, conn) = setup_db();
252        let memory_id = insert_memory(&conn);
253
254        let chunks = vec![
255            Chunk {
256                memory_id,
257                chunk_idx: 2,
258                chunk_text: "terceiro".to_string(),
259                start_offset: 20,
260                end_offset: 28,
261                token_count: 1,
262            },
263            Chunk {
264                memory_id,
265                chunk_idx: 0,
266                chunk_text: "primeiro".to_string(),
267                start_offset: 0,
268                end_offset: 8,
269                token_count: 1,
270            },
271            Chunk {
272                memory_id,
273                chunk_idx: 1,
274                chunk_text: "segundo".to_string(),
275                start_offset: 9,
276                end_offset: 16,
277                token_count: 1,
278            },
279        ];
280        insert_chunks(&conn, &chunks).unwrap();
281
282        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
283        assert_eq!(recuperados.len(), 3);
284        assert_eq!(recuperados[0].chunk_idx, 0);
285        assert_eq!(recuperados[1].chunk_idx, 1);
286        assert_eq!(recuperados[2].chunk_idx, 2);
287    }
288
289    #[test]
290    fn test_upsert_chunk_vec_e_knn_search() {
291        let (_tmp, conn) = setup_db();
292        let memory_id = insert_memory(&conn);
293
294        let chunk = Chunk {
295            memory_id,
296            chunk_idx: 0,
297            chunk_text: "embedding test".to_string(),
298            start_offset: 0,
299            end_offset: 14,
300            token_count: 2,
301        };
302        insert_chunks(&conn, &[chunk]).unwrap();
303
304        let mut embedding = vec![0.0f32; EMBEDDING_DIM];
305        embedding[0] = 1.0;
306
307        let chunk_id: i64 = conn
308            .query_row(
309                "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
310                params![memory_id],
311                |r| r.get(0),
312            )
313            .unwrap();
314
315        upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
316
317        let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
318        assert_eq!(resultados.len(), 1);
319        assert_eq!(resultados[0].0, memory_id);
320        assert_eq!(resultados[0].1, 0);
321    }
322
323    #[test]
324    fn test_knn_search_chunks_sem_dados_retorna_vazio() {
325        let (_tmp, conn) = setup_db();
326        let embedding = vec![0.0f32; EMBEDDING_DIM];
327        let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
328        assert!(resultado.is_empty());
329    }
330
331    #[test]
332    fn test_insert_chunks_fk_invalida_falha() {
333        let (_tmp, conn) = setup_db();
334        let chunk = Chunk {
335            memory_id: 99999,
336            chunk_idx: 0,
337            chunk_text: "sem pai".to_string(),
338            start_offset: 0,
339            end_offset: 7,
340            token_count: 1,
341        };
342        let resultado = insert_chunks(&conn, &[chunk]);
343        assert!(resultado.is_err());
344    }
345}