Skip to main content

sqlite_graphrag/storage/
chunks.rs

1// src/storage/chunks.rs
2// Chunk storage for bodies exceeding 512 tokens E5 limit
3
4use crate::embedder::f32_to_bytes;
5use crate::errors::AppError;
6use rusqlite::{params, Connection};
7
8#[derive(Debug, Clone)]
9pub struct Chunk {
10    pub memory_id: i64,
11    pub chunk_idx: i32,
12    pub chunk_text: String,
13    pub start_offset: i32,
14    pub end_offset: i32,
15    pub token_count: i32,
16}
17
18pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
19    for chunk in chunks {
20        conn.execute(
21            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
22             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
23            params![
24                chunk.memory_id,
25                chunk.chunk_idx,
26                chunk.chunk_text,
27                chunk.start_offset,
28                chunk.end_offset,
29                chunk.token_count,
30            ],
31        )?;
32    }
33    Ok(())
34}
35
36pub fn insert_chunk_slices(
37    conn: &Connection,
38    memory_id: i64,
39    body: &str,
40    chunks: &[crate::chunking::Chunk],
41) -> Result<(), AppError> {
42    for (chunk_idx, chunk) in chunks.iter().enumerate() {
43        conn.execute(
44            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
45             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
46            params![
47                memory_id,
48                chunk_idx as i32,
49                crate::chunking::chunk_text(body, chunk),
50                chunk.start_offset as i32,
51                chunk.end_offset as i32,
52                chunk.token_count_approx as i32,
53            ],
54        )?;
55    }
56    Ok(())
57}
58
59pub fn upsert_chunk_vec(
60    conn: &Connection,
61    _rowid: i64,
62    memory_id: i64,
63    chunk_idx: i32,
64    embedding: &[f32],
65) -> Result<(), AppError> {
66    conn.execute(
67        "INSERT OR REPLACE INTO vec_chunks(rowid, memory_id, chunk_idx, embedding)
68         VALUES (
69             (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
70             ?1, ?2, ?3
71         )",
72        params![memory_id, chunk_idx, f32_to_bytes(embedding)],
73    )?;
74    Ok(())
75}
76
77pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
78    conn.execute(
79        "DELETE FROM memory_chunks WHERE memory_id = ?1",
80        params![memory_id],
81    )?;
82    Ok(())
83}
84
85pub fn knn_search_chunks(
86    conn: &Connection,
87    embedding: &[f32],
88    k: usize,
89) -> Result<Vec<(i64, i32, f32)>, AppError> {
90    let bytes = f32_to_bytes(embedding);
91    let mut stmt = conn.prepare(
92        "SELECT memory_id, chunk_idx, distance FROM vec_chunks
93         WHERE embedding MATCH ?1
94         ORDER BY distance LIMIT ?2",
95    )?;
96    let rows = stmt
97        .query_map(params![bytes, k as i64], |r| {
98            Ok((
99                r.get::<_, i64>(0)?,
100                r.get::<_, i32>(1)?,
101                r.get::<_, f32>(2)?,
102            ))
103        })?
104        .collect::<Result<Vec<_>, _>>()?;
105    Ok(rows)
106}
107
108pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
109    let mut stmt = conn.prepare(
110        "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
111         FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
112    )?;
113    let rows = stmt
114        .query_map(params![memory_id], |r| {
115            Ok(Chunk {
116                memory_id: r.get(0)?,
117                chunk_idx: r.get(1)?,
118                chunk_text: r.get(2)?,
119                start_offset: r.get(3)?,
120                end_offset: r.get(4)?,
121                token_count: r.get(5)?,
122            })
123        })?
124        .collect::<Result<Vec<_>, _>>()?;
125    Ok(rows)
126}
127
128#[cfg(test)]
129mod tests {
130    use super::*;
131    use crate::constants::EMBEDDING_DIM;
132    use crate::storage::connection::register_vec_extension;
133    use rusqlite::Connection;
134    use tempfile::TempDir;
135
136    fn setup_db() -> (TempDir, Connection) {
137        register_vec_extension();
138        let tmp = TempDir::new().unwrap();
139        let db_path = tmp.path().join("test.db");
140        let mut conn = Connection::open(&db_path).unwrap();
141        crate::migrations::runner().run(&mut conn).unwrap();
142        (tmp, conn)
143    }
144
145    fn insert_memory(conn: &Connection) -> i64 {
146        conn.execute(
147            "INSERT INTO memories (namespace, name, type, description, body, body_hash)
148             VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
149            [],
150        )
151        .unwrap();
152        conn.last_insert_rowid()
153    }
154
155    #[test]
156    fn test_insert_chunks_vazia_ok() {
157        let (_tmp, conn) = setup_db();
158        let resultado = insert_chunks(&conn, &[]);
159        assert!(resultado.is_ok());
160    }
161
162    #[test]
163    fn test_insert_chunks_e_get_por_memory() {
164        let (_tmp, conn) = setup_db();
165        let memory_id = insert_memory(&conn);
166
167        let chunks = vec![
168            Chunk {
169                memory_id,
170                chunk_idx: 0,
171                chunk_text: "primeiro chunk".to_string(),
172                start_offset: 0,
173                end_offset: 14,
174                token_count: 3,
175            },
176            Chunk {
177                memory_id,
178                chunk_idx: 1,
179                chunk_text: "segundo chunk".to_string(),
180                start_offset: 15,
181                end_offset: 28,
182                token_count: 3,
183            },
184        ];
185
186        insert_chunks(&conn, &chunks).unwrap();
187
188        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
189        assert_eq!(recuperados.len(), 2);
190        assert_eq!(recuperados[0].chunk_idx, 0);
191        assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
192        assert_eq!(recuperados[0].start_offset, 0);
193        assert_eq!(recuperados[0].end_offset, 14);
194        assert_eq!(recuperados[0].token_count, 3);
195        assert_eq!(recuperados[1].chunk_idx, 1);
196        assert_eq!(recuperados[1].chunk_text, "segundo chunk");
197    }
198
199    #[test]
200    fn test_get_chunks_memory_inexistente_retorna_vazio() {
201        let (_tmp, conn) = setup_db();
202        let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
203        assert!(resultado.is_empty());
204    }
205
206    #[test]
207    fn test_delete_chunks_remove_todos() {
208        let (_tmp, conn) = setup_db();
209        let memory_id = insert_memory(&conn);
210
211        let chunks = vec![
212            Chunk {
213                memory_id,
214                chunk_idx: 0,
215                chunk_text: "chunk a".to_string(),
216                start_offset: 0,
217                end_offset: 7,
218                token_count: 2,
219            },
220            Chunk {
221                memory_id,
222                chunk_idx: 1,
223                chunk_text: "chunk b".to_string(),
224                start_offset: 8,
225                end_offset: 15,
226                token_count: 2,
227            },
228        ];
229        insert_chunks(&conn, &chunks).unwrap();
230
231        delete_chunks(&conn, memory_id).unwrap();
232
233        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
234        assert!(recuperados.is_empty());
235    }
236
237    #[test]
238    fn test_delete_chunks_memory_sem_chunks_ok() {
239        let (_tmp, conn) = setup_db();
240        let resultado = delete_chunks(&conn, 9999);
241        assert!(resultado.is_ok());
242    }
243
244    #[test]
245    fn test_get_chunks_ordenados_por_chunk_idx() {
246        let (_tmp, conn) = setup_db();
247        let memory_id = insert_memory(&conn);
248
249        let chunks = vec![
250            Chunk {
251                memory_id,
252                chunk_idx: 2,
253                chunk_text: "terceiro".to_string(),
254                start_offset: 20,
255                end_offset: 28,
256                token_count: 1,
257            },
258            Chunk {
259                memory_id,
260                chunk_idx: 0,
261                chunk_text: "primeiro".to_string(),
262                start_offset: 0,
263                end_offset: 8,
264                token_count: 1,
265            },
266            Chunk {
267                memory_id,
268                chunk_idx: 1,
269                chunk_text: "segundo".to_string(),
270                start_offset: 9,
271                end_offset: 16,
272                token_count: 1,
273            },
274        ];
275        insert_chunks(&conn, &chunks).unwrap();
276
277        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
278        assert_eq!(recuperados.len(), 3);
279        assert_eq!(recuperados[0].chunk_idx, 0);
280        assert_eq!(recuperados[1].chunk_idx, 1);
281        assert_eq!(recuperados[2].chunk_idx, 2);
282    }
283
284    #[test]
285    fn test_upsert_chunk_vec_e_knn_search() {
286        let (_tmp, conn) = setup_db();
287        let memory_id = insert_memory(&conn);
288
289        let chunk = Chunk {
290            memory_id,
291            chunk_idx: 0,
292            chunk_text: "embedding test".to_string(),
293            start_offset: 0,
294            end_offset: 14,
295            token_count: 2,
296        };
297        insert_chunks(&conn, &[chunk]).unwrap();
298
299        let mut embedding = vec![0.0f32; EMBEDDING_DIM];
300        embedding[0] = 1.0;
301
302        let chunk_id: i64 = conn
303            .query_row(
304                "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
305                params![memory_id],
306                |r| r.get(0),
307            )
308            .unwrap();
309
310        upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
311
312        let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
313        assert_eq!(resultados.len(), 1);
314        assert_eq!(resultados[0].0, memory_id);
315        assert_eq!(resultados[0].1, 0);
316    }
317
318    #[test]
319    fn test_knn_search_chunks_sem_dados_retorna_vazio() {
320        let (_tmp, conn) = setup_db();
321        let embedding = vec![0.0f32; EMBEDDING_DIM];
322        let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
323        assert!(resultado.is_empty());
324    }
325
326    #[test]
327    fn test_insert_chunks_fk_invalida_falha() {
328        let (_tmp, conn) = setup_db();
329        let chunk = Chunk {
330            memory_id: 99999,
331            chunk_idx: 0,
332            chunk_text: "sem pai".to_string(),
333            start_offset: 0,
334            end_offset: 7,
335            token_count: 1,
336        };
337        let resultado = insert_chunks(&conn, &[chunk]);
338        assert!(resultado.is_err());
339    }
340}