Skip to main content

sqlite_graphrag/storage/
chunks.rs

1// src/storage/chunks.rs
2// Chunk storage for bodies exceeding 512 tokens E5 limit
3
4use crate::embedder::f32_to_bytes;
5use crate::errors::AppError;
6use rusqlite::{params, Connection};
7
8#[derive(Debug, Clone)]
9pub struct Chunk {
10    pub memory_id: i64,
11    pub chunk_idx: i32,
12    pub chunk_text: String,
13    pub start_offset: i32,
14    pub end_offset: i32,
15    pub token_count: i32,
16}
17
18pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
19    for chunk in chunks {
20        conn.execute(
21            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
22             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
23            params![
24                chunk.memory_id,
25                chunk.chunk_idx,
26                chunk.chunk_text,
27                chunk.start_offset,
28                chunk.end_offset,
29                chunk.token_count,
30            ],
31        )?;
32    }
33    Ok(())
34}
35
36pub fn insert_chunk_slices(
37    conn: &Connection,
38    memory_id: i64,
39    chunks: &[crate::chunking::Chunk],
40) -> Result<(), AppError> {
41    for (chunk_idx, chunk) in chunks.iter().enumerate() {
42        conn.execute(
43            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
44             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
45            params![
46                memory_id,
47                chunk_idx as i32,
48                chunk.text,
49                chunk.start_offset as i32,
50                chunk.end_offset as i32,
51                chunk.token_count_approx as i32,
52            ],
53        )?;
54    }
55    Ok(())
56}
57
58pub fn upsert_chunk_vec(
59    conn: &Connection,
60    _rowid: i64,
61    memory_id: i64,
62    chunk_idx: i32,
63    embedding: &[f32],
64) -> Result<(), AppError> {
65    conn.execute(
66        "INSERT OR REPLACE INTO vec_chunks(rowid, memory_id, chunk_idx, embedding)
67         VALUES (
68             (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
69             ?1, ?2, ?3
70         )",
71        params![memory_id, chunk_idx, f32_to_bytes(embedding)],
72    )?;
73    Ok(())
74}
75
76pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
77    conn.execute(
78        "DELETE FROM memory_chunks WHERE memory_id = ?1",
79        params![memory_id],
80    )?;
81    Ok(())
82}
83
84pub fn knn_search_chunks(
85    conn: &Connection,
86    embedding: &[f32],
87    k: usize,
88) -> Result<Vec<(i64, i32, f32)>, AppError> {
89    let bytes = f32_to_bytes(embedding);
90    let mut stmt = conn.prepare(
91        "SELECT memory_id, chunk_idx, distance FROM vec_chunks
92         WHERE embedding MATCH ?1
93         ORDER BY distance LIMIT ?2",
94    )?;
95    let rows = stmt
96        .query_map(params![bytes, k as i64], |r| {
97            Ok((
98                r.get::<_, i64>(0)?,
99                r.get::<_, i32>(1)?,
100                r.get::<_, f32>(2)?,
101            ))
102        })?
103        .collect::<Result<Vec<_>, _>>()?;
104    Ok(rows)
105}
106
107pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
108    let mut stmt = conn.prepare(
109        "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
110         FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
111    )?;
112    let rows = stmt
113        .query_map(params![memory_id], |r| {
114            Ok(Chunk {
115                memory_id: r.get(0)?,
116                chunk_idx: r.get(1)?,
117                chunk_text: r.get(2)?,
118                start_offset: r.get(3)?,
119                end_offset: r.get(4)?,
120                token_count: r.get(5)?,
121            })
122        })?
123        .collect::<Result<Vec<_>, _>>()?;
124    Ok(rows)
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130    use crate::constants::EMBEDDING_DIM;
131    use crate::storage::connection::register_vec_extension;
132    use rusqlite::Connection;
133    use tempfile::TempDir;
134
135    fn setup_db() -> (TempDir, Connection) {
136        register_vec_extension();
137        let tmp = TempDir::new().unwrap();
138        let db_path = tmp.path().join("test.db");
139        let mut conn = Connection::open(&db_path).unwrap();
140        crate::migrations::runner().run(&mut conn).unwrap();
141        (tmp, conn)
142    }
143
144    fn insert_memory(conn: &Connection) -> i64 {
145        conn.execute(
146            "INSERT INTO memories (namespace, name, type, description, body, body_hash)
147             VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
148            [],
149        )
150        .unwrap();
151        conn.last_insert_rowid()
152    }
153
154    #[test]
155    fn test_insert_chunks_vazia_ok() {
156        let (_tmp, conn) = setup_db();
157        let resultado = insert_chunks(&conn, &[]);
158        assert!(resultado.is_ok());
159    }
160
161    #[test]
162    fn test_insert_chunks_e_get_por_memory() {
163        let (_tmp, conn) = setup_db();
164        let memory_id = insert_memory(&conn);
165
166        let chunks = vec![
167            Chunk {
168                memory_id,
169                chunk_idx: 0,
170                chunk_text: "primeiro chunk".to_string(),
171                start_offset: 0,
172                end_offset: 14,
173                token_count: 3,
174            },
175            Chunk {
176                memory_id,
177                chunk_idx: 1,
178                chunk_text: "segundo chunk".to_string(),
179                start_offset: 15,
180                end_offset: 28,
181                token_count: 3,
182            },
183        ];
184
185        insert_chunks(&conn, &chunks).unwrap();
186
187        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
188        assert_eq!(recuperados.len(), 2);
189        assert_eq!(recuperados[0].chunk_idx, 0);
190        assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
191        assert_eq!(recuperados[0].start_offset, 0);
192        assert_eq!(recuperados[0].end_offset, 14);
193        assert_eq!(recuperados[0].token_count, 3);
194        assert_eq!(recuperados[1].chunk_idx, 1);
195        assert_eq!(recuperados[1].chunk_text, "segundo chunk");
196    }
197
198    #[test]
199    fn test_get_chunks_memory_inexistente_retorna_vazio() {
200        let (_tmp, conn) = setup_db();
201        let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
202        assert!(resultado.is_empty());
203    }
204
205    #[test]
206    fn test_delete_chunks_remove_todos() {
207        let (_tmp, conn) = setup_db();
208        let memory_id = insert_memory(&conn);
209
210        let chunks = vec![
211            Chunk {
212                memory_id,
213                chunk_idx: 0,
214                chunk_text: "chunk a".to_string(),
215                start_offset: 0,
216                end_offset: 7,
217                token_count: 2,
218            },
219            Chunk {
220                memory_id,
221                chunk_idx: 1,
222                chunk_text: "chunk b".to_string(),
223                start_offset: 8,
224                end_offset: 15,
225                token_count: 2,
226            },
227        ];
228        insert_chunks(&conn, &chunks).unwrap();
229
230        delete_chunks(&conn, memory_id).unwrap();
231
232        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
233        assert!(recuperados.is_empty());
234    }
235
236    #[test]
237    fn test_delete_chunks_memory_sem_chunks_ok() {
238        let (_tmp, conn) = setup_db();
239        let resultado = delete_chunks(&conn, 9999);
240        assert!(resultado.is_ok());
241    }
242
243    #[test]
244    fn test_get_chunks_ordenados_por_chunk_idx() {
245        let (_tmp, conn) = setup_db();
246        let memory_id = insert_memory(&conn);
247
248        let chunks = vec![
249            Chunk {
250                memory_id,
251                chunk_idx: 2,
252                chunk_text: "terceiro".to_string(),
253                start_offset: 20,
254                end_offset: 28,
255                token_count: 1,
256            },
257            Chunk {
258                memory_id,
259                chunk_idx: 0,
260                chunk_text: "primeiro".to_string(),
261                start_offset: 0,
262                end_offset: 8,
263                token_count: 1,
264            },
265            Chunk {
266                memory_id,
267                chunk_idx: 1,
268                chunk_text: "segundo".to_string(),
269                start_offset: 9,
270                end_offset: 16,
271                token_count: 1,
272            },
273        ];
274        insert_chunks(&conn, &chunks).unwrap();
275
276        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
277        assert_eq!(recuperados.len(), 3);
278        assert_eq!(recuperados[0].chunk_idx, 0);
279        assert_eq!(recuperados[1].chunk_idx, 1);
280        assert_eq!(recuperados[2].chunk_idx, 2);
281    }
282
283    #[test]
284    fn test_upsert_chunk_vec_e_knn_search() {
285        let (_tmp, conn) = setup_db();
286        let memory_id = insert_memory(&conn);
287
288        let chunk = Chunk {
289            memory_id,
290            chunk_idx: 0,
291            chunk_text: "embedding test".to_string(),
292            start_offset: 0,
293            end_offset: 14,
294            token_count: 2,
295        };
296        insert_chunks(&conn, &[chunk]).unwrap();
297
298        let mut embedding = vec![0.0f32; EMBEDDING_DIM];
299        embedding[0] = 1.0;
300
301        let chunk_id: i64 = conn
302            .query_row(
303                "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
304                params![memory_id],
305                |r| r.get(0),
306            )
307            .unwrap();
308
309        upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
310
311        let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
312        assert_eq!(resultados.len(), 1);
313        assert_eq!(resultados[0].0, memory_id);
314        assert_eq!(resultados[0].1, 0);
315    }
316
317    #[test]
318    fn test_knn_search_chunks_sem_dados_retorna_vazio() {
319        let (_tmp, conn) = setup_db();
320        let embedding = vec![0.0f32; EMBEDDING_DIM];
321        let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
322        assert!(resultado.is_empty());
323    }
324
325    #[test]
326    fn test_insert_chunks_fk_invalida_falha() {
327        let (_tmp, conn) = setup_db();
328        let chunk = Chunk {
329            memory_id: 99999,
330            chunk_idx: 0,
331            chunk_text: "sem pai".to_string(),
332            start_offset: 0,
333            end_offset: 7,
334            token_count: 1,
335        };
336        let resultado = insert_chunks(&conn, &[chunk]);
337        assert!(resultado.is_err());
338    }
339}