Skip to main content

sqlite_graphrag/storage/
chunks.rs

1//! Chunk storage CRUD for multi-chunk memories.
2//!
3//! Manages the `memory_chunks` table: insert embeddings for bodies that
4//! exceed the 512-token E5 limit and query chunks for vector search.
5
6// src/storage/chunks.rs
7// Chunk storage for bodies exceeding 512 tokens E5 limit
8
9use crate::embedder::f32_to_bytes;
10use crate::errors::AppError;
11use rusqlite::{params, Connection};
12
13#[derive(Debug, Clone)]
14pub struct Chunk {
15    pub memory_id: i64,
16    pub chunk_idx: i32,
17    pub chunk_text: String,
18    pub start_offset: i32,
19    pub end_offset: i32,
20    pub token_count: i32,
21}
22
23pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
24    for chunk in chunks {
25        conn.execute(
26            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
27             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
28            params![
29                chunk.memory_id,
30                chunk.chunk_idx,
31                chunk.chunk_text,
32                chunk.start_offset,
33                chunk.end_offset,
34                chunk.token_count,
35            ],
36        )?;
37    }
38    Ok(())
39}
40
41pub fn insert_chunk_slices(
42    conn: &Connection,
43    memory_id: i64,
44    body: &str,
45    chunks: &[crate::chunking::Chunk],
46) -> Result<(), AppError> {
47    for (chunk_idx, chunk) in chunks.iter().enumerate() {
48        conn.execute(
49            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
50             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
51            params![
52                memory_id,
53                chunk_idx as i32,
54                crate::chunking::chunk_text(body, chunk),
55                chunk.start_offset as i32,
56                chunk.end_offset as i32,
57                chunk.token_count_approx as i32,
58            ],
59        )?;
60    }
61    Ok(())
62}
63
64pub fn upsert_chunk_vec(
65    conn: &Connection,
66    _rowid: i64,
67    memory_id: i64,
68    chunk_idx: i32,
69    embedding: &[f32],
70) -> Result<(), AppError> {
71    conn.execute(
72        "INSERT OR REPLACE INTO chunk_embeddings(chunk_id, memory_id, embedding, source, model, dim)
73         VALUES (
74             (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
75             ?1, ?3, 'llm-headless', ?4, ?5
76         )",
77        params![
78            memory_id,
79            chunk_idx,
80            f32_to_bytes(embedding),
81            crate::constants::SQLITE_GRAPHRAG_VERSION,
82            crate::constants::EMBEDDING_DIM as i64,
83        ],
84    )?;
85    Ok(())
86}
87
88pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
89    conn.execute(
90        "DELETE FROM memory_chunks WHERE memory_id = ?1",
91        params![memory_id],
92    )?;
93    Ok(())
94}
95
96pub fn knn_search_chunks(
97    conn: &Connection,
98    embedding: &[f32],
99    k: usize,
100) -> Result<Vec<(i64, i32, f32)>, AppError> {
101    if embedding.len() != crate::constants::EMBEDDING_DIM {
102        return Err(AppError::Embedding(format!(
103            "knn_search_chunks embedding has {} dims, expected {}",
104            embedding.len(),
105            crate::constants::EMBEDDING_DIM
106        )));
107    }
108    // v1.0.76: full table scan + in-process cosine similarity. The
109    // `chunk_embeddings` table no longer has a `distance` column;
110    // similarity is computed in Rust.
111    let mut stmt =
112        conn.prepare_cached("SELECT chunk_id, memory_id, embedding FROM chunk_embeddings")?;
113    let mut scored: Vec<(i64, i32, f32)> = stmt
114        .query_map([], |r| {
115            let chunk_id: i64 = r.get(0)?;
116            let memory_id: i64 = r.get(1)?;
117            let bytes: Vec<u8> = r.get(2)?;
118            Ok((chunk_id, memory_id, bytes))
119        })?
120        .filter_map(|row| {
121            row.ok().and_then(|(_, memory_id, bytes)| {
122                let stored = crate::embedder::bytes_to_f32(&bytes);
123                if stored.len() != embedding.len() {
124                    return None;
125                }
126                let score = crate::similarity::cosine_similarity(embedding, &stored);
127                Some((memory_id, 0, score))
128            })
129        })
130        .collect();
131    scored.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
132    scored.truncate(k);
133    Ok(scored)
134}
135
136pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
137    let mut stmt = conn.prepare_cached(
138        "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
139         FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
140    )?;
141    let rows = stmt
142        .query_map(params![memory_id], |r| {
143            Ok(Chunk {
144                memory_id: r.get(0)?,
145                chunk_idx: r.get(1)?,
146                chunk_text: r.get(2)?,
147                start_offset: r.get(3)?,
148                end_offset: r.get(4)?,
149                token_count: r.get(5)?,
150            })
151        })?
152        .collect::<Result<Vec<_>, _>>()?;
153    Ok(rows)
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159    use crate::constants::EMBEDDING_DIM;
160    use crate::storage::connection::register_vec_extension;
161    use rusqlite::Connection;
162    use tempfile::TempDir;
163
164    fn setup_db() -> (TempDir, Connection) {
165        register_vec_extension();
166        let tmp = TempDir::new().unwrap();
167        let db_path = tmp.path().join("test.db");
168        let mut conn = Connection::open(&db_path).unwrap();
169        crate::migrations::runner().run(&mut conn).unwrap();
170        (tmp, conn)
171    }
172
173    fn insert_memory(conn: &Connection) -> i64 {
174        conn.execute(
175            "INSERT INTO memories (namespace, name, type, description, body, body_hash)
176             VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
177            [],
178        )
179        .unwrap();
180        conn.last_insert_rowid()
181    }
182
183    #[test]
184    fn test_insert_chunks_empty_ok() {
185        let (_tmp, conn) = setup_db();
186        let resultado = insert_chunks(&conn, &[]);
187        assert!(resultado.is_ok());
188    }
189
190    #[test]
191    fn test_insert_chunks_and_get_by_memory() {
192        let (_tmp, conn) = setup_db();
193        let memory_id = insert_memory(&conn);
194
195        let chunks = vec![
196            Chunk {
197                memory_id,
198                chunk_idx: 0,
199                chunk_text: "primeiro chunk".to_string(),
200                start_offset: 0,
201                end_offset: 14,
202                token_count: 3,
203            },
204            Chunk {
205                memory_id,
206                chunk_idx: 1,
207                chunk_text: "segundo chunk".to_string(),
208                start_offset: 15,
209                end_offset: 28,
210                token_count: 3,
211            },
212        ];
213
214        insert_chunks(&conn, &chunks).unwrap();
215
216        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
217        assert_eq!(recuperados.len(), 2);
218        assert_eq!(recuperados[0].chunk_idx, 0);
219        assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
220        assert_eq!(recuperados[0].start_offset, 0);
221        assert_eq!(recuperados[0].end_offset, 14);
222        assert_eq!(recuperados[0].token_count, 3);
223        assert_eq!(recuperados[1].chunk_idx, 1);
224        assert_eq!(recuperados[1].chunk_text, "segundo chunk");
225    }
226
227    #[test]
228    fn test_get_chunks_missing_memory_returns_empty() {
229        let (_tmp, conn) = setup_db();
230        let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
231        assert!(resultado.is_empty());
232    }
233
234    #[test]
235    fn test_delete_chunks_removes_all() {
236        let (_tmp, conn) = setup_db();
237        let memory_id = insert_memory(&conn);
238
239        let chunks = vec![
240            Chunk {
241                memory_id,
242                chunk_idx: 0,
243                chunk_text: "chunk a".to_string(),
244                start_offset: 0,
245                end_offset: 7,
246                token_count: 2,
247            },
248            Chunk {
249                memory_id,
250                chunk_idx: 1,
251                chunk_text: "chunk b".to_string(),
252                start_offset: 8,
253                end_offset: 15,
254                token_count: 2,
255            },
256        ];
257        insert_chunks(&conn, &chunks).unwrap();
258
259        delete_chunks(&conn, memory_id).unwrap();
260
261        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
262        assert!(recuperados.is_empty());
263    }
264
265    #[test]
266    fn test_delete_chunks_memory_without_chunks_ok() {
267        let (_tmp, conn) = setup_db();
268        let resultado = delete_chunks(&conn, 9999);
269        assert!(resultado.is_ok());
270    }
271
272    #[test]
273    fn test_get_chunks_ordered_by_chunk_idx() {
274        let (_tmp, conn) = setup_db();
275        let memory_id = insert_memory(&conn);
276
277        let chunks = vec![
278            Chunk {
279                memory_id,
280                chunk_idx: 2,
281                chunk_text: "terceiro".to_string(),
282                start_offset: 20,
283                end_offset: 28,
284                token_count: 1,
285            },
286            Chunk {
287                memory_id,
288                chunk_idx: 0,
289                chunk_text: "primeiro".to_string(),
290                start_offset: 0,
291                end_offset: 8,
292                token_count: 1,
293            },
294            Chunk {
295                memory_id,
296                chunk_idx: 1,
297                chunk_text: "segundo".to_string(),
298                start_offset: 9,
299                end_offset: 16,
300                token_count: 1,
301            },
302        ];
303        insert_chunks(&conn, &chunks).unwrap();
304
305        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
306        assert_eq!(recuperados.len(), 3);
307        assert_eq!(recuperados[0].chunk_idx, 0);
308        assert_eq!(recuperados[1].chunk_idx, 1);
309        assert_eq!(recuperados[2].chunk_idx, 2);
310    }
311
312    #[test]
313    fn test_upsert_chunk_vec_and_knn_search() {
314        let (_tmp, conn) = setup_db();
315        let memory_id = insert_memory(&conn);
316
317        let chunk = Chunk {
318            memory_id,
319            chunk_idx: 0,
320            chunk_text: "embedding test".to_string(),
321            start_offset: 0,
322            end_offset: 14,
323            token_count: 2,
324        };
325        insert_chunks(&conn, &[chunk]).unwrap();
326
327        let mut embedding = vec![0.0f32; EMBEDDING_DIM];
328        embedding[0] = 1.0;
329
330        let chunk_id: i64 = conn
331            .query_row(
332                "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
333                params![memory_id],
334                |r| r.get(0),
335            )
336            .unwrap();
337
338        upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
339
340        let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
341        assert_eq!(resultados.len(), 1);
342        assert_eq!(resultados[0].0, memory_id);
343        assert_eq!(resultados[0].1, 0);
344    }
345
346    #[test]
347    fn test_knn_search_chunks_without_data_returns_empty() {
348        let (_tmp, conn) = setup_db();
349        let embedding = vec![0.0f32; EMBEDDING_DIM];
350        let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
351        assert!(resultado.is_empty());
352    }
353
354    #[test]
355    fn test_insert_chunks_invalid_fk_fails() {
356        let (_tmp, conn) = setup_db();
357        let chunk = Chunk {
358            memory_id: 99999,
359            chunk_idx: 0,
360            chunk_text: "sem pai".to_string(),
361            start_offset: 0,
362            end_offset: 7,
363            token_count: 1,
364        };
365        let resultado = insert_chunks(&conn, &[chunk]);
366        assert!(resultado.is_err());
367    }
368}