Skip to main content

starpod_memory/
indexer.rs

1use sqlx::SqlitePool;
2
3use starpod_core::StarpodError;
4
5/// A chunk of text extracted from a markdown file for FTS indexing.
6#[derive(Debug, Clone)]
7pub struct Chunk {
8    pub source: String,
9    pub text: String,
10    pub line_start: usize,
11    pub line_end: usize,
12}
13
14/// Target chunk size in characters (~400 tokens ≈ 1600 chars).
15pub const CHUNK_SIZE: usize = 1600;
16/// Overlap in characters (~80 tokens ≈ 320 chars).
17pub const CHUNK_OVERLAP: usize = 320;
18
19/// Split text into chunks with overlap, splitting at line boundaries.
20///
21/// `chunk_size` and `chunk_overlap` control the target chunk size and overlap
22/// in characters. Pass [`CHUNK_SIZE`] and [`CHUNK_OVERLAP`] for the defaults.
23pub fn chunk_text(source: &str, text: &str, chunk_size: usize, chunk_overlap: usize) -> Vec<Chunk> {
24    let lines: Vec<&str> = text.lines().collect();
25    if lines.is_empty() {
26        return Vec::new();
27    }
28
29    let mut chunks = Vec::new();
30    let mut start_line = 0;
31
32    while start_line < lines.len() {
33        let mut char_count = 0;
34        let mut end_line = start_line;
35
36        // Accumulate lines until we reach the chunk size
37        while end_line < lines.len() && char_count < chunk_size {
38            char_count += lines[end_line].len() + 1; // +1 for newline
39            end_line += 1;
40        }
41
42        let chunk_text: String = lines[start_line..end_line].join("\n");
43        if !chunk_text.trim().is_empty() {
44            chunks.push(Chunk {
45                source: source.to_string(),
46                text: chunk_text,
47                line_start: start_line + 1, // 1-indexed
48                line_end: end_line,
49            });
50        }
51
52        // Advance past the chunk, minus overlap
53        let mut overlap_chars = 0;
54        let mut overlap_lines = 0;
55        for i in (start_line..end_line).rev() {
56            overlap_chars += lines[i].len() + 1;
57            overlap_lines += 1;
58            if overlap_chars >= chunk_overlap {
59                break;
60            }
61        }
62
63        let advance = end_line - start_line;
64        if advance <= overlap_lines {
65            // Can't make progress — move forward by at least 1 line
66            start_line = end_line;
67        } else {
68            start_line = end_line - overlap_lines;
69        }
70    }
71
72    chunks
73}
74
75/// Delete all FTS entries for a given source, then insert new chunks.
76pub async fn reindex_source(
77    pool: &SqlitePool,
78    source: &str,
79    text: &str,
80    chunk_size: usize,
81    chunk_overlap: usize,
82) -> Result<(), StarpodError> {
83    // Delete old entries for this source
84    sqlx::query("DELETE FROM memory_fts WHERE source = ?1")
85        .bind(source)
86        .execute(pool)
87        .await
88        .map_err(|e| StarpodError::Database(format!("Failed to delete old chunks: {}", e)))?;
89
90    // Chunk and insert
91    let chunks = chunk_text(source, text, chunk_size, chunk_overlap);
92    for chunk in &chunks {
93        sqlx::query("INSERT INTO memory_fts (source, chunk_text, line_start, line_end) VALUES (?1, ?2, ?3, ?4)")
94            .bind(&chunk.source)
95            .bind(&chunk.text)
96            .bind(chunk.line_start as i64)
97            .bind(chunk.line_end as i64)
98            .execute(pool)
99            .await
100            .map_err(|e| StarpodError::Database(format!("Failed to insert chunk: {}", e)))?;
101    }
102
103    Ok(())
104}
105
106#[cfg(test)]
107mod tests {
108    use super::*;
109
110    #[test]
111    fn test_chunk_text_small() {
112        let text = "line one\nline two\nline three";
113        let chunks = chunk_text("test.md", text, CHUNK_SIZE, CHUNK_OVERLAP);
114        assert_eq!(chunks.len(), 1);
115        assert_eq!(chunks[0].source, "test.md");
116        assert_eq!(chunks[0].line_start, 1);
117        assert_eq!(chunks[0].line_end, 3);
118    }
119
120    #[test]
121    fn test_chunk_text_empty() {
122        let chunks = chunk_text("test.md", "", CHUNK_SIZE, CHUNK_OVERLAP);
123        assert!(chunks.is_empty());
124    }
125
126    #[test]
127    fn test_chunk_text_large() {
128        // Create text larger than CHUNK_SIZE
129        let line = "x".repeat(200);
130        let text: String = (0..20)
131            .map(|_| line.as_str())
132            .collect::<Vec<_>>()
133            .join("\n");
134        let chunks = chunk_text("big.md", &text, CHUNK_SIZE, CHUNK_OVERLAP);
135        assert!(chunks.len() > 1, "Should produce multiple chunks");
136        // Every chunk should have content
137        for chunk in &chunks {
138            assert!(!chunk.text.trim().is_empty());
139        }
140    }
141
142    #[test]
143    fn test_chunk_text_custom_sizes() {
144        // Build a long text (~4000 chars): 20 lines of 200 chars each
145        let line = "a".repeat(200);
146        let long_text: String = (0..20)
147            .map(|_| line.as_str())
148            .collect::<Vec<_>>()
149            .join("\n");
150
151        let chunks_default = chunk_text("test.md", &long_text, CHUNK_SIZE, CHUNK_OVERLAP);
152        let chunks_small = chunk_text("test.md", &long_text, 200, 50);
153
154        // A smaller chunk_size must produce MORE chunks
155        assert!(
156            chunks_small.len() > chunks_default.len(),
157            "Small chunk_size ({} chunks) should produce more chunks than default ({} chunks)",
158            chunks_small.len(),
159            chunks_default.len(),
160        );
161
162        // Verify overlap is respected: consecutive chunks should share some text.
163        // With overlap=50, the tail of chunk N should appear at the start of chunk N+1.
164        if chunks_small.len() >= 2 {
165            for i in 0..chunks_small.len() - 1 {
166                let current_lines: Vec<&str> = chunks_small[i].text.lines().collect();
167                let next_lines: Vec<&str> = chunks_small[i + 1].text.lines().collect();
168                // The first line of the next chunk should be present somewhere in the current chunk
169                let first_next_line = next_lines[0];
170                assert!(
171                    current_lines.contains(&first_next_line),
172                    "Overlap not respected between chunk {} and chunk {}: first line of next chunk not found in current chunk",
173                    i,
174                    i + 1,
175                );
176            }
177        }
178    }
179}