Skip to main content

mirror_log/
chunk.rs

1use rusqlite::{Connection, Result};
2use uuid::Uuid;
3
4#[derive(Debug)]
5pub struct Chunk {
6    pub id: String,
7    pub event_id: String,
8    pub chunk_index: i64,
9    pub content: String,
10    pub start_offset: i64, //placeholder for now
11    pub end_offset: i64,   // placeholder for now
12}
13
14/// Split content into chunks based on paragraphs or size
15pub fn chunk_content(content: &str, max_chunk_size: usize) -> Vec<(usize, usize, String)> {
16    let mut chunks = Vec::new();
17
18    if content.is_empty() || max_chunk_size == 0 {
19        return chunks;
20    }
21
22    let mut start = 0;
23    while start < content.len() {
24        let remaining = content.len() - start;
25        if remaining <= max_chunk_size {
26            chunks.push((start, content.len(), content[start..].to_string()));
27            break;
28        }
29
30        let target_end = max_chunk_size.min(remaining);
31        let mut hard_end = start;
32        for (offset, _) in content[start..].char_indices() {
33            if offset <= target_end {
34                hard_end = start + offset;
35            } else {
36                break;
37            }
38        }
39        if hard_end == start {
40            if let Some((offset, ch)) = content[start..].char_indices().next() {
41                hard_end = start + offset + ch.len_utf8();
42            }
43        }
44        let window = &content[start..hard_end];
45
46        // Prefer splitting on whitespace for readability; fall back to a hard boundary.
47        let split_at = window
48            .char_indices()
49            .filter_map(|(idx, ch)| {
50                if idx > 0 && ch.is_whitespace() {
51                    Some(start + idx)
52                } else {
53                    None
54                }
55            })
56            .next_back()
57            .unwrap_or(hard_end);
58
59        chunks.push((start, split_at, content[start..split_at].to_string()));
60        start = split_at;
61    }
62
63    chunks
64}
65
66/// Create chunks for an event
67pub fn create_chunks(
68    conn: &Connection,
69    event_id: &str,
70    content: &str,
71    timestamp: i64,
72    max_chunk_size: usize,
73) -> Result<usize> {
74    let chunks = chunk_content(content, max_chunk_size);
75
76    let mut count = 0;
77    for (idx, (start, end, chunk_content)) in chunks.iter().enumerate() {
78        let chunk_id = Uuid::new_v4().to_string();
79
80        conn.execute(
81            "INSERT INTO chunks (id, event_id, chunk_index, content, start_offset, end_offset, timestamp)
82             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
83            (
84                &chunk_id,
85                event_id,
86                idx as i64,
87                chunk_content,
88                *start as i64,
89                *end as i64,
90                timestamp,
91            ),
92        )?;
93        count += 1;
94    }
95
96    Ok(count)
97}
98
99/// Search chunks
100pub fn search_chunks(conn: &Connection, term: &str, limit: Option<i64>) -> Result<Vec<Chunk>> {
101    let like = format!("%{}%", term);
102
103    let query = if let Some(lim) = limit {
104        format!(
105            "SELECT id, event_id, chunk_index, content, start_offset, end_offset
106             FROM chunks
107             WHERE content LIKE ?1
108             AND NOT EXISTS (
109                 SELECT 1 FROM shadow_state s WHERE s.event_id = chunks.event_id
110             )
111             ORDER BY timestamp DESC
112             LIMIT {}",
113            lim
114        )
115    } else {
116        "SELECT id, event_id, chunk_index, content, start_offset, end_offset
117         FROM chunks
118         WHERE content LIKE ?1
119         AND NOT EXISTS (
120             SELECT 1 FROM shadow_state s WHERE s.event_id = chunks.event_id
121         )
122         ORDER BY timestamp DESC"
123            .to_string()
124    };
125
126    let mut stmt = conn.prepare(&query)?;
127
128    let rows = stmt.query_map([like], |row| {
129        Ok(Chunk {
130            id: row.get(0)?,
131            event_id: row.get(1)?,
132            chunk_index: row.get(2)?,
133            content: row.get(3)?,
134            start_offset: row.get(4)?,
135            end_offset: row.get(5)?,
136        })
137    })?;
138
139    rows.collect()
140}
141
142/// List all chunks for an event
143pub fn list_chunks(conn: &Connection, event_id: &str) -> Result<Vec<Chunk>> {
144    let mut stmt = conn.prepare(
145        "SELECT id, event_id, chunk_index, content, start_offset, end_offset
146         FROM chunks
147         WHERE event_id = ?1
148         AND NOT EXISTS (
149             SELECT 1 FROM shadow_state s WHERE s.event_id = chunks.event_id
150         )
151         ORDER BY chunk_index ASC",
152    )?;
153
154    let rows = stmt.query_map([event_id], |row| {
155        Ok(Chunk {
156            id: row.get(0)?,
157            event_id: row.get(1)?,
158            chunk_index: row.get(2)?,
159            content: row.get(3)?,
160            start_offset: row.get(4)?,
161            end_offset: row.get(5)?,
162        })
163    })?;
164
165    rows.collect()
166}