mirror_log/
chunk.rs

1use rusqlite::{Connection, Result};
2use uuid::Uuid;
3
4#[derive(Debug)]
5pub struct Chunk {
6    pub id: String,
7    pub event_id: String,
8    pub chunk_index: i64,
9    pub content: String,
10    pub start_offset: i64, //placeholder for now
11    pub end_offset: i64,   // placeholder for now
12}
13
14/// Split content into chunks based on paragraphs or size
15pub fn chunk_content(content: &str, max_chunk_size: usize) -> Vec<(usize, usize, String)> {
16    let mut chunks = Vec::new();
17
18    // Split on double newlines (paragraphs)
19    let paragraphs: Vec<&str> = content.split("\n\n").collect();
20
21    let mut current_chunk = String::new();
22    let mut chunk_start = 0;
23    let mut current_pos = 0;
24
25    for para in paragraphs {
26        let para_len = para.len() + 2; // +2 for the \n\n we split on
27
28        // If adding this paragraph exceeds max size and we have content, save chunk
29        if !current_chunk.is_empty() && current_chunk.len() + para_len > max_chunk_size {
30            chunks.push((chunk_start, current_pos, current_chunk.trim().to_string()));
31            current_chunk.clear();
32            chunk_start = current_pos;
33        }
34
35        if !current_chunk.is_empty() {
36            current_chunk.push_str("\n\n");
37        }
38        current_chunk.push_str(para);
39        current_pos += para_len;
40    }
41
42    // Don't forget the last chunk
43    if !current_chunk.is_empty() {
44        chunks.push((chunk_start, current_pos, current_chunk.trim().to_string()));
45    }
46
47    // If content has no paragraphs, split by size
48    if chunks.is_empty() && !content.is_empty() {
49        let mut start = 0;
50        while start < content.len() {
51            let end = (start + max_chunk_size).min(content.len());
52            let chunk_text = content[start..end].to_string();
53            chunks.push((start, end, chunk_text));
54            start = end;
55        }
56    }
57
58    chunks
59}
60
61/// Create chunks for an event
62pub fn create_chunks(
63    conn: &Connection,
64    event_id: &str,
65    content: &str,
66    timestamp: i64,
67    max_chunk_size: usize,
68) -> Result<usize> {
69    let chunks = chunk_content(content, max_chunk_size);
70
71    let mut count = 0;
72    for (idx, (start, end, chunk_content)) in chunks.iter().enumerate() {
73        let chunk_id = Uuid::new_v4().to_string();
74
75        conn.execute(
76            "INSERT INTO chunks (id, event_id, chunk_index, content, start_offset, end_offset, timestamp)
77             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
78            (
79                &chunk_id,
80                event_id,
81                idx as i64,
82                chunk_content,
83                *start as i64,
84                *end as i64,
85                timestamp,
86            ),
87        )?;
88        count += 1;
89    }
90
91    Ok(count)
92}
93
94/// Search chunks
95pub fn search_chunks(conn: &Connection, term: &str, limit: Option<i64>) -> Result<Vec<Chunk>> {
96    let like = format!("%{}%", term);
97
98    let query = if let Some(lim) = limit {
99        format!(
100            "SELECT id, event_id, chunk_index, content, start_offset, end_offset
101             FROM chunks
102             WHERE content LIKE ?1
103             ORDER BY timestamp DESC
104             LIMIT {}",
105            lim
106        )
107    } else {
108        "SELECT id, event_id, chunk_index, content, start_offset, end_offset
109         FROM chunks
110         WHERE content LIKE ?1
111         ORDER BY timestamp DESC"
112            .to_string()
113    };
114
115    let mut stmt = conn.prepare(&query)?;
116
117    let rows = stmt.query_map([like], |row| {
118        Ok(Chunk {
119            id: row.get(0)?,
120            event_id: row.get(1)?,
121            chunk_index: row.get(2)?,
122            content: row.get(3)?,
123            start_offset: row.get(4)?,
124            end_offset: row.get(5)?,
125        })
126    })?;
127
128    Ok(rows.filter_map(Result::ok).collect())
129}