lantern 0.2.2

Local-first, provenance-aware semantic search for agent activity
Documentation
//! Deterministic text chunker.
//!
//! Chunks are produced by scanning the input once and picking a break point
//! inside a fixed character window. Given identical input and options the
//! output is byte-for-byte identical. Char boundaries are respected so
//! multibyte UTF-8 input is safe.

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Chunk {
    pub ordinal: usize,
    pub byte_start: usize,
    pub byte_end: usize,
    pub text: String,
    pub role: Option<String>,
    pub session_id: Option<String>,
    pub turn_id: Option<String>,
    pub tool_name: Option<String>,
    pub timestamp_unix: Option<i64>,
}

#[derive(Debug, Clone, Copy)]
pub struct ChunkOptions {
    pub max_chars: usize,
    pub min_chars: usize,
}

impl Default for ChunkOptions {
    fn default() -> Self {
        Self {
            max_chars: 1200,
            min_chars: 200,
        }
    }
}

pub fn chunk_text(text: &str, opts: ChunkOptions) -> Vec<Chunk> {
    assert!(
        opts.max_chars >= opts.min_chars && opts.max_chars > 0,
        "invalid chunk options: max={}, min={}",
        opts.max_chars,
        opts.min_chars
    );

    if text.is_empty() {
        return Vec::new();
    }

    let char_indices: Vec<(usize, char)> = text.char_indices().collect();
    let n_chars = char_indices.len();

    let mut chunks = Vec::new();
    let mut start_char = 0usize;
    let mut ordinal = 0usize;

    while start_char < n_chars {
        let end_target = (start_char + opts.max_chars).min(n_chars);
        let end_char = if end_target == n_chars {
            n_chars
        } else {
            pick_break(&char_indices, start_char, end_target, opts.min_chars)
        };

        let byte_start = char_indices[start_char].0;
        let byte_end = if end_char == n_chars {
            text.len()
        } else {
            char_indices[end_char].0
        };

        chunks.push(Chunk {
            ordinal,
            byte_start,
            byte_end,
            text: text[byte_start..byte_end].to_string(),
            role: None,
            session_id: None,
            turn_id: None,
            tool_name: None,
            timestamp_unix: None,
        });

        ordinal += 1;
        start_char = end_char;
    }

    chunks
}

/// Pick a char index in `(start_char + min_chars, end_target]` to cut at,
/// preferring paragraph breaks, then line breaks, then whitespace, with a
/// hard cut at `end_target` as the final fallback.
fn pick_break(
    chars: &[(usize, char)],
    start_char: usize,
    end_target: usize,
    min_chars: usize,
) -> usize {
    let lower = (start_char + min_chars).min(end_target);

    for i in (lower..end_target).rev() {
        if i >= 2 && chars[i - 1].1 == '\n' && chars[i - 2].1 == '\n' {
            return i;
        }
    }
    for i in (lower..end_target).rev() {
        if i >= 1 && chars[i - 1].1 == '\n' {
            return i;
        }
    }
    for i in (lower..end_target).rev() {
        if i >= 1 && chars[i - 1].1.is_whitespace() {
            return i;
        }
    }
    end_target
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_input_yields_no_chunks() {
        assert!(chunk_text("", ChunkOptions::default()).is_empty());
    }

    #[test]
    fn short_input_is_single_chunk() {
        let out = chunk_text("hello world", ChunkOptions::default());
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].ordinal, 0);
        assert_eq!(out[0].text, "hello world");
        assert_eq!(out[0].byte_start, 0);
        assert_eq!(out[0].byte_end, 11);
    }

    #[test]
    fn output_is_deterministic() {
        let text = "foo bar baz ".repeat(500);
        let a = chunk_text(&text, ChunkOptions::default());
        let b = chunk_text(&text, ChunkOptions::default());
        assert_eq!(a, b);
    }

    #[test]
    fn prefers_paragraph_break() {
        let opts = ChunkOptions {
            max_chars: 50,
            min_chars: 10,
        };
        let a: String = "a".repeat(20);
        let b: String = "b".repeat(40);
        let text = format!("{a}\n\n{b}");
        let out = chunk_text(&text, opts);
        assert!(out.len() >= 2);
        assert!(out[0].text.ends_with("\n\n"));
        assert!(out[1].text.starts_with('b'));
    }

    #[test]
    fn prefers_line_break_when_no_paragraph() {
        let opts = ChunkOptions {
            max_chars: 30,
            min_chars: 5,
        };
        let line = "x".repeat(20);
        let text = format!("{line}\n{line}");
        let out = chunk_text(&text, opts);
        assert!(out[0].text.ends_with('\n'));
    }

    #[test]
    fn chunks_cover_full_text() {
        let text: String = ('a'..='z').cycle().take(3000).collect();
        let out = chunk_text(&text, ChunkOptions::default());
        let rebuilt: String = out.iter().map(|c| c.text.as_str()).collect();
        assert_eq!(rebuilt, text);
        for pair in out.windows(2) {
            assert_eq!(pair[0].byte_end, pair[1].byte_start);
            assert_eq!(pair[0].ordinal + 1, pair[1].ordinal);
        }
    }

    #[test]
    fn handles_multibyte_utf8() {
        let text = "世界".repeat(1000);
        let out = chunk_text(&text, ChunkOptions::default());
        let rebuilt: String = out.iter().map(|c| c.text.as_str()).collect();
        assert_eq!(rebuilt, text);
        for c in &out {
            assert!(text.is_char_boundary(c.byte_start));
            assert!(text.is_char_boundary(c.byte_end));
        }
    }

    #[test]
    fn hard_cut_when_no_whitespace_in_window() {
        let opts = ChunkOptions {
            max_chars: 10,
            min_chars: 5,
        };
        let text = "abcdefghijklmno";
        let out = chunk_text(text, opts);
        assert_eq!(out.len(), 2);
        assert_eq!(out[0].text, "abcdefghij");
        assert_eq!(out[1].text, "klmno");
    }
}