julienne 0.1.0

Range-preserving Rust text chunkers for retrieval and embedding pipelines
Documentation
use std::collections::BTreeMap;

/// Optional structured metadata attached to a chunk.
pub type ChunkMetadata = BTreeMap<String, String>;

/// A zero-copy chunk whose text is always a valid byte range in the source input.
///
/// The offset contract is:
///
/// ```text
/// &input[chunk.start_byte..chunk.end_byte] == chunk.text
/// ```
///
/// `start_char` and `end_char` are character offsets counted from the start of
/// the same source input.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TextChunk<'a> {
    pub text: &'a str,
    pub start_byte: usize,
    pub end_byte: usize,
    pub start_char: usize,
    pub end_char: usize,
    pub measured_length: usize,
    pub metadata: Option<ChunkMetadata>,
}

impl<'a> TextChunk<'a> {
    pub fn from_byte_range(
        input: &'a str,
        start_byte: usize,
        end_byte: usize,
        measured_length: usize,
    ) -> Self {
        assert!(start_byte <= end_byte, "chunk start must be before end");
        assert!(
            input.is_char_boundary(start_byte),
            "chunk start must be a char boundary"
        );
        assert!(
            input.is_char_boundary(end_byte),
            "chunk end must be a char boundary"
        );

        let start_char = input[..start_byte].chars().count();
        let end_char = start_char + input[start_byte..end_byte].chars().count();

        Self {
            text: &input[start_byte..end_byte],
            start_byte,
            end_byte,
            start_char,
            end_char,
            measured_length,
            metadata: None,
        }
    }

    pub fn with_metadata(mut self, metadata: ChunkMetadata) -> Self {
        self.metadata = Some(metadata);
        self
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct TextSpan {
    pub start: usize,
    pub end: usize,
}

impl TextSpan {
    pub(crate) fn new(start: usize, end: usize) -> Self {
        Self { start, end }
    }

    pub(crate) fn text<'a>(&self, input: &'a str) -> &'a str {
        &input[self.start..self.end]
    }

    pub(crate) fn len_with(&self, input: &str, length_fn: &dyn Fn(&str) -> usize) -> usize {
        length_fn(self.text(input))
    }

    pub(crate) fn trim(self, input: &str) -> Option<Self> {
        let text = self.text(input);
        let trimmed = text.trim();
        if trimmed.is_empty() {
            return None;
        }

        let leading = text.len() - text.trim_start().len();
        let trailing = text.len() - text.trim_end().len();
        Some(Self {
            start: self.start + leading,
            end: self.end - trailing,
        })
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct MeasuredSpan {
    pub span: TextSpan,
    pub measured_length: usize,
}

impl MeasuredSpan {
    pub(crate) fn new(input: &str, span: TextSpan, length_fn: &dyn Fn(&str) -> usize) -> Self {
        Self {
            span,
            measured_length: span.len_with(input, length_fn),
        }
    }
}

pub struct TextChunkIter<'a, I> {
    input: &'a str,
    spans: I,
}

impl<'a, I> TextChunkIter<'a, I> {
    pub(crate) fn new(input: &'a str, spans: I) -> Self {
        Self { input, spans }
    }
}

impl<'a, I> Iterator for TextChunkIter<'a, I>
where
    I: Iterator<Item = MeasuredSpan>,
{
    type Item = TextChunk<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        let measured = self.spans.next()?;
        Some(TextChunk::from_byte_range(
            self.input,
            measured.span.start,
            measured.span.end,
            measured.measured_length,
        ))
    }
}

pub(crate) fn measured_spans<'a>(
    input: &'a str,
    spans: impl Iterator<Item = TextSpan> + 'a,
    length_fn: &'a dyn Fn(&str) -> usize,
) -> impl Iterator<Item = MeasuredSpan> + 'a {
    spans.map(|span| MeasuredSpan::new(input, span, length_fn))
}

pub(crate) fn chunks_from_spans<'a>(
    input: &'a str,
    spans: impl IntoIterator<Item = TextSpan>,
    length_fn: &dyn Fn(&str) -> usize,
) -> Vec<TextChunk<'a>> {
    TextChunkIter::new(input, measured_spans(input, spans.into_iter(), length_fn)).collect()
}