julienne 0.1.0

Range-preserving Rust text chunkers for retrieval and embedding pipelines
Documentation
use regex::Regex;

use crate::chunk::TextSpan;

/// Where to attach the separator when keeping it.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum KeepSeparator {
    Start,
    End,
}

/// Split text using a regex pattern.
///
/// - If `separator_pattern` is empty, splits into individual characters.
/// - `keep_separator` controls whether the separator is attached to the start
///   or end of each split, or discarded (`None`).
///
/// Empty strings are filtered out of the result.
pub fn split_text_with_regex(
    text: &str,
    separator_pattern: &str,
    keep_separator: Option<KeepSeparator>,
) -> Vec<String> {
    if separator_pattern.is_empty() {
        return text.chars().map(|c| c.to_string()).collect();
    }

    let re = Regex::new(separator_pattern).expect("invalid regex pattern");
    split_text_with_compiled_regex(text, &re, keep_separator)
}

pub(crate) fn split_text_with_compiled_regex(
    text: &str,
    re: &Regex,
    keep_separator: Option<KeepSeparator>,
) -> Vec<String> {
    match keep_separator {
        None => re
            .split(text)
            .filter(|s| !s.is_empty())
            .map(|s| s.to_string())
            .collect(),

        Some(position) => {
            let matches: Vec<_> = re.find_iter(text).collect();
            if matches.is_empty() {
                return vec![text.to_string()];
            }
            let mut result = Vec::new();

            match position {
                KeepSeparator::End => {
                    let mut start = 0usize;
                    for mat in matches {
                        if start < mat.end() {
                            result.push(text[start..mat.end()].to_string());
                        }
                        start = mat.end();
                    }
                    if start < text.len() {
                        result.push(text[start..].to_string());
                    }
                }
                KeepSeparator::Start => {
                    let first = matches[0];
                    if first.start() > 0 {
                        result.push(text[..first.start()].to_string());
                    }
                    for (idx, mat) in matches.iter().enumerate() {
                        let end = matches
                            .get(idx + 1)
                            .map(|next| next.start())
                            .unwrap_or(text.len());
                        if mat.start() < end {
                            result.push(text[mat.start()..end].to_string());
                        }
                    }
                }
            }

            result
        }
    }
}

/// Split text into source ranges using a regex pattern.
///
/// Unlike `split_text_with_regex`, this function never allocates owned text
/// pieces and never synthesizes separators. Returned spans always reference the
/// original input.
pub(crate) fn split_spans_with_compiled_regex(
    text: &str,
    re: Option<&Regex>,
    keep_separator: Option<KeepSeparator>,
) -> Vec<TextSpan> {
    if text.is_empty() {
        return Vec::new();
    }

    let Some(re) = re else {
        return text
            .char_indices()
            .map(|(start, ch)| TextSpan::new(start, start + ch.len_utf8()))
            .collect();
    };

    let matches: Vec<_> = re.find_iter(text).collect();
    if matches.is_empty() {
        return vec![TextSpan::new(0, text.len())];
    }

    let mut spans = Vec::new();
    match keep_separator {
        None => {
            let mut start = 0usize;
            for mat in matches {
                if start < mat.start() {
                    spans.push(TextSpan::new(start, mat.start()));
                }
                start = mat.end();
            }
            if start < text.len() {
                spans.push(TextSpan::new(start, text.len()));
            }
        }
        Some(KeepSeparator::End) => {
            let mut start = 0usize;
            for mat in matches {
                if start < mat.end() {
                    spans.push(TextSpan::new(start, mat.end()));
                }
                start = mat.end();
            }
            if start < text.len() {
                spans.push(TextSpan::new(start, text.len()));
            }
        }
        Some(KeepSeparator::Start) => {
            let first = matches[0];
            if first.start() > 0 {
                spans.push(TextSpan::new(0, first.start()));
            }

            for (idx, mat) in matches.iter().enumerate() {
                let end = matches
                    .get(idx + 1)
                    .map(|next| next.start())
                    .unwrap_or(text.len());
                if mat.start() < end {
                    spans.push(TextSpan::new(mat.start(), end));
                }
            }
        }
    }

    spans
}

pub(crate) struct RegexSpanIter<'a> {
    text: &'a str,
    regex: Option<Regex>,
    keep_separator: Option<KeepSeparator>,
    cursor: usize,
    pending: Option<TextSpan>,
    done: bool,
}

impl<'a> RegexSpanIter<'a> {
    pub(crate) fn from_regex(
        text: &'a str,
        regex: Option<Regex>,
        keep_separator: Option<KeepSeparator>,
    ) -> Self {
        Self {
            text,
            regex,
            keep_separator,
            cursor: 0,
            pending: None,
            done: text.is_empty(),
        }
    }
}

impl Iterator for RegexSpanIter<'_> {
    type Item = TextSpan;

    fn next(&mut self) -> Option<Self::Item> {
        if let Some(span) = self.pending.take() {
            return Some(span);
        }
        if self.done {
            return None;
        }

        let Some(regex) = &self.regex else {
            if let Some((start, ch)) = self.text[self.cursor..].char_indices().next() {
                let start = self.cursor + start;
                self.cursor = start + ch.len_utf8();
                if self.cursor >= self.text.len() {
                    self.done = true;
                }
                return Some(TextSpan::new(start, self.cursor));
            }
            self.done = true;
            return None;
        };

        let mat = regex.find_at(self.text, self.cursor);
        match (self.keep_separator, mat) {
            (None, Some(mat)) => {
                if self.cursor < mat.start() {
                    let span = TextSpan::new(self.cursor, mat.start());
                    self.cursor = mat.end();
                    Some(span)
                } else {
                    self.cursor = mat.end();
                    self.next()
                }
            }
            (None, None) => {
                self.done = true;
                if self.cursor < self.text.len() {
                    Some(TextSpan::new(self.cursor, self.text.len()))
                } else {
                    None
                }
            }
            (Some(KeepSeparator::End), Some(mat)) => {
                if self.cursor < mat.end() {
                    let span = TextSpan::new(self.cursor, mat.end());
                    self.cursor = mat.end();
                    Some(span)
                } else {
                    self.cursor = mat.end();
                    self.next()
                }
            }
            (Some(KeepSeparator::End), None) => {
                self.done = true;
                if self.cursor < self.text.len() {
                    Some(TextSpan::new(self.cursor, self.text.len()))
                } else {
                    None
                }
            }
            (Some(KeepSeparator::Start), Some(mat)) => {
                if self.cursor < mat.start() {
                    let next = regex.find_at(self.text, mat.end());
                    let span = TextSpan::new(self.cursor, mat.start());
                    let end = next.map(|next| next.start()).unwrap_or(self.text.len());
                    self.pending = (mat.start() < end).then(|| TextSpan::new(mat.start(), end));
                    self.cursor = end;
                    if self.cursor >= self.text.len() {
                        self.done = true;
                    }
                    Some(span)
                } else {
                    let next = regex.find_at(self.text, mat.end());
                    let end = next.map(|next| next.start()).unwrap_or(self.text.len());
                    self.cursor = end;
                    if self.cursor >= self.text.len() {
                        self.done = true;
                    }
                    (mat.start() < end).then(|| TextSpan::new(mat.start(), end))
                }
            }
            (Some(KeepSeparator::Start), None) => {
                self.done = true;
                if self.cursor < self.text.len() {
                    Some(TextSpan::new(self.cursor, self.text.len()))
                } else {
                    None
                }
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_split_no_keep() {
        let result = split_text_with_regex("hello world foo", " ", None);
        assert_eq!(result, vec!["hello", "world", "foo"]);
    }

    #[test]
    fn test_split_keep_start() {
        let result = split_text_with_regex("hello world foo", " ", Some(KeepSeparator::Start));
        assert_eq!(result, vec!["hello", " world", " foo"]);
    }

    #[test]
    fn test_split_keep_end() {
        let result = split_text_with_regex("hello world foo", " ", Some(KeepSeparator::End));
        assert_eq!(result, vec!["hello ", "world ", "foo"]);
    }

    #[test]
    fn test_split_empty_separator() {
        let result = split_text_with_regex("abc", "", None);
        assert_eq!(result, vec!["a", "b", "c"]);
    }

    #[test]
    fn test_split_no_match() {
        let result = split_text_with_regex("hello", "X", None);
        assert_eq!(result, vec!["hello"]);
    }

    #[test]
    fn test_split_filters_empty() {
        let result = split_text_with_regex("a  b", " ", None);
        assert_eq!(result, vec!["a", "b"]);
    }

    #[test]
    fn test_split_regex_pattern() {
        let result = split_text_with_regex("foo123bar456baz", r"\d+", None);
        assert_eq!(result, vec!["foo", "bar", "baz"]);
    }
}