use std::collections::BTreeMap;
pub type ChunkMetadata = BTreeMap<String, String>;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TextChunk<'a> {
pub text: &'a str,
pub start_byte: usize,
pub end_byte: usize,
pub start_char: usize,
pub end_char: usize,
pub measured_length: usize,
pub metadata: Option<ChunkMetadata>,
}
impl<'a> TextChunk<'a> {
pub fn from_byte_range(
input: &'a str,
start_byte: usize,
end_byte: usize,
measured_length: usize,
) -> Self {
assert!(start_byte <= end_byte, "chunk start must be before end");
assert!(
input.is_char_boundary(start_byte),
"chunk start must be a char boundary"
);
assert!(
input.is_char_boundary(end_byte),
"chunk end must be a char boundary"
);
let start_char = input[..start_byte].chars().count();
let end_char = start_char + input[start_byte..end_byte].chars().count();
Self {
text: &input[start_byte..end_byte],
start_byte,
end_byte,
start_char,
end_char,
measured_length,
metadata: None,
}
}
pub fn with_metadata(mut self, metadata: ChunkMetadata) -> Self {
self.metadata = Some(metadata);
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct TextSpan {
pub start: usize,
pub end: usize,
}
impl TextSpan {
pub(crate) fn new(start: usize, end: usize) -> Self {
Self { start, end }
}
pub(crate) fn text<'a>(&self, input: &'a str) -> &'a str {
&input[self.start..self.end]
}
pub(crate) fn len_with(&self, input: &str, length_fn: &dyn Fn(&str) -> usize) -> usize {
length_fn(self.text(input))
}
pub(crate) fn trim(self, input: &str) -> Option<Self> {
let text = self.text(input);
let trimmed = text.trim();
if trimmed.is_empty() {
return None;
}
let leading = text.len() - text.trim_start().len();
let trailing = text.len() - text.trim_end().len();
Some(Self {
start: self.start + leading,
end: self.end - trailing,
})
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct MeasuredSpan {
pub span: TextSpan,
pub measured_length: usize,
}
impl MeasuredSpan {
pub(crate) fn new(input: &str, span: TextSpan, length_fn: &dyn Fn(&str) -> usize) -> Self {
Self {
span,
measured_length: span.len_with(input, length_fn),
}
}
}
pub struct TextChunkIter<'a, I> {
input: &'a str,
spans: I,
}
impl<'a, I> TextChunkIter<'a, I> {
pub(crate) fn new(input: &'a str, spans: I) -> Self {
Self { input, spans }
}
}
impl<'a, I> Iterator for TextChunkIter<'a, I>
where
I: Iterator<Item = MeasuredSpan>,
{
type Item = TextChunk<'a>;
fn next(&mut self) -> Option<Self::Item> {
let measured = self.spans.next()?;
Some(TextChunk::from_byte_range(
self.input,
measured.span.start,
measured.span.end,
measured.measured_length,
))
}
}
pub(crate) fn measured_spans<'a>(
input: &'a str,
spans: impl Iterator<Item = TextSpan> + 'a,
length_fn: &'a dyn Fn(&str) -> usize,
) -> impl Iterator<Item = MeasuredSpan> + 'a {
spans.map(|span| MeasuredSpan::new(input, span, length_fn))
}
pub(crate) fn chunks_from_spans<'a>(
input: &'a str,
spans: impl IntoIterator<Item = TextSpan>,
length_fn: &dyn Fn(&str) -> usize,
) -> Vec<TextChunk<'a>> {
TextChunkIter::new(input, measured_spans(input, spans.into_iter(), length_fn)).collect()
}