pub(super) fn recurse_with_metric(
text: &str,
separators: &[String],
chunk_size: usize,
measure: &dyn Fn(&str) -> usize,
fallback: &dyn Fn(&str, usize) -> Vec<String>,
) -> Vec<String> {
if text.is_empty() {
return Vec::new();
}
if measure(text) <= chunk_size {
return vec![text.to_owned()];
}
let (sep, rest) = separators
.iter()
.enumerate()
.find(|(_, sep)| sep.is_empty() || text.contains(sep.as_str()))
.map_or((None, separators), |(idx, sep)| {
let rest = separators.get(idx + 1..).unwrap_or(&[]);
(Some(sep.as_str()), rest)
});
let pieces: Vec<String> = match sep {
Some("") | None => fallback(text, chunk_size),
Some(sep) => split_keeping_separator(text, sep),
};
let mut out = Vec::with_capacity(pieces.len());
for piece in pieces {
if measure(&piece) <= chunk_size {
if !piece.is_empty() {
out.push(piece);
}
} else {
out.extend(recurse_with_metric(
&piece, rest, chunk_size, measure, fallback,
));
}
}
out
}
pub(super) fn merge_with_overlap_metric(
segments: Vec<String>,
chunk_size: usize,
chunk_overlap: usize,
measure: &dyn Fn(&str) -> usize,
take_tail: &dyn Fn(&str, usize) -> String,
) -> Vec<String> {
if segments.is_empty() {
return Vec::new();
}
let mut chunks: Vec<String> = Vec::new();
let mut current = String::new();
let mut current_size: usize = 0;
for segment in segments {
let seg_size = measure(&segment);
if current_size + seg_size > chunk_size && current_size > 0 {
chunks.push(std::mem::take(&mut current));
if chunk_overlap > 0
&& let Some(last) = chunks.last()
{
let tail = take_tail(last, chunk_overlap);
current.push_str(&tail);
current_size = measure(¤t);
} else {
current_size = 0;
}
}
current.push_str(&segment);
current_size += seg_size;
}
if !current.is_empty() {
chunks.push(current);
}
chunks
}
pub(super) fn split_keeping_separator(text: &str, separator: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut last_end = 0usize;
for (idx, mat) in text.match_indices(separator) {
let chunk_end = idx + mat.len();
out.push(text[last_end..chunk_end].to_owned());
last_end = chunk_end;
}
if last_end < text.len() {
out.push(text[last_end..].to_owned());
}
out
}