use super::types::{conservative_token_estimate, truncate_to_conservative_tokens};
const OVERLAP_PERCENT: u32 = 12;
const OVERLAP_MAX_PERCENT: u32 = 40;
pub(crate) fn split_by_token_budget(text: &str, max_tokens: u32) -> Vec<String> {
let budget = max_tokens.max(1);
if text.is_empty() {
return vec![String::new()];
}
if conservative_token_estimate(text) <= budget {
return vec![text.to_string()];
}
let overlap_budget = (budget * OVERLAP_PERCENT / 100).max(1);
let overlap_cap = (budget * OVERLAP_MAX_PERCENT / 100).max(overlap_budget);
let seg_budget = budget.saturating_sub(overlap_cap).max(1);
let mut segments: Vec<&str> = Vec::new();
push_atomic_segments(text, seg_budget, &mut segments);
if segments.len() <= 1 {
return vec![text.to_string()];
}
let mut chunks: Vec<String> = Vec::new();
let mut cur: Vec<&str> = Vec::new();
let mut cur_tokens = 0u32;
for seg in &segments {
let seg_tokens = conservative_token_estimate(seg);
if !cur.is_empty() && cur_tokens.saturating_add(seg_tokens) > budget {
chunks.push(join_segments(&cur));
let overlap = tail_overlap(&cur, overlap_budget, overlap_cap);
cur_tokens = overlap.iter().map(|s| conservative_token_estimate(s)).sum();
cur = overlap;
}
cur.push(seg);
cur_tokens = cur_tokens.saturating_add(seg_tokens);
}
if !cur.is_empty() {
chunks.push(join_segments(&cur));
}
if chunks.is_empty() {
chunks.push(String::new());
}
chunks
}
fn push_atomic_segments<'a>(text: &'a str, budget: u32, out: &mut Vec<&'a str>) {
if text.is_empty() {
return;
}
if conservative_token_estimate(text) <= budget {
out.push(text);
return;
}
if split_on_separator(text, "\n\n", budget, out)
|| split_on_sentences(text, budget, out)
|| split_on_whitespace(text, budget, out)
{
return;
}
hard_split(text, budget, out);
}
fn split_on_separator<'a>(text: &'a str, sep: &str, budget: u32, out: &mut Vec<&'a str>) -> bool {
if sep.is_empty() || !text.contains(sep) {
return false;
}
let sep_len = sep.len();
let mut pieces: Vec<&str> = Vec::new();
let mut start = 0usize;
let mut search = 0usize;
while let Some(rel) = text[search..].find(sep) {
let end = search + rel + sep_len; pieces.push(&text[start..end]);
start = end;
search = end;
}
if start < text.len() {
pieces.push(&text[start..]);
}
if pieces.len() <= 1 {
return false;
}
for p in pieces {
push_atomic_segments(p, budget, out);
}
true
}
fn split_on_sentences<'a>(text: &'a str, budget: u32, out: &mut Vec<&'a str>) -> bool {
let bytes = text.as_bytes();
let mut pieces: Vec<&str> = Vec::new();
let mut start = 0usize;
for i in 0..bytes.len() {
let c = bytes[i];
let boundary_end = if c == b'\n' {
Some(i + 1)
} else if (c == b'.' || c == b'!' || c == b'?')
&& i + 1 < bytes.len()
&& bytes[i + 1] == b' '
{
Some(i + 1)
} else {
None
};
if let Some(end) = boundary_end {
pieces.push(&text[start..end]);
start = end;
}
}
if start < text.len() {
pieces.push(&text[start..]);
}
if pieces.len() <= 1 {
return false;
}
for p in pieces {
push_atomic_segments(p, budget, out);
}
true
}
fn split_on_whitespace<'a>(text: &'a str, budget: u32, out: &mut Vec<&'a str>) -> bool {
let bytes = text.as_bytes();
let mut pieces: Vec<&str> = Vec::new();
let mut start = 0usize;
for i in 0..bytes.len() {
if bytes[i] == b' ' {
pieces.push(&text[start..=i]);
start = i + 1;
}
}
if start < text.len() {
pieces.push(&text[start..]);
}
if pieces.len() <= 1 {
return false;
}
for p in pieces {
push_atomic_segments(p, budget, out);
}
true
}
fn hard_split<'a>(mut text: &'a str, budget: u32, out: &mut Vec<&'a str>) {
while !text.is_empty() {
let mut piece = truncate_to_conservative_tokens(text, budget);
if piece.is_empty() {
let end = text
.char_indices()
.nth(1)
.map(|(i, _)| i)
.unwrap_or(text.len());
piece = &text[..end];
}
let plen = piece.len();
out.push(&text[..plen]);
text = &text[plen..];
}
}
fn join_segments(segs: &[&str]) -> String {
segs.concat()
}
fn tail_overlap<'a>(cur: &[&'a str], overlap_budget: u32, overlap_cap: u32) -> Vec<&'a str> {
if cur.len() <= 1 {
return Vec::new();
}
let mut acc = 0u32;
let mut take = 0usize;
for seg in cur.iter().rev() {
let t = conservative_token_estimate(seg);
if acc.saturating_add(t) > overlap_cap {
break;
}
acc = acc.saturating_add(t);
take += 1;
if acc >= overlap_budget {
break;
}
}
if take >= cur.len() {
take = cur.len() - 1; }
cur[cur.len() - take..].to_vec()
}