use regex::Regex;
pub const MAX_MESSAGE_LEN: usize = 4000;
pub const MIN_CHUNK_SIZE: usize = 3000;
pub fn split_paragraphs(text: &str) -> Vec<String> {
let mut paras = Vec::new();
for line in text.lines() {
if !line.is_empty() {
paras.push(line.to_string() + "\n");
} else {
if let Some(last) = paras.last() {
if last != "\n" {
paras.push("\n".to_string());
}
} else {
paras.push("\n".to_string());
}
}
}
if paras.is_empty() {
paras.push(text.to_string());
}
paras
}
pub fn split_sentences(text: &str) -> Vec<String> {
let re = Regex::new(r"([^.!?]+[.!?]\s*)").unwrap();
let mut sentences = Vec::new();
for cap in re.captures_iter(text) {
sentences.push(cap[0].to_string());
}
if sentences.is_empty() {
sentences.push(text.to_string());
}
sentences
}
pub fn split_words(text: &str) -> Vec<String> {
text.split_whitespace().map(|s| s.to_string()).collect()
}
pub fn split_long_word(word: &str) -> Vec<String> {
let mut parts = Vec::new();
let mut start = 0;
let chars: Vec<char> = word.chars().collect();
while start < chars.len() {
let end = (start + MAX_MESSAGE_LEN).min(chars.len());
let part: String = chars[start..end].iter().collect();
parts.push(part);
start = end;
}
parts
}
pub fn split_text(text: &str, _format: Option<&str>) -> Vec<String> {
let mut result = Vec::new();
let paragraphs = split_paragraphs(text);
let mut current = String::new();
for para in paragraphs {
if current.is_empty() {
current = para;
continue;
}
let candidate = current.clone() + ¶
if candidate.len() <= MAX_MESSAGE_LEN {
current = candidate;
continue;
}
if current.len() >= MIN_CHUNK_SIZE {
result.push(std::mem::take(&mut current));
current = para;
} else {
let sentences = split_sentences(¶);
for sent in sentences {
let cand2 = current.clone() + &sent;
if cand2.len() <= MAX_MESSAGE_LEN {
current = cand2;
continue;
}
if current.len() >= MIN_CHUNK_SIZE {
result.push(std::mem::take(&mut current));
current = sent;
} else {
let words = split_words(&sent);
for w in words {
let cand3 = if current.is_empty() {
w.clone()
} else {
current.clone() + " " + &w
};
if cand3.len() <= MAX_MESSAGE_LEN {
current = cand3;
continue;
}
if current.len() >= MIN_CHUNK_SIZE {
result.push(std::mem::take(&mut current));
current = w.clone();
} else {
let parts = split_long_word(&w);
for part in parts {
let cand4 = if current.is_empty() {
part.clone()
} else {
current.clone() + " " + &part
};
if cand4.len() <= MAX_MESSAGE_LEN {
current = cand4;
} else {
if !current.is_empty() {
result.push(std::mem::take(&mut current));
}
current = part;
}
}
}
}
}
}
}
}
if !current.is_empty() {
result.push(current);
}
result
}