use crate::memory::chunks::chunk_semantic;
pub(super) fn chunk_document_content(content: &str, max_tokens: usize) -> Vec<String> {
let mut chunks: Vec<String> = chunk_semantic(content, max_tokens.max(1))
.into_iter()
.map(|chunk| chunk.content.trim().to_string())
.filter(|chunk: &String| !chunk.is_empty())
.collect();
if chunks.is_empty() && !content.trim().is_empty() {
chunks.push(content.trim().to_string());
}
chunks
}
pub(super) fn collapse_whitespace(text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
pub(super) fn normalize_search_text(text: &str) -> String {
let collapsed = collapse_whitespace(text);
let mut normalized = String::with_capacity(collapsed.len());
for ch in collapsed.chars() {
if ch.is_alphanumeric() {
normalized.extend(ch.to_lowercase());
} else if ch.is_whitespace() || matches!(ch, '_' | '-' | '/' | '.') {
normalized.push(' ');
}
}
normalized.split_whitespace().collect::<Vec<_>>().join(" ")
}
pub(super) fn normalize_graph_predicate(text: &str) -> String {
let mut out = String::new();
let mut last_was_sep = false;
for ch in collapse_whitespace(text.trim()).chars() {
if ch.is_alphanumeric() {
out.extend(ch.to_uppercase());
last_was_sep = false;
} else if !last_was_sep {
out.push('_');
last_was_sep = true;
}
}
out.trim_matches('_').to_string()
}
#[cfg(test)]
#[path = "text_tests.rs"]
mod tests;