use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;
pub fn clean_text(text: &str) -> String {
let mut cleaned = text.to_string();
cleaned = cleaned.chars()
.filter(|c| !c.is_control() || *c == '\n' || *c == '\t')
.collect();
let whitespace_regex = Regex::new(r"\s+").unwrap();
cleaned = whitespace_regex.replace_all(&cleaned, " ").to_string();
cleaned.trim().to_string()
}
pub fn chunk_text(text: &str, max_size: usize, overlap: usize) -> Vec<String> {
if text.chars().count() <= max_size {
return vec![text.to_string()];
}
let mut chunks = Vec::new();
let chars: Vec<char> = text.chars().collect();
let mut start = 0;
while start < chars.len() {
let end = (start + max_size).min(chars.len());
let chunk_chars = &chars[start..end];
let chunk: String = chunk_chars.iter().collect();
let final_chunk = if end < chars.len() {
if let Some(last_space) = chunk.rfind(' ') {
&chunk[..last_space]
} else {
&chunk
}
} else {
&chunk
};
chunks.push(final_chunk.to_string());
if end >= chars.len() {
break;
}
let final_chunk_len = final_chunk.chars().count();
start = if final_chunk_len > overlap {
start + final_chunk_len - overlap
} else {
start + final_chunk_len
};
}
chunks
}
pub fn count_words(text: &str) -> usize {
text.unicode_words().count()
}
pub fn estimate_tokens(text: &str) -> usize {
(count_words(text) as f64 * 1.33) as usize
}
pub fn detect_language(text: &str) -> Option<String> {
let text_lower = text.to_lowercase();
if text_lower.contains("le ") || text_lower.contains("la ") ||
text_lower.contains("les ") || text_lower.contains("des ") ||
text_lower.contains("pour ") || text_lower.contains("avec ") {
return Some("fr".to_string());
}
if text_lower.contains("the ") || text_lower.contains("and ") ||
text_lower.contains("for ") || text_lower.contains("with ") ||
text_lower.contains("from ") || text_lower.contains("this ") {
return Some("en".to_string());
}
None
}
pub fn normalize_line_breaks(text: &str) -> String {
text.replace("\r\n", "\n").replace('\r', "\n")
}
pub fn remove_empty_lines(text: &str) -> String {
let empty_lines_regex = Regex::new(r"\n\s*\n").unwrap();
empty_lines_regex.replace_all(text, "\n\n").to_string()
}
pub fn extract_text_metadata(text: &str) -> TextMetadata {
TextMetadata {
character_count: text.len(),
word_count: count_words(text),
line_count: text.lines().count(),
estimated_tokens: estimate_tokens(text),
detected_language: detect_language(text),
}
}
#[derive(Debug, Clone)]
pub struct TextMetadata {
pub character_count: usize,
pub word_count: usize,
pub line_count: usize,
pub estimated_tokens: usize,
pub detected_language: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clean_text() {
let dirty_text = " Hello\t\tWorld \n\n ";
let cleaned = clean_text(dirty_text);
assert_eq!(cleaned, "Hello World");
}
#[test]
fn test_chunk_text() {
let text = "Hello world this is a test";
let chunks = chunk_text(text, 10, 2);
assert!(!chunks.is_empty());
assert!(chunks[0].len() <= 10);
}
#[test]
fn test_word_count() {
let text = "Hello world, this is a test!";
assert_eq!(count_words(text), 6);
}
#[test]
fn test_language_detection() {
let french_text = "Bonjour le monde, ceci est un test";
let english_text = "Hello the world, this is a test";
assert_eq!(detect_language(french_text), Some("fr".to_string()));
assert_eq!(detect_language(english_text), Some("en".to_string()));
}
}