#![allow(clippy::unused_self)]
use regex::Regex;
#[derive(Debug)]
pub struct TextOptimizer {
remove_headers_footers: bool,
normalize_whitespace: bool,
remove_hyphenation: bool,
}
impl TextOptimizer {
pub fn new() -> Self {
Self {
remove_headers_footers: true,
normalize_whitespace: true,
remove_hyphenation: true,
}
}
pub fn with_header_footer_removal(mut self, enable: bool) -> Self {
self.remove_headers_footers = enable;
self
}
pub fn with_whitespace_normalization(mut self, enable: bool) -> Self {
self.normalize_whitespace = enable;
self
}
pub fn with_hyphenation_removal(mut self, enable: bool) -> Self {
self.remove_hyphenation = enable;
self
}
pub fn optimize(&self, text: &str) -> String {
let mut result = text.to_string();
if self.remove_hyphenation {
result = self.remove_hyphenation_impl(&result);
}
if self.normalize_whitespace {
result = self.normalize_whitespace_impl(&result);
}
if self.remove_headers_footers {
result = self.remove_headers_footers_impl(&result);
}
result = self.detect_paragraphs(&result);
result
}
fn remove_hyphenation_impl(&self, text: &str) -> String {
let re = Regex::new(r"(\w+)-\s*\n\s*(\w+)").unwrap();
re.replace_all(text, "$1$2").to_string()
}
fn normalize_whitespace_impl(&self, text: &str) -> String {
let mut result = text.to_string();
let re = Regex::new(r"([^\n]) {2,}").unwrap();
result = re.replace_all(&result, "$1 ").to_string();
result = result.replace('\t', " ");
result = result.replace("\r\n", "\n");
result = result.replace('\r', "\n");
result
}
fn remove_headers_footers_impl(&self, text: &str) -> String {
let lines: Vec<&str> = text.lines().collect();
if lines.len() < 10 {
return text.to_string(); }
let mut filtered_lines = Vec::new();
for line in lines {
if line.trim().chars().all(|c| c.is_ascii_digit()) {
continue;
}
if line.trim().len() < 5 {
continue;
}
filtered_lines.push(line);
}
filtered_lines.join("\n")
}
fn detect_paragraphs(&self, text: &str) -> String {
let lines: Vec<&str> = text.lines().collect();
let mut result = Vec::new();
let mut current_paragraph = Vec::new();
for line in lines {
let trimmed = line.trim();
if trimmed.is_empty() {
if !current_paragraph.is_empty() {
result.push(current_paragraph.join(" "));
result.push(String::new()); current_paragraph.clear();
}
} else {
if Self::is_paragraph_start(trimmed) && !current_paragraph.is_empty() {
result.push(current_paragraph.join(" "));
current_paragraph.clear();
}
current_paragraph.push(trimmed.to_string());
}
}
if !current_paragraph.is_empty() {
result.push(current_paragraph.join(" "));
}
result.join("\n")
}
fn is_paragraph_start(line: &str) -> bool {
line.starts_with("• ")
|| line.starts_with("- ")
|| line.starts_with("* ")
|| line.starts_with(|c: char| c.is_ascii_digit() && line.contains(". "))
|| line.starts_with('#')
}
}
impl Default for TextOptimizer {
fn default() -> Self {
Self::new()
}
}
pub fn remove_excessive_whitespace(text: &str) -> String {
let re = Regex::new(r"\s{3,}").unwrap();
re.replace_all(text, " ").to_string()
}
pub fn normalize_line_breaks(text: &str) -> String {
text.replace("\r\n", "\n").replace('\r', "\n")
}
pub fn remove_page_numbers(text: &str) -> String {
let lines: Vec<&str> = text.lines().collect();
lines
.into_iter()
.filter(|line| {
let trimmed = line.trim();
!trimmed.chars().all(|c| c.is_ascii_digit())
})
.collect::<Vec<_>>()
.join("\n")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hyphenation_removal() {
let optimizer = TextOptimizer::new();
let text = "This is a long-\nword that continues.";
let result = optimizer.remove_hyphenation_impl(text);
assert_eq!(result, "This is a longword that continues.");
}
#[test]
fn test_whitespace_normalization() {
let optimizer = TextOptimizer::new();
let text = "Hello world test";
let result = optimizer.normalize_whitespace_impl(text);
assert_eq!(result, "Hello world test");
}
#[test]
fn test_remove_excessive_whitespace() {
let text = "Hello world";
let result = remove_excessive_whitespace(text);
assert_eq!(result, "Hello world");
}
#[test]
fn test_normalize_line_breaks() {
let text = "Line 1\r\nLine 2\rLine 3\nLine 4";
let result = normalize_line_breaks(text);
assert_eq!(result, "Line 1\nLine 2\nLine 3\nLine 4");
}
#[test]
fn test_full_optimization() {
let optimizer = TextOptimizer::new();
let text = "This is a test-\ntext with\n\n\n\nmultiple issues.";
let result = optimizer.optimize(text);
assert!(!result.contains(" "));
assert!(!result.contains("-\n"));
}
}