#![allow(missing_docs, clippy::unused_self)]
use once_cell::sync::Lazy;
use regex::Regex;
static HYPHEN_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\w+)-\s*\n\s*(\w+)").unwrap());
static MULTI_SPACE_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s{2,}").unwrap());
static LINE_BREAK_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"([^\n])\n([^\n])").unwrap());
const CHAR_NORMALIZATION_MAP: &[(&str, &str)] = &[
("⁄", "/"),
("∕", "/"),
("'", "'"), ("'", "'"), ("\u{201C}", "\""), ("\u{201D}", "\""), ("\u{201E}", "\""), ("\u{201F}", "\""), ("‹", "<"), ("›", ">"), ("«", "<<"), ("»", ">>"), ("–", "-"), ("—", "-"), ("‐", "-"), ("‑", "-"), ("−", "-"), ("‒", "-"), ("―", "-"), ("•", "·"),
("‣", "·"),
("⁃", "·"),
("◦", "o"),
("▪", "·"),
("▫", "o"),
("…", "..."),
(" ", " "), (" ", " "), (" ", " "), (" ", " "), (" ", " "), ("×", "x"),
("÷", "/"),
("±", "+/-"),
("∓", "-/+"),
("≤", "<="),
("≥", ">="),
("≠", "!="),
("≈", "~="),
("≡", "==="),
("≢", "!=="),
("∞", "infinity"),
("∫", "integral"),
("∑", "sum"),
("∏", "product"),
("√", "sqrt"),
("∛", "cbrt"),
("∜", "fourthrt"),
("∂", "d"), ("∆", "delta"),
("∇", "nabla"),
("→", "->"),
("←", "<-"),
("↑", "^"),
("↓", "v"),
("↔", "<->"),
("⇒", "=>"),
("⇐", "<="),
("⇔", "<=>"),
("↦", "|->"),
("⁰", "0"),
("¹", "1"),
("²", "2"),
("³", "3"),
("⁴", "4"),
("⁵", "5"),
("⁶", "6"),
("⁷", "7"),
("⁸", "8"),
("⁹", "9"),
("⁺", "+"),
("⁻", "-"),
("⁼", "="),
("⁽", "("),
("⁾", ")"),
("₀", "0"),
("₁", "1"),
("₂", "2"),
("₃", "3"),
("₄", "4"),
("₅", "5"),
("₆", "6"),
("₇", "7"),
("₈", "8"),
("₉", "9"),
("₊", "+"),
("₋", "-"),
("₌", "="),
("₍", "("),
("₎", ")"),
("α", "alpha"),
("β", "beta"),
("γ", "gamma"),
("δ", "delta"),
("ε", "epsilon"),
("ζ", "zeta"),
("η", "eta"),
("θ", "theta"),
("ι", "iota"),
("κ", "kappa"),
("λ", "lambda"),
("μ", "mu"),
("ν", "nu"),
("ξ", "xi"),
("ο", "omicron"),
("π", "pi"),
("ρ", "rho"),
("σ", "sigma"),
("τ", "tau"),
("υ", "upsilon"),
("φ", "phi"),
("χ", "chi"),
("ψ", "psi"),
("ω", "omega"),
("Α", "Alpha"),
("Β", "Beta"),
("Γ", "Gamma"),
("Δ", "Delta"),
("Ε", "Epsilon"),
("Ζ", "Zeta"),
("Η", "Eta"),
("Θ", "Theta"),
("Ι", "Iota"),
("Κ", "Kappa"),
("Λ", "Lambda"),
("Μ", "Mu"),
("Ν", "Nu"),
("Ξ", "Xi"),
("Ο", "Omicron"),
("Π", "Pi"),
("Ρ", "Rho"),
("Σ", "Sigma"),
("Τ", "Tau"),
("Υ", "Upsilon"),
("Φ", "Phi"),
("Χ", "Chi"),
("Ψ", "Psi"),
("Ω", "Omega"),
("fi", "fi"),
("fl", "fl"),
("ff", "ff"),
("ffi", "ffi"),
("ffl", "ffl"),
("ſt", "ft"),
("st", "st"),
("©", "(c)"),
("®", "(R)"),
("™", "(TM)"),
("°", " degrees"),
("§", "section"),
("¶", "paragraph"),
("†", "+"), ("‡", "++"), ("¢", "cents"),
("£", "GBP"),
("¥", "JPY"),
("€", "EUR"),
];
#[derive(Debug)]
pub struct TextSanitizer {
join_hyphens: bool,
join_lines: bool,
normalize_chars: bool,
normalize_whitespace: bool,
}
impl TextSanitizer {
pub fn new() -> Self {
Self {
join_hyphens: true,
join_lines: true,
normalize_chars: true,
normalize_whitespace: true,
}
}
pub fn with_options(
join_hyphens: bool,
join_lines: bool,
normalize_chars: bool,
normalize_whitespace: bool,
) -> Self {
Self {
join_hyphens,
join_lines,
normalize_chars,
normalize_whitespace,
}
}
pub fn sanitize(&self, text: &str) -> String {
let mut result = text.to_string();
if self.normalize_chars {
result = self.normalize_characters(&result);
}
if self.join_hyphens {
result = self.join_hyphenated_words(&result);
}
if self.join_lines {
result = self.join_lines_with_space(&result);
}
if self.normalize_whitespace {
result = self.normalize_whitespace_chars(&result);
}
result.trim().to_string()
}
fn join_hyphenated_words(&self, text: &str) -> String {
HYPHEN_PATTERN.replace_all(text, "$1$2").to_string()
}
fn join_lines_with_space(&self, text: &str) -> String {
LINE_BREAK_PATTERN.replace_all(text, "$1 $2").to_string()
}
fn normalize_characters(&self, text: &str) -> String {
let mut result = text.to_string();
for (from, to) in CHAR_NORMALIZATION_MAP {
result = result.replace(from, to);
}
result
}
fn normalize_whitespace_chars(&self, text: &str) -> String {
MULTI_SPACE_PATTERN.replace_all(text, " ").to_string()
}
}
impl Default for TextSanitizer {
fn default() -> Self {
Self::new()
}
}
pub fn sanitize_text(text: &str) -> String {
TextSanitizer::new().sanitize(text)
}
pub fn join_text_cells(texts: &[&str], add_spaces: bool) -> String {
if texts.is_empty() {
return String::new();
}
if add_spaces {
texts.join(" ")
} else {
texts.concat()
}
}
pub fn is_likely_heading(text: &str) -> bool {
let text = text.trim();
if text.is_empty() || text.len() > 100 {
return false;
}
if text.ends_with('.') || text.ends_with('?') || text.ends_with('!') {
return false;
}
let uppercase_ratio = text
.chars()
.filter(|c| c.is_alphabetic())
.filter(|c| c.is_uppercase())
.count() as f32
/ text.chars().filter(|c| c.is_alphabetic()).count().max(1) as f32;
if uppercase_ratio > 0.7 {
return true;
}
let section_number_pattern = Regex::new(r"^\d+(\.\d+)*\.?\s").unwrap();
if section_number_pattern.is_match(text) {
return true;
}
false
}
pub fn extract_section_number(text: &str) -> Option<String> {
let section_pattern = Regex::new(r"^(\d+(\.\d+)*)\.?\s").unwrap();
section_pattern
.captures(text)
.and_then(|caps| caps.get(1))
.map(|m| m.as_str().to_string())
}
pub fn calculate_section_level(section_number: &str) -> usize {
section_number.split('.').count()
}
pub fn remove_pdf_artifacts(text: &str) -> String {
text.chars()
.filter(|&c| {
!matches!(
c,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{00AD}' | '\0'..='\u{001F}' ) || c == '\n'
|| c == '\t'
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hyphen_joining() {
let sanitizer = TextSanitizer::new();
let text = "This is a hyphen-\nated word.";
let result = sanitizer.sanitize(text);
assert_eq!(result, "This is a hyphenated word.");
}
#[test]
fn test_line_joining() {
let sanitizer = TextSanitizer::new();
let text = "Line one\nLine two";
let result = sanitizer.sanitize(text);
assert_eq!(result, "Line one Line two");
}
#[test]
fn test_character_normalization() {
let sanitizer = TextSanitizer::new();
let text = "Price: $100⁄month — \u{201C}special\u{201D} offer";
let result = sanitizer.sanitize(text);
assert_eq!(result, "Price: $100/month - \"special\" offer");
}
#[test]
fn test_whitespace_normalization() {
let sanitizer = TextSanitizer::new();
let text = "Too many spaces";
let result = sanitizer.sanitize(text);
assert_eq!(result, "Too many spaces");
}
#[test]
fn test_is_likely_heading() {
assert!(is_likely_heading("1. Introduction"));
assert!(is_likely_heading("CHAPTER 1"));
assert!(is_likely_heading("1.2.3 Methods"));
assert!(!is_likely_heading("This is a regular sentence."));
assert!(!is_likely_heading(
"This is a very long text that goes on and on and definitely should not be considered a heading because it's way too long."
));
}
#[test]
fn test_extract_section_number() {
assert_eq!(
extract_section_number("1.2.3 Methods"),
Some("1.2.3".to_string())
);
assert_eq!(
extract_section_number("1. Introduction"),
Some("1".to_string())
);
assert_eq!(extract_section_number("No number here"), None);
}
#[test]
fn test_calculate_section_level() {
assert_eq!(calculate_section_level("1"), 1);
assert_eq!(calculate_section_level("1.2"), 2);
assert_eq!(calculate_section_level("1.2.3"), 3);
assert_eq!(calculate_section_level("1.2.3.4"), 4);
}
#[test]
fn test_remove_pdf_artifacts() {
let text = "Hello\u{200B}World\u{00AD}Test";
let result = remove_pdf_artifacts(text);
assert_eq!(result, "HelloWorldTest");
}
#[test]
fn test_ligature_normalization() {
let sanitizer = TextSanitizer::new();
let text = "file with ligatures: ff, fi, fl";
let result = sanitizer.sanitize(text);
assert_eq!(result, "file with ligatures: ff, fi, fl");
}
}