use lazy_static::lazy_static;
use regex::Regex;
lazy_static! {
static ref RE_MULTI_NEWLINE: Regex = Regex::new(r"\n{3,}").unwrap();
static ref RE_PAGE_NUM: Regex = Regex::new(r"(?m)^Page\s+\d+\s*$").unwrap();
static ref RE_DASH_PAGE: Regex = Regex::new(r"(?m)^\s*-\s*\d+\s*-\s*$").unwrap();
static ref RE_BRACKET_PAGE: Regex = Regex::new(r"(?m)^\s*[\[\(]\d+[\]\)]\s*$").unwrap();
static ref RE_STANDALONE_NUM: Regex = Regex::new(r"(?m)^\s*\d{1,3}\s*$").unwrap();
static ref RE_DASH_SEP: Regex = Regex::new(r"(?m)^[\s\-]{5,}$").unwrap();
static ref RE_EQUALS_SEP: Regex = Regex::new(r"(?m)^[\s=]{5,}$").unwrap();
}
pub fn normalize_whitespace(text: &str) -> String {
RE_MULTI_NEWLINE.replace_all(text, "\n\n\n").to_string()
}
pub fn remove_page_artifacts(text: &str) -> String {
let mut result = text.to_string();
result = RE_PAGE_NUM.replace_all(&result, "").to_string();
result = RE_DASH_PAGE.replace_all(&result, "").to_string();
result = RE_BRACKET_PAGE.replace_all(&result, "").to_string();
result = RE_STANDALONE_NUM.replace_all(&result, "").to_string();
result = RE_DASH_SEP.replace_all(&result, "").to_string();
result = RE_EQUALS_SEP.replace_all(&result, "").to_string();
result
}
pub fn merge_bold_markers(text: &str) -> String {
lazy_static! {
static ref RE_BOLD_GAP: Regex = Regex::new(
r"\*\*([^*]+)\*\*\s+([a-zA-Z]+)(?:\s+([a-zA-Z]+))?(?:\s+([a-zA-Z]+))?"
).unwrap();
}
text.replace("** **", " ")
}
pub fn remove_duplicate_words(text: &str) -> String {
lazy_static! {
static ref RE_WORD: Regex = Regex::new(r"\b(\w{4,})\b").unwrap();
}
let mut result = String::with_capacity(text.len());
let mut last_word: Option<String> = None;
let mut last_end = 0;
for cap in RE_WORD.captures_iter(text) {
let m = cap.get(0).unwrap();
let word = m.as_str();
let start = m.start();
let end = m.end();
result.push_str(&text[last_end..start]);
let is_duplicate = if let Some(ref prev) = last_word {
word.to_lowercase() == prev.to_lowercase()
} else {
false
};
if !is_duplicate {
result.push_str(word);
last_word = Some(word.to_string());
}
last_end = end;
}
result.push_str(&text[last_end..]);
result
}
pub fn cleanup_markdown(text: &str) -> String {
let without_artifacts = remove_page_artifacts(text);
normalize_whitespace(&without_artifacts)
}
pub fn normalize_horizontal_whitespace(text: &str) -> String {
lazy_static! {
static ref RE_MULTI_SPACE: Regex = Regex::new(r" {2,}").unwrap();
}
let mut result = String::with_capacity(text.len());
for line in text.lines() {
if !result.is_empty() {
result.push('\n');
}
let trimmed_start = line.trim_start();
let leading_spaces_count = line.len() - trimmed_start.len();
for _ in 0..leading_spaces_count {
result.push(' ');
}
let normalized = RE_MULTI_SPACE.replace_all(trimmed_start, " ");
result.push_str(&normalized);
}
result
}
pub fn cleanup_plain_text(text: &str) -> String {
let dehyphenated = dehyphenate_line_breaks(text);
let horizontal_normalized = normalize_horizontal_whitespace(&dehyphenated);
normalize_whitespace(&horizontal_normalized)
}
pub fn dehyphenate_line_breaks(text: &str) -> String {
let mut out = String::with_capacity(text.len());
let bytes = text.as_bytes();
let mut i = 0usize;
while i < bytes.len() {
let c = bytes[i];
if c == b'-' && i > 0 && bytes[i - 1].is_ascii_lowercase() {
let mut j = i + 1;
while j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
j += 1;
}
if j < bytes.len() && bytes[j] == b'\r' {
j += 1;
}
if j < bytes.len() && bytes[j] == b'\n' {
j += 1;
while j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
j += 1;
}
if j < bytes.len() && bytes[j].is_ascii_lowercase() {
i = j;
continue;
}
}
}
if c.is_ascii() {
out.push(c as char);
i += 1;
} else {
let rest = &text[i..];
let ch = rest.chars().next().expect("bytes inside str");
out.push(ch);
i += ch.len_utf8();
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn dehyphenate_rejoins_soft_hyphen_line_break() {
assert_eq!(dehyphenate_line_breaks("scruti-\nneer"), "scrutineer");
assert_eq!(dehyphenate_line_breaks("scruti-\r\nneer"), "scrutineer");
assert_eq!(dehyphenate_line_breaks("scruti- \n neer"), "scrutineer");
}
#[test]
fn dehyphenate_does_not_merge_proper_noun_fragment() {
assert_eq!(dehyphenate_line_breaks("co-\nWorker"), "co-\nWorker");
}
#[test]
fn dehyphenate_requires_hyphen_and_lowercase_both_sides() {
assert_eq!(dehyphenate_line_breaks("no\nhyphen"), "no\nhyphen");
assert_eq!(dehyphenate_line_breaks("bullet-\n• point"), "bullet-\n• point");
}
#[test]
fn dehyphenate_preserves_intentional_hyphens() {
assert_eq!(
dehyphenate_line_breaks("state-of-the-art solution"),
"state-of-the-art solution"
);
}
#[test]
fn test_normalize_whitespace_reduces_excessive_blanks() {
let input = "Line 1\n\n\n\n\n\n\n\nLine 2";
let output = normalize_whitespace(input);
assert_eq!(output, "Line 1\n\n\nLine 2");
assert!(!output.contains("\n\n\n\n")); }
#[test]
fn test_normalize_whitespace_preserves_single_and_double_blanks() {
let input = "A\nB\n\nC\n\n\nD";
let output = normalize_whitespace(input);
assert_eq!(output, "A\nB\n\nC\n\n\nD");
}
#[test]
fn test_normalize_whitespace_handles_no_blanks() {
let input = "Line 1\nLine 2\nLine 3";
let output = normalize_whitespace(input);
assert_eq!(output, input);
}
#[test]
fn test_remove_page_artifacts_page_numbers() {
let input = "Content\n\nPage 1\n\nMore content\n\nPage 2\n\nEnd";
let output = remove_page_artifacts(input);
assert!(output.contains("Content"));
assert!(output.contains("More content"));
assert!(output.contains("End"));
assert!(!output.contains("Page 1"));
assert!(!output.contains("Page 2"));
}
#[test]
fn test_remove_page_artifacts_dash_style() {
let input = "Content\n\n- 1 -\n\nMore content\n\n- 2 -\n\nEnd";
let output = remove_page_artifacts(input);
assert!(!output.contains("- 1 -"));
assert!(!output.contains("- 2 -"));
}
#[test]
fn test_remove_page_artifacts_bracket_style() {
let input = "Content\n\n[1]\n\nMore\n\n(2)\n\nEnd";
let output = remove_page_artifacts(input);
assert!(!output.contains("[1]"));
assert!(!output.contains("(2)"));
}
#[test]
fn test_remove_page_artifacts_standalone_numbers() {
let input = "Content\n\n1\n\nMore\n\n42\n\nEnd";
let output = remove_page_artifacts(input);
assert!(!output.contains("\n1\n"));
assert!(!output.contains("\n42\n"));
}
#[test]
fn test_remove_page_artifacts_preserves_inline_numbers() {
let input = "There are 42 items in the list.";
let output = remove_page_artifacts(input);
assert_eq!(output, input);
}
#[test]
fn test_remove_page_artifacts_separators() {
let input = "Section 1\n\n-----------\n\nSection 2\n\n===========\n\nEnd";
let output = remove_page_artifacts(input);
assert!(!output.contains("-----------"));
assert!(!output.contains("==========="));
}
#[test]
fn test_cleanup_markdown_full_pipeline() {
let input = "Content\n\n\n\n\n\nPage 1\n\n\n\n\n\nMore content\n\n-----------\n\n\n\n\nEnd";
let output = cleanup_markdown(input);
assert!(!output.contains("Page 1"));
assert!(!output.contains("-----------"));
assert!(!output.contains("\n\n\n\n"));
assert!(output.contains("Content"));
assert!(output.contains("More content"));
assert!(output.contains("End"));
}
#[test]
fn test_cleanup_markdown_empty_string() {
let output = cleanup_markdown("");
assert_eq!(output, "");
}
#[test]
fn test_cleanup_markdown_no_changes_needed() {
let input = "Line 1\n\nLine 2\n\nLine 3";
let output = cleanup_markdown(input);
assert_eq!(output, input);
}
}