use regex::Regex;
use unicode_normalization::UnicodeNormalization;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum CleanupPreset {
Minimal,
#[default]
Standard,
Aggressive,
}
#[derive(Debug, Clone)]
pub struct CleanupOptions {
pub normalize_unicode: bool,
pub standardize_bullets: bool,
pub remove_page_numbers: bool,
pub remove_headers_footers: bool,
pub remove_toc: bool,
pub fix_ligatures: bool,
pub fix_hyphenation: bool,
pub detect_mojibake: bool,
pub remove_pua: bool,
pub remove_replacement_char: bool,
pub merge_single_newlines: bool,
pub merge_list_markers: bool,
pub merge_cjk_lines: bool,
pub normalize_whitespace: bool,
pub max_consecutive_newlines: u8,
pub preserve_frontmatter: bool,
}
impl CleanupOptions {
pub fn from_preset(preset: CleanupPreset) -> Self {
match preset {
CleanupPreset::Minimal => Self::minimal(),
CleanupPreset::Standard => Self::standard(),
CleanupPreset::Aggressive => Self::aggressive(),
}
}
pub fn minimal() -> Self {
Self {
normalize_unicode: true,
standardize_bullets: false,
remove_page_numbers: false,
remove_headers_footers: false,
remove_toc: false,
fix_ligatures: false,
fix_hyphenation: false,
detect_mojibake: false,
remove_pua: false,
remove_replacement_char: false,
merge_single_newlines: false,
merge_list_markers: false,
merge_cjk_lines: false,
normalize_whitespace: true,
max_consecutive_newlines: 0,
preserve_frontmatter: true,
}
}
pub fn standard() -> Self {
Self {
normalize_unicode: true,
standardize_bullets: true,
remove_page_numbers: true,
remove_headers_footers: true,
remove_toc: true,
fix_ligatures: true,
fix_hyphenation: true,
detect_mojibake: false,
remove_pua: false,
remove_replacement_char: true,
merge_single_newlines: true,
merge_list_markers: true,
merge_cjk_lines: true,
normalize_whitespace: true,
max_consecutive_newlines: 1, preserve_frontmatter: true,
}
}
pub fn aggressive() -> Self {
Self {
normalize_unicode: true,
standardize_bullets: true,
remove_page_numbers: true,
remove_headers_footers: true,
remove_toc: true,
fix_ligatures: true,
fix_hyphenation: true,
detect_mojibake: true,
remove_pua: true,
remove_replacement_char: true,
merge_single_newlines: true,
merge_list_markers: true,
merge_cjk_lines: true,
normalize_whitespace: true,
max_consecutive_newlines: 2,
preserve_frontmatter: true,
}
}
}
impl Default for CleanupOptions {
fn default() -> Self {
Self::standard()
}
}
pub struct CleanupPipeline {
options: CleanupOptions,
page_number_regex: Regex,
toc_dot_leader_regex: Regex,
ligature_map: Vec<(&'static str, &'static str)>,
}
impl CleanupPipeline {
pub fn new(options: CleanupOptions) -> Self {
Self {
options,
page_number_regex: Regex::new(r"(?m)^[\s]*[-–—]?\s*\d+\s*[-–—]?\s*$").unwrap(),
toc_dot_leader_regex: Regex::new(r"\s*\.{4,}\s*(\d+)?\s*$").unwrap(),
ligature_map: vec![
("\u{FB00}", "ff"), ("\u{FB01}", "fi"), ("\u{FB02}", "fl"), ("\u{FB03}", "ffi"), ("\u{FB04}", "ffl"), ("\u{FB05}", "st"), ("\u{FB06}", "st"), ],
}
}
pub fn from_preset(preset: CleanupPreset) -> Self {
Self::new(CleanupOptions::from_preset(preset))
}
pub fn process(&self, text: &str) -> String {
let mut result = text.to_string();
let frontmatter = if self.options.preserve_frontmatter {
self.extract_frontmatter(&result)
} else {
None
};
if let Some((fm, content)) = frontmatter {
result = content;
result = self.process_content(&result);
result = format!("{}\n{}", fm, result);
} else {
result = self.process_content(&result);
}
result
}
fn process_content(&self, text: &str) -> String {
let mut result = text.to_string();
if self.options.normalize_unicode {
result = result.nfc().collect();
}
if self.options.fix_ligatures {
for (ligature, replacement) in &self.ligature_map {
result = result.replace(ligature, replacement);
}
}
if self.options.standardize_bullets {
result = self.standardize_bullets(&result);
}
if self.options.remove_pua {
result = self.remove_pua_chars(&result);
}
if self.options.remove_replacement_char {
result = result.replace('\u{FFFD}', "");
}
if self.options.remove_page_numbers {
result = self.page_number_regex.replace_all(&result, "").to_string();
}
if self.options.remove_toc {
result = self.remove_toc_dot_leaders(&result);
}
if self.options.fix_hyphenation {
result = self.fix_hyphenation(&result);
}
if self.options.merge_list_markers {
result = self.merge_list_markers(&result);
}
if self.options.merge_cjk_lines {
result = self.merge_cjk_lines(&result);
}
if self.options.merge_single_newlines {
result = self.merge_single_newlines(&result);
}
if self.options.normalize_whitespace {
result = self.normalize_whitespace(&result);
}
if self.options.max_consecutive_newlines > 0 {
result = self.limit_newlines(&result);
}
result.trim().to_string()
}
fn extract_frontmatter(&self, text: &str) -> Option<(String, String)> {
if let Some(stripped) = text.strip_prefix("---\n") {
if let Some(end_pos) = stripped.find("\n---\n") {
let fm_end = 4 + end_pos + 5;
let frontmatter = &text[..fm_end];
let content = &text[fm_end..];
return Some((frontmatter.to_string(), content.to_string()));
}
}
None
}
fn standardize_bullets(&self, text: &str) -> String {
let bullets = ['●', '○', '■', '□', '◆', '◇', '▪', '▫', '►', '▻'];
let mut result = text.to_string();
for bullet in bullets {
result = result.replace(bullet, "•");
}
result
}
fn remove_pua_chars(&self, text: &str) -> String {
text.chars()
.filter(|c| {
let code = *c as u32;
!(0xE000..=0xF8FF).contains(&code)
&& !(0xF0000..=0xFFFFD).contains(&code)
&& !(0x100000..=0x10FFFD).contains(&code)
})
.collect()
}
fn fix_hyphenation(&self, text: &str) -> String {
let re = Regex::new(r"([a-zA-Z])-\s*\n?\s*([a-z])").unwrap();
re.replace_all(text, "$1$2").to_string()
}
fn normalize_whitespace(&self, text: &str) -> String {
let re = Regex::new(r"[ ]{3,}").unwrap();
re.replace_all(text, " ").to_string()
}
fn limit_newlines(&self, text: &str) -> String {
let max = self.options.max_consecutive_newlines as usize;
let pattern = format!(r"\n{{{},}}", max + 1);
let re = Regex::new(&pattern).unwrap();
let replacement = "\n".repeat(max);
re.replace_all(text, replacement.as_str()).to_string()
}
fn merge_single_newlines(&self, text: &str) -> String {
const PARA_PLACEHOLDER: &str = "\u{0000}PARA\u{0000}";
const SENT_PLACEHOLDER: &str = "\u{0000}SENT\u{0000}";
let re_heading_line = Regex::new(r"(?m)^(#{1,6}\s)").unwrap();
let protected = re_heading_line.replace_all(text, |caps: ®ex::Captures| {
format!("\u{0000}H{}", &caps[1])
});
let re_list_line = Regex::new(r"(?m)^([-*]\s|[0-9]+\.\s)").unwrap();
let protected = re_list_line.replace_all(&protected, |caps: ®ex::Captures| {
format!("\u{0000}L{}", &caps[1])
});
let re_table_line = Regex::new(r"(?m)^(\|)").unwrap();
let protected = re_table_line.replace_all(&protected, |caps: ®ex::Captures| {
format!("\u{0000}T{}", &caps[1])
});
let re_para = Regex::new(r"\n{2,}").unwrap();
let protected = re_para.replace_all(&protected, PARA_PLACEHOLDER);
let re_sent = Regex::new(r"([.。!?!?])\s*\n").unwrap();
let protected = re_sent.replace_all(&protected, |caps: ®ex::Captures| {
format!("{}{}", &caps[1], SENT_PLACEHOLDER)
});
let merged = protected.replace('\n', " ");
let merged = merged.replace(SENT_PLACEHOLDER, "\n");
let merged = merged.replace(PARA_PLACEHOLDER, "\n\n");
let merged = merged.replace("\u{0000}H", "\n");
let merged = merged.replace("\u{0000}L", "\n");
merged.replace("\u{0000}T", "\n")
}
fn merge_list_markers(&self, text: &str) -> String {
let mut result = text.to_string();
let re_bullet = Regex::new(r"([•\-■□▪▸►◆◇➤✓✗])\s*\n\s*").unwrap();
result = re_bullet.replace_all(&result, "$1 ").to_string();
let re_number = Regex::new(r"(\d{1,3}[.)]\s*)\n\s*").unwrap();
result = re_number.replace_all(&result, "$1").to_string();
let re_paren_number = Regex::new(r"(\(\d{1,3}\)\s*)\n\s*").unwrap();
result = re_paren_number.replace_all(&result, "$1").to_string();
let re_korean = Regex::new(r"([가-힣][.)]\s*)\n\s*").unwrap();
result = re_korean.replace_all(&result, "$1").to_string();
let re_circled = Regex::new(r"([❶-❿])\s*\n\s*").unwrap();
result = re_circled.replace_all(&result, "$1 ").to_string();
result
}
fn remove_toc_dot_leaders(&self, text: &str) -> String {
text.lines()
.map(|line| {
self.toc_dot_leader_regex
.replace(line, |caps: ®ex::Captures| {
if let Some(page) = caps.get(1) {
format!(" (p.{})", page.as_str())
} else {
String::new()
}
})
.into_owned()
})
.collect::<Vec<_>>()
.join("\n")
}
fn merge_cjk_lines(&self, text: &str) -> String {
const PLACEHOLDER: &str = "\u{0000}CJKPARA\u{0000}";
let re_para = Regex::new(r"\n{2,}").unwrap();
let protected = re_para.replace_all(text, PLACEHOLDER);
let re = Regex::new(
r"([\p{Hangul}\p{Han}\p{Hiragana}\p{Katakana}])([^.。!?!?\n]?)\n([\p{Hangul}\p{Han}\p{Hiragana}\p{Katakana}])"
).unwrap();
let merged = re.replace_all(&protected, "$1$2$3").to_string();
merged.replace(PLACEHOLDER, "\n\n")
}
}
impl Default for CleanupPipeline {
fn default() -> Self {
Self::new(CleanupOptions::default())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unicode_normalization() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Minimal);
let text = "café"; let result = pipeline.process(text);
assert!(result.contains("café"));
}
#[test]
fn test_ligature_fix() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Standard);
let text = "finding flowers";
let result = pipeline.process(text);
assert_eq!(result, "finding flowers");
}
#[test]
fn test_bullet_standardization() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Standard);
let text = "● Item 1\n○ Item 2\n■ Item 3";
let result = pipeline.process(text);
assert!(result.contains("• Item 1"));
assert!(result.contains("• Item 2"));
}
#[test]
fn test_hyphenation_fix() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Standard);
let text = "This is infor-\nmation about something.";
let result = pipeline.process(text);
assert!(result.contains("information"));
let text2 = "This is adip- iscing elit.";
let result2 = pipeline.process(text2);
assert!(
result2.contains("adipiscing"),
"Expected 'adipiscing' but got: {}",
result2
);
let text3 = "con-\n sectetuer";
let result3 = pipeline.process(text3);
assert!(
result3.contains("consectetuer"),
"Expected 'consectetuer' but got: {}",
result3
);
}
#[test]
fn test_frontmatter_preservation() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Aggressive);
let text = "---\ntitle: Test\n---\n\nContent with extra spaces.";
let result = pipeline.process(text);
assert!(result.starts_with("---\n"));
assert!(result.contains("title: Test"));
}
#[test]
fn test_merge_single_newlines() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Standard);
let text = "This\nis\na\ntest.\n\nNew paragraph.";
let result = pipeline.process(text);
assert!(result.contains("This is a test."));
assert!(result.contains("\n\n") || result.contains("New paragraph"));
}
#[test]
fn test_remove_replacement_char() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Standard);
let text = "Hello\u{FFFD}World";
let result = pipeline.process(text);
assert_eq!(result, "HelloWorld");
}
#[test]
fn test_merge_list_markers_bullet() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Standard);
let text = "• \n'안전을 위한 주의사항'은 제품을 올바르게 사용하기 위한 것입니다.";
let result = pipeline.process(text);
assert!(
result.starts_with("• '안전을"),
"Expected bullet merged, got: {}",
result
);
}
#[test]
fn test_merge_list_markers_number() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Standard);
let text = "01. \n인명이나 재산상에 영향이 큰 기기에 사용하지 마십시오.";
let result = pipeline.process(text);
assert!(
result.starts_with("01. 인명이나"),
"Expected number merged, got: {}",
result
);
}
#[test]
fn test_merge_cjk_lines() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Standard);
let text = "반드시 지키\n십시오.";
let result = pipeline.process(text);
assert!(
result.contains("반드시 지키십시오"),
"Expected CJK merged, got: {}",
result
);
}
#[test]
fn test_merge_cjk_with_space() {
let pipeline = CleanupPipeline::from_preset(CleanupPreset::Standard);
let text = "특정조건 하에서\n 위험이 발생할 우려가 있습니다.";
let result = pipeline.process(text);
assert!(
result.contains("하에서") && result.contains("위험이"),
"Expected proper merge, got: {}",
result
);
}
}