use regex::Regex;
use std::collections::HashMap;
use std::sync::LazyLock;
use unicode_normalization::UnicodeNormalization;
#[derive(Debug, Clone)]
pub struct CleanupOptions {
pub normalize_strings: bool,
pub clean_lines: bool,
pub filter_structure: bool,
pub final_normalize: bool,
pub remove_pua: bool,
pub remove_hwp_placeholders: bool,
pub header_footer_threshold: f64,
pub max_header_footer_length: usize,
pub detect_mojibake: bool,
pub preserve_frontmatter: bool,
}
impl Default for CleanupOptions {
fn default() -> Self {
Self {
normalize_strings: true,
clean_lines: true,
filter_structure: true,
final_normalize: true,
remove_pua: true,
remove_hwp_placeholders: true,
header_footer_threshold: 0.8,
max_header_footer_length: 100,
detect_mojibake: true,
preserve_frontmatter: true,
}
}
}
impl CleanupOptions {
pub fn minimal() -> Self {
Self {
normalize_strings: true,
clean_lines: false,
filter_structure: false,
final_normalize: true,
remove_pua: true,
remove_hwp_placeholders: true,
header_footer_threshold: 0.8,
max_header_footer_length: 100,
detect_mojibake: true,
preserve_frontmatter: true,
}
}
pub fn aggressive() -> Self {
Self {
normalize_strings: true,
clean_lines: true,
filter_structure: true,
final_normalize: true,
remove_pua: true,
remove_hwp_placeholders: true,
header_footer_threshold: 0.7, max_header_footer_length: 150,
detect_mojibake: true,
preserve_frontmatter: true,
}
}
}
const BULLET_MAPPINGS: &[(char, &str)] = &[
('●', "- "),
('■', "- "),
('◆', "- "),
('▶', "- "),
('►', "- "),
('➢', "- "),
('➤', "- "),
('•', "- "),
('·', "- "), ('○', "- "),
('□', "- "),
('◇', "- "),
('▷', "- "),
('→', "- "),
('⇒', "- "),
('➔', "- "),
('※', "> ※ "), ('★', "- "),
('☆', "- "),
('✓', "- [x] "), ('✔', "- [x] "),
('✗', "- [ ] "),
('✘', "- [ ] "),
('◦', " - "), ('◼', "- "), ('◾', "- "), ('\u{F0A3}', "- "), ('\u{F09F}', " - "), ('\u{F09E}', "- "), ('\u{F020}', "- "), ('\u{F076}', "- "), ('\u{F0FC}', "- [x] "), ('\u{F0A8}', "- "), ];
pub fn stage1_normalize_string(input: &str, options: &CleanupOptions) -> String {
let mut result = String::with_capacity(input.len());
for c in input.nfc() {
if is_control_char(c) {
continue;
}
if options.remove_pua && is_pua_char(c) {
continue;
}
if let Some(replacement) = get_bullet_replacement(c) {
result.push_str(replacement);
continue;
}
if let Some(normalized) = normalize_fullwidth(c) {
result.push(normalized);
continue;
}
if c == '\u{00A0}' {
result.push(' ');
continue;
}
result.push(c);
}
result
}
fn is_control_char(c: char) -> bool {
matches!(
c,
'\0' | '\x0B' | '\x0C' | '\u{FEFF}' | '\u{FFFD}' | '\u{00AD}' )
}
fn is_pua_char(c: char) -> bool {
let code = c as u32;
(0xE000..=0xF8FF).contains(&code)
|| (0xF0000..=0xFFFFD).contains(&code)
|| (0x100000..=0x10FFFD).contains(&code)
}
fn get_bullet_replacement(c: char) -> Option<&'static str> {
BULLET_MAPPINGS
.iter()
.find(|(bullet, _)| *bullet == c)
.map(|(_, replacement)| *replacement)
}
fn normalize_fullwidth(c: char) -> Option<char> {
match c {
'\u{3000}' => Some(' '), '\u{FF01}'..='\u{FF5E}' => {
let offset = c as u32 - 0xFF01;
char::from_u32(0x21 + offset) }
_ => None,
}
}
pub fn clean_line_trailing_mojibake(line: &str) -> String {
if line.is_empty() {
return line.to_string();
}
let chars: Vec<char> = line.chars().collect();
let len = chars.len();
if let Some(end_pos) = detect_trailing_garbage(&chars) {
return chars[..end_pos].iter().collect();
}
let mut end_pos = len;
for i in (0..len).rev() {
let c = chars[i];
if is_normal_content_char(c) {
break;
}
if is_suspicious_trailing_char(c, &chars, i) {
end_pos = i;
} else {
break;
}
}
if end_pos < len {
chars[..end_pos].iter().collect()
} else {
line.to_string()
}
}
fn detect_trailing_garbage(chars: &[char]) -> Option<usize> {
let len = chars.len();
if len < 3 {
return None;
}
let has_korean = chars.iter().any(|&c| is_hangul(c));
let text: String = chars.iter().collect();
let has_url = text.contains("://") || text.contains("http");
let has_content = has_korean || has_url || chars.iter().any(|c| c.is_ascii_alphanumeric());
if !has_content {
return None;
}
let mut garbage_start = None;
let mut cjk_count = 0;
let mut trailing_ascii_count = 0;
for i in (0..len).rev() {
let c = chars[i];
if is_cjk_ideograph(c) {
cjk_count += 1;
garbage_start = Some(i);
} else if cjk_count > 0 {
if is_normal_content_char(c) || c.is_ascii_punctuation() {
if cjk_count <= 3 {
return garbage_start;
}
}
break;
} else if c.is_ascii_alphabetic() && trailing_ascii_count < 3 {
trailing_ascii_count += 1;
} else {
break;
}
}
None
}
fn is_hangul(c: char) -> bool {
let code = c as u32;
(0xAC00..=0xD7AF).contains(&code) || (0x1100..=0x11FF).contains(&code) || (0x3130..=0x318F).contains(&code) }
fn is_normal_content_char(c: char) -> bool {
if c.is_ascii() {
return true;
}
let code = c as u32;
if (0xAC00..=0xD7AF).contains(&code) || (0x1100..=0x11FF).contains(&code) || (0x3130..=0x318F).contains(&code)
{
return true;
}
if (0x2000..=0x206F).contains(&code) || (0x3000..=0x303F).contains(&code)
{
return true;
}
false
}
fn is_suspicious_trailing_char(c: char, chars: &[char], pos: usize) -> bool {
let code = c as u32;
if (0x4E00..=0x9FFF).contains(&code) {
let has_ascii_before = chars[..pos].iter().any(|&ch| ch.is_ascii_alphanumeric());
let has_url_pattern = chars[..pos].iter().collect::<String>().contains("://");
if has_ascii_before || has_url_pattern {
let cjk_neighbors = count_cjk_neighbors(chars, pos);
if cjk_neighbors < 2 {
return true; }
}
}
if (0x3400..=0x4DBF).contains(&code) || (0x20000..=0x2A6DF).contains(&code) || (0x2A700..=0x2B73F).contains(&code)
{
return true;
}
if (0xFFF0..=0xFFFF).contains(&code)
{
return true;
}
false
}
fn count_cjk_neighbors(chars: &[char], pos: usize) -> usize {
let mut count = 0;
for c in chars.iter().take(pos).skip(pos.saturating_sub(3)) {
if is_cjk_ideograph(*c) {
count += 1;
}
}
for c in chars.iter().skip(pos + 1).take(3) {
if is_cjk_ideograph(*c) {
count += 1;
}
}
count
}
fn is_cjk_ideograph(c: char) -> bool {
let code = c as u32;
(0x4E00..=0x9FFF).contains(&code)
|| (0x3400..=0x4DBF).contains(&code)
|| (0x20000..=0x2A6DF).contains(&code)
}
fn is_entirely_mojibake(line: &str) -> bool {
let trimmed = line.trim();
let char_count = trimmed.chars().count();
if char_count == 0 || char_count > 5 {
return false;
}
let all_cjk_ideographs = trimmed.chars().all(is_cjk_ideograph);
if !all_cjk_ideographs {
return false;
}
true
}
static RE_PAGE_HYPHEN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^\s*[-\[\(]\s*\d+\s*[-\]\)]\s*$").expect("RE_PAGE_HYPHEN: invalid regex pattern")
});
static RE_PAGE_RATIO: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)^\s*(?:Page\s*)?\d+\s*(?:/|of)\s*\d+\s*$")
.expect("RE_PAGE_RATIO: invalid regex pattern")
});
static RE_PAGE_KOREAN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"^\s*(?:-\s*)?\d+\s*(?:쪽|페이지|Page)(?:\s*-)?(?:\s*/\s*\d+(?:쪽|페이지|Page)?)?\s*$",
)
.expect("RE_PAGE_KOREAN: invalid regex pattern")
});
static RE_TOC_DOTS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^.*\.{3,}[\.\s]*\d+\s*$").expect("RE_TOC_DOTS: invalid regex pattern")
});
static RE_SEPARATOR: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[-=*_]{3,}$").expect("RE_SEPARATOR: invalid regex pattern"));
static RE_HWP_PLACEHOLDER: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^\s*\[(?:EQ|수식|표|TABLE|그림|IMAGE)\]\s*$")
.expect("RE_HWP_PLACEHOLDER: invalid regex pattern")
});
static RE_EMPTY_BRACKETS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^[\(\[\{<]\s*[\)\]\}>]$").expect("RE_EMPTY_BRACKETS: invalid regex pattern")
});
#[allow(dead_code)]
static RE_FIGURE_CAPTION: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^\s*\[?(?:그림|Figure|Fig)\.?\s*\d+[^\]]*\]?\s*$")
.expect("RE_FIGURE_CAPTION: invalid regex pattern")
});
#[allow(dead_code)]
static RE_TABLE_CAPTION: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^\s*\[?(?:표|Table)\.?\s*\d+[^\]]*\]?\s*$")
.expect("RE_TABLE_CAPTION: invalid regex pattern")
});
pub fn stage2_clean_lines(input: &str, options: &CleanupOptions) -> String {
let lines: Vec<&str> = input.lines().collect();
let frequent_lines = if options.clean_lines {
analyze_line_frequencies(&lines, options)
} else {
HashMap::new()
};
let mut result = Vec::with_capacity(lines.len());
for line in lines {
let trimmed = line.trim();
if trimmed.is_empty() {
result.push(line.to_string());
continue;
}
if is_page_number(trimmed) {
result.push(String::new());
continue;
}
if RE_TOC_DOTS.is_match(trimmed) {
result.push(String::new());
continue;
}
if options.remove_hwp_placeholders && RE_HWP_PLACEHOLDER.is_match(trimmed) {
result.push(String::new());
continue;
}
if RE_EMPTY_BRACKETS.is_match(trimmed) {
result.push(String::new());
continue;
}
if RE_SEPARATOR.is_match(trimmed) {
result.push("---".to_string());
continue;
}
if frequent_lines.contains_key(trimmed) {
result.push(String::new());
continue;
}
if options.detect_mojibake && is_entirely_mojibake(trimmed) {
result.push(String::new());
continue;
}
let cleaned_line = if options.detect_mojibake {
clean_line_trailing_mojibake(line)
} else {
line.to_string()
};
result.push(cleaned_line);
}
result.join("\n")
}
fn is_page_number(line: &str) -> bool {
RE_PAGE_HYPHEN.is_match(line) || RE_PAGE_RATIO.is_match(line) || RE_PAGE_KOREAN.is_match(line)
}
fn analyze_line_frequencies<'a>(
lines: &[&'a str],
options: &CleanupOptions,
) -> HashMap<&'a str, usize> {
let mut freq: HashMap<&str, usize> = HashMap::new();
for line in lines {
let trimmed = line.trim();
if !trimmed.is_empty() && trimmed.len() <= options.max_header_footer_length {
*freq.entry(trimmed).or_insert(0) += 1;
}
}
let estimated_pages = (lines.len() as f64 / 40.0).ceil() as usize;
let threshold = (estimated_pages as f64 * options.header_footer_threshold) as usize;
freq.retain(|_, count| *count >= threshold.max(3));
freq
}
pub fn stage3_filter_structure(input: &str, options: &CleanupOptions) -> String {
use pulldown_cmark::{Event, Options, Parser, Tag};
let (frontmatter, content) = if options.preserve_frontmatter {
extract_yaml_frontmatter(input)
} else {
(None, input)
};
let parser_options = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH;
let parser = Parser::new_ext(content, parser_options);
let mut events: Vec<Event> = Vec::new();
let mut tag_stack: Vec<(Tag, usize)> = Vec::new();
for event in parser {
match &event {
Event::Start(tag) => {
tag_stack.push((tag.clone(), events.len()));
events.push(event);
}
Event::End(tag_end) => {
if let Some((start_tag, start_idx)) = tag_stack.pop() {
if is_empty_emphasis(&start_tag, tag_end, &events, start_idx) {
events.truncate(start_idx);
} else {
events.push(event);
}
} else {
events.push(event);
}
}
_ => {
events.push(event);
}
}
}
let mut output = String::new();
if pulldown_cmark_to_cmark::cmark(events.into_iter(), &mut output).is_err() {
return input.to_string();
}
if let Some(fm) = frontmatter {
format!("{}\n{}", fm, output.trim_start())
} else {
output
}
}
fn extract_yaml_frontmatter(input: &str) -> (Option<String>, &str) {
let trimmed = input.trim_start();
if !trimmed.starts_with("---") {
return (None, input);
}
let after_opening = match trimmed.strip_prefix("---") {
Some(rest) => rest.trim_start_matches([' ', '\t']),
None => return (None, input),
};
if !after_opening.starts_with('\n') && !after_opening.starts_with("\r\n") {
return (None, input);
}
let content_start = if let Some(stripped) = after_opening.strip_prefix("\r\n") {
stripped
} else if let Some(stripped) = after_opening.strip_prefix('\n') {
stripped
} else {
return (None, input);
};
let mut pos = 0;
let mut found_end = false;
let mut end_pos = 0;
for line in content_start.lines() {
let line_trimmed = line.trim();
if line_trimmed == "---" || line_trimmed == "..." {
found_end = true;
end_pos = pos + line.len();
break;
}
pos += line.len() + 1; }
if !found_end {
return (None, input);
}
let opening_offset = input.len() - trimmed.len();
let frontmatter_content = &content_start[..pos];
let frontmatter_str = format!("---\n{}---", frontmatter_content);
let remaining_start = content_start.get(end_pos..).unwrap_or("");
let remaining = remaining_start.trim_start_matches(['\n', '\r']);
let final_remaining = if remaining.is_empty() {
let full_fm_len = opening_offset
+ 3
+ (content_start.as_ptr() as usize - trimmed.as_ptr() as usize - 3)
+ end_pos;
input
.get(full_fm_len..)
.unwrap_or("")
.trim_start_matches(['\n', '\r'])
} else {
remaining
};
(Some(frontmatter_str), final_remaining)
}
fn is_empty_emphasis(
start_tag: &pulldown_cmark::Tag,
end_tag: &pulldown_cmark::TagEnd,
events: &[pulldown_cmark::Event],
start_idx: usize,
) -> bool {
use pulldown_cmark::{Tag, TagEnd};
let is_emphasis = matches!(
(start_tag, end_tag),
(Tag::Emphasis, TagEnd::Emphasis)
| (Tag::Strong, TagEnd::Strong)
| (Tag::Strikethrough, TagEnd::Strikethrough)
);
if !is_emphasis {
return false;
}
let content_events = &events[start_idx + 1..];
for event in content_events {
if let pulldown_cmark::Event::Text(text) = event {
if !text.trim().is_empty() {
return false;
}
}
}
true
}
static RE_MULTIPLE_NEWLINES: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\n{2,}").expect("RE_MULTIPLE_NEWLINES: invalid regex pattern"));
static RE_MULTIPLE_SPACES: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"[ \t]+").expect("RE_MULTIPLE_SPACES: invalid regex pattern"));
pub fn stage4_final_normalize(input: &str, _options: &CleanupOptions) -> String {
let normalized = RE_MULTIPLE_NEWLINES.replace_all(input, "\n");
let lines: Vec<&str> = normalized.lines().collect();
let mut cleaned_lines: Vec<String> = Vec::with_capacity(lines.len());
for line in lines {
let trimmed = line.trim();
if is_orphan_line(trimmed) {
continue;
}
let cleaned = RE_MULTIPLE_SPACES.replace_all(trimmed, " ");
cleaned_lines.push(cleaned.into_owned());
}
let mut result = cleaned_lines.join("\n");
result = RE_MULTIPLE_NEWLINES.replace_all(&result, "\n").into_owned();
result = merge_consecutive_list_items(&result);
result = fix_consecutive_headings(&result);
result
}
fn merge_consecutive_list_items(input: &str) -> String {
let lines: Vec<&str> = input.lines().collect();
let mut result: Vec<String> = Vec::with_capacity(lines.len());
for (i, line) in lines.iter().enumerate() {
let prev_was_list = if i > 0 {
let mut prev_idx = i - 1;
loop {
let prev_line = lines[prev_idx].trim();
if !prev_line.is_empty() {
break is_list_line(prev_line);
}
if prev_idx == 0 {
break false;
}
prev_idx -= 1;
}
} else {
false
};
if line.trim().is_empty() && prev_was_list {
let next_is_list = lines
.iter()
.skip(i + 1)
.find(|l| !l.trim().is_empty())
.map(|l| is_list_line(l))
.unwrap_or(false);
if next_is_list {
continue; }
}
result.push(line.to_string());
}
result.join("\n")
}
fn fix_consecutive_headings(input: &str) -> String {
let lines: Vec<&str> = input.lines().collect();
let mut result: Vec<String> = Vec::with_capacity(lines.len());
let mut i = 0;
while i < lines.len() {
let line = lines[i];
let trimmed = line.trim();
if let Some(level) = get_heading_level(trimmed) {
let mut consecutive_count = 1;
let mut j = i + 1;
while j < lines.len() {
let next_trimmed = lines[j].trim();
if next_trimmed.is_empty() {
j += 1;
break;
}
if let Some(next_level) = get_heading_level(next_trimmed) {
if next_level == level {
consecutive_count += 1;
j += 1;
} else {
break;
}
} else {
break;
}
}
if consecutive_count >= 2 {
result.push(line.to_string());
result.push(String::new());
for demote_line in lines.iter().take(j).skip(i + 1) {
let demote_line = demote_line.trim();
if !demote_line.is_empty() {
if let Some(content) = strip_heading_prefix(demote_line) {
result.push(content.to_string());
} else {
result.push(demote_line.to_string());
}
}
}
i = j;
continue;
}
}
result.push(line.to_string());
i += 1;
}
result.join("\n")
}
fn get_heading_level(line: &str) -> Option<u8> {
let trimmed = line.trim();
if !trimmed.starts_with('#') {
return None;
}
let hashes: usize = trimmed.chars().take_while(|&c| c == '#').count();
if hashes > 6 || hashes == 0 {
return None;
}
let after_hashes = &trimmed[hashes..];
if !after_hashes.is_empty() && !after_hashes.starts_with(' ') {
return None;
}
Some(hashes as u8)
}
fn strip_heading_prefix(line: &str) -> Option<&str> {
let trimmed = line.trim();
if !trimmed.starts_with('#') {
return None;
}
let hashes: usize = trimmed.chars().take_while(|&c| c == '#').count();
let after = trimmed[hashes..].trim_start();
Some(after)
}
fn is_list_line(line: &str) -> bool {
let trimmed = line.trim();
if trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ ") {
return true;
}
if let Some(dot_pos) = trimmed.find(". ") {
let prefix = &trimmed[..dot_pos];
if prefix.chars().all(|c| c.is_ascii_digit()) && !prefix.is_empty() {
return true;
}
}
false
}
fn is_orphan_line(line: &str) -> bool {
let len = line.chars().count();
if len == 0 {
return false;
}
if line == "---" || line == "..." || line == "***" || line == "___" {
return false;
}
if len >= 2 {
if !line
.chars()
.all(|c| c.is_ascii_punctuation() || c.is_whitespace())
{
return false;
}
}
if line.starts_with("- ")
|| line.starts_with("* ")
|| line.starts_with("+ ")
|| line.chars().next().is_some_and(|c| c.is_ascii_digit())
{
return false;
}
if line.starts_with('#') {
return false;
}
if len == 1 {
let c = line.chars().next().unwrap();
if c.is_ascii_punctuation() || c == '.' || c == ',' || c == '。' || c == '、' {
return true;
}
return false;
}
if line
.chars()
.all(|c| c.is_ascii_punctuation() || c.is_whitespace())
{
return true;
}
false
}
pub fn cleanup(input: &str, options: &CleanupOptions) -> String {
let mut result = input.to_string();
if options.normalize_strings {
result = stage1_normalize_string(&result, options);
}
if options.clean_lines {
result = stage2_clean_lines(&result, options);
}
if options.filter_structure {
result = stage3_filter_structure(&result, options);
}
if options.final_normalize {
result = stage4_final_normalize(&result, options);
}
result
}
pub fn cleanup_default(input: &str) -> String {
cleanup(input, &CleanupOptions::default())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bullet_mapping() {
let input = "●첫번째 항목\n■두번째 항목\n▶세번째 항목";
let result = stage1_normalize_string(input, &CleanupOptions::default());
assert!(result.contains("- 첫번째"));
assert!(result.contains("- 두번째"));
assert!(result.contains("- 세번째"));
}
#[test]
fn test_pua_removal() {
let input = "정상텍스트\u{E000}PUA문자\u{F000}끝";
let options = CleanupOptions::default();
let result = stage1_normalize_string(input, &options);
assert_eq!(result, "정상텍스트PUA문자끝");
}
#[test]
fn test_control_char_removal() {
let input = "텍스트\x0B수직탭\x0C폼피드\u{FEFF}BOM끝";
let result = stage1_normalize_string(input, &CleanupOptions::default());
assert_eq!(result, "텍스트수직탭폼피드BOM끝");
}
#[test]
fn test_fullwidth_normalization() {
let input = "전각 스페이스A~Z";
let result = stage1_normalize_string(input, &CleanupOptions::default());
assert!(result.contains(' ')); }
#[test]
fn test_nbsp_normalization() {
let input = "non\u{00A0}breaking\u{00A0}space";
let result = stage1_normalize_string(input, &CleanupOptions::default());
assert_eq!(result, "non breaking space"); }
#[test]
fn test_consecutive_list_items() {
let input = "- item 1\n\n- item 2\n\n- item 3";
let result = stage4_final_normalize(input, &CleanupOptions::default());
assert_eq!(result, "- item 1\n- item 2\n- item 3"); }
#[test]
fn test_ordered_list_items() {
let input = "1. first\n\n2. second\n\n3. third";
let result = stage4_final_normalize(input, &CleanupOptions::default());
assert_eq!(result, "1. first\n2. second\n3. third");
}
#[test]
fn test_page_number_hyphen() {
let input = "본문\n\n- 15 -\n\n다음 내용";
let result = stage2_clean_lines(input, &CleanupOptions::default());
assert!(!result.contains("- 15 -"));
}
#[test]
fn test_page_number_ratio() {
let input = "본문\n\n1 / 20\n\n다음";
let result = stage2_clean_lines(input, &CleanupOptions::default());
assert!(!result.contains("1 / 20"));
}
#[test]
fn test_page_number_korean() {
let input = "본문\n\n12 쪽\n\n다음";
let result = stage2_clean_lines(input, &CleanupOptions::default());
assert!(!result.contains("12 쪽"));
}
#[test]
fn test_toc_removal() {
let input = "목차\n\n서론.......... 5\n제1장 개요...... 12\n\n본문 시작";
let result = stage2_clean_lines(input, &CleanupOptions::default());
assert!(!result.contains("서론.......... 5"));
assert!(!result.contains("제1장 개요...... 12"));
}
#[test]
fn test_hwp_placeholder_removal() {
let input = "수식:\n\n[EQ]\n\n다음 내용";
let result = stage2_clean_lines(input, &CleanupOptions::default());
assert!(!result.contains("[EQ]"));
}
#[test]
fn test_multiple_newlines() {
let input = "첫번째\n\n\n\n\n두번째";
let result = stage4_final_normalize(input, &CleanupOptions::default());
assert!(!result.contains("\n\n")); assert!(result.contains("\n"));
assert_eq!(result, "첫번째\n두번째");
}
#[test]
fn test_orphan_line_removal() {
let input = "정상 문장입니다.\n.\n,\n다음 문장입니다.";
let result = stage4_final_normalize(input, &CleanupOptions::default());
assert!(!result.lines().any(|l| l.trim() == "."));
assert!(!result.lines().any(|l| l.trim() == ","));
}
#[test]
fn test_full_pipeline() {
let input = concat!(
"●첫번째 항목\n",
"- 15 -\n",
"\u{E000}PUA문자\n",
"\n\n\n\n",
"정상 내용입니다.\n",
"서론.......... 5\n",
"마지막 내용."
);
let result = cleanup(input, &CleanupOptions::default());
assert!(
result.contains("첫번째 항목"),
"Expected '첫번째 항목' in result"
);
assert!(!result.contains("- 15 -"), "Page number should be removed");
assert!(!result.contains('\u{E000}'), "PUA char should be removed");
assert!(
!result.contains("\n\n\n\n"),
"Excessive newlines should be reduced"
);
assert!(
!result.contains("서론.......... 5"),
"TOC should be removed"
);
}
#[test]
fn test_cleanup_options_minimal() {
let options = CleanupOptions::minimal();
assert!(options.normalize_strings);
assert!(!options.clean_lines);
assert!(!options.filter_structure);
assert!(options.final_normalize);
}
#[test]
fn test_cleanup_options_aggressive() {
let options = CleanupOptions::aggressive();
assert!(options.normalize_strings);
assert!(options.clean_lines);
assert!(options.filter_structure);
assert!(options.final_normalize);
assert!(options.header_footer_threshold < 0.8); }
#[test]
fn test_mojibake_detection_isolated_cjk() {
let input = "https://example.com湰灧";
let result = clean_line_trailing_mojibake(input);
assert_eq!(result, "https://example.com");
}
#[test]
fn test_mojibake_preserves_legitimate_cjk() {
let input = "다음은 중국어입니다: 你好世界";
let result = clean_line_trailing_mojibake(input);
assert_eq!(result, input); }
#[test]
fn test_mojibake_preserves_korean() {
let input = "한글 테스트 문장입니다";
let result = clean_line_trailing_mojibake(input);
assert_eq!(result, input);
}
#[test]
fn test_mojibake_single_trailing_cjk() {
let input = "Normal text 湰";
let result = clean_line_trailing_mojibake(input);
assert_eq!(result, "Normal text ");
}
#[test]
fn test_frontmatter_extraction_simple() {
let input = "---\ntitle: \"Test\"\n---\n\nContent here";
let (fm, content) = extract_yaml_frontmatter(input);
assert!(fm.is_some());
assert!(fm.unwrap().contains("title: \"Test\""));
assert!(content.contains("Content here"));
}
#[test]
fn test_frontmatter_extraction_no_frontmatter() {
let input = "Just regular content\nNo frontmatter here";
let (fm, content) = extract_yaml_frontmatter(input);
assert!(fm.is_none());
assert_eq!(content, input);
}
#[test]
fn test_frontmatter_preservation_in_pipeline() {
let input =
"---\ntitle: \"My Document\"\nformat: \"5.0.4.0\"\n---\n\nDocument content here.";
let options = CleanupOptions::default();
let result = stage3_filter_structure(input, &options);
assert!(result.starts_with("---"), "Should start with ---");
assert!(
result.contains("title: \"My Document\""),
"Title should be preserved"
);
assert!(
result.contains("format: \"5.0.4.0\""),
"Format should be preserved"
);
assert!(result.contains("---\n"), "Closing --- should exist");
}
#[test]
fn test_frontmatter_not_corrupted() {
let input = "---\nformat: \"5.0.4.0\"\n---\n\n## Heading\n\nParagraph text.";
let options = CleanupOptions::default();
let result = stage3_filter_structure(input, &options);
assert!(
!result.contains("## format"),
"Should not convert format to heading"
);
assert!(
result.contains("format: \"5.0.4.0\""),
"Format value should be preserved"
);
}
#[test]
fn test_table_pipe_preservation() {
let input = "| Header 1 | Header 2 |\n|----------|----------|\n| Cell 1 | Cell 2 |";
let options = CleanupOptions::default();
let result = stage3_filter_structure(input, &options);
assert!(
!result.contains("\\|"),
"Table pipes should not be escaped: {}",
result
);
assert!(result.contains("|"), "Table pipes should be preserved");
}
#[test]
fn test_table_in_full_pipeline() {
let input =
"# Title\n\n| Col A | Col B |\n|-------|-------|\n| Data 1 | Data 2 |\n\nParagraph.";
let options = CleanupOptions::default();
let result = cleanup(input, &options);
assert!(
!result.contains("\\|"),
"Full pipeline should not escape table pipes: {}",
result
);
}
}