undoc 0.3.0

High-performance Microsoft Office document extraction to Markdown
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
//! Text cleanup and normalization utilities.
//!
//! This module provides text cleaning functionality optimized for
//! LLM training data preparation.

use super::options::CleanupOptions;
use unicode_normalization::UnicodeNormalization;

/// Clean text according to the provided options.
pub fn clean_text(text: &str, options: &CleanupOptions) -> String {
    let mut result = text.to_string();

    if options.normalize_strings {
        result = normalize_unicode(&result);
    }

    if options.remove_pua {
        result = remove_private_use_area(&result);
    }

    if options.clean_lines {
        result = clean_lines(&result, options.preserve_frontmatter);
    }

    if options.filter_structure {
        result = filter_structure(&result);
    }

    if options.final_normalize {
        result = final_normalize(&result);
    }

    result
}

/// Normalize Unicode strings to NFC form and standardize common elements.
fn normalize_unicode(text: &str) -> String {
    let normalized: String = text.nfc().collect();

    // Standardize bullets and dashes
    normalized
        // Various bullet characters
        .replace(['', '', '', '', '', '', '', ''], "")
        // Various dashes (en-dash, em-dash, minus sign, figure dash)
        .replace(['\u{2013}', '\u{2014}', '\u{2212}', '\u{2012}'], "-")
        // Various single quotes (left single, right single)
        .replace(['\u{2018}', '\u{2019}'], "'")
        // Various double quotes (left, right, low-9, left guillemet, right guillemet)
        .replace(
            ['\u{201C}', '\u{201D}', '\u{201E}', '\u{00AB}', '\u{00BB}'],
            "\"",
        )
        // Various spaces (non-breaking, en, em, thin, hair, narrow no-break)
        .replace(
            [
                '\u{00A0}', '\u{2002}', '\u{2003}', '\u{2009}', '\u{200A}', '\u{202F}',
            ],
            " ",
        )
        // Zero-width characters (remove: zero-width space, non-joiner, joiner, BOM)
        .replace(['\u{200B}', '\u{200C}', '\u{200D}', '\u{FEFF}'], "")
}

/// Remove Private Use Area (PUA) characters.
fn remove_private_use_area(text: &str) -> String {
    text.chars()
        .filter(|c| {
            let code = *c as u32;
            // Private Use Area ranges:
            // U+E000 - U+F8FF (BMP PUA)
            // U+F0000 - U+FFFFD (Supplementary PUA-A)
            // U+100000 - U+10FFFD (Supplementary PUA-B)
            !((0xE000..=0xF8FF).contains(&code)
                || (0xF0000..=0xFFFFD).contains(&code)
                || (0x100000..=0x10FFFD).contains(&code))
        })
        .collect()
}

/// Clean lines - remove headers, footers, page numbers, TOC markers.
fn clean_lines(text: &str, preserve_frontmatter: bool) -> String {
    let lines: Vec<&str> = text.lines().collect();
    let mut result = Vec::new();
    let mut in_frontmatter = false;

    for (i, line) in lines.iter().enumerate() {
        // Handle YAML frontmatter
        if preserve_frontmatter {
            if i == 0 && line.trim() == "---" {
                in_frontmatter = true;
                result.push(*line);
                continue;
            }
            if in_frontmatter {
                result.push(*line);
                if line.trim() == "---" {
                    in_frontmatter = false;
                }
                continue;
            }
        }

        // Skip likely header/footer patterns
        if should_skip_line(line) {
            continue;
        }

        result.push(*line);
    }

    result.join("\n")
}

/// Check if a line should be skipped (header, footer, page number, etc.).
fn should_skip_line(line: &str) -> bool {
    let trimmed = line.trim();

    // Empty lines are not skipped
    if trimmed.is_empty() {
        return false;
    }

    // Page number patterns
    if is_page_number(trimmed) {
        return true;
    }

    // Common header/footer patterns
    if is_header_footer(trimmed) {
        return true;
    }

    // TOC marker patterns
    if is_toc_marker(trimmed) {
        return true;
    }

    false
}

/// Check if line is a page number.
fn is_page_number(line: &str) -> bool {
    // Simple page number patterns
    let patterns = ["Page ", "page ", "- ", ""];

    for pattern in patterns {
        if let Some(rest) = line.strip_prefix(pattern) {
            if rest.trim().chars().all(|c| c.is_ascii_digit()) {
                return true;
            }
        }
        if let Some(rest) = line.strip_suffix(pattern.trim()) {
            if rest.trim().chars().all(|c| c.is_ascii_digit()) {
                return true;
            }
        }
    }

    // Just a number alone (potential page number)
    if line.len() <= 5 && line.chars().all(|c| c.is_ascii_digit()) {
        return true;
    }

    false
}

/// Check if line is a common header/footer.
fn is_header_footer(line: &str) -> bool {
    let lower = line.to_lowercase();

    // Common footer phrases
    let footer_patterns = [
        "all rights reserved",
        "confidential",
        "proprietary",
        "copyright ©",
        "copyright (c)",
        "© ",
        "(c) ",
    ];

    for pattern in footer_patterns {
        if lower.contains(pattern) {
            return true;
        }
    }

    false
}

/// Check if line is a TOC marker.
fn is_toc_marker(line: &str) -> bool {
    let lower = line.to_lowercase();

    // TOC patterns - lines with lots of dots (leader dots)
    if line.contains("...") || line.contains("") {
        // If it has dots followed by a number, likely TOC entry
        let dot_count = line.chars().filter(|c| *c == '.').count();
        if dot_count > 3 {
            return true;
        }
    }

    // Explicit TOC headers
    if lower == "table of contents" || lower == "contents" {
        return true;
    }

    false
}

/// Collapse 3+ consecutive newlines into a single blank line (`\n\n`).
///
/// This is a lossless normalization: per CommonMark, two or more blank lines
/// render identically to one blank line, so the output is semantically
/// equivalent to the input but more compact.
pub(crate) fn collapse_blank_lines(text: &str) -> String {
    let mut result = Vec::new();
    let mut prev_blank = false;

    for line in text.lines() {
        let is_blank = line.trim().is_empty();
        if is_blank && prev_blank {
            continue;
        }
        result.push(line);
        prev_blank = is_blank;
    }

    result.join("\n")
}

/// Filter structural elements - remove empty paragraphs, orphaned elements.
fn filter_structure(text: &str) -> String {
    let stripped: String = text
        .lines()
        .filter(|line| {
            let trimmed = line.trim();
            if trimmed.len() != 1 {
                return true;
            }
            !trimmed
                .chars()
                .next()
                .is_some_and(|c| matches!(c, '|' | '-' | '_' | '=' | '*' | '#' | '~'))
        })
        .collect::<Vec<_>>()
        .join("\n");

    collapse_blank_lines(&stripped)
}

/// Final whitespace normalization.
fn final_normalize(text: &str) -> String {
    let mut result = String::new();

    for line in text.lines() {
        let mut normalized_line = String::new();
        let mut prev_space = false;

        for c in line.chars() {
            if c.is_whitespace() {
                if !prev_space {
                    normalized_line.push(' ');
                    prev_space = true;
                }
            } else {
                normalized_line.push(c);
                prev_space = false;
            }
        }

        // Trim trailing whitespace from each line
        let trimmed = normalized_line.trim_end();
        if !result.is_empty() {
            result.push('\n');
        }
        result.push_str(trimmed);
    }

    // Trim leading/trailing whitespace from entire document
    result.trim().to_string()
}

/// Detect potential mojibake patterns (for reporting, not fixing).
#[allow(dead_code)]
pub fn detect_mojibake(text: &str) -> Vec<(usize, String)> {
    let mut issues = Vec::new();

    // Common mojibake patterns (UTF-8 decoded as Windows-1252, etc.)
    // These are byte sequences that result from mis-encoding
    let patterns: &[(&str, &str)] = &[
        ("\u{00E2}\u{20AC}\u{201C}", "em-dash"),
        ("\u{00E2}\u{20AC}\u{2122}", "apostrophe"),
        ("\u{00E2}\u{20AC}\u{0153}", "left quote"),
        ("\u{00C3}\u{00A9}", "e-acute"),
        ("\u{00C3}\u{00A8}", "e-grave"),
        ("\u{00C3}\u{00A0}", "a-grave"),
        ("\u{00C3}\u{00A2}", "a-circumflex"),
        ("\u{00C2}\u{00A0}", "non-breaking space"),
        ("\u{00C3}\u{00A7}", "c-cedilla"),
    ];

    for (i, line) in text.lines().enumerate() {
        for (pattern, desc) in patterns {
            if line.contains(pattern) {
                issues.push((i + 1, format!("Possible mojibake: {} ({})", pattern, desc)));
            }
        }
    }

    issues
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_normalize_unicode() {
        // en-dash (\u{2013}) and em-dash (\u{2014})
        let input = "Hello \u{2013} World \u{2014} Test";
        let result = normalize_unicode(input);
        assert_eq!(result, "Hello - World - Test");
    }

    #[test]
    fn test_normalize_quotes() {
        // smart quotes: \u{201C} left, \u{201D} right double; \u{2018} left, \u{2019} right single
        let input = "\u{201C}Smart quotes\u{201D} and \u{2018}apostrophes\u{2019}";
        let result = normalize_unicode(input);
        assert_eq!(result, "\"Smart quotes\" and 'apostrophes'");
    }

    #[test]
    fn test_remove_pua() {
        let input = "Normal text\u{E001}hidden\u{F000}text";
        let result = remove_private_use_area(input);
        assert_eq!(result, "Normal texthiddentext");
    }

    #[test]
    fn test_clean_lines_page_numbers() {
        let input = "Content here\nPage 1\nMore content\n15";
        let result = clean_lines(input, false);
        assert!(!result.contains("Page 1"));
        assert!(!result.contains("\n15"));
    }

    #[test]
    fn test_clean_lines_preserve_frontmatter() {
        let input = "---\ntitle: Test\n---\nContent\nPage 1";
        let result = clean_lines(input, true);
        assert!(result.contains("title: Test"));
        assert!(!result.contains("Page 1"));
    }

    #[test]
    fn test_filter_structure() {
        let input = "Line 1\n\n\n\nLine 2";
        let result = filter_structure(input);
        assert!(!result.contains("\n\n\n")); // No triple blank lines
    }

    #[test]
    fn test_collapse_blank_lines_basic() {
        let input = "A\n\n\n\nB";
        assert_eq!(collapse_blank_lines(input), "A\n\nB");
    }

    #[test]
    fn test_collapse_blank_lines_idempotent() {
        let input = "A\n\nB\n\nC";
        assert_eq!(collapse_blank_lines(input), input);
    }

    #[test]
    fn test_collapse_blank_lines_whitespace_only() {
        // Lines containing only whitespace count as blank
        let input = "A\n\n   \n\t\nB";
        assert_eq!(collapse_blank_lines(input), "A\n\nB");
    }

    #[test]
    fn test_collapse_blank_lines_preserves_single_blank() {
        let input = "A\n\nB";
        assert_eq!(collapse_blank_lines(input), "A\n\nB");
    }

    #[test]
    fn test_collapse_blank_lines_no_blanks() {
        let input = "A\nB\nC";
        assert_eq!(collapse_blank_lines(input), "A\nB\nC");
    }

    #[test]
    fn test_final_normalize() {
        let input = "Multiple   spaces   here";
        let result = final_normalize(input);
        assert_eq!(result, "Multiple spaces here");
    }

    #[test]
    fn test_clean_text_full() {
        let options = CleanupOptions {
            normalize_strings: true,
            clean_lines: true,
            filter_structure: true,
            final_normalize: true,
            remove_pua: true,
            detect_mojibake: false,
            preserve_frontmatter: true,
        };

        let input = "---\ntitle: Test\n---\n\nHello – World\n\n\n\nPage 1\nContent.";
        let result = clean_text(input, &options);

        assert!(result.contains("Hello - World")); // Normalized dash
        assert!(!result.contains("Page 1")); // Removed page number
        assert!(!result.contains("\n\n\n")); // No excess blank lines
    }

    #[test]
    fn test_detect_mojibake() {
        // Mojibake pattern for em-dash: \u{00E2}\u{20AC}\u{201C}
        let input = "This has \u{00E2}\u{20AC}\u{201C} some issues";
        let issues = detect_mojibake(input);
        assert!(!issues.is_empty());
    }
}