Skip to main content

orbok_extract/
normalize.rs

1//! Text normalization, version `norm-v1` (RFC-005 §9).
2//!
3//! norm-v1 is deliberately small and exactly specified, because its
4//! output feeds content hashes and indexes — any change must come with
5//! a version bump:
6//! 1. strip a UTF-8 BOM at the start of the document;
7//! 2. normalize CRLF and lone CR to LF;
8//! 3. remove control characters except `\n` and `\t`;
9//! 4. trim trailing whitespace on each line.
10//!
11//! Unicode NFC normalization is intentionally **not** part of norm-v1
12//! (deferred to a future norm-v2 with RFC-014 language work); Japanese
13//! text passes through byte-identical apart from the rules above.
14
15/// Version constant recorded with every extraction.
16pub use orbok_core::versions::NORMALIZATION_VERSION;
17
18/// Apply norm-v1 to a whole document.
19pub fn normalize_document(input: &str) -> String {
20    let input = input.strip_prefix('\u{FEFF}').unwrap_or(input);
21    let mut out = String::with_capacity(input.len());
22    let mut line = String::new();
23    let mut chars = input.chars().peekable();
24    while let Some(c) = chars.next() {
25        match c {
26            '\r' => {
27                if chars.peek() == Some(&'\n') {
28                    chars.next();
29                }
30                flush_line(&mut out, &mut line);
31            }
32            '\n' => flush_line(&mut out, &mut line),
33            '\t' => line.push('\t'),
34            c if c.is_control() => {} // rule 3
35            c => line.push(c),
36        }
37    }
38    if !line.is_empty() {
39        out.push_str(line.trim_end());
40    }
41    out
42}
43
44fn flush_line(out: &mut String, line: &mut String) {
45    out.push_str(line.trim_end());
46    out.push('\n');
47    line.clear();
48}