orbok_extract/normalize.rs
1//! Text normalization, version `norm-v1` (RFC-005 §9).
2//!
3//! norm-v1 is deliberately small and exactly specified, because its
4//! output feeds content hashes and indexes — any change must come with
5//! a version bump:
6//! 1. strip a UTF-8 BOM at the start of the document;
7//! 2. normalize CRLF and lone CR to LF;
8//! 3. remove control characters except `\n` and `\t`;
9//! 4. trim trailing whitespace on each line.
10//!
11//! Unicode NFC normalization is intentionally **not** part of norm-v1
12//! (deferred to a future norm-v2 with RFC-014 language work); Japanese
13//! text passes through byte-identical apart from the rules above.
14
15/// Version constant recorded with every extraction.
16pub use orbok_core::versions::NORMALIZATION_VERSION;
17
18/// Apply norm-v1 to a whole document.
19pub fn normalize_document(input: &str) -> String {
20 let input = input.strip_prefix('\u{FEFF}').unwrap_or(input);
21 let mut out = String::with_capacity(input.len());
22 let mut line = String::new();
23 let mut chars = input.chars().peekable();
24 while let Some(c) = chars.next() {
25 match c {
26 '\r' => {
27 if chars.peek() == Some(&'\n') {
28 chars.next();
29 }
30 flush_line(&mut out, &mut line);
31 }
32 '\n' => flush_line(&mut out, &mut line),
33 '\t' => line.push('\t'),
34 c if c.is_control() => {} // rule 3
35 c => line.push(c),
36 }
37 }
38 if !line.is_empty() {
39 out.push_str(line.trim_end());
40 }
41 out
42}
43
44fn flush_line(out: &mut String, line: &mut String) {
45 out.push_str(line.trim_end());
46 out.push('\n');
47 line.clear();
48}