Skip to main content

ics_core/parser/
unfold.rs

1//! RFC 5545 §3.1 content line unfolding.
2//!
3//! Physical lines longer than 75 octets are folded by inserting a CRLF
4//! followed by a single SPACE or HTAB. The reader must reverse this
5//! transformation before per-line parsing — the leading folding-marker
6//! whitespace is part of the *fold*, not the value.
7//!
8//! Also handles a leading UTF-8 BOM (`U+FEFF`) tolerantly: many tools
9//! (notably Outlook) emit one, and the rest of the parser assumes the
10//! input starts at the first wire character.
11
12/// Strip a leading UTF-8 BOM if present.
13pub fn strip_bom(content: &str) -> &str {
14    content.strip_prefix('\u{FEFF}').unwrap_or(content)
15}
16
17/// Split `content` into RFC 5545 logical lines.
18///
19/// Steps:
20/// 1. Drop a leading UTF-8 BOM if present.
21/// 2. Accept either `CRLF` or `LF` as the physical line terminator.
22/// 3. A line whose first byte is `SPACE` (0x20) or `HTAB` (0x09) is a
23///    continuation of the previous logical line; the folding-marker
24///    whitespace is dropped, and the remainder appends to the previous
25///    logical line.
26///
27/// Returns owned `String`s because folded continuation joining means we
28/// can no longer borrow slices of the input.
29pub fn unfold(content: &str) -> Vec<String> {
30    let content = strip_bom(content);
31    let normalized = content.replace("\r\n", "\n");
32    let mut logical: Vec<String> = Vec::new();
33    let mut current: Option<String> = None;
34
35    for line in normalized.split('\n') {
36        if let Some(rest) = line.strip_prefix(' ').or_else(|| line.strip_prefix('\t')) {
37            match current.as_mut() {
38                Some(c) => c.push_str(rest),
39                None => current = Some(rest.to_string()),
40            }
41        } else {
42            if let Some(c) = current.take() {
43                logical.push(c);
44            }
45            current = Some(line.to_string());
46        }
47    }
48    if let Some(c) = current.take() {
49        logical.push(c);
50    }
51    // The final split element from a trailing newline is an empty string;
52    // drop trailing empties so consumers don't see phantom blank logical
53    // lines.
54    while matches!(logical.last(), Some(s) if s.is_empty()) {
55        logical.pop();
56    }
57    logical
58}
59
60#[cfg(test)]
61mod tests {
62    use super::*;
63
64    #[test]
65    fn strip_bom_removes_leading_bom_only() {
66        assert_eq!(strip_bom("\u{FEFF}HELLO"), "HELLO");
67        assert_eq!(strip_bom("HELLO"), "HELLO");
68        // BOM in the middle is left alone — it is not a fold marker.
69        assert_eq!(strip_bom("HEL\u{FEFF}LO"), "HEL\u{FEFF}LO");
70    }
71
72    #[test]
73    fn unfold_passes_through_single_lines() {
74        let input = "BEGIN:VCALENDAR\r\nVERSION:2.0\r\nEND:VCALENDAR\r\n";
75        let logical = unfold(input);
76        assert_eq!(
77            logical,
78            vec!["BEGIN:VCALENDAR", "VERSION:2.0", "END:VCALENDAR"]
79        );
80    }
81
82    #[test]
83    fn unfold_joins_space_continuation() {
84        // A:long-value-foldedhere -> A:long-value-foldedhere on one logical line
85        let input = "A:long-value-folded\r\n here\r\n";
86        let logical = unfold(input);
87        assert_eq!(logical, vec!["A:long-value-foldedhere"]);
88    }
89
90    #[test]
91    fn unfold_joins_tab_continuation() {
92        let input = "A:long-value-folded\r\n\there\r\n";
93        let logical = unfold(input);
94        assert_eq!(logical, vec!["A:long-value-foldedhere"]);
95    }
96
97    #[test]
98    fn unfold_joins_multiple_continuations() {
99        let input = "A:part1\r\n part2\r\n part3\r\n";
100        let logical = unfold(input);
101        assert_eq!(logical, vec!["A:part1part2part3"]);
102    }
103
104    #[test]
105    fn unfold_accepts_lf_only_line_terminators() {
106        let input = "A:foo\nB:bar\n";
107        let logical = unfold(input);
108        assert_eq!(logical, vec!["A:foo", "B:bar"]);
109    }
110
111    #[test]
112    fn unfold_strips_leading_bom() {
113        let input = "\u{FEFF}BEGIN:VCALENDAR\r\nEND:VCALENDAR\r\n";
114        let logical = unfold(input);
115        assert_eq!(logical, vec!["BEGIN:VCALENDAR", "END:VCALENDAR"]);
116    }
117
118    #[test]
119    fn unfold_preserves_utf8_in_continuation() {
120        // Multi-byte UTF-8 content in continuation lines must survive intact.
121        let input = "SUMMARY:憲法\r\n 記念日\r\n";
122        let logical = unfold(input);
123        assert_eq!(logical, vec!["SUMMARY:憲法記念日"]);
124    }
125
126    #[test]
127    fn unfold_drops_trailing_empty_logical_line() {
128        // A trailing CRLF yields a phantom empty split element; we must
129        // not surface it as a logical line.
130        let input = "A:foo\r\n";
131        let logical = unfold(input);
132        assert_eq!(logical, vec!["A:foo"]);
133    }
134}