aimcal_ical/value/
text.rs

1// SPDX-FileCopyrightText: 2025-2026 Zexin Yuan <aim@yzx9.xyz>
2//
3// SPDX-License-Identifier: Apache-2.0
4
5//! Parsers for property values as defined in RFC 5545 Section 3.3.
6
7use std::borrow::Cow;
8use std::fmt;
9
10use chumsky::Parser;
11use chumsky::container::Container;
12use chumsky::extra::ParserExtra;
13use chumsky::prelude::*;
14
15use crate::string_storage::{SegmentedSpannedChars, Segments, Span, StringStorage};
16
17/// Text value type defined in RFC 5545 Section 3.3.11.
18#[derive(Default, Debug, Clone)]
19pub struct ValueText<S: StringStorage> {
20    tokens: Vec<(ValueTextToken<S>, S::Span)>,
21}
22
23impl<'a> ValueText<Segments<'a>> {
24    /// Resolve the text value into a single string, processing escapes.
25    ///
26    /// This version tries to avoid allocation when there's only a single string token.
27    #[must_use]
28    pub fn resolve(&self) -> Cow<'a, str> {
29        #[expect(clippy::indexing_slicing)]
30        if self.tokens.len() == 1
31            && let (ValueTextToken::Str(part), _) = &self.tokens[0]
32        {
33            part.resolve()
34        } else {
35            Cow::Owned(self.to_string())
36        }
37    }
38
39    /// Compare the text value with a string, ignoring ASCII case.
40    ///
41    /// This method iterates through tokens without allocating a new string,
42    /// using string slice comparison for efficiency.
43    #[must_use]
44    pub(crate) fn eq_str_ignore_ascii_case(&self, other: &str) -> bool {
45        let mut remaining = other;
46
47        for (token, _) in &self.tokens {
48            if remaining.is_empty() {
49                return false;
50            }
51
52            match token {
53                ValueTextToken::Str(part) => {
54                    if part.len() > remaining.len() {
55                        return false;
56                    }
57                    let Some((head, tail)) = remaining.split_at_checked(part.len()) else {
58                        return false;
59                    };
60                    if !part.eq_str_ignore_ascii_case(head) {
61                        return false;
62                    }
63                    remaining = tail;
64                }
65                ValueTextToken::Escape(escape_char) => {
66                    // Escape token is exactly 1 character
67                    let Some((first, rest)) = remaining.split_at_checked(1) else {
68                        return false;
69                    };
70                    // Compare first with expected escape character
71                    if !first.eq_ignore_ascii_case(escape_char.as_ref()) {
72                        return false;
73                    }
74                    remaining = rest;
75                }
76            }
77        }
78
79        // Check if we've consumed all characters from other
80        remaining.is_empty()
81    }
82
83    /// Convert borrowed type to owned type
84    #[must_use]
85    pub fn to_owned(&self) -> ValueText<String> {
86        ValueText {
87            tokens: self
88                .tokens
89                .iter()
90                .map(|(token, _)| match token {
91                    ValueTextToken::Str(s) => ValueTextToken::Str(s.to_owned()),
92                    ValueTextToken::Escape(c) => ValueTextToken::Escape(*c),
93                })
94                .map(|token| (token, ()))
95                .collect(),
96        }
97    }
98
99    /// Get the full span from the first to the last token.
100    ///
101    /// This method provides O(1) access to the span that covers all tokens
102    /// in the `ValueText`, from the first character to the last.
103    #[must_use]
104    pub fn span(&self) -> Span {
105        if self.tokens.is_empty() {
106            Span { start: 0, end: 0 }
107        } else {
108            #[expect(clippy::indexing_slicing)]
109            let first = &self.tokens[0].1;
110            #[expect(clippy::indexing_slicing)]
111            let last = &self.tokens[self.tokens.len() - 1].1;
112            Span {
113                start: first.start,
114                end: last.end,
115            }
116        }
117    }
118
119    /// Create an iterator over characters with their spans.
120    ///
121    /// This method provides a zero-copy iterator that yields each character
122    /// along with its source position, enabling accurate error reporting.
123    #[must_use]
124    pub fn into_spanned_chars(self) -> ValueTextSpannedChars<'a> {
125        ValueTextSpannedChars {
126            tokens: self.tokens.into_iter(),
127            current_segments: None,
128            current_escape: None,
129        }
130    }
131}
132
133/// Iterator over characters in a `ValueText` with their spans.
134///
135/// This struct is created by `ValueText::into_spanned_chars()` and yields
136/// characters along with their source positions.
137///
138/// # Lifetime
139///
140/// The lifetime parameter `'a` represents the lifetime of the underlying
141/// string data in the original `ValueText`.
142#[derive(Debug)]
143pub struct ValueTextSpannedChars<'a> {
144    /// Remaining tokens to process
145    tokens: std::vec::IntoIter<(ValueTextToken<Segments<'a>>, Span)>,
146    /// Current segment spanned chars iterator (if processing a Str token)
147    current_segments: Option<SegmentedSpannedChars<'a>>,
148    /// Current escape char (if processing an Escape token)
149    current_escape: Option<(char, Span)>,
150}
151
152impl Iterator for ValueTextSpannedChars<'_> {
153    type Item = (char, Span);
154
155    fn next(&mut self) -> Option<Self::Item> {
156        loop {
157            // Try to get next char from current segments iterator
158            if let Some(ref mut iter) = self.current_segments {
159                if let Some(item) = iter.next() {
160                    return Some(item);
161                }
162                self.current_segments = None;
163            }
164
165            // Try to get next char from current escape
166            if let Some(item) = self.current_escape.take() {
167                return Some(item);
168            }
169
170            // Get next token
171            let (token, span) = self.tokens.next()?;
172
173            match token {
174                ValueTextToken::Str(segments) => {
175                    self.current_segments = Some(segments.into_spanned_chars());
176                }
177                ValueTextToken::Escape(escape_char) => {
178                    let c = escape_char.as_ref().chars().next().unwrap();
179                    self.current_escape = Some((c, span));
180                }
181            }
182        }
183    }
184}
185
186impl<S: StringStorage> fmt::Display for ValueText<S> {
187    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
188        for (token, _) in &self.tokens {
189            match token {
190                ValueTextToken::Str(part) => write!(f, "{part}")?,
191                ValueTextToken::Escape(c) => write!(f, "{c}")?,
192            }
193        }
194        Ok(())
195    }
196}
197
198impl ValueText<String> {
199    /// Create a new `ValueText<String>` from a string.
200    ///
201    /// This constructor is provided for convenient construction of owned text values.
202    /// The input string is treated as a single unescaped text token.
203    #[must_use]
204    pub fn new(value: String) -> Self {
205        Self {
206            tokens: vec![(ValueTextToken::Str(value), ())],
207        }
208    }
209}
210
211#[derive(Debug, Clone)]
212enum ValueTextToken<S: StringStorage> {
213    Str(S),
214    Escape(ValueTextEscape),
215}
216
217#[derive(Debug, Clone, Copy, PartialEq, Eq)]
218enum ValueTextEscape {
219    Backslash,
220    Semicolon,
221    Comma,
222    Newline,
223}
224
225impl AsRef<str> for ValueTextEscape {
226    fn as_ref(&self) -> &str {
227        match self {
228            ValueTextEscape::Backslash => "\\",
229            ValueTextEscape::Semicolon => ";",
230            ValueTextEscape::Comma => ",",
231            ValueTextEscape::Newline => "\n",
232        }
233    }
234}
235
236impl fmt::Display for ValueTextEscape {
237    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
238        self.as_ref().fmt(f)
239    }
240}
241
242#[derive(Debug)]
243pub struct RawValueText(Vec<Either<SpanCollector, (ValueTextEscape, SimpleSpan)>>);
244
245impl RawValueText {
246    pub fn build<'src>(self, src: &Segments<'src>) -> ValueText<Segments<'src>> {
247        let size = self.0.iter().fold(0, |acc, t| match t {
248            Either::Left(collector) => acc + collector.0.len(),
249            Either::Right(_) => acc + 1,
250        });
251
252        let mut tokens = Vec::with_capacity(size);
253        for t in self.0 {
254            match t {
255                Either::Left(collector) => tokens.extend(
256                    collector
257                        .build(src)
258                        .into_iter()
259                        .map(|(s, span)| (ValueTextToken::Str(s), span.into())),
260                ),
261                Either::Right((v, span)) => tokens.push((ValueTextToken::Escape(v), span.into())),
262            }
263        }
264
265        ValueText { tokens }
266    }
267}
268
269/// Format Definition:  This value type is defined by the following notation:
270///
271/// ```txt
272/// text       = *(TSAFE-CHAR / ":" / DQUOTE / ESCAPED-CHAR)
273/// ; Folded according to description above
274///
275/// ESCAPED-CHAR = ("\\" / "\;" / "\," / "\N" / "\n")
276/// ; \\ encodes \, \N or \n encodes newline
277/// ; \; encodes ;, \, encodes ,
278///
279/// TSAFE-CHAR = WSP / %x21 / %x23-2B / %x2D-39 / %x3C-5B / %x5D-7E / NON-US-
280/// ASCII
281/// ; Any character except CONTROLs not needed by the current
282/// ; character set, DQUOTE, ";", ":", "\", ","
283/// ```
284fn value_text<'src, I, E>() -> impl Parser<'src, I, RawValueText, E>
285where
286    I: Input<'src, Token = char, Span = SimpleSpan>,
287    E: ParserExtra<'src, I>,
288{
289    let s = select! { c if c != '\\' => c }
290        .ignored()
291        .repeated()
292        .at_least(1)
293        .map_with(|(), e| e.span())
294        .collect::<SpanCollector>()
295        .map(Either::Left);
296
297    let escape = just('\\')
298        .ignore_then(select! {
299            ';' => ValueTextEscape::Semicolon,
300            ',' => ValueTextEscape::Comma,
301            'N' | 'n' => ValueTextEscape::Newline,
302            '\\' => ValueTextEscape::Backslash,
303        })
304        .map_with(|v, e| (v, e.span()))
305        .map(Either::Right);
306
307    choice((s, escape)).repeated().collect().map(RawValueText)
308}
309
310/// Text multiple values parser.
311///
312/// If the property permits, multiple TEXT values are specified by a
313/// COMMA-separated list of values.
314pub fn values_text<'src, I, E>() -> impl Parser<'src, I, Vec<RawValueText>, E>
315where
316    I: Input<'src, Token = char, Span = SimpleSpan>,
317    E: ParserExtra<'src, I>,
318{
319    value_text().separated_by(just(',')).collect()
320}
321
322#[derive(Debug, Default)]
323struct SpanCollector(Vec<SimpleSpan>);
324
325impl SpanCollector {
326    fn build<'src>(self, src: &Segments<'src>) -> Vec<(Segments<'src>, SimpleSpan)> {
327        // assume src segments are non-overlapping and sorted
328        let mut iter = src.segments.iter();
329        let Some(mut item) = iter.next() else {
330            return Vec::new(); // no segments
331        };
332
333        let mut vec = Vec::with_capacity(self.0.len());
334        for span in self.0 {
335            let mut flag = true;
336            while flag {
337                if span.start > item.1.end {
338                    // need next segment, and skip this one
339                    match iter.next() {
340                        Some(a) => item = a,
341                        None => flag = false, // no more segments
342                    }
343                } else if span.end > item.1.end {
344                    // need next segment
345                    let i = span.start.saturating_sub(item.1.start);
346                    let s = item.0.get(i..).unwrap(); // SAFETY: since in range
347                    match iter.next() {
348                        Some(a) => item = a,
349                        None => flag = false, // no more segments
350                    }
351                    vec.push((Segments::new(vec![(s, span.into())]), span));
352                } else {
353                    // within this segment
354                    flag = false;
355                    let i = span.start.saturating_sub(item.1.start);
356                    let j = span.end.saturating_sub(item.1.start);
357                    let s = item.0.get(i..j).unwrap(); // SAFETY: since i,j are in range
358                    vec.push((Segments::new(vec![(s, span.into())]), span));
359                }
360            }
361        }
362        vec
363    }
364}
365
366impl Container<SimpleSpan> for SpanCollector {
367    fn with_capacity(n: usize) -> Self {
368        Self(Vec::with_capacity(n))
369    }
370
371    fn push(&mut self, span: SimpleSpan) {
372        match self.0.last_mut() {
373            Some(last) if last.end() == span.start() => {
374                *last = SimpleSpan::new(last.context(), last.start()..span.end());
375            }
376            _ => self.0.push(span),
377        }
378    }
379}
380
381#[derive(Debug)]
382enum Either<L, R> {
383    Left(L),
384    Right(R),
385}
386
387#[cfg(test)]
388mod tests {
389    use chumsky::input::Stream;
390
391    use crate::syntax::syntax_analysis;
392
393    use super::*;
394
395    fn make_input(segs: Segments<'_>) -> impl Input<'_, Token = char, Span = SimpleSpan> {
396        let eoi = match (segs.segments.first(), segs.segments.last()) {
397            (Some(first), Some(last)) => SimpleSpan::new((), first.1.start..last.1.end),
398            _ => SimpleSpan::new((), 0..0),
399        };
400        Stream::from_iter(segs.into_spanned_chars()).map(eoi, |(t, s)| {
401            // Convert our custom Span to SimpleSpan
402            let simple = SimpleSpan::new((), s.start..s.end);
403            (t, simple)
404        })
405    }
406
407    fn parse(src: &str) -> ValueText<Segments<'_>> {
408        let comps = syntax_analysis(src).unwrap();
409        assert_eq!(comps.len(), 1);
410        let syntax_component = comps.first().unwrap();
411        assert_eq!(syntax_component.properties.len(), 1);
412
413        let segs = syntax_component.properties.first().unwrap().value.clone();
414        let stream = make_input(segs.clone());
415        value_text::<'_, _, extra::Err<Rich<_>>>()
416            .parse(stream)
417            .into_result()
418            .map(|raw_text| raw_text.build(&segs))
419            .unwrap()
420    }
421
422    fn with_component(src: &str) -> String {
423        format!("BEGIN:VEVENT\r\nTEST_PROP:{src}\r\nEND:VEVENT")
424    }
425
426    #[test]
427    fn parses_text() {
428        #[rustfmt::skip]
429        let success_cases = [
430            // examples from RFC 5545 Section 3.3.11
431            (r"Project XYZ Final Review\nConference Room - 3B\nCome Prepared.",
432              "Project XYZ Final Review\nConference Room - 3B\nCome Prepared."),
433            // extra tests
434            (r"Hello\, World\; \N", "Hello, World; \n"),
435            ( r#""Quoted Text" and more text"#, r#""Quoted Text" and more text"#,),
436            ("Unicode 字符串 🎉", "Unicode 字符串 🎉"),
437            ("123\r\n 456\r\n\t789", "123456789"),
438        ];
439        for (src, expected) in success_cases {
440            let src = with_component(src);
441            let result = &parse(&src);
442            assert_eq!(result.to_string(), expected);
443        }
444    }
445
446    #[test]
447    fn value_text_eq_str_ignore_ascii_case() {
448        // Test basic case insensitive matching
449        {
450            let src = with_component("ABC");
451            let result = parse(&src);
452            assert!(result.eq_str_ignore_ascii_case("abc"));
453            assert!(result.eq_str_ignore_ascii_case("ABC"));
454            assert!(!result.eq_str_ignore_ascii_case("xyz"));
455        }
456
457        // Test with space
458        {
459            let src = with_component("ABC DEF");
460            let result = parse(&src);
461            assert!(result.eq_str_ignore_ascii_case("abc def"));
462            assert!(result.eq_str_ignore_ascii_case("ABC DEF"));
463        }
464
465        // Test with mixed case
466        {
467            let src = with_component("Hello World");
468            let result = parse(&src);
469            assert!(result.eq_str_ignore_ascii_case("hello world"));
470            assert!(result.eq_str_ignore_ascii_case("HELLO WORLD"));
471            assert!(result.eq_str_ignore_ascii_case("HeLlO WoRlD"));
472            assert!(result.eq_str_ignore_ascii_case("Hello World"));
473        }
474
475        // Test with escaped comma
476        {
477            let src = with_component(r"Hello\, World");
478            let result = parse(&src);
479            assert!(result.eq_str_ignore_ascii_case("hello, world"));
480            assert!(result.eq_str_ignore_ascii_case("HELLO, WORLD"));
481        }
482
483        // Test with escaped semicolon
484        {
485            let src = with_component(r"Hello\; World");
486            let result = parse(&src);
487            assert!(result.eq_str_ignore_ascii_case("hello; world"));
488            assert!(result.eq_str_ignore_ascii_case("HELLO; WORLD"));
489        }
490
491        // Test with escaped backslash
492        {
493            let src = with_component(r"C:\\Path");
494            let result = parse(&src);
495            assert!(result.eq_str_ignore_ascii_case("c:\\path"));
496            assert!(result.eq_str_ignore_ascii_case("C:\\PATH"));
497        }
498
499        // Test length difference
500        {
501            let src = with_component("abc");
502            let result = parse(&src);
503            assert!(!result.eq_str_ignore_ascii_case("abcd"));
504            assert!(!result.eq_str_ignore_ascii_case("ab"));
505        }
506
507        // Test with escaped newline (using \N per RFC 5545)
508        {
509            let src = with_component(r"Hello\NWorld");
510            let result = parse(&src);
511            assert!(result.eq_str_ignore_ascii_case("hello\nworld"));
512            assert!(result.eq_str_ignore_ascii_case("HELLO\nWORLD"));
513        }
514
515        // Test with multiple escape sequences
516        {
517            let src = with_component(r"Text\, with\; \\escapes\Nand more");
518            let result = parse(&src);
519            assert!(result.eq_str_ignore_ascii_case("text, with; \\escapes\nand more"));
520            assert!(result.eq_str_ignore_ascii_case("TEXT, WITH; \\ESCAPES\nAND MORE"));
521        }
522    }
523
524    #[test]
525    fn value_text_eq_str_ignore_ascii_case_empty() {
526        let segs = Segments::default();
527        let stream = make_input(segs.clone());
528        let result = value_text::<'_, _, extra::Err<Rich<_>>>()
529            .parse(stream)
530            .into_result()
531            .map(|raw_text| raw_text.build(&segs))
532            .unwrap();
533
534        assert!(result.eq_str_ignore_ascii_case(""));
535        assert!(!result.eq_str_ignore_ascii_case("a"));
536    }
537}
aimcal_ical/value/text.rs

aimcal_ical/value/
text.rs