eml_codec/text/
utf8.rs

1#[cfg(feature = "tracing-recover")]
2use crate::utils::bytes_to_trace_string;
3use nom::{
4    character::complete::{space0, space1},
5    error::{Error, ErrorKind},
6    Err, IResult,
7};
8use std::borrow::Cow;
9#[cfg(feature = "tracing-recover")]
10use tracing::warn;
11
12/// Parses the input as a sequence of UTF-8 characters that satisfy the
13/// predicate `cond`. If invalid UTF-8 is encountered, it is replaced by
14/// [`char::REPLACEMENT_CHARACTER`] and parsing continues.
15///
16/// This function is zero-copy if the parsed input is valid UTF-8, otherwise a
17/// string gets allocated because of the need to insert replacement characters.
18/// This is similar to how [`String::from_utf8_lossy`] works.
19pub fn take_utf8_while1<F>(cond: F) -> impl Fn(&[u8]) -> IResult<&[u8], Cow<'_, str>>
20where
21    F: Fn(char) -> bool,
22{
23    move |i: &[u8]| {
24        let mut it = utf8_iter::ErrorReportingUtf8Chars::new(i);
25        let i_len = i.len();
26        let mut rest = i;
27        // read first chunk of valid UTF-8
28        loop {
29            match it.next() {
30                Some(Ok(c)) if cond(c) => {
31                    rest = it.as_slice();
32                }
33                Some(Err(_)) => {
34                    // encountered invalid UTF-8
35                    break;
36                }
37                _ => {
38                    // end of input or cond() returned false; stop reading.
39                    //
40                    // NOTE: we are careful of using `rest` and not
41                    // `it.as_slice()` to denote the rest of the input: if we
42                    // just read a character for which cond() is false, then
43                    // this character has already been returned by the iterator
44                    // and is not part of it.as_slice() (but it is part of
45                    // `rest`, which is only advanced in the `Some(Ok(c)) if
46                    // cond(c)` branch above).
47                    let end = i_len - rest.len();
48                    if end > 0 {
49                        // SAFETY: `0..end` represents a subslice in which the
50                        // `utf8_iter` iterator recognized strictly valid UTF-8
51                        // codepoints. (We use the `ErrorReportingUtf8Chars`
52                        // iterator and break out of the loop as soon as it
53                        // encounters bytes that are not valid UTF-8.)
54                        let sub = unsafe { str::from_utf8_unchecked(&i[0..end]) };
55                        return Ok((rest, Cow::Borrowed(sub)));
56                    } else {
57                        return Err(Err::Error(Error {
58                            input: i,
59                            code: ErrorKind::TakeWhile1,
60                        }));
61                    }
62                }
63            }
64        }
65
66        // we have encountered some invalid UTF-8.
67        #[cfg(feature = "tracing-recover")]
68        warn!(input = %bytes_to_trace_string(i), "input contains invalid UTF-8");
69
70        let mut s = String::new();
71        // SAFETY: `0..end` only contains bytes on which the iterator
72        // returned Ok (same as above).
73        s.push_str(unsafe { str::from_utf8_unchecked(&i[0..i_len - rest.len()]) });
74        // push a replacement for the invalid UTF-8
75        s.push(char::REPLACEMENT_CHARACTER);
76
77        // read remaining valid and invalid text, pushing it to `s`.
78        let mut start = i_len - it.as_slice().len();
79        let mut rest = it.as_slice();
80        loop {
81            match it.next() {
82                Some(Ok(c)) if cond(c) => {
83                    rest = it.as_slice();
84                }
85                res => {
86                    // invalid utf8, end of input, or cond() returned false
87
88                    // start by pushing the valid chunk read so far
89                    let end = i_len - rest.len();
90                    // SAFETY: `start..end` only contains bytes on which the iterator
91                    // return Ok()
92                    s.push_str(unsafe { str::from_utf8_unchecked(&i[start..end]) });
93
94                    if let Some(Err(_)) = res {
95                        // if we read invalid utf8, push a replacement and continue
96                        s.push(char::REPLACEMENT_CHARACTER);
97                        start = i_len - it.as_slice().len();
98                        rest = it.as_slice();
99                    } else {
100                        // otherwise, stop reading
101                        break;
102                    }
103                }
104            }
105        }
106
107        if !s.is_empty() {
108            Ok((rest, Cow::Owned(s)))
109        } else {
110            Err(Err::Error(Error {
111                input: i,
112                code: ErrorKind::TakeWhile1,
113            }))
114        }
115    }
116}
117
118pub fn is_nonascii_or<F>(cond: F) -> impl Fn(char) -> bool
119where
120    F: Fn(u8) -> bool,
121{
122    move |c: char| {
123        if c.is_ascii() {
124            let c = u8::try_from(c).unwrap();
125            cond(c)
126        } else {
127            true
128        }
129    }
130}
131
132pub fn is_ascii_and<F>(cond: F) -> impl Fn(char) -> bool
133where
134    F: Fn(u8) -> bool,
135{
136    move |c: char| {
137        if c.is_ascii() {
138            let c = u8::try_from(c).unwrap();
139            cond(c)
140        } else {
141            false
142        }
143    }
144}
145
146pub fn space0_str(input: &[u8]) -> nom::IResult<&[u8], &str> {
147    let (input, sp) = space0(input)?;
148    // SAFETY: the `space0` combinator recognizes sequences of ' ' and '\t',
149    // which are ASCII.
150    Ok((input, unsafe { str::from_utf8_unchecked(sp) }))
151}
152
153pub fn space1_str(input: &[u8]) -> nom::IResult<&[u8], &str> {
154    let (input, sp) = space1(input)?;
155    // SAFETY: the `space1` combinator recognizes sequences of ' ' and '\t',
156    // which are ASCII.
157    Ok((input, unsafe { str::from_utf8_unchecked(sp) }))
158}
eml_codec/text/utf8.rs

eml_codec/text/
utf8.rs