eml_codec/text/utf8.rs
1#[cfg(feature = "tracing-recover")]
2use crate::utils::bytes_to_trace_string;
3use nom::{
4 character::complete::{space0, space1},
5 error::{Error, ErrorKind},
6 Err, IResult,
7};
8use std::borrow::Cow;
9#[cfg(feature = "tracing-recover")]
10use tracing::warn;
11
12/// Parses the input as a sequence of UTF-8 characters that satisfy the
13/// predicate `cond`. If invalid UTF-8 is encountered, it is replaced by
14/// [`char::REPLACEMENT_CHARACTER`] and parsing continues.
15///
16/// This function is zero-copy if the parsed input is valid UTF-8, otherwise a
17/// string gets allocated because of the need to insert replacement characters.
18/// This is similar to how [`String::from_utf8_lossy`] works.
19pub fn take_utf8_while1<F>(cond: F) -> impl Fn(&[u8]) -> IResult<&[u8], Cow<'_, str>>
20where
21 F: Fn(char) -> bool,
22{
23 move |i: &[u8]| {
24 let mut it = utf8_iter::ErrorReportingUtf8Chars::new(i);
25 let i_len = i.len();
26 let mut rest = i;
27 // read first chunk of valid UTF-8
28 loop {
29 match it.next() {
30 Some(Ok(c)) if cond(c) => {
31 rest = it.as_slice();
32 }
33 Some(Err(_)) => {
34 // encountered invalid UTF-8
35 break;
36 }
37 _ => {
38 // end of input or cond() returned false; stop reading.
39 //
40 // NOTE: we are careful of using `rest` and not
41 // `it.as_slice()` to denote the rest of the input: if we
42 // just read a character for which cond() is false, then
43 // this character has already been returned by the iterator
44 // and is not part of it.as_slice() (but it is part of
45 // `rest`, which is only advanced in the `Some(Ok(c)) if
46 // cond(c)` branch above).
47 let end = i_len - rest.len();
48 if end > 0 {
49 // SAFETY: `0..end` represents a subslice in which the
50 // `utf8_iter` iterator recognized strictly valid UTF-8
51 // codepoints. (We use the `ErrorReportingUtf8Chars`
52 // iterator and break out of the loop as soon as it
53 // encounters bytes that are not valid UTF-8.)
54 let sub = unsafe { str::from_utf8_unchecked(&i[0..end]) };
55 return Ok((rest, Cow::Borrowed(sub)));
56 } else {
57 return Err(Err::Error(Error {
58 input: i,
59 code: ErrorKind::TakeWhile1,
60 }));
61 }
62 }
63 }
64 }
65
66 // we have encountered some invalid UTF-8.
67 #[cfg(feature = "tracing-recover")]
68 warn!(input = %bytes_to_trace_string(i), "input contains invalid UTF-8");
69
70 let mut s = String::new();
71 // SAFETY: `0..end` only contains bytes on which the iterator
72 // returned Ok (same as above).
73 s.push_str(unsafe { str::from_utf8_unchecked(&i[0..i_len - rest.len()]) });
74 // push a replacement for the invalid UTF-8
75 s.push(char::REPLACEMENT_CHARACTER);
76
77 // read remaining valid and invalid text, pushing it to `s`.
78 let mut start = i_len - it.as_slice().len();
79 let mut rest = it.as_slice();
80 loop {
81 match it.next() {
82 Some(Ok(c)) if cond(c) => {
83 rest = it.as_slice();
84 }
85 res => {
86 // invalid utf8, end of input, or cond() returned false
87
88 // start by pushing the valid chunk read so far
89 let end = i_len - rest.len();
90 // SAFETY: `start..end` only contains bytes on which the iterator
91 // return Ok()
92 s.push_str(unsafe { str::from_utf8_unchecked(&i[start..end]) });
93
94 if let Some(Err(_)) = res {
95 // if we read invalid utf8, push a replacement and continue
96 s.push(char::REPLACEMENT_CHARACTER);
97 start = i_len - it.as_slice().len();
98 rest = it.as_slice();
99 } else {
100 // otherwise, stop reading
101 break;
102 }
103 }
104 }
105 }
106
107 if !s.is_empty() {
108 Ok((rest, Cow::Owned(s)))
109 } else {
110 Err(Err::Error(Error {
111 input: i,
112 code: ErrorKind::TakeWhile1,
113 }))
114 }
115 }
116}
117
118pub fn is_nonascii_or<F>(cond: F) -> impl Fn(char) -> bool
119where
120 F: Fn(u8) -> bool,
121{
122 move |c: char| {
123 if c.is_ascii() {
124 let c = u8::try_from(c).unwrap();
125 cond(c)
126 } else {
127 true
128 }
129 }
130}
131
132pub fn is_ascii_and<F>(cond: F) -> impl Fn(char) -> bool
133where
134 F: Fn(u8) -> bool,
135{
136 move |c: char| {
137 if c.is_ascii() {
138 let c = u8::try_from(c).unwrap();
139 cond(c)
140 } else {
141 false
142 }
143 }
144}
145
146pub fn space0_str(input: &[u8]) -> nom::IResult<&[u8], &str> {
147 let (input, sp) = space0(input)?;
148 // SAFETY: the `space0` combinator recognizes sequences of ' ' and '\t',
149 // which are ASCII.
150 Ok((input, unsafe { str::from_utf8_unchecked(sp) }))
151}
152
153pub fn space1_str(input: &[u8]) -> nom::IResult<&[u8], &str> {
154 let (input, sp) = space1(input)?;
155 // SAFETY: the `space1` combinator recognizes sequences of ' ' and '\t',
156 // which are ASCII.
157 Ok((input, unsafe { str::from_utf8_unchecked(sp) }))
158}