mail_parser/parsers/fields/
content_type.rs

1/*
2 * SPDX-FileCopyrightText: 2020 Stalwart Labs LLC <hello@stalw.art>
3 *
4 * SPDX-License-Identifier: Apache-2.0 OR MIT
5 */
6
7use std::borrow::Cow;
8
9use crate::{
10    decoders::{charsets::map::charset_decoder, hex::decode_hex},
11    parsers::MessageStream,
12    Attribute, ContentType, HeaderValue,
13};
14
15#[derive(Clone, Copy, PartialEq, Debug)]
16enum ContentState {
17    Type,
18    SubType,
19    AttributeName,
20    AttributeValue,
21    AttributeQuotedValue,
22    Comment,
23}
24
25type Continuation<'x> = (Cow<'x, str>, u32, Cow<'x, str>);
26
27struct ContentTypeParser<'x> {
28    state: ContentState,
29    state_stack: Vec<ContentState>,
30
31    c_type: Option<Cow<'x, str>>,
32    c_subtype: Option<Cow<'x, str>>,
33
34    attr_name: Option<Cow<'x, str>>,
35    attr_charset: Option<Cow<'x, str>>,
36    attr_position: u32,
37
38    values: Vec<Cow<'x, str>>,
39    attributes: Vec<Attribute<'x>>,
40    continuations: Option<Vec<Continuation<'x>>>,
41
42    token_start: usize,
43    token_end: usize,
44
45    is_continuation: bool,
46    is_encoded_attribute: bool,
47    is_escaped: bool,
48    remove_crlf: bool,
49    is_lower_case: bool,
50    is_token_start: bool,
51}
52
53impl<'x> ContentTypeParser<'x> {
54    #[inline(always)]
55    fn reset_parser(&mut self) {
56        self.token_start = 0;
57        self.is_token_start = true;
58    }
59
60    fn add_attribute(&mut self, stream: &MessageStream<'x>) -> bool {
61        if self.token_start > 0 {
62            let mut attr = Some(String::from_utf8_lossy(
63                &stream.data[self.token_start - 1..self.token_end],
64            ));
65
66            if !self.is_lower_case {
67                attr.as_mut().unwrap().to_mut().make_ascii_lowercase();
68                self.is_lower_case = true;
69            }
70
71            match self.state {
72                ContentState::AttributeName => self.attr_name = attr,
73                ContentState::Type => self.c_type = attr,
74                ContentState::SubType => self.c_subtype = attr,
75                _ => unreachable!(),
76            }
77
78            self.reset_parser();
79            true
80        } else {
81            false
82        }
83    }
84
85    fn add_attribute_parameter(&mut self, stream: &MessageStream<'x>) {
86        if self.token_start > 0 {
87            let attr_part =
88                String::from_utf8_lossy(&stream.data[self.token_start - 1..self.token_end]);
89
90            if self.attr_charset.is_none() {
91                self.attr_charset = attr_part.into();
92            } else {
93                let attr_name =
94                    self.attr_name.as_ref().unwrap_or(&"unknown".into()).clone() + "-language";
95
96                if !self.attributes.iter().any(|a| a.name == attr_name) {
97                    self.attributes.push(Attribute {
98                        name: attr_name,
99                        value: attr_part,
100                    });
101                } else {
102                    self.values.push("'".into());
103                    self.values.push(attr_part);
104                }
105            }
106
107            self.reset_parser();
108        }
109    }
110
111    fn add_partial_value(&mut self, stream: &MessageStream<'x>, to_cur_pos: bool) {
112        if self.token_start > 0 {
113            let in_quote = self.state == ContentState::AttributeQuotedValue;
114
115            self.values.push(String::from_utf8_lossy(
116                &stream.data[self.token_start - 1..if in_quote && to_cur_pos {
117                    stream.offset() - 1
118                } else {
119                    self.token_end
120                }],
121            ));
122            if !in_quote {
123                self.values.push(" ".into());
124            }
125
126            self.reset_parser();
127        }
128    }
129
130    fn add_value(&mut self, stream: &MessageStream<'x>) {
131        if self.attr_name.is_none() {
132            return;
133        }
134
135        let has_values = !self.values.is_empty();
136        let value = if self.token_start > 0 {
137            let value = &stream.data[self.token_start - 1..self.token_end];
138            Some(if !self.remove_crlf {
139                String::from_utf8_lossy(value)
140            } else {
141                self.remove_crlf = false;
142                match String::from_utf8(
143                    value
144                        .iter()
145                        .filter(|&&ch| ch != b'\r' && ch != b'\n')
146                        .copied()
147                        .collect::<Vec<_>>(),
148                ) {
149                    Ok(value) => value.into(),
150                    Err(err) => String::from_utf8_lossy(err.as_bytes()).into_owned().into(),
151                }
152            })
153        } else {
154            if !has_values {
155                return;
156            }
157            None
158        };
159
160        if !self.is_continuation {
161            self.attributes.push(Attribute {
162                name: self.attr_name.take().unwrap(),
163                value: if !has_values {
164                    value.unwrap()
165                } else {
166                    if let Some(value) = value {
167                        self.values.push(value);
168                    }
169                    self.values.concat().into()
170                },
171            });
172        } else {
173            let attr_name = self.attr_name.take().unwrap();
174            let mut value = if let Some(value) = value {
175                if has_values {
176                    Cow::from(self.values.concat()) + value
177                } else {
178                    value
179                }
180            } else {
181                self.values.concat().into()
182            };
183
184            if self.is_encoded_attribute {
185                if let (true, decoded_bytes) = decode_hex(value.as_bytes()) {
186                    value = if let Some(decoder) = self
187                        .attr_charset
188                        .as_ref()
189                        .and_then(|c| charset_decoder(c.as_bytes()))
190                    {
191                        decoder(&decoded_bytes).into()
192                    } else {
193                        String::from_utf8(decoded_bytes)
194                            .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
195                            .into()
196                    }
197                }
198                self.is_encoded_attribute = false;
199            }
200
201            if self.attr_position > 0 {
202                let continuation = (attr_name, self.attr_position, value);
203                if let Some(continuations) = self.continuations.as_mut() {
204                    continuations.push(continuation);
205                } else {
206                    self.continuations = Some(vec![continuation]);
207                }
208
209                self.attr_position = 0;
210            } else {
211                self.attributes.push(Attribute {
212                    name: attr_name,
213                    value,
214                });
215            }
216            self.is_continuation = false;
217            self.attr_charset = None;
218        }
219
220        if has_values {
221            self.values.clear();
222        }
223
224        self.reset_parser();
225    }
226
227    fn add_attr_position(&mut self, stream: &MessageStream<'_>) -> bool {
228        if self.token_start > 0 {
229            self.attr_position =
230                String::from_utf8_lossy(&stream.data[self.token_start - 1..self.token_end])
231                    .parse()
232                    .unwrap_or(0);
233
234            self.reset_parser();
235            true
236        } else {
237            false
238        }
239    }
240
241    fn merge_continuations(&mut self) {
242        let continuations = self.continuations.as_mut().unwrap();
243        continuations.sort();
244        for (key, _, value) in continuations.drain(..) {
245            if let Some(old) = self.attributes.iter_mut().find(|a| a.name == key) {
246                old.value = format!("{}{value}", old.value).into();
247            } else {
248                self.attributes.push(Attribute { name: key, value });
249            }
250        }
251    }
252}
253
254impl<'x> MessageStream<'x> {
255    pub fn parse_content_type(&mut self) -> HeaderValue<'x> {
256        let mut parser = ContentTypeParser {
257            state: ContentState::Type,
258            state_stack: Vec::new(),
259
260            c_type: None,
261            c_subtype: None,
262
263            attr_name: None,
264            attr_charset: None,
265            attr_position: 0,
266
267            attributes: Vec::new(),
268            values: Vec::new(),
269            continuations: None,
270
271            is_continuation: false,
272            is_encoded_attribute: false,
273            is_lower_case: true,
274            is_token_start: true,
275            is_escaped: false,
276            remove_crlf: false,
277
278            token_start: 0,
279            token_end: 0,
280        };
281
282        while let Some(ch) = self.next() {
283            match ch {
284                b' ' | b'\t' => {
285                    if !parser.is_token_start {
286                        parser.is_token_start = true;
287                    }
288                    if let ContentState::AttributeQuotedValue = parser.state {
289                        if parser.token_start == 0 {
290                            parser.token_start = self.offset();
291                            parser.token_end = parser.token_start;
292                        } else {
293                            parser.token_end = self.offset();
294                        }
295                    }
296                    continue;
297                }
298                b'A'..=b'Z' => {
299                    if parser.is_lower_case {
300                        if let ContentState::Type
301                        | ContentState::SubType
302                        | ContentState::AttributeName = parser.state
303                        {
304                            parser.is_lower_case = false;
305                        }
306                    }
307                }
308                b'\n' => {
309                    let next_is_space = self.peek_next_is_space();
310                    match parser.state {
311                        ContentState::Type
312                        | ContentState::AttributeName
313                        | ContentState::SubType => {
314                            parser.add_attribute(self);
315                        }
316                        ContentState::AttributeValue => {
317                            parser.add_value(self);
318                        }
319                        ContentState::AttributeQuotedValue => {
320                            if next_is_space {
321                                self.next();
322                                parser.remove_crlf = true;
323                                continue;
324                            } else {
325                                parser.add_value(self);
326                            }
327                        }
328                        _ => (),
329                    }
330
331                    if next_is_space {
332                        if parser.state == ContentState::Type {
333                            continue;
334                        }
335                        parser.state = ContentState::AttributeName;
336                        self.next();
337
338                        if !parser.is_token_start {
339                            parser.is_token_start = true;
340                        }
341                        continue;
342                    } else {
343                        if parser.continuations.is_some() {
344                            parser.merge_continuations();
345                        }
346
347                        return if let Some(content_type) = parser.c_type {
348                            HeaderValue::ContentType(ContentType {
349                                c_type: content_type,
350                                c_subtype: parser.c_subtype.take(),
351                                attributes: if !parser.attributes.is_empty() {
352                                    Some(parser.attributes)
353                                } else {
354                                    None
355                                },
356                            })
357                        } else {
358                            HeaderValue::Empty
359                        };
360                    }
361                }
362                b'/' if parser.state == ContentState::Type => {
363                    parser.add_attribute(self);
364                    parser.state = ContentState::SubType;
365                    continue;
366                }
367                b';' => match parser.state {
368                    ContentState::Type | ContentState::SubType | ContentState::AttributeName => {
369                        parser.add_attribute(self);
370                        parser.state = ContentState::AttributeName;
371                        continue;
372                    }
373                    ContentState::AttributeValue => {
374                        if !parser.is_escaped {
375                            parser.add_value(self);
376                            parser.state = ContentState::AttributeName;
377                        } else {
378                            parser.is_escaped = false;
379                        }
380                        continue;
381                    }
382                    _ => (),
383                },
384                b'*' if parser.state == ContentState::AttributeName => {
385                    if !parser.is_continuation {
386                        parser.is_continuation = parser.add_attribute(self);
387                    } else if !parser.is_encoded_attribute {
388                        parser.add_attr_position(self);
389                        parser.is_encoded_attribute = true;
390                    } else {
391                        // Malformed data, reset parser.
392                        parser.reset_parser();
393                    }
394                    continue;
395                }
396                b'=' => match parser.state {
397                    ContentState::AttributeName => {
398                        if !parser.is_continuation {
399                            if !parser.add_attribute(self) {
400                                continue;
401                            }
402                        } else if !parser.is_encoded_attribute {
403                            /* If is_continuation=true && is_encoded_attribute=false,
404                            the last character was a '*' which means encoding */
405                            parser.is_encoded_attribute = !parser.add_attr_position(self);
406                        } else {
407                            parser.reset_parser();
408                        }
409                        parser.state = ContentState::AttributeValue;
410                        continue;
411                    }
412                    ContentState::AttributeValue | ContentState::AttributeQuotedValue
413                        if parser.is_token_start && self.peek_char(b'?') =>
414                    {
415                        self.checkpoint();
416                        if let Some(token) = self.decode_rfc2047() {
417                            parser.add_partial_value(self, false);
418                            parser.values.push(token.into());
419                            continue;
420                        }
421                        self.restore();
422                    }
423                    _ => (),
424                },
425                b'\"' => match parser.state {
426                    ContentState::AttributeValue => {
427                        if !parser.is_token_start {
428                            parser.is_token_start = true;
429                        }
430                        parser.state = ContentState::AttributeQuotedValue;
431                        continue;
432                    }
433                    ContentState::AttributeQuotedValue => {
434                        if !parser.is_escaped {
435                            parser.add_value(self);
436                            parser.state = ContentState::AttributeName;
437                            continue;
438                        } else {
439                            parser.is_escaped = false;
440                        }
441                    }
442                    _ => continue,
443                },
444                b'\\' => match parser.state {
445                    ContentState::AttributeQuotedValue | ContentState::AttributeValue => {
446                        if !parser.is_escaped {
447                            parser.add_partial_value(self, true);
448                            parser.is_escaped = true;
449                            continue;
450                        } else {
451                            parser.is_escaped = false;
452                        }
453                    }
454                    ContentState::Comment => parser.is_escaped = !parser.is_escaped,
455                    _ => continue,
456                },
457                b'\''
458                    if parser.is_encoded_attribute
459                        && !parser.is_escaped
460                        && (parser.state == ContentState::AttributeValue
461                            || parser.state == ContentState::AttributeQuotedValue) =>
462                {
463                    parser.add_attribute_parameter(self);
464                    continue;
465                }
466                b'(' if parser.state != ContentState::AttributeQuotedValue => {
467                    if !parser.is_escaped {
468                        match parser.state {
469                            ContentState::Type
470                            | ContentState::AttributeName
471                            | ContentState::SubType => {
472                                parser.add_attribute(self);
473                            }
474                            ContentState::AttributeValue => {
475                                parser.add_value(self);
476                            }
477                            _ => (),
478                        }
479
480                        parser.state_stack.push(parser.state);
481                        parser.state = ContentState::Comment;
482                    } else {
483                        parser.is_escaped = false;
484                    }
485                    continue;
486                }
487                b')' if parser.state == ContentState::Comment => {
488                    if !parser.is_escaped {
489                        parser.state = parser.state_stack.pop().unwrap();
490                        parser.reset_parser();
491                    } else {
492                        parser.is_escaped = false;
493                    }
494                    continue;
495                }
496                b'\r' => continue,
497                _ => (),
498            }
499
500            if parser.is_escaped {
501                parser.is_escaped = false;
502            }
503
504            if parser.is_token_start {
505                parser.is_token_start = false;
506            }
507
508            if parser.token_start == 0 {
509                parser.token_start = self.offset();
510                parser.token_end = parser.token_start;
511            } else {
512                parser.token_end = self.offset();
513            }
514        }
515
516        HeaderValue::Empty
517    }
518}
519#[cfg(test)]
520mod tests {
521    use crate::parsers::{fields::load_tests, MessageStream};
522
523    #[test]
524    fn parse_content_fields() {
525        for test in load_tests("content_type.json") {
526            assert_eq!(
527                MessageStream::new(test.header.as_bytes())
528                    .parse_content_type()
529                    .into_content_type(),
530                test.expected,
531                "failed for {:?}",
532                test.header
533            );
534        }
535
536        /*let mut builder = crate::parsers::fields::TestBuilder::new("content_type.json");
537
538        for input in inputs {
539            println!("Testing: {:?}", input.0);
540            let result = MessageStream::new(input.0.as_bytes())
541                .parse_content_type()
542                .into_content_type();
543
544            builder.add(input.0.to_string(), result);
545        }
546
547        builder.write();*/
548    }
549}