mail_parser/parsers/fields/
content_type.rs1use std::borrow::Cow;
8
9use crate::{
10 decoders::{charsets::map::charset_decoder, hex::decode_hex},
11 parsers::MessageStream,
12 Attribute, ContentType, HeaderValue,
13};
14
15#[derive(Clone, Copy, PartialEq, Debug)]
16enum ContentState {
17 Type,
18 SubType,
19 AttributeName,
20 AttributeValue,
21 AttributeQuotedValue,
22 Comment,
23}
24
25type Continuation<'x> = (Cow<'x, str>, u32, Cow<'x, str>);
26
27struct ContentTypeParser<'x> {
28 state: ContentState,
29 state_stack: Vec<ContentState>,
30
31 c_type: Option<Cow<'x, str>>,
32 c_subtype: Option<Cow<'x, str>>,
33
34 attr_name: Option<Cow<'x, str>>,
35 attr_charset: Option<Cow<'x, str>>,
36 attr_position: u32,
37
38 values: Vec<Cow<'x, str>>,
39 attributes: Vec<Attribute<'x>>,
40 continuations: Option<Vec<Continuation<'x>>>,
41
42 token_start: usize,
43 token_end: usize,
44
45 is_continuation: bool,
46 is_encoded_attribute: bool,
47 is_escaped: bool,
48 remove_crlf: bool,
49 is_lower_case: bool,
50 is_token_start: bool,
51}
52
53impl<'x> ContentTypeParser<'x> {
54 #[inline(always)]
55 fn reset_parser(&mut self) {
56 self.token_start = 0;
57 self.is_token_start = true;
58 }
59
60 fn add_attribute(&mut self, stream: &MessageStream<'x>) -> bool {
61 if self.token_start > 0 {
62 let mut attr = Some(String::from_utf8_lossy(
63 &stream.data[self.token_start - 1..self.token_end],
64 ));
65
66 if !self.is_lower_case {
67 attr.as_mut().unwrap().to_mut().make_ascii_lowercase();
68 self.is_lower_case = true;
69 }
70
71 match self.state {
72 ContentState::AttributeName => self.attr_name = attr,
73 ContentState::Type => self.c_type = attr,
74 ContentState::SubType => self.c_subtype = attr,
75 _ => unreachable!(),
76 }
77
78 self.reset_parser();
79 true
80 } else {
81 false
82 }
83 }
84
85 fn add_attribute_parameter(&mut self, stream: &MessageStream<'x>) {
86 if self.token_start > 0 {
87 let attr_part =
88 String::from_utf8_lossy(&stream.data[self.token_start - 1..self.token_end]);
89
90 if self.attr_charset.is_none() {
91 self.attr_charset = attr_part.into();
92 } else {
93 let attr_name =
94 self.attr_name.as_ref().unwrap_or(&"unknown".into()).clone() + "-language";
95
96 if !self.attributes.iter().any(|a| a.name == attr_name) {
97 self.attributes.push(Attribute {
98 name: attr_name,
99 value: attr_part,
100 });
101 } else {
102 self.values.push("'".into());
103 self.values.push(attr_part);
104 }
105 }
106
107 self.reset_parser();
108 }
109 }
110
111 fn add_partial_value(&mut self, stream: &MessageStream<'x>, to_cur_pos: bool) {
112 if self.token_start > 0 {
113 let in_quote = self.state == ContentState::AttributeQuotedValue;
114
115 self.values.push(String::from_utf8_lossy(
116 &stream.data[self.token_start - 1..if in_quote && to_cur_pos {
117 stream.offset() - 1
118 } else {
119 self.token_end
120 }],
121 ));
122 if !in_quote {
123 self.values.push(" ".into());
124 }
125
126 self.reset_parser();
127 }
128 }
129
130 fn add_value(&mut self, stream: &MessageStream<'x>) {
131 if self.attr_name.is_none() {
132 return;
133 }
134
135 let has_values = !self.values.is_empty();
136 let value = if self.token_start > 0 {
137 let value = &stream.data[self.token_start - 1..self.token_end];
138 Some(if !self.remove_crlf {
139 String::from_utf8_lossy(value)
140 } else {
141 self.remove_crlf = false;
142 match String::from_utf8(
143 value
144 .iter()
145 .filter(|&&ch| ch != b'\r' && ch != b'\n')
146 .copied()
147 .collect::<Vec<_>>(),
148 ) {
149 Ok(value) => value.into(),
150 Err(err) => String::from_utf8_lossy(err.as_bytes()).into_owned().into(),
151 }
152 })
153 } else {
154 if !has_values {
155 return;
156 }
157 None
158 };
159
160 if !self.is_continuation {
161 self.attributes.push(Attribute {
162 name: self.attr_name.take().unwrap(),
163 value: if !has_values {
164 value.unwrap()
165 } else {
166 if let Some(value) = value {
167 self.values.push(value);
168 }
169 self.values.concat().into()
170 },
171 });
172 } else {
173 let attr_name = self.attr_name.take().unwrap();
174 let mut value = if let Some(value) = value {
175 if has_values {
176 Cow::from(self.values.concat()) + value
177 } else {
178 value
179 }
180 } else {
181 self.values.concat().into()
182 };
183
184 if self.is_encoded_attribute {
185 if let (true, decoded_bytes) = decode_hex(value.as_bytes()) {
186 value = if let Some(decoder) = self
187 .attr_charset
188 .as_ref()
189 .and_then(|c| charset_decoder(c.as_bytes()))
190 {
191 decoder(&decoded_bytes).into()
192 } else {
193 String::from_utf8(decoded_bytes)
194 .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
195 .into()
196 }
197 }
198 self.is_encoded_attribute = false;
199 }
200
201 if self.attr_position > 0 {
202 let continuation = (attr_name, self.attr_position, value);
203 if let Some(continuations) = self.continuations.as_mut() {
204 continuations.push(continuation);
205 } else {
206 self.continuations = Some(vec![continuation]);
207 }
208
209 self.attr_position = 0;
210 } else {
211 self.attributes.push(Attribute {
212 name: attr_name,
213 value,
214 });
215 }
216 self.is_continuation = false;
217 self.attr_charset = None;
218 }
219
220 if has_values {
221 self.values.clear();
222 }
223
224 self.reset_parser();
225 }
226
227 fn add_attr_position(&mut self, stream: &MessageStream<'_>) -> bool {
228 if self.token_start > 0 {
229 self.attr_position =
230 String::from_utf8_lossy(&stream.data[self.token_start - 1..self.token_end])
231 .parse()
232 .unwrap_or(0);
233
234 self.reset_parser();
235 true
236 } else {
237 false
238 }
239 }
240
241 fn merge_continuations(&mut self) {
242 let continuations = self.continuations.as_mut().unwrap();
243 continuations.sort();
244 for (key, _, value) in continuations.drain(..) {
245 if let Some(old) = self.attributes.iter_mut().find(|a| a.name == key) {
246 old.value = format!("{}{value}", old.value).into();
247 } else {
248 self.attributes.push(Attribute { name: key, value });
249 }
250 }
251 }
252}
253
254impl<'x> MessageStream<'x> {
255 pub fn parse_content_type(&mut self) -> HeaderValue<'x> {
256 let mut parser = ContentTypeParser {
257 state: ContentState::Type,
258 state_stack: Vec::new(),
259
260 c_type: None,
261 c_subtype: None,
262
263 attr_name: None,
264 attr_charset: None,
265 attr_position: 0,
266
267 attributes: Vec::new(),
268 values: Vec::new(),
269 continuations: None,
270
271 is_continuation: false,
272 is_encoded_attribute: false,
273 is_lower_case: true,
274 is_token_start: true,
275 is_escaped: false,
276 remove_crlf: false,
277
278 token_start: 0,
279 token_end: 0,
280 };
281
282 while let Some(ch) = self.next() {
283 match ch {
284 b' ' | b'\t' => {
285 if !parser.is_token_start {
286 parser.is_token_start = true;
287 }
288 if let ContentState::AttributeQuotedValue = parser.state {
289 if parser.token_start == 0 {
290 parser.token_start = self.offset();
291 parser.token_end = parser.token_start;
292 } else {
293 parser.token_end = self.offset();
294 }
295 }
296 continue;
297 }
298 b'A'..=b'Z' => {
299 if parser.is_lower_case {
300 if let ContentState::Type
301 | ContentState::SubType
302 | ContentState::AttributeName = parser.state
303 {
304 parser.is_lower_case = false;
305 }
306 }
307 }
308 b'\n' => {
309 let next_is_space = self.peek_next_is_space();
310 match parser.state {
311 ContentState::Type
312 | ContentState::AttributeName
313 | ContentState::SubType => {
314 parser.add_attribute(self);
315 }
316 ContentState::AttributeValue => {
317 parser.add_value(self);
318 }
319 ContentState::AttributeQuotedValue => {
320 if next_is_space {
321 self.next();
322 parser.remove_crlf = true;
323 continue;
324 } else {
325 parser.add_value(self);
326 }
327 }
328 _ => (),
329 }
330
331 if next_is_space {
332 if parser.state == ContentState::Type {
333 continue;
334 }
335 parser.state = ContentState::AttributeName;
336 self.next();
337
338 if !parser.is_token_start {
339 parser.is_token_start = true;
340 }
341 continue;
342 } else {
343 if parser.continuations.is_some() {
344 parser.merge_continuations();
345 }
346
347 return if let Some(content_type) = parser.c_type {
348 HeaderValue::ContentType(ContentType {
349 c_type: content_type,
350 c_subtype: parser.c_subtype.take(),
351 attributes: if !parser.attributes.is_empty() {
352 Some(parser.attributes)
353 } else {
354 None
355 },
356 })
357 } else {
358 HeaderValue::Empty
359 };
360 }
361 }
362 b'/' if parser.state == ContentState::Type => {
363 parser.add_attribute(self);
364 parser.state = ContentState::SubType;
365 continue;
366 }
367 b';' => match parser.state {
368 ContentState::Type | ContentState::SubType | ContentState::AttributeName => {
369 parser.add_attribute(self);
370 parser.state = ContentState::AttributeName;
371 continue;
372 }
373 ContentState::AttributeValue => {
374 if !parser.is_escaped {
375 parser.add_value(self);
376 parser.state = ContentState::AttributeName;
377 } else {
378 parser.is_escaped = false;
379 }
380 continue;
381 }
382 _ => (),
383 },
384 b'*' if parser.state == ContentState::AttributeName => {
385 if !parser.is_continuation {
386 parser.is_continuation = parser.add_attribute(self);
387 } else if !parser.is_encoded_attribute {
388 parser.add_attr_position(self);
389 parser.is_encoded_attribute = true;
390 } else {
391 parser.reset_parser();
393 }
394 continue;
395 }
396 b'=' => match parser.state {
397 ContentState::AttributeName => {
398 if !parser.is_continuation {
399 if !parser.add_attribute(self) {
400 continue;
401 }
402 } else if !parser.is_encoded_attribute {
403 parser.is_encoded_attribute = !parser.add_attr_position(self);
406 } else {
407 parser.reset_parser();
408 }
409 parser.state = ContentState::AttributeValue;
410 continue;
411 }
412 ContentState::AttributeValue | ContentState::AttributeQuotedValue
413 if parser.is_token_start && self.peek_char(b'?') =>
414 {
415 self.checkpoint();
416 if let Some(token) = self.decode_rfc2047() {
417 parser.add_partial_value(self, false);
418 parser.values.push(token.into());
419 continue;
420 }
421 self.restore();
422 }
423 _ => (),
424 },
425 b'\"' => match parser.state {
426 ContentState::AttributeValue => {
427 if !parser.is_token_start {
428 parser.is_token_start = true;
429 }
430 parser.state = ContentState::AttributeQuotedValue;
431 continue;
432 }
433 ContentState::AttributeQuotedValue => {
434 if !parser.is_escaped {
435 parser.add_value(self);
436 parser.state = ContentState::AttributeName;
437 continue;
438 } else {
439 parser.is_escaped = false;
440 }
441 }
442 _ => continue,
443 },
444 b'\\' => match parser.state {
445 ContentState::AttributeQuotedValue | ContentState::AttributeValue => {
446 if !parser.is_escaped {
447 parser.add_partial_value(self, true);
448 parser.is_escaped = true;
449 continue;
450 } else {
451 parser.is_escaped = false;
452 }
453 }
454 ContentState::Comment => parser.is_escaped = !parser.is_escaped,
455 _ => continue,
456 },
457 b'\''
458 if parser.is_encoded_attribute
459 && !parser.is_escaped
460 && (parser.state == ContentState::AttributeValue
461 || parser.state == ContentState::AttributeQuotedValue) =>
462 {
463 parser.add_attribute_parameter(self);
464 continue;
465 }
466 b'(' if parser.state != ContentState::AttributeQuotedValue => {
467 if !parser.is_escaped {
468 match parser.state {
469 ContentState::Type
470 | ContentState::AttributeName
471 | ContentState::SubType => {
472 parser.add_attribute(self);
473 }
474 ContentState::AttributeValue => {
475 parser.add_value(self);
476 }
477 _ => (),
478 }
479
480 parser.state_stack.push(parser.state);
481 parser.state = ContentState::Comment;
482 } else {
483 parser.is_escaped = false;
484 }
485 continue;
486 }
487 b')' if parser.state == ContentState::Comment => {
488 if !parser.is_escaped {
489 parser.state = parser.state_stack.pop().unwrap();
490 parser.reset_parser();
491 } else {
492 parser.is_escaped = false;
493 }
494 continue;
495 }
496 b'\r' => continue,
497 _ => (),
498 }
499
500 if parser.is_escaped {
501 parser.is_escaped = false;
502 }
503
504 if parser.is_token_start {
505 parser.is_token_start = false;
506 }
507
508 if parser.token_start == 0 {
509 parser.token_start = self.offset();
510 parser.token_end = parser.token_start;
511 } else {
512 parser.token_end = self.offset();
513 }
514 }
515
516 HeaderValue::Empty
517 }
518}
519#[cfg(test)]
520mod tests {
521 use crate::parsers::{fields::load_tests, MessageStream};
522
523 #[test]
524 fn parse_content_fields() {
525 for test in load_tests("content_type.json") {
526 assert_eq!(
527 MessageStream::new(test.header.as_bytes())
528 .parse_content_type()
529 .into_content_type(),
530 test.expected,
531 "failed for {:?}",
532 test.header
533 );
534 }
535
536 }
549}