Skip to main content

docspec_http/
mime_parser.rs

1//! HTTP `Accept` negotiation and `Content-Type` validation for the conversion API.
2
3use axum::http::HeaderValue;
4use docspec::{InputFormat, OutputFormat};
5
6use crate::error::HttpError;
7use crate::format::{
8    OUTPUT_MIME_ALIAS, OUTPUT_MIME_HTML_PRIMARY, OUTPUT_MIME_OXA_PRIMARY,
9    OUTPUT_MIME_PANDOC_NATIVE_PRIMARY, OUTPUT_MIME_PRIMARY,
10};
11
12/// Negotiates the `Accept` header for the `/conversion` endpoint.
13///
14/// Returns [`OutputFormat::Oxa`] for `Accept: application/vnd.oxa+json`,
15/// [`OutputFormat::PandocNative`] for `Accept: application/vnd.pandoc.native`,
16/// [`OutputFormat::Html`] for `Accept: text/html`, and [`OutputFormat::Blocknote`]
17/// for the `BlockNote` MIMEs, `application/*`, `*/*`, and missing `Accept`.
18/// Wildcards default to `BlockNote` for back-compat with pre-oxa clients. When
19/// `Accept` lists multiple types, the first whose bare MIME matches a supported
20/// value wins (case-insensitive); `q=...` is stripped.
21///
22/// # Errors
23///
24/// Returns [`HttpError::NotAcceptable`] if no acceptable MIME type found.
25#[inline]
26pub fn negotiate_accept(header_value: Option<&HeaderValue>) -> Result<OutputFormat, HttpError> {
27    // Missing Accept == */* per RFC 7231 §5.3.2
28    let Some(header_val) = header_value else {
29        return Ok(OutputFormat::Blocknote);
30    };
31    let header_str = header_val
32        .to_str()
33        .map_err(|_err| HttpError::NotAcceptable)?;
34
35    for part in header_str.split(',') {
36        let type_part = part.trim().split(';').next().map_or("", str::trim);
37        if type_part.eq_ignore_ascii_case(OUTPUT_MIME_OXA_PRIMARY) {
38            return Ok(OutputFormat::Oxa);
39        }
40        if type_part.eq_ignore_ascii_case(OUTPUT_MIME_PANDOC_NATIVE_PRIMARY) {
41            return Ok(OutputFormat::PandocNative);
42        }
43        if type_part.eq_ignore_ascii_case(OUTPUT_MIME_HTML_PRIMARY) {
44            return Ok(OutputFormat::Html);
45        }
46        if type_part.eq_ignore_ascii_case("*/*")
47            || type_part.eq_ignore_ascii_case("application/*")
48            || type_part.eq_ignore_ascii_case(OUTPUT_MIME_PRIMARY)
49            || type_part.eq_ignore_ascii_case(OUTPUT_MIME_ALIAS)
50        {
51            return Ok(OutputFormat::Blocknote);
52        }
53    }
54    Err(HttpError::NotAcceptable)
55}
56
57/// Validates the `Content-Type` header for the `/conversion` endpoint and
58/// resolves it to the matching reader format.
59///
60/// Accepts `text/markdown` ([`InputFormat::Markdown`]), `text/html`
61/// ([`InputFormat::Html`]), and
62/// `application/vnd.openxmlformats-officedocument.wordprocessingml.document`
63/// ([`InputFormat::Docx`]). Text types accept no charset, or `charset=utf-8`
64/// (case-insensitive). DOCX is a binary format — no parameters are allowed at
65/// all. Any other charset is rejected — the handler always decodes the body as
66/// UTF-8, so a non-UTF-8 charset is unsupportable. Returns `Err` if the header
67/// is missing, malformed, the MIME type is unsupported, the charset is anything
68/// other than `utf-8` for text types, or any parameter is present on DOCX.
69///
70/// # Errors
71///
72/// Returns [`HttpError::UnsupportedMediaType`] with the received value (or `None` if missing).
73#[inline]
74pub fn validate_content_type(header_value: Option<&HeaderValue>) -> Result<InputFormat, HttpError> {
75    let Some(header_val) = header_value else {
76        return Err(HttpError::UnsupportedMediaType { received: None });
77    };
78    let header_str = header_val
79        .to_str()
80        .ok()
81        .ok_or_else(|| HttpError::UnsupportedMediaType {
82            received: Some("<invalid header value>".to_owned()),
83        })?;
84    let parsed: mime::Mime =
85        header_str
86            .parse()
87            .ok()
88            .ok_or_else(|| HttpError::UnsupportedMediaType {
89                received: Some(header_str.to_owned()),
90            })?;
91    // DOCX: strict — no parameters allowed (binary format; charset is meaningless)
92    if parsed.type_() == mime::APPLICATION
93        && parsed.subtype().as_str()
94            == "vnd.openxmlformats-officedocument.wordprocessingml.document"
95    {
96        if parsed.params().next().is_some() {
97            return Err(HttpError::UnsupportedMediaType {
98                received: Some(header_str.to_owned()),
99            });
100        }
101        return Ok(InputFormat::Docx);
102    }
103    let format = match (parsed.type_(), parsed.subtype().as_str()) {
104        (mime::TEXT, "markdown") => InputFormat::Markdown,
105        (mime::TEXT, "html") => InputFormat::Html,
106        _ => {
107            return Err(HttpError::UnsupportedMediaType {
108                received: Some(header_str.to_owned()),
109            });
110        }
111    };
112    if let Some(charset) = parsed.get_param(mime::CHARSET) {
113        if !charset.as_str().eq_ignore_ascii_case("utf-8") {
114            return Err(HttpError::UnsupportedMediaType {
115                received: Some(header_str.to_owned()),
116            });
117        }
118    }
119    // Strict: only the optional charset parameter is allowed. Unknown params
120    // (e.g. `boundary`, `format`) cause 415 to prevent accidental acceptance
121    // of unrelated media types that happen to share the text/markdown prefix.
122    for (name, _) in parsed.params() {
123        if name != mime::CHARSET {
124            return Err(HttpError::UnsupportedMediaType {
125                received: Some(header_str.to_owned()),
126            });
127        }
128    }
129    Ok(format)
130}
131
132/// Returns the bounded `input_mime_type` label value for a `Content-Type` header.
133///
134/// This function is intentionally MORE permissive than [`validate_content_type`]:
135/// it returns the matching label for any `text/markdown`, `text/html`, or DOCX
136/// value regardless of charset or other parameters, because the label answers
137/// "what did the client try to send?" rather than "is it valid?".
138///
139/// # Label values
140///
141/// - [`crate::metrics::INPUT_MIME_NONE`] — header absent
142/// - [`crate::metrics::INPUT_MIME_MARKDOWN`] — `text/markdown` (any params)
143/// - [`crate::metrics::INPUT_MIME_HTML`] — `text/html` (any params)
144/// - [`crate::metrics::INPUT_MIME_DOCX`] — `application/vnd.openxmlformats-officedocument.wordprocessingml.document` (any params)
145/// - [`crate::metrics::INPUT_MIME_UNSUPPORTED`] — anything else
146#[must_use]
147#[inline]
148pub fn bucket_input_mime(header_value: Option<&HeaderValue>) -> &'static str {
149    let Some(header_val) = header_value else {
150        return crate::metrics::INPUT_MIME_NONE;
151    };
152    let Ok(header_str) = header_val.to_str() else {
153        return crate::metrics::INPUT_MIME_UNSUPPORTED;
154    };
155    let Ok(parsed) = header_str.parse::<mime::Mime>() else {
156        return crate::metrics::INPUT_MIME_UNSUPPORTED;
157    };
158    match (parsed.type_(), parsed.subtype().as_str()) {
159        (mime::TEXT, "markdown") => crate::metrics::INPUT_MIME_MARKDOWN,
160        (mime::TEXT, "html") => crate::metrics::INPUT_MIME_HTML,
161        (mime::APPLICATION, "vnd.openxmlformats-officedocument.wordprocessingml.document") => {
162            crate::metrics::INPUT_MIME_DOCX
163        }
164        _ => crate::metrics::INPUT_MIME_UNSUPPORTED,
165    }
166}
167
168/// Returns the bounded `output_mime_type` label value for a conversion outcome.
169///
170/// `chosen_format` is `None` for any error path, and `Some(format)` on success.
171#[inline]
172#[must_use]
173pub fn bucket_output_mime(chosen_format: Option<OutputFormat>) -> &'static str {
174    match chosen_format {
175        Some(OutputFormat::Blocknote) => crate::metrics::OUTPUT_MIME_BLOCKNOTE,
176        Some(OutputFormat::Html) => crate::metrics::OUTPUT_MIME_HTML,
177        Some(OutputFormat::Oxa) => crate::metrics::OUTPUT_MIME_OXA,
178        Some(OutputFormat::PandocNative) => crate::metrics::OUTPUT_MIME_PANDOC_NATIVE,
179        None | Some(_) => crate::metrics::OUTPUT_MIME_NONE,
180    }
181}
182
183#[cfg(test)]
184mod bucket_tests {
185    #![allow(
186        clippy::tests_outside_test_module,
187        clippy::unwrap_used,
188        clippy::expect_used
189    )]
190
191    use super::*;
192    use axum::http::HeaderValue;
193
194    // ─── bucket_input_mime tests ───────────────────────────────────────────
195
196    #[test]
197    fn bucket_input_mime_none_when_header_absent() {
198        assert_eq!(bucket_input_mime(None), crate::metrics::INPUT_MIME_NONE);
199    }
200
201    #[test]
202    fn bucket_input_mime_markdown_when_text_markdown() {
203        let val = HeaderValue::from_static("text/markdown");
204        assert_eq!(
205            bucket_input_mime(Some(&val)),
206            crate::metrics::INPUT_MIME_MARKDOWN
207        );
208    }
209
210    #[test]
211    fn bucket_input_mime_markdown_when_text_markdown_with_charset() {
212        let val = HeaderValue::from_static("text/markdown; charset=utf-8");
213        assert_eq!(
214            bucket_input_mime(Some(&val)),
215            crate::metrics::INPUT_MIME_MARKDOWN
216        );
217    }
218
219    #[test]
220    fn bucket_input_mime_markdown_case_insensitive() {
221        let val = HeaderValue::from_static("TEXT/MARKDOWN");
222        assert_eq!(
223            bucket_input_mime(Some(&val)),
224            crate::metrics::INPUT_MIME_MARKDOWN
225        );
226    }
227
228    #[test]
229    fn bucket_input_mime_html_when_text_html() {
230        let val = HeaderValue::from_static("text/html");
231        assert_eq!(
232            bucket_input_mime(Some(&val)),
233            crate::metrics::INPUT_MIME_HTML
234        );
235    }
236
237    #[test]
238    fn bucket_input_mime_html_when_text_html_with_charset() {
239        let val = HeaderValue::from_static("text/html; charset=utf-8");
240        assert_eq!(
241            bucket_input_mime(Some(&val)),
242            crate::metrics::INPUT_MIME_HTML
243        );
244    }
245
246    #[test]
247    fn bucket_input_mime_html_case_insensitive() {
248        let val = HeaderValue::from_static("TEXT/HTML");
249        assert_eq!(
250            bucket_input_mime(Some(&val)),
251            crate::metrics::INPUT_MIME_HTML
252        );
253    }
254
255    #[test]
256    fn bucket_input_mime_html_with_non_utf8_charset_still_buckets_html() {
257        let val = HeaderValue::from_static("text/html; charset=iso-8859-1");
258        assert_eq!(
259            bucket_input_mime(Some(&val)),
260            crate::metrics::INPUT_MIME_HTML
261        );
262    }
263
264    #[test]
265    fn bucket_input_mime_unsupported_when_other_format() {
266        let val = HeaderValue::from_static("application/pdf");
267        assert_eq!(
268            bucket_input_mime(Some(&val)),
269            crate::metrics::INPUT_MIME_UNSUPPORTED
270        );
271    }
272
273    #[test]
274    fn bucket_input_mime_unsupported_when_malformed() {
275        let val = HeaderValue::from_static("not a mime type at all");
276        assert_eq!(
277            bucket_input_mime(Some(&val)),
278            crate::metrics::INPUT_MIME_UNSUPPORTED
279        );
280    }
281
282    #[test]
283    fn bucket_input_mime_unsupported_when_non_ascii() {
284        let val = HeaderValue::from_bytes(&[0xFF, 0xFE]).unwrap();
285        assert_eq!(
286            bucket_input_mime(Some(&val)),
287            crate::metrics::INPUT_MIME_UNSUPPORTED
288        );
289    }
290
291    // ─── bucket_output_mime tests ──────────────────────────────────────────
292
293    #[test]
294    fn bucket_output_mime_blocknote_when_blocknote_succeeded() {
295        assert_eq!(
296            bucket_output_mime(Some(OutputFormat::Blocknote)),
297            crate::metrics::OUTPUT_MIME_BLOCKNOTE
298        );
299    }
300
301    #[test]
302    fn bucket_output_mime_html_when_html_succeeded() {
303        assert_eq!(
304            bucket_output_mime(Some(OutputFormat::Html)),
305            crate::metrics::OUTPUT_MIME_HTML
306        );
307    }
308
309    #[test]
310    fn bucket_output_mime_oxa_when_oxa_succeeded() {
311        assert_eq!(
312            bucket_output_mime(Some(OutputFormat::Oxa)),
313            crate::metrics::OUTPUT_MIME_OXA
314        );
315    }
316
317    #[test]
318    fn bucket_output_mime_pandoc_native_when_pandoc_native_succeeded() {
319        assert_eq!(
320            bucket_output_mime(Some(OutputFormat::PandocNative)),
321            crate::metrics::OUTPUT_MIME_PANDOC_NATIVE
322        );
323    }
324
325    #[test]
326    fn bucket_output_mime_none_when_no_format_chosen() {
327        assert_eq!(bucket_output_mime(None), crate::metrics::OUTPUT_MIME_NONE);
328    }
329}