xocomil 0.3.0

A lightweight, zero-allocation HTTP/1.1 request parser and response writer
Documentation
//! Parsing of `Content-Type` (and other media-type) header values.
//!
//! Splits a media type into `type/subtype` plus a parameter list,
//! borrowing from the input. Parameter values may be quoted strings;
//! [`MediaType::param`] strips the surrounding quotes (but does **not**
//! unfold `\\`-escapes — extractors that care about exact byte fidelity
//! should fall back to [`params`](MediaType::params) and inspect the raw
//! value).
//!
//! # Example
//!
//! ```
//! use xocomil::media::MediaType;
//!
//! let mt = MediaType::parse(b"application/json; charset=utf-8").unwrap();
//! assert_eq!(mt.type_(), b"application");
//! assert_eq!(mt.subtype(), b"json");
//! assert_eq!(mt.param("charset"), Some(&b"utf-8"[..]));
//!
//! let mt = MediaType::parse(b"multipart/form-data; boundary=\"abc-123\"").unwrap();
//! assert_eq!(mt.param("boundary"), Some(&b"abc-123"[..]));
//! ```

use crate::error::MediaErrorKind;

/// A parsed `Content-Type`-style media type with lazy parameter access.
///
/// Borrows from the input slice. Type and subtype are extracted eagerly;
/// parameters are stored as the raw remainder and re-parsed by
/// [`param`](Self::param) and [`params`](Self::params) on demand.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct MediaType<'a> {
    type_: &'a [u8],
    subtype: &'a [u8],
    /// Raw parameter section (everything after the first `;`, including
    /// the leading `;`). Empty if no parameters are present.
    params: &'a [u8],
}

impl<'a> MediaType<'a> {
    /// Parse a media-type header value.
    ///
    /// Surrounding optional whitespace is trimmed. The type/subtype are
    /// validated as non-empty token-shaped slices (no whitespace, no
    /// `;`); deeper RFC 7231 token validation is left to callers.
    ///
    /// # Errors
    ///
    /// Returns [`MediaErrorKind::Empty`] if the input is blank,
    /// [`MediaErrorKind::MissingSlash`] if there is no `/` between type
    /// and subtype, or [`MediaErrorKind::InvalidToken`] if either
    /// component is empty.
    pub fn parse(value: &'a [u8]) -> Result<Self, MediaErrorKind> {
        let trimmed = trim_ows(value);
        if trimmed.is_empty() {
            return Err(MediaErrorKind::Empty);
        }

        // Split off the parameter section at the first `;`.
        let (mt, params) = trimmed
            .iter()
            .position(|&b| b == b';')
            .map_or_else(|| (trimmed, &b""[..]), |i| (&trimmed[..i], &trimmed[i..]));
        let mt = trim_ows(mt);

        // Split type and subtype.
        let slash = mt
            .iter()
            .position(|&b| b == b'/')
            .ok_or(MediaErrorKind::MissingSlash)?;
        let type_ = trim_ows(&mt[..slash]);
        let subtype = trim_ows(&mt[slash + 1..]);
        if type_.is_empty() || subtype.is_empty() {
            return Err(MediaErrorKind::InvalidToken);
        }

        Ok(Self {
            type_,
            subtype,
            params,
        })
    }

    /// The top-level type, e.g. `application`.
    #[inline]
    #[must_use]
    pub const fn type_(&self) -> &'a [u8] {
        self.type_
    }

    /// The subtype, e.g. `json`.
    #[inline]
    #[must_use]
    pub const fn subtype(&self) -> &'a [u8] {
        self.subtype
    }

    /// Look up a parameter by name (case-insensitive).
    ///
    /// Returns the value with surrounding double-quotes stripped, if
    /// present. `\\`-escapes inside the quoted string are **not**
    /// unfolded.
    #[must_use]
    pub fn param(&self, name: &str) -> Option<&'a [u8]> {
        let name = name.as_bytes();
        for (n, v) in self.params() {
            if n.eq_ignore_ascii_case(name) {
                return Some(unquote(v));
            }
        }
        None
    }

    /// Iterator over parameter `(name, value)` byte slices.
    ///
    /// Names are returned case-preserving but should be compared
    /// case-insensitively. Values are returned **including** any
    /// surrounding double-quotes; use [`param`](Self::param) for the
    /// quote-stripped form.
    #[inline]
    #[must_use]
    pub const fn params(&self) -> ParamsIter<'a> {
        ParamsIter { rest: self.params }
    }
}

/// Iterator yielding parameter `(name, value)` pairs from a media type.
///
/// Values are returned with any surrounding double-quotes intact.
#[derive(Clone, Debug)]
pub struct ParamsIter<'a> {
    rest: &'a [u8],
}

impl<'a> Iterator for ParamsIter<'a> {
    type Item = (&'a [u8], &'a [u8]);

    fn next(&mut self) -> Option<Self::Item> {
        // Each iteration: skip a leading `;`, find the next `;` that is
        // not inside a quoted value, return the trimmed name/value.
        loop {
            self.rest = trim_ows(self.rest);
            if self.rest.is_empty() {
                return None;
            }
            if self.rest[0] != b';' {
                // Malformed input — refuse to yield more.
                return None;
            }
            self.rest = &self.rest[1..];
            self.rest = trim_ows(self.rest);
            if self.rest.is_empty() {
                return None;
            }
            // An empty parameter (`;;`) — skip and continue.
            if self.rest[0] == b';' {
                continue;
            }
            break;
        }

        let end = scan_param_end(self.rest);
        let segment = &self.rest[..end];
        self.rest = &self.rest[end..];

        let (name, value) = segment.iter().position(|&b| b == b'=').map_or_else(
            || (segment, &b""[..]),
            |eq| (&segment[..eq], &segment[eq + 1..]),
        );
        Some((trim_ows(name), trim_ows(value)))
    }
}

/// Find the byte offset of the next unquoted `;`, or end of slice.
fn scan_param_end(input: &[u8]) -> usize {
    let mut i = 0;
    let mut in_quote = false;
    while i < input.len() {
        let b = input[i];
        if in_quote {
            if b == b'\\' && i + 1 < input.len() {
                i += 2;
                continue;
            }
            if b == b'"' {
                in_quote = false;
            }
        } else {
            match b {
                b'"' => in_quote = true,
                b';' => return i,
                _ => {}
            }
        }
        i += 1;
    }
    input.len()
}

/// Strip a single pair of surrounding double-quotes if present.
#[inline]
fn unquote(v: &[u8]) -> &[u8] {
    if v.len() >= 2 && v[0] == b'"' && v[v.len() - 1] == b'"' {
        &v[1..v.len() - 1]
    } else {
        v
    }
}

/// Trim leading/trailing OWS (space + horizontal tab), per RFC 7230.
#[inline]
const fn trim_ows(mut s: &[u8]) -> &[u8] {
    while let [b' ' | b'\t', rest @ ..] = s {
        s = rest;
    }
    while let [rest @ .., b' ' | b'\t'] = s {
        s = rest;
    }
    s
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_simple() {
        let mt = MediaType::parse(b"application/json").unwrap();
        assert_eq!(mt.type_(), b"application");
        assert_eq!(mt.subtype(), b"json");
        assert!(mt.params().next().is_none());
    }

    #[test]
    fn parse_with_charset() {
        let mt = MediaType::parse(b"text/html; charset=utf-8").unwrap();
        assert_eq!(mt.type_(), b"text");
        assert_eq!(mt.subtype(), b"html");
        assert_eq!(mt.param("charset"), Some(&b"utf-8"[..]));
    }

    #[test]
    fn parse_multipart_with_boundary() {
        let mt = MediaType::parse(b"multipart/form-data; boundary=----abc123").unwrap();
        assert_eq!(mt.param("boundary"), Some(&b"----abc123"[..]));
    }

    #[test]
    fn parse_quoted_boundary() {
        let mt = MediaType::parse(b"multipart/form-data; boundary=\"abc; xyz\"").unwrap();
        // The `;` inside the quoted string must not split the params.
        assert_eq!(mt.param("boundary"), Some(&b"abc; xyz"[..]));
    }

    #[test]
    fn param_lookup_is_case_insensitive() {
        let mt = MediaType::parse(b"text/html; CharSet=utf-8").unwrap();
        assert_eq!(mt.param("charset"), Some(&b"utf-8"[..]));
        assert_eq!(mt.param("CHARSET"), Some(&b"utf-8"[..]));
    }

    #[test]
    fn parse_multiple_params() {
        let mt = MediaType::parse(b"text/html; charset=utf-8; foo=bar").unwrap();
        assert_eq!(mt.params().count(), 2);
        assert_eq!(mt.param("foo"), Some(&b"bar"[..]));
    }

    #[test]
    fn parse_trims_outer_whitespace() {
        let mt = MediaType::parse(b"  application/json  ").unwrap();
        assert_eq!(mt.type_(), b"application");
        assert_eq!(mt.subtype(), b"json");
    }

    #[test]
    fn parse_trims_around_slash() {
        // Spaces around the slash aren't strictly RFC-conformant but
        // some clients send them; we accept them rather than reject.
        let mt = MediaType::parse(b"text /  html").unwrap();
        assert_eq!(mt.type_(), b"text");
        assert_eq!(mt.subtype(), b"html");
    }

    #[test]
    fn missing_param_returns_none() {
        let mt = MediaType::parse(b"text/plain").unwrap();
        assert_eq!(mt.param("charset"), None);
    }

    #[test]
    fn empty_input_errors() {
        assert_eq!(MediaType::parse(b""), Err(MediaErrorKind::Empty));
        assert_eq!(MediaType::parse(b"   "), Err(MediaErrorKind::Empty));
    }

    #[test]
    fn missing_slash_errors() {
        assert_eq!(
            MediaType::parse(b"applicationjson"),
            Err(MediaErrorKind::MissingSlash)
        );
    }

    #[test]
    fn empty_type_or_subtype_errors() {
        assert_eq!(
            MediaType::parse(b"/json"),
            Err(MediaErrorKind::InvalidToken)
        );
        assert_eq!(
            MediaType::parse(b"application/"),
            Err(MediaErrorKind::InvalidToken)
        );
        assert_eq!(MediaType::parse(b"/"), Err(MediaErrorKind::InvalidToken));
    }

    #[test]
    fn empty_param_segment_is_skipped() {
        let mt = MediaType::parse(b"text/html;; charset=utf-8").unwrap();
        assert_eq!(mt.param("charset"), Some(&b"utf-8"[..]));
    }

    #[test]
    fn param_without_value() {
        let mt = MediaType::parse(b"text/html; flag").unwrap();
        let collected: Vec<_> = mt.params().collect();
        assert_eq!(collected, &[(&b"flag"[..], &b""[..])]);
    }

    #[test]
    fn unquote_helper() {
        assert_eq!(unquote(b"\"x\""), b"x");
        assert_eq!(unquote(b"x"), b"x");
        assert_eq!(unquote(b"\""), b"\""); // single quote, not stripped
        assert_eq!(unquote(b""), b"");
    }
}