xocomil 0.3.0 - Docs.rs

use std::mem::MaybeUninit;

use crate::ascii::HttpChar;
use crate::bytes::ByteSearch;
use crate::error::ParseErrorKind;
use crate::headers::{Header, HttpVersion, Method, RequestHeader};
use crate::scan::scan_header_line;
use crate::validate::HttpValidate;

use super::Request;

use super::HeaderSlot;

use crate::ascii::parse_content_length;

impl<'buf, const MAX_HDRS: usize> Request<'buf, MAX_HDRS> {
    /// Parse raw HTTP header bytes into a `Request`.
    ///
    /// The input **must** contain the `\r\n\r\n` header terminator.
    /// No UTF-8 validation is performed — all fields are returned as byte
    /// slices. Use [`Request::path_str`] / [`Request::header_str`] to
    /// convert to `&str` on demand.
    ///
    /// # Errors
    ///
    /// Returns [`crate::error::Error`] if the request is malformed, the method is
    /// unsupported, or the `\r\n\r\n` terminator is missing.
    #[inline]
    pub fn parse(header_bytes: &'buf [u8]) -> Result<Self, crate::error::Error> {
        Self::parse_impl(header_bytes).map_err(crate::error::Error::from)
    }

    /// Internal parse returning the crate-internal `ParseErrorKind`.
    pub(super) fn parse_impl(header_bytes: &'buf [u8]) -> Result<Self, ParseErrorKind> {
        // Compile-time guarantee that header indices fit in u8.
        const { assert!(MAX_HDRS <= u8::MAX as usize, "MAX_HDRS must be <= 255") }

        let (method, path, version, request_line_len) = Self::parse_request_line(header_bytes)?;

        // Build the Request in place. `parse_headers` writes through
        // `&mut` references to the headers/known slots, avoiding a 1
        // KiB stack-to-stack copy that previously showed up at ~8% of
        // htmx parse cycles in perf annotation. RVO should place this
        // directly into the caller's result slot.
        let mut req = Request {
            method,
            version,
            path,
            headers: [const { MaybeUninit::uninit() }; MAX_HDRS],
            header_count: 0,
            known: [HeaderSlot::EMPTY; RequestHeader::COUNT],
            content_length: None,
        };

        Self::parse_headers(
            header_bytes,
            request_line_len,
            &mut req.headers,
            &mut req.known,
            &mut req.header_count,
        )?;

        // `headers_init` is the single point at which we reinterpret
        // initialized `MaybeUninit<Header>` slots as `Header`. Reuse it
        // so the safety argument lives in exactly one place.
        let init_headers = req.headers_init();
        req.content_length = Self::validate_semantics(method, version, init_headers, &req.known)?;

        Ok(req)
    }

    /// Parse the first line: `METHOD SP path SP HTTP/x.y CRLF`.
    /// Returns (method, path, version, bytes consumed including CRLF).
    fn parse_request_line(
        header_bytes: &'buf [u8],
    ) -> Result<(Method, &'buf [u8], HttpVersion, usize), ParseErrorKind> {
        let line_end = header_bytes
            .find_crlf(0)
            .ok_or(ParseErrorKind::NoRequestLine)?;

        let request_line = &header_bytes[..line_end];

        let first_space = request_line
            .find_byte(HttpChar::Space.as_u8())
            .ok_or(ParseErrorKind::MalformedRequestLine)?;

        let method_bytes = &request_line[..first_space];
        let rest = &request_line[first_space + 1..];

        let second_space = rest
            .find_byte(HttpChar::Space.as_u8())
            .ok_or(ParseErrorKind::MalformedRequestLine)?;

        let method = Method::from_bytes(method_bytes).ok_or(ParseErrorKind::UnsupportedMethod)?;

        let path = &request_line[first_space + 1..first_space + 1 + second_space];

        // `path` is the slice between the first and second space of the
        // request line (see the slicing on the line above). Reject empty
        // paths and paths whose first or last byte is itself a space:
        //   - `path[0] == Space` means the request line contained two
        //     consecutive spaces after the method (e.g. "GET  /foo HTTP/1.1");
        //     the first space delimits the method and the second one ends up
        //     as `path[0]`.
        //   - `path[path.len() - 1] == Space` catches a trailing space before
        //     the version (e.g. "GET /foo  HTTP/1.1"), where the extra space
        //     is the last byte of the slice before the second delimiter.
        // Either case causes path interpretation to differ between parsers —
        // a classic request-smuggling / routing-inconsistency vector.
        if path.is_empty() || path[0] == HttpChar::Space || path[path.len() - 1] == HttpChar::Space
        {
            return Err(ParseErrorKind::MalformedRequestTarget);
        }

        // Reject control characters in the request target (RFC 7230 §3.1.1).
        if !path.is_valid_request_target() {
            return Err(ParseErrorKind::MalformedRequestTarget);
        }

        let version_bytes = &rest[second_space + 1..];
        let version =
            HttpVersion::from_bytes(version_bytes).ok_or(ParseErrorKind::UnsupportedHttpVersion)?;

        Ok((method, path, version, line_end + 2))
    }

    /// Parse all header lines after the request line, writing into
    /// caller-provided slots.
    ///
    /// Takes `&mut` references rather than returning the arrays by
    /// value to avoid a 1 KiB stack-to-stack memcpy — at the default
    /// `MAX_HDRS = 32` and 32-byte `Header`, returning by value
    /// produced a byte-shuffling pattern that was ~8% of cycles on
    /// the htmx benchmark.
    fn parse_headers(
        header_bytes: &'buf [u8],
        start: usize,
        headers: &mut [MaybeUninit<Header<'buf>>; MAX_HDRS],
        known: &mut [HeaderSlot; RequestHeader::COUNT],
        out_count: &mut usize,
    ) -> Result<(), ParseErrorKind> {
        let mut header_count: usize = 0;
        let mut pos = start;
        let mut found_end = false;

        while pos < header_bytes.len() {
            let remaining = &header_bytes[pos..];

            // Check for empty line (header terminator \r\n).
            if remaining.len() >= 2
                && remaining[0] == HttpChar::CarriageReturn
                && remaining[1] == HttpChar::LineFeed
            {
                found_end = true;
                break;
            }

            // Fused single-pass: find colon + \r\n + validate TCHAR/value in one scan.
            //
            // Obs-fold rejection: RFC 7230 §3.2.4 deprecated continuation
            // lines (CRLF followed by SP/HTAB). `scan_header_line`
            // terminates on the first `\r\n`, so a folded header surfaces
            // here as a fresh "header line" starting with SP/HTAB. SP and
            // HTAB are not valid TCHARs, so the scanner errors with
            // `MalformedHeader` before any colon is found. Tolerating
            // obs-fold is a request-smuggling vector when intermediaries
            // disagree on the canonicalization.
            let span = scan_header_line(remaining)?;

            let name = &remaining[..span.colon];
            let value = remaining[span.colon + 1..span.line_end].trim_ows();

            if header_count >= MAX_HDRS {
                return Err(ParseErrorKind::TooManyHeaders);
            }
            headers[header_count].write(Header::new(name, value));

            // Populate O(1) lookup table for known headers.
            // Reject duplicate Host, Content-Length, and Transfer-Encoding
            // headers (RFC 7230 §5.4, §3.3.3). Duplicate TE is a request
            // smuggling vector (RFC 7230 §3.3.3 item 3).
            //
            // RFC 7230 §3.3.2 technically permits multiple Content-Length
            // headers if every value is identical (treated as a single
            // value). We reject all duplicates regardless: tolerating any
            // form of CL repetition has historically been a smuggling
            // vector when intermediaries disagree on the canonicalization.
            #[allow(clippy::cast_possible_truncation)]
            if let Some(rh) = RequestHeader::from_bytes_ignore_case(name) {
                let slot = rh as usize;
                if known[slot].is_none() {
                    known[slot] = HeaderSlot::new(header_count as u8);
                } else if matches!(
                    rh,
                    RequestHeader::Host
                        | RequestHeader::ContentLength
                        | RequestHeader::TransferEncoding
                ) {
                    return Err(ParseErrorKind::DuplicateHeader);
                }
            }

            header_count += 1;
            pos += span.line_end + 2;
        }

        if !found_end {
            return Err(ParseErrorKind::IncompleteHeaders);
        }

        *out_count = header_count;
        Ok(())
    }

    /// Validate header semantics: TE/CL conflicts, Content-Length parsing,
    /// Host requirement. Returns cached `content_length`.
    fn validate_semantics(
        method: Method,
        version: HttpVersion,
        headers: &[Header<'buf>],
        known: &[HeaderSlot; RequestHeader::COUNT],
    ) -> Result<Option<u64>, ParseErrorKind> {
        let te = known[RequestHeader::TransferEncoding as usize];
        let cl = known[RequestHeader::ContentLength as usize];

        // Parse and cache Content-Length at parse time. An invalid value
        // that a proxy might interpret differently is a request smuggling
        // vector — reject it early rather than silently treating it as
        // "no body".
        let content_length = if let Some(idx) = cl.get() {
            let cl_value = headers[idx as usize].value();
            Some(parse_content_length(cl_value).ok_or(ParseErrorKind::InvalidContentLength)?)
        } else {
            None
        };

        // Validate Transfer-Encoding is "chunked" (case-insensitive).
        // RFC 7230 §3.3.3: if a TE we don't understand is received, we
        // MUST respond with 501 and close. Silently treating an unknown
        // TE as chunked would cause a desync with proxies.
        if let Some(idx) = te.get()
            && !headers[idx as usize]
                .value()
                .eq_ignore_ascii_case(b"chunked")
        {
            return Err(ParseErrorKind::UnsupportedTransferEncoding);
        }

        // RFC 7230 §3.3.1: Transfer-Encoding is HTTP/1.1+. HTTP/1.0 clients
        // MUST NOT send it. Accepting TE on 1.0 is a smuggling vector when
        // intermediaries downgrade-rewrite TE -> CL for 1.0 origins.
        if te.is_some() && matches!(version, HttpVersion::Http10) {
            return Err(ParseErrorKind::UnsupportedTransferEncoding);
        }

        // RFC 7230 §3.3.3: reject requests with both Transfer-Encoding
        // and Content-Length, as this is a classic request smuggling vector.
        if te.is_some() && cl.is_some() {
            return Err(ParseErrorKind::ConflictingHeaders);
        }

        // Methods without body semantics should not carry Transfer-Encoding.
        // A TE header on these methods is a smuggling vector: a proxy may
        // forward the TE header while the origin ignores the body, causing
        // desync.
        if te.is_some() && !method.can_have_body() {
            return Err(ParseErrorKind::ConflictingHeaders);
        }

        // RFC 7230 §5.4: HTTP/1.1 requests MUST include exactly one Host header.
        if matches!(version, HttpVersion::Http11) && known[RequestHeader::Host as usize].is_none() {
            return Err(ParseErrorKind::MissingHostHeader);
        }

        Ok(content_length)
    }
}