viadkim 0.2.0 - Docs.rs

// viadkim – implementation of the DKIM specification
// Copyright © 2022–2024 David Bürgin <dbuergin@gluet.ch>
//
// This program is free software: you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free Software
// Foundation, either version 3 of the License, or (at your option) any later
// version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
// details.
//
// You should have received a copy of the GNU General Public License along with
// this program. If not, see <https://www.gnu.org/licenses/>.

//! Canonicalization utilities.

use crate::{
    header::{FieldBody, FieldName, HeaderFields},
    signature::CanonicalizationAlgorithm,
};
use std::{borrow::Cow, collections::HashSet};

const SP: u8 = b' ';
const CR: u8 = b'\r';
const LF: u8 = b'\n';
const CRLF: [u8; 2] = [CR, LF];

/// Produces the header canonicalization result for some header fields.
pub fn canonicalize_headers(
    algorithm: CanonicalizationAlgorithm,
    headers: &HeaderFields,
    selected_headers: &[FieldName],
) -> Vec<u8> {
    let mut result = vec![];
    let mut processed_indexes = HashSet::with_capacity(selected_headers.len());

    for selected_header in selected_headers {
        if let Some((i, (name, val))) = headers
            .as_ref()
            .iter()
            .rev()
            .enumerate()
            .find(|(i, (name, _))| !processed_indexes.contains(i) && name == selected_header)
        {
            canonicalize_header(&mut result, algorithm, name, val);

            result.extend(CRLF);

            processed_indexes.insert(i);
        }
    }

    result
}

/// Canonicalizes a header field into some result vector.
///
/// The given name and value must conform to the header field format as encoded
/// by `FieldName` and `FieldBody`. This function does not include the final
/// CRLF that ends a header value in the result.
pub fn canonicalize_header(
    result: &mut Vec<u8>,
    algorithm: CanonicalizationAlgorithm,
    name: impl AsRef<str>,
    value: impl AsRef<[u8]>,
) {
    let name = name.as_ref();
    let value = value.as_ref();

    match algorithm {
        CanonicalizationAlgorithm::Simple => {
            result.extend(name.bytes());
            result.push(b':');
            result.extend(value);
        }
        CanonicalizationAlgorithm::Relaxed => {
            result.extend(name.to_ascii_lowercase().bytes());
            result.push(b':');
            canonicalize_header_relaxed(result, value);
        }
    }
}

fn canonicalize_header_relaxed(result: &mut Vec<u8>, value: &[u8]) {
    fn is_space(b: u8) -> bool {
        matches!(b, b' ' | b'\t' | b'\r' | b'\n')
    }

    fn trim_space(mut value: &[u8]) -> &[u8] {
        while let Some((_, rest)) = value.split_first().filter(|(&b, _)| is_space(b)) {
            value = rest;
        }
        while let Some((_, rest)) = value.split_last().filter(|(&b, _)| is_space(b)) {
            value = rest;
        }
        value
    }

    debug_assert!(FieldBody::new(value).is_ok());

    let value = trim_space(value);

    let mut compressing = false;
    for &b in value {
        if is_space(b) {
            if !compressing {
                result.push(SP);
                compressing = true;
            }
        } else {
            result.push(b);
            if compressing {
                compressing = false;
            }
        }
    }
}

// which state are we in = what did we see last?
#[derive(Clone, Copy)]
enum CState {
    Init,
    CrLf,
    Cr,
    Wsp,
    WspCr,
    Byte,
}

/// A canonicalizer using the body canonicalization algorithm.
///
/// This canonicalizer can be used to canonicalize a message body in chunks,
/// without the entire body needing to be in memory at once. For this, it keeps
/// a small amount of state that must be finalized with a final call to
/// [`BodyCanonicalizer::finish`].
#[derive(Clone)]
pub struct BodyCanonicalizer {
    kind: CanonicalizationAlgorithm,
    state: CState,
    blank_line: bool,  // whether currently on an empty or blank line
    empty_lines: usize,  // number of empty lines seen
}

impl BodyCanonicalizer {
    /// Creates a new body canonicalizer.
    pub fn new(kind: CanonicalizationAlgorithm) -> Self {
        // For the simple algorithm emit initial CRLF eagerly, for the relaxed
        // algorithm delay emitting it (body might be blank = empty result):
        let blank_line = match kind {
            CanonicalizationAlgorithm::Simple => false,
            CanonicalizationAlgorithm::Relaxed => true,
        };
        Self {
            kind,
            state: CState::Init,
            blank_line,
            empty_lines: 0,
        }
    }

    // Canonicalisation recognises only CRLF as line separator/terminator, stray
    // CR and LF are treated like other bytes.

    /// Canonicalizes a chunk of message body content.
    pub fn canonicalize_chunk<'a>(&mut self, bytes: &'a [u8]) -> Cow<'a, [u8]> {
        match self.kind {
            CanonicalizationAlgorithm::Simple => self.canon_chunk_simple(bytes),
            CanonicalizationAlgorithm::Relaxed => self.canon_chunk_relaxed(bytes),
        }
    }

    fn canon_chunk_simple<'a>(&mut self, mut bytes: &'a [u8]) -> Cow<'a, [u8]> {
        // Optimisation for the common case: whole message body with final CRLFs
        // may be returned directly in borrowed form.
        if matches!(self.state, CState::Init) {
            if let Some(mut bnext) = bytes.strip_suffix(&CRLF) {
                while let Some(bx) = bnext.strip_suffix(&CRLF) {
                    (bytes, bnext) = (bnext, bx);
                    self.empty_lines += 1;
                }
                self.blank_line = true;
                self.state = CState::CrLf;
                return bytes.into();
            }
        }

        let mut result = Vec::with_capacity(bytes.len());

        for &b in bytes {
            match self.state {
                CState::Init | CState::CrLf => {
                    if b == CR {
                        self.state = CState::Cr;
                    } else {
                        self.flush_empty_lines(&mut result);
                        result.push(b);
                        self.state = CState::Byte;
                    }
                }
                CState::Cr => {
                    if b == LF {
                        if self.blank_line {
                            self.empty_lines += 1;
                        } else {
                            result.extend(CRLF);
                            self.blank_line = true;
                        }
                        self.state = CState::CrLf;
                        continue;
                    }

                    self.flush_empty_lines(&mut result);
                    result.push(CR);

                    if b != CR {
                        result.push(b);
                        self.state = CState::Byte;
                    }
                }
                CState::Byte => {
                    if b == CR {
                        self.state = CState::Cr;
                    } else {
                        result.push(b);
                    }
                }
                CState::Wsp | CState::WspCr => unreachable!(),
            }
        }

        result.into()
    }

    fn canon_chunk_relaxed<'a>(&mut self, bytes: &'a [u8]) -> Cow<'a, [u8]> {
        fn is_wsp(b: u8) -> bool {
            matches!(b, b'\t' | b' ')
        }

        let mut result = Vec::with_capacity(bytes.len());

        for &b in bytes {
            match self.state {
                CState::Init | CState::CrLf => {
                    if is_wsp(b) {
                        self.state = CState::Wsp;
                    } else if b == CR {
                        self.state = CState::Cr;
                    } else {
                        self.flush_empty_lines(&mut result);
                        result.push(b);
                        self.state = CState::Byte;
                    }
                }
                CState::Wsp => {
                    if b == CR {
                        self.state = CState::WspCr;
                    } else if !is_wsp(b) {
                        self.flush_empty_lines(&mut result);
                        result.push(SP);
                        result.push(b);
                        self.state = CState::Byte;
                    }
                }
                CState::Cr => {
                    if b == LF {
                        if self.blank_line {
                            self.empty_lines += 1;
                        } else {
                            result.extend(CRLF);
                            self.blank_line = true;
                        }
                        self.state = CState::CrLf;
                        continue;
                    }

                    self.flush_empty_lines(&mut result);
                    result.push(CR);

                    if is_wsp(b) {
                        self.state = CState::Wsp;
                    } else if b != CR {
                        result.push(b);
                        self.state = CState::Byte;
                    }
                }
                CState::WspCr => {
                    if b == LF {
                        if self.blank_line {
                            self.empty_lines += 1;
                        } else {
                            result.extend(CRLF);
                            self.blank_line = true;
                        }
                        self.state = CState::CrLf;
                        continue;
                    }

                    self.flush_empty_lines(&mut result);
                    result.push(SP);
                    result.push(CR);

                    if b == CR {
                        self.state = CState::Cr;
                    } else if is_wsp(b) {
                        self.state = CState::Wsp;
                    } else {
                        result.push(b);
                        self.state = CState::Byte;
                    }
                }
                CState::Byte => {
                    if is_wsp(b) {
                        self.state = CState::Wsp;
                    } else if b == CR {
                        self.state = CState::Cr;
                    } else {
                        result.push(b);
                    }
                }
            }
        }

        result.into()
    }

    /// Produces the final chunk of canonicalized message body content.
    pub fn finish(mut self) -> Cow<'static, [u8]> {
        match self.kind {
            CanonicalizationAlgorithm::Simple => {
                // Body always needs final CRLF.
                match self.state {
                    CState::Init | CState::Byte => (&CRLF[..]).into(),
                    CState::CrLf => (&[][..]).into(),
                    CState::Cr => {
                        let mut result = vec![];
                        self.flush_empty_lines(&mut result);
                        result.push(CR);
                        result.extend(CRLF);
                        result.into()
                    }
                    CState::Wsp | CState::WspCr => unreachable!(),
                }
            }
            CanonicalizationAlgorithm::Relaxed => {
                // Non-empty body needs final CRLF.
                match self.state {
                    CState::Init | CState::CrLf => (&[][..]).into(),
                    CState::Cr => {
                        let mut result = vec![];
                        self.flush_empty_lines(&mut result);
                        result.push(CR);
                        result.extend(CRLF);
                        result.into()
                    }
                    CState::Wsp => {
                        if self.blank_line {
                            (&[][..]).into()
                        } else {
                            (&CRLF[..]).into()
                        }
                    }
                    CState::WspCr => {
                        let mut result = vec![];
                        self.flush_empty_lines(&mut result);
                        result.push(SP);
                        result.push(CR);
                        result.extend(CRLF);
                        result.into()
                    }
                    CState::Byte => (&CRLF[..]).into(),
                }
            }
        }
    }

    // Write out remembered empty lines after encountering/before processing a
    // byte that ends a section of empty lines.
    fn flush_empty_lines(&mut self, result: &mut Vec<u8>) {
        for _ in 0..self.empty_lines {
            result.extend(CRLF);
        }
        self.empty_lines = 0;
        self.blank_line = false;
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::signature::CanonicalizationAlgorithm::*;
    use rand::{
        distributions::{Distribution, Slice},
        Rng,
    };
    use std::{ops::RangeInclusive, str};

    #[test]
    fn canonicalize_headers_relaxed_ok() {
        let headers = HeaderFields::from_vec(vec![
            ("from".to_owned(), b" Good \t ".to_vec()),
            ("to".to_owned(), b" see   me".to_vec()),
            ("Date".to_owned(), b" Fri 24\r\n\tfoo".to_vec()),
            ("To".to_owned(), b" another one".to_vec()),
        ])
        .unwrap();

        let selected_headers = vec![
            FieldName::new("to").unwrap(),
            FieldName::new("from").unwrap(),
            FieldName::new("to").unwrap(),
        ];

        assert_eq!(
            canonicalize_headers(Relaxed, &headers, &selected_headers),
            &b"to:another one\r\nfrom:Good\r\nto:see me\r\n"[..],
        );
    }

    #[test]
    fn canonicalize_header_relaxed_dkim_sig() {
        let example = "v=1; a=rsa-sha256; d=example.net; s=brisbane;
  c=simple; q=dns/txt; i=@eng.example.net;
  h=from:to:subject:date;
  bh=MTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTI=;
  b=dzdV...   ";
        let example = example.replace('\n', "\r\n");

        let mut result = vec![];
        canonicalize_header(&mut result, Relaxed, "Dkim-Signature", &example);

        assert_eq!(
            result,
            b"dkim-signature:v=1; a=rsa-sha256; d=example.net; \
            s=brisbane; c=simple; q=dns/txt; i=@eng.example.net; h=from:to:subject:date; \
            bh=MTIzNDU2Nzg5MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTI=; b=dzdV..."[..]
        );
    }

    #[test]
    fn body_canon_simple_ok() {
        let bc = BodyCanonicalizer::new(Simple);

        let body = canonicalize_chunks(
            bc,
            &[b"well  hello \r\n", b"\r\n\r? what's up \r\n\r\n", b"\r\n"],
        );

        assert_eq!(body, b"well  hello \r\n\r\n\r? what's up \r\n");
    }

    #[test]
    fn body_canon_simple_cases() {
        fn c(bs: &[u8]) -> Vec<u8> {
            let bc = BodyCanonicalizer::new(Simple);
            canonicalize_chunks(bc, &[bs])
        }

        assert_eq!(c(b""), b"\r\n");
        assert_eq!(c(b"\r"), b"\r\r\n");
        assert_eq!(c(b"\rx"), b"\rx\r\n");
        assert_eq!(c(b"\r\r"), b"\r\r\r\n");
        assert_eq!(c(b"x"), b"x\r\n");
        assert_eq!(c(b"x\r"), b"x\r\r\n");
        assert_eq!(c(b"x\r\n"), b"x\r\n");
        assert_eq!(c(b"x\r\n\r\n"), b"x\r\n");
        assert_eq!(c(b"x\r\n\r\n\rx"), b"x\r\n\r\n\rx\r\n");
        assert_eq!(c(b"\n"), b"\n\r\n");
        assert_eq!(c(b"\r\n"), b"\r\n");
        assert_eq!(c(b"\r\n\r"), b"\r\n\r\r\n");
        assert_eq!(c(b"\r\n\r\n"), b"\r\n");
        assert_eq!(c(b"\r\n\r\nx"), b"\r\n\r\nx\r\n");
    }

    #[test]
    fn body_canon_simple_optimized() {
        fn c(chunk1: &[u8], chunk2: &[u8]) -> Vec<u8> {
            let bc = BodyCanonicalizer::new(Simple);
            canonicalize_chunks(bc, &[chunk1, chunk2])
        }

        // Some glass box test cases exercising the optimised path.
        assert_eq!(c(b"", b"x"), b"x\r\n");
        assert_eq!(c(b"\r\n", b"\r\n"), b"\r\n");
        assert_eq!(c(b"x\r\n", b"\r\nx"), b"x\r\n\r\nx\r\n");
        assert_eq!(c(b"\r\n\r\n\r\n", b""), b"\r\n");
        assert_eq!(c(b"\r\n\r\n\r\n", b"\r\n"), b"\r\n");
        assert_eq!(c(b"\r\n\r\n\r\n", b"\r"), b"\r\n\r\n\r\n\r\r\n");
    }

    #[test]
    fn body_canon_relaxed_basic() {
        let bc = BodyCanonicalizer::new(Relaxed);

        let body = canonicalize_chunks(
            bc,
            &[b"well  hello \r\n", b"\r\n what's up \r\n\r\n", b"\r\n"],
        );

        assert_eq!(body, b"well hello\r\n\r\n what's up\r\n");
    }

    #[test]
    fn body_canon_relaxed_small_chunks() {
        let bc = BodyCanonicalizer::new(Relaxed);

        let body = canonicalize_chunks(
            bc,
            &[
                b"well ",
                b" hello ",
                b"\r",
                b"\n\r",
                b"\n what's up \r\n\r\n",
                b"\r\n",
            ],
        );

        assert_eq!(body, b"well hello\r\n\r\n what's up\r\n");
    }

    #[test]
    fn body_canon_relaxed_initial_empty_lines() {
        let bc = BodyCanonicalizer::new(Relaxed);

        let body = canonicalize_chunks(bc, &[b"\r\n\r\n", b"\ra \r", b"\nb  ", b"c"]);

        assert_eq!(body, b"\r\n\r\n\ra\r\nb c\r\n");
    }

    #[test]
    fn body_canon_relaxed_cases() {
        fn c(bs: &[u8]) -> Vec<u8> {
            let bc = BodyCanonicalizer::new(Relaxed);
            canonicalize_chunks(bc, &[bs])
        }

        assert_eq!(c(b""), b"");
        assert_eq!(c(b" "), b"");
        assert_eq!(c(b"  "), b"");
        assert_eq!(c(b" x"), b" x\r\n");
        assert_eq!(c(b"  x"), b" x\r\n");
        assert_eq!(c(b"  x "), b" x\r\n");
        assert_eq!(c(b" \r"), b" \r\r\n");
        assert_eq!(c(b" \r "), b" \r\r\n");
        assert_eq!(c(b" \r\r"), b" \r\r\r\n");
        assert_eq!(c(b" \rx"), b" \rx\r\n");
        assert_eq!(c(b" \r\n"), b"");
        assert_eq!(c(b" \r\n\r\n  \r\n"), b"");
        assert_eq!(c(b" \r\n\r\n  "), b"");
        assert_eq!(c(b"\r"), b"\r\r\n");
        assert_eq!(c(b"\r\r"), b"\r\r\r\n");
        assert_eq!(c(b"\rx"), b"\rx\r\n");
        assert_eq!(c(b"\r "), b"\r\r\n");
        assert_eq!(c(b"\r  "), b"\r\r\n");
        assert_eq!(c(b"x"), b"x\r\n");
        assert_eq!(c(b"xy"), b"xy\r\n");
        assert_eq!(c(b"x\r\n"), b"x\r\n");
        assert_eq!(c(b"x\r\n\r\n"), b"x\r\n");
        assert_eq!(c(b"x  "), b"x\r\n");

        assert_eq!(c(b"x\r\ny"), b"x\r\ny\r\n");
        assert_eq!(c(b"x\r\n\ry"), b"x\r\n\ry\r\n");
        assert_eq!(c(b"x\r\n\r\ny"), b"x\r\n\r\ny\r\n");
        assert_eq!(c(b"x\r\n\r\n\ry"), b"x\r\n\r\n\ry\r\n");
        assert_eq!(c(b"x\r\n \r\ny"), b"x\r\n\r\ny\r\n");
        assert_eq!(c(b"x\r\n  \r\ny"), b"x\r\n\r\ny\r\n");
        assert_eq!(c(b"x\r\n  z\r\ny"), b"x\r\n z\r\ny\r\n");
        assert_eq!(c(b"x\r\n  z \r\ny"), b"x\r\n z\r\ny\r\n");
        assert_eq!(c(b"x\r\n  z  \r\ny"), b"x\r\n z\r\ny\r\n");

        assert_eq!(c(b"x y z\r\n"), b"x y z\r\n");
        assert_eq!(c(b"x  y z\r\n"), b"x y z\r\n");
        assert_eq!(c(b"x  y z \r\n"), b"x y z\r\n");

        assert_eq!(c(b"\r\n"), b"");
        assert_eq!(c(b"\r\n\r"), b"\r\n\r\r\n");
        assert_eq!(c(b"\r\n\r "), b"\r\n\r\r\n");
        assert_eq!(c(b"\r\n\r\r"), b"\r\n\r\r\r\n");
        assert_eq!(c(b"\r\n\r\n"), b"");
        assert_eq!(c(b"\r\n \r\n"), b"");
        assert_eq!(c(b"\r\n  \r\n"), b"");
        assert_eq!(c(b"\r\n\r\nx"), b"\r\n\r\nx\r\n");
        assert_eq!(c(b"\r\n \r\nx"), b"\r\n\r\nx\r\n");
    }

    fn canonicalize_chunks(mut bc: BodyCanonicalizer, chunks: &[&[u8]]) -> Vec<u8> {
        let mut result = vec![];
        for c in chunks {
            result.extend(bc.canonicalize_chunk(c).into_owned());
        }
        result.extend(bc.finish().into_owned());
        result
    }

    #[test]
    #[ignore = "randomly generated test inputs"]
    fn fuzz_body_canonicalizer_simple() {
        let alt_impl = |bytes: &_| {
            str::from_utf8(bytes)
                .unwrap()
                .trim_end_matches("\r\n")
                .bytes()
                .chain(*b"\r\n")
                .collect()
        };

        let s_elems = ["x", "\r", "\n", "\r\n"];
        let s_len = 0..=9;

        run_fuzz(300, Simple, &s_elems, s_len, alt_impl);
    }

    #[test]
    #[ignore = "randomly generated test inputs"]
    fn fuzz_body_canonicalizer_relaxed() {
        let alt_impl = |bytes: &_| {
            let s = str::from_utf8(bytes).unwrap();

            // Some commentary from §3.4.4.
            // For each line:
            let s = s.split("\r\n")
                .map(|s| {
                    let mut sp = false;
                    // ‘a. Reduce whitespace:
                    //     * Ignore all whitespace at the end of lines. […]
                    s.trim_end_matches(' ')
                        .chars()
                        .filter(|&c| {
                            // * Reduce all sequences of WSP within a line to a
                            //   single SP character.’
                            if c == ' ' {
                                let ret = !sp;
                                sp = true;
                                ret
                            } else {
                                sp = false;
                                true
                            }
                        })
                        .collect::<String>()
                })
                .collect::<Vec<_>>()
                .join("\r\n");

            // ‘b. Ignore all empty lines at the end of the message body. […] If
            //     the body is non-empty but does not end with a CRLF, a CRLF is
            //     added.’
            let s = s.trim_end_matches("\r\n");
            let mut s = s.to_owned();
            if !s.is_empty() {
                s.push_str("\r\n");
            }

            s.bytes().collect()
        };

        let s_elems = ["x", " ", "\r", "\n", "\r\n"];
        let s_len = 0..=12;

        run_fuzz(1000, Relaxed, &s_elems, s_len, alt_impl);
    }

    fn run_fuzz(
        repetitions: usize,
        alg: CanonicalizationAlgorithm,
        s_elems: &[&str],
        s_len: RangeInclusive<u8>,
        alt_impl: impl Fn(&[u8]) -> Vec<u8>,
    ) {
        let elems = Slice::new(s_elems).unwrap();

        let mut rng = rand::thread_rng();

        for _ in 0..repetitions {
            let n = rng.gen_range(s_len.clone()).into();
            let s: String = elems.sample_iter(&mut rng).copied().take(n).collect();
            let bytes = s.as_bytes();

            let r1 = canonicalize_chunks(BodyCanonicalizer::new(alg), &[bytes]);
            let r2 = alt_impl(bytes);

            assert_eq!(
                r1,
                r2,
                "divergent results for input {s:?}: {:?} != {:?}",
                str::from_utf8(&r1).unwrap(),
                str::from_utf8(&r2).unwrap()
            );
        }
    }
}