eolify 0.4.0

High-performance line ending normalization for Rust.
Documentation
//! The `formats` module contains the core traits and types for normalization. The actual
//! formats (like CRLF) are implemented in submodules.

use std::mem::MaybeUninit;

use crate::{helpers::vec_to_uninit_mut, Result};

pub(crate) mod crlf;
pub(crate) mod lf;

/// Result returned by `normalize_chunk` describing how many bytes were
/// written and whether the chunk ended with a `\r`.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct NormalizeChunkResult<S: Sized> {
    output_len: usize,
    state: Option<S>,
}

impl<S> NormalizeChunkResult<S> {
    /// Construct a new `NormalizeChunkResult`.
    #[must_use]
    pub fn new(output_len: usize, state: Option<S>) -> Self {
        Self { output_len, state }
    }

    /// Returns the number of bytes written into the output buffer for the
    /// last processed chunk.
    #[must_use]
    pub fn output_len(&self) -> usize {
        self.output_len
    }

    /// Whether the input ended with an unpaired `\r`.
    ///
    /// If `true`, the next invocation of `normalize_chunk` should have `preceded_by_cr`
    /// set to `true` to properly handle a possible leading `\n`.
    #[must_use]
    pub fn state(&self) -> Option<&S> {
        self.state.as_ref()
    }
}

/// This is the core trait that defines how to normalize a chunk data to a specific format.
///
/// Consumers will typically not use this trait directly, but rather the higher-level
/// `Normalize` trait.
pub trait NormalizeChunk {
    type State: Clone + Sized;

    /// Normalize a single chunk of input to the required format into the provided `output` buffer.
    ///
    /// Parameters:
    /// - `input`: bytes to normalize
    /// - `output`: destination buffer.
    /// - `preceded_by_cr`: set to `true` if the previous chunk ended with a `\r`.
    /// - `is_last_chunk`: set to `true` if this is the final chunk of the stream.
    ///
    /// Returns a `NormalizeChunkResult` on success which tells how many bytes were
    /// written and whether the chunk ended with a dangling `\r`.
    ///
    /// # Errors
    ///
    /// Returns `Err(crate::Error::OutputBufferTooSmall { required })` if `output`
    /// is too small to hold the  expansion of `input`. The implementation is expected
    /// (but not required) to calculate the worst-case size without actually processing
    /// the input.
    fn normalize_chunk(
        input: &[u8],
        output: &mut [MaybeUninit<u8>],
        state: Option<&Self::State>,
        is_last_chunk: bool,
    ) -> Result<NormalizeChunkResult<Self::State>>;

    /// Returns the worst-case required output buffer size for the given `chunk_size`.
    #[must_use]
    fn max_output_size_for_chunk(
        chunk_size: usize,
        state: Option<&Self::State>,
        is_last_chunk: bool,
    ) -> usize;
}

/// This is the trait that consumers will typically use to normalize vectors or
/// string slices to a specific format.
pub trait Normalize {
    /// Normalize the entire input buffer and return a newly allocated `Vec<u8>` with the result.
    #[must_use]
    fn normalize(input: &[u8]) -> Vec<u8>;

    /// Normalize the entire input string and return a newly allocated `String` with the result.
    #[must_use]
    fn normalize_str(input: &str) -> String;
}

impl<N: NormalizeChunk> Normalize for N {
    fn normalize(input: &[u8]) -> Vec<u8> {
        let mut output =
            Vec::with_capacity(Self::max_output_size_for_chunk(input.len(), None, true));
        let status = Self::normalize_chunk(input, vec_to_uninit_mut(&mut output), None, true)
            .unwrap_or_else(|err| unreachable!("{err} (should be impossible)",));

        // SAFETY: We trust that the implementation of normalize_chunk correctly
        unsafe {
            output.set_len(status.output_len());
        }

        output
    }

    fn normalize_str(input: &str) -> String {
        // SAFETY: normalize returns valid UTF-8 when given valid UTF-8 input because we only
        // insert ASCII CR/LF bytes.
        unsafe { String::from_utf8_unchecked(Self::normalize(input.as_bytes())) }
    }
}