pgp/
normalize_lines.rs

1//! # Line ending normalization module
2
3use std::sync::LazyLock;
4
5use bytes::{Buf, BytesMut};
6
7use crate::{line_writer::LineBreak, util::fill_buffer};
8
9static RE: LazyLock<regex::bytes::Regex> =
10    LazyLock::new(|| regex::bytes::Regex::new(r"(\r\n?|\n)").expect("valid regex"));
11
12/// This struct wraps a reader and normalize line endings.
13pub struct NormalizedReader<R>
14where
15    R: std::io::Read,
16{
17    line_break: LineBreak,
18    source: R,
19    in_buffer: [u8; BUF_SIZE / 2],
20    replaced: BytesMut,
21    is_done: bool,
22}
23
24const BUF_SIZE: usize = 1024;
25impl<R: std::io::Read> NormalizedReader<R> {
26    pub fn new(source: R, line_break: LineBreak) -> Self {
27        Self {
28            source,
29            line_break,
30            in_buffer: [0u8; BUF_SIZE / 2],
31            replaced: BytesMut::with_capacity(BUF_SIZE),
32            is_done: false,
33        }
34    }
35
36    /// Fills the buffer, and then normalizes it
37    fn fill_buffer(&mut self) -> std::io::Result<()> {
38        // edge case, if the last byte of the previous buffer was `\r` and the first of the new is `\n`
39        // we need to make sure to correctly handle it.
40        let last_was_cr = self.in_buffer[self.in_buffer.len() - 1] == b'\r';
41        let read = fill_buffer(&mut self.source, &mut self.in_buffer, None)?;
42        if read == 0 {
43            self.is_done = true;
44        }
45
46        let first_is_lf = self.in_buffer[0] == b'\n';
47
48        self.cleanup_buffer(read, last_was_cr && first_is_lf);
49        Ok(())
50    }
51
52    /// Normalizes the line endings in the current buffer
53    fn cleanup_buffer(&mut self, read: usize, have_split_crlf: bool) {
54        let in_buffer = if have_split_crlf && read > 0 {
55            // skip the first byte of the buffer, which is a `\n` as it was already handled before
56            &self.in_buffer[1..read]
57        } else {
58            &self.in_buffer[..read]
59        };
60
61        let res = RE.replace_all(in_buffer, self.line_break.as_ref());
62        self.replaced.clear();
63        self.replaced.extend_from_slice(&res);
64    }
65}
66
67impl<R: std::io::Read> std::io::Read for NormalizedReader<R> {
68    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
69        if !self.replaced.has_remaining() {
70            if self.is_done {
71                return Ok(0);
72            }
73            self.fill_buffer()?;
74        }
75
76        let to_write = self.replaced.remaining().min(buf.len());
77        self.replaced.copy_to_slice(&mut buf[..to_write]);
78        Ok(to_write)
79    }
80}
81
82pub(crate) fn normalize_lines(s: &str, line_break: LineBreak) -> std::borrow::Cow<'_, str> {
83    let bytes = RE.replace_all(s.as_bytes(), line_break.as_ref());
84    match bytes {
85        std::borrow::Cow::Borrowed(bytes) => {
86            std::borrow::Cow::Borrowed(std::str::from_utf8(bytes).expect("valid bytes in"))
87        }
88        std::borrow::Cow::Owned(bytes) => {
89            std::borrow::Cow::Owned(std::string::String::from_utf8(bytes).expect("valid bytes in"))
90        }
91    }
92}
93
94// tests
95#[cfg(test)]
96mod tests {
97    use std::io::Read;
98
99    use rand::{Rng, SeedableRng};
100    use rand_chacha::ChaCha8Rng;
101
102    use super::*;
103    use crate::util::test::{check_strings, random_string, ChaosReader};
104
105    #[test]
106    fn reader_normalized_lf() {
107        let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
108
109        let mut out = String::new();
110        NormalizedReader::new(&mut input.as_bytes(), LineBreak::Lf)
111            .read_to_string(&mut out)
112            .unwrap();
113
114        check_strings(
115            out,
116            "This is a string \n with \n some \n\n random newlines\n\n\n",
117        );
118    }
119
120    #[test]
121    fn reader_normalized_cr() {
122        let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
123
124        let mut out = String::new();
125        NormalizedReader::new(&mut input.as_bytes(), LineBreak::Cr)
126            .read_to_string(&mut out)
127            .unwrap();
128
129        check_strings(
130            out,
131            "This is a string \r with \r some \r\r random newlines\r\r\r",
132        );
133    }
134
135    #[test]
136    fn reader_normalized_crlf_fixed() {
137        let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
138
139        let mut out = String::new();
140        NormalizedReader::new(&mut input.as_bytes(), LineBreak::Crlf)
141            .read_to_string(&mut out)
142            .unwrap();
143
144        check_strings(
145            "This is a string \r\n with \r\n some \r\n\r\n random newlines\r\n\r\n\r\n",
146            out,
147        );
148    }
149
150    #[test]
151    fn reader_normalized_crlf_random() {
152        let mut rng = ChaCha8Rng::seed_from_u64(1);
153
154        for _ in 0..100 {
155            let size = rng.gen_range(1..10000);
156            let input = random_string(&mut rng, size);
157            let reader = ChaosReader::new(&mut rng, input.clone());
158
159            let mut out = String::new();
160            NormalizedReader::new(reader, LineBreak::Crlf)
161                .read_to_string(&mut out)
162                .unwrap();
163
164            let normalized_input = normalize_lines(&input, LineBreak::Crlf);
165            check_strings(&normalized_input, out);
166        }
167    }
168}