Skip to main content

pgp/
normalize_lines.rs

1//! # Line ending normalization module
2
3use std::sync::LazyLock;
4
5use bytes::{Buf, BufMut, BytesMut};
6
7use crate::{line_writer::LineBreak, util::fill_buffer};
8
9static RE: LazyLock<regex::bytes::Regex> =
10    LazyLock::new(|| regex::bytes::Regex::new(r"(\r\n|\n)").expect("valid regex"));
11
12/// This struct wraps a reader and normalize line endings.
13pub struct NormalizedReader<R>
14where
15    R: std::io::Read,
16{
17    line_break: LineBreak,
18    source: R,
19    in_buffer: [u8; BUF_SIZE / 2],
20    replaced: BytesMut,
21    is_done: bool,
22}
23
24const BUF_SIZE: usize = 1024;
25impl<R: std::io::Read> NormalizedReader<R> {
26    pub fn new(source: R, line_break: LineBreak) -> Self {
27        Self {
28            source,
29            line_break,
30            in_buffer: [0u8; BUF_SIZE / 2],
31            replaced: BytesMut::with_capacity(BUF_SIZE),
32            is_done: false,
33        }
34    }
35
36    /// Fills the buffer, and then normalizes it.
37    ///
38    /// This is only ever called when `self.replaced` has no more data left to consume.
39    fn fill_buffer(&mut self) -> std::io::Result<()> {
40        // edge case: if the last byte of the previous read was `\r` and the first of the new read
41        // is `\n` we need to make sure to handle it correctly.
42
43        // the previous read was guaranteed to have filled up the buffer (otherwise we would have
44        // switched to `self.is_done`, so this is the last byte of the previous read.
45        // If this is a CR, it wasn’t handled in the previous call.
46        let last_char = self.in_buffer[self.in_buffer.len() - 1];
47
48        let read = fill_buffer(&mut self.source, &mut self.in_buffer, None)?;
49        if read < self.in_buffer.len() {
50            // When `crate::util::fill_buffer` returns the buffer not fully filled,
51            // the underlying reader is guaranteed to be empty -> we're done.
52            self.is_done = true;
53        }
54
55        self.cleanup_buffer(read, last_char);
56        Ok(())
57    }
58
59    /// Normalizes the line endings in the current buffer
60    fn cleanup_buffer(&mut self, read: usize, last_char: u8) {
61        const CR: u8 = b'\r';
62        const LF: u8 = b'\n';
63
64        self.replaced.clear();
65        let mut start = 0;
66        let mut end = read;
67
68        // Did this read fill up `self.in_buffer` and end with a CR?
69        if read == self.in_buffer.len() && self.in_buffer[self.in_buffer.len() - 1] == CR {
70            // The next boundary could be an edge case, so we are excluding the last byte of this
71            // read in this round of processing.
72            end = read - 1;
73        }
74
75        // Handle edge case where the last byte of the previous buffer was `\r`.
76        let edge_case = [last_char, self.in_buffer[0]];
77        match (edge_case, read > 0) {
78            ([CR, LF], true) => {
79                // Edge case, we need to normalize this pair of bytes separately.
80                let res = RE.replace_all(&edge_case, self.line_break.as_ref());
81                self.replaced.extend_from_slice(&res);
82
83                // We already processed the leading LF in `self.in_buffer`,  so it’s omitted from the final normalization step.
84                start = 1;
85            }
86            ([CR, _], _) => {
87                // The last `\r` was not included and normalization is not needed.
88                self.replaced.put_u8(CR);
89            }
90            _ => {}
91        }
92
93        // Normalize the remaining part of the buffer ...
94        let res = RE.replace_all(&self.in_buffer[start..end], self.line_break.as_ref());
95        // ... and copy it into `self.replaced`.
96        self.replaced.extend_from_slice(&res);
97    }
98}
99
100impl<R: std::io::Read> std::io::Read for NormalizedReader<R> {
101    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
102        if !self.replaced.has_remaining() {
103            if self.is_done {
104                return Ok(0);
105            }
106            self.fill_buffer()?;
107        }
108
109        let to_write = self.replaced.remaining().min(buf.len());
110        self.replaced.copy_to_slice(&mut buf[..to_write]);
111        Ok(to_write)
112    }
113}
114
115pub(crate) fn normalize_lines(s: &str, line_break: LineBreak) -> std::borrow::Cow<'_, str> {
116    let bytes = RE.replace_all(s.as_bytes(), line_break.as_ref());
117    match bytes {
118        std::borrow::Cow::Borrowed(bytes) => {
119            std::borrow::Cow::Borrowed(std::str::from_utf8(bytes).expect("valid bytes in"))
120        }
121        std::borrow::Cow::Owned(bytes) => {
122            std::borrow::Cow::Owned(std::string::String::from_utf8(bytes).expect("valid bytes in"))
123        }
124    }
125}
126
127// tests
128#[cfg(test)]
129mod tests {
130    use std::io::Read;
131
132    use rand::{Rng, SeedableRng};
133    use rand_chacha::ChaCha8Rng;
134
135    use super::*;
136    use crate::util::test::{check_strings, random_string, ChaosReader};
137
138    #[test]
139    fn reader_normalized_lf() {
140        let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
141
142        let mut out = String::new();
143        NormalizedReader::new(&mut input.as_bytes(), LineBreak::Lf)
144            .read_to_string(&mut out)
145            .unwrap();
146
147        check_strings(
148            out,
149            "This is a string \n with \r some \n\n random newlines\r\n\n",
150        );
151    }
152
153    #[test]
154    fn reader_normalized_cr() {
155        let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
156
157        let mut out = String::new();
158        NormalizedReader::new(&mut input.as_bytes(), LineBreak::Cr)
159            .read_to_string(&mut out)
160            .unwrap();
161
162        check_strings(
163            out,
164            "This is a string \r with \r some \r\r random newlines\r\r\r",
165        );
166    }
167
168    #[test]
169    fn reader_normalized_crlf_fixed() {
170        let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
171
172        let mut out = String::new();
173        NormalizedReader::new(&mut input.as_bytes(), LineBreak::Crlf)
174            .read_to_string(&mut out)
175            .unwrap();
176
177        check_strings(
178            "This is a string \r\n with \r some \r\n\r\n random newlines\r\r\n\r\n",
179            out,
180        );
181    }
182
183    #[test]
184    fn reader_normalized_crlf_random() {
185        let mut rng = ChaCha8Rng::seed_from_u64(1);
186
187        for _ in 0..100 {
188            let size = rng.gen_range(1..10000);
189            let input = random_string(&mut rng, size);
190            let reader = ChaosReader::new(&mut rng, input.clone());
191
192            let mut out = String::new();
193            NormalizedReader::new(reader, LineBreak::Crlf)
194                .read_to_string(&mut out)
195                .unwrap();
196
197            let normalized_input = normalize_lines(&input, LineBreak::Crlf);
198            check_strings(&normalized_input, out);
199        }
200    }
201
202    #[test]
203    fn reader_normalized_crlf_then_lf_edge_case() {
204        let input_string = "a \n ".repeat(512);
205
206        let mut out_crlf = String::new();
207        NormalizedReader::new(input_string.as_bytes(), LineBreak::Crlf)
208            .read_to_string(&mut out_crlf)
209            .unwrap();
210
211        let mut reverted = String::new();
212        NormalizedReader::new(out_crlf.as_bytes(), LineBreak::Lf)
213            .read_to_string(&mut reverted)
214            .unwrap();
215        check_strings(input_string, reverted);
216    }
217}