1use std::sync::LazyLock;
4
5use bytes::{Buf, BufMut, BytesMut};
6
7use crate::{line_writer::LineBreak, util::fill_buffer};
8
9static RE: LazyLock<regex::bytes::Regex> =
10 LazyLock::new(|| regex::bytes::Regex::new(r"(\r\n|\n)").expect("valid regex"));
11
12pub struct NormalizedReader<R>
14where
15 R: std::io::Read,
16{
17 line_break: LineBreak,
18 source: R,
19 in_buffer: [u8; BUF_SIZE / 2],
20 replaced: BytesMut,
21 is_done: bool,
22}
23
24const BUF_SIZE: usize = 1024;
25impl<R: std::io::Read> NormalizedReader<R> {
26 pub fn new(source: R, line_break: LineBreak) -> Self {
27 Self {
28 source,
29 line_break,
30 in_buffer: [0u8; BUF_SIZE / 2],
31 replaced: BytesMut::with_capacity(BUF_SIZE),
32 is_done: false,
33 }
34 }
35
36 fn fill_buffer(&mut self) -> std::io::Result<()> {
40 let last_char = self.in_buffer[self.in_buffer.len() - 1];
47
48 let read = fill_buffer(&mut self.source, &mut self.in_buffer, None)?;
49 if read < self.in_buffer.len() {
50 self.is_done = true;
53 }
54
55 self.cleanup_buffer(read, last_char);
56 Ok(())
57 }
58
59 fn cleanup_buffer(&mut self, read: usize, last_char: u8) {
61 const CR: u8 = b'\r';
62 const LF: u8 = b'\n';
63
64 self.replaced.clear();
65 let mut start = 0;
66 let mut end = read;
67
68 if read == self.in_buffer.len() && self.in_buffer[self.in_buffer.len() - 1] == CR {
70 end = read - 1;
73 }
74
75 let edge_case = [last_char, self.in_buffer[0]];
77 match (edge_case, read > 0) {
78 ([CR, LF], true) => {
79 let res = RE.replace_all(&edge_case, self.line_break.as_ref());
81 self.replaced.extend_from_slice(&res);
82
83 start = 1;
85 }
86 ([CR, _], _) => {
87 self.replaced.put_u8(CR);
89 }
90 _ => {}
91 }
92
93 let res = RE.replace_all(&self.in_buffer[start..end], self.line_break.as_ref());
95 self.replaced.extend_from_slice(&res);
97 }
98}
99
100impl<R: std::io::Read> std::io::Read for NormalizedReader<R> {
101 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
102 if !self.replaced.has_remaining() {
103 if self.is_done {
104 return Ok(0);
105 }
106 self.fill_buffer()?;
107 }
108
109 let to_write = self.replaced.remaining().min(buf.len());
110 self.replaced.copy_to_slice(&mut buf[..to_write]);
111 Ok(to_write)
112 }
113}
114
115pub(crate) fn normalize_lines(s: &str, line_break: LineBreak) -> std::borrow::Cow<'_, str> {
116 let bytes = RE.replace_all(s.as_bytes(), line_break.as_ref());
117 match bytes {
118 std::borrow::Cow::Borrowed(bytes) => {
119 std::borrow::Cow::Borrowed(std::str::from_utf8(bytes).expect("valid bytes in"))
120 }
121 std::borrow::Cow::Owned(bytes) => {
122 std::borrow::Cow::Owned(std::string::String::from_utf8(bytes).expect("valid bytes in"))
123 }
124 }
125}
126
127#[cfg(test)]
129mod tests {
130 use std::io::Read;
131
132 use rand::{Rng, SeedableRng};
133 use rand_chacha::ChaCha8Rng;
134
135 use super::*;
136 use crate::util::test::{check_strings, random_string, ChaosReader};
137
138 #[test]
139 fn reader_normalized_lf() {
140 let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
141
142 let mut out = String::new();
143 NormalizedReader::new(&mut input.as_bytes(), LineBreak::Lf)
144 .read_to_string(&mut out)
145 .unwrap();
146
147 check_strings(
148 out,
149 "This is a string \n with \r some \n\n random newlines\r\n\n",
150 );
151 }
152
153 #[test]
154 fn reader_normalized_cr() {
155 let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
156
157 let mut out = String::new();
158 NormalizedReader::new(&mut input.as_bytes(), LineBreak::Cr)
159 .read_to_string(&mut out)
160 .unwrap();
161
162 check_strings(
163 out,
164 "This is a string \r with \r some \r\r random newlines\r\r\r",
165 );
166 }
167
168 #[test]
169 fn reader_normalized_crlf_fixed() {
170 let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
171
172 let mut out = String::new();
173 NormalizedReader::new(&mut input.as_bytes(), LineBreak::Crlf)
174 .read_to_string(&mut out)
175 .unwrap();
176
177 check_strings(
178 "This is a string \r\n with \r some \r\n\r\n random newlines\r\r\n\r\n",
179 out,
180 );
181 }
182
183 #[test]
184 fn reader_normalized_crlf_random() {
185 let mut rng = ChaCha8Rng::seed_from_u64(1);
186
187 for _ in 0..100 {
188 let size = rng.gen_range(1..10000);
189 let input = random_string(&mut rng, size);
190 let reader = ChaosReader::new(&mut rng, input.clone());
191
192 let mut out = String::new();
193 NormalizedReader::new(reader, LineBreak::Crlf)
194 .read_to_string(&mut out)
195 .unwrap();
196
197 let normalized_input = normalize_lines(&input, LineBreak::Crlf);
198 check_strings(&normalized_input, out);
199 }
200 }
201
202 #[test]
203 fn reader_normalized_crlf_then_lf_edge_case() {
204 let input_string = "a \n ".repeat(512);
205
206 let mut out_crlf = String::new();
207 NormalizedReader::new(input_string.as_bytes(), LineBreak::Crlf)
208 .read_to_string(&mut out_crlf)
209 .unwrap();
210
211 let mut reverted = String::new();
212 NormalizedReader::new(out_crlf.as_bytes(), LineBreak::Lf)
213 .read_to_string(&mut reverted)
214 .unwrap();
215 check_strings(input_string, reverted);
216 }
217}