1use std::sync::LazyLock;
4
5use bytes::{Buf, BytesMut};
6
7use crate::{line_writer::LineBreak, util::fill_buffer};
8
9static RE: LazyLock<regex::bytes::Regex> =
10 LazyLock::new(|| regex::bytes::Regex::new(r"(\r\n?|\n)").expect("valid regex"));
11
12pub struct NormalizedReader<R>
14where
15 R: std::io::Read,
16{
17 line_break: LineBreak,
18 source: R,
19 in_buffer: [u8; BUF_SIZE / 2],
20 replaced: BytesMut,
21 is_done: bool,
22}
23
24const BUF_SIZE: usize = 1024;
25impl<R: std::io::Read> NormalizedReader<R> {
26 pub fn new(source: R, line_break: LineBreak) -> Self {
27 Self {
28 source,
29 line_break,
30 in_buffer: [0u8; BUF_SIZE / 2],
31 replaced: BytesMut::with_capacity(BUF_SIZE),
32 is_done: false,
33 }
34 }
35
36 fn fill_buffer(&mut self) -> std::io::Result<()> {
38 let last_was_cr = self.in_buffer[self.in_buffer.len() - 1] == b'\r';
41 let read = fill_buffer(&mut self.source, &mut self.in_buffer, None)?;
42 if read == 0 {
43 self.is_done = true;
44 }
45
46 let first_is_lf = self.in_buffer[0] == b'\n';
47
48 self.cleanup_buffer(read, last_was_cr && first_is_lf);
49 Ok(())
50 }
51
52 fn cleanup_buffer(&mut self, read: usize, have_split_crlf: bool) {
54 let in_buffer = if have_split_crlf && read > 0 {
55 &self.in_buffer[1..read]
57 } else {
58 &self.in_buffer[..read]
59 };
60
61 let res = RE.replace_all(in_buffer, self.line_break.as_ref());
62 self.replaced.clear();
63 self.replaced.extend_from_slice(&res);
64 }
65}
66
67impl<R: std::io::Read> std::io::Read for NormalizedReader<R> {
68 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
69 if !self.replaced.has_remaining() {
70 if self.is_done {
71 return Ok(0);
72 }
73 self.fill_buffer()?;
74 }
75
76 let to_write = self.replaced.remaining().min(buf.len());
77 self.replaced.copy_to_slice(&mut buf[..to_write]);
78 Ok(to_write)
79 }
80}
81
82pub(crate) fn normalize_lines(s: &str, line_break: LineBreak) -> std::borrow::Cow<'_, str> {
83 let bytes = RE.replace_all(s.as_bytes(), line_break.as_ref());
84 match bytes {
85 std::borrow::Cow::Borrowed(bytes) => {
86 std::borrow::Cow::Borrowed(std::str::from_utf8(bytes).expect("valid bytes in"))
87 }
88 std::borrow::Cow::Owned(bytes) => {
89 std::borrow::Cow::Owned(std::string::String::from_utf8(bytes).expect("valid bytes in"))
90 }
91 }
92}
93
94#[cfg(test)]
96mod tests {
97 use std::io::Read;
98
99 use rand::{Rng, SeedableRng};
100 use rand_chacha::ChaCha8Rng;
101
102 use super::*;
103 use crate::util::test::{check_strings, random_string, ChaosReader};
104
105 #[test]
106 fn reader_normalized_lf() {
107 let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
108
109 let mut out = String::new();
110 NormalizedReader::new(&mut input.as_bytes(), LineBreak::Lf)
111 .read_to_string(&mut out)
112 .unwrap();
113
114 check_strings(
115 out,
116 "This is a string \n with \n some \n\n random newlines\n\n\n",
117 );
118 }
119
120 #[test]
121 fn reader_normalized_cr() {
122 let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
123
124 let mut out = String::new();
125 NormalizedReader::new(&mut input.as_bytes(), LineBreak::Cr)
126 .read_to_string(&mut out)
127 .unwrap();
128
129 check_strings(
130 out,
131 "This is a string \r with \r some \r\r random newlines\r\r\r",
132 );
133 }
134
135 #[test]
136 fn reader_normalized_crlf_fixed() {
137 let input = "This is a string \n with \r some \n\r\n random newlines\r\r\n\n";
138
139 let mut out = String::new();
140 NormalizedReader::new(&mut input.as_bytes(), LineBreak::Crlf)
141 .read_to_string(&mut out)
142 .unwrap();
143
144 check_strings(
145 "This is a string \r\n with \r\n some \r\n\r\n random newlines\r\n\r\n\r\n",
146 out,
147 );
148 }
149
150 #[test]
151 fn reader_normalized_crlf_random() {
152 let mut rng = ChaCha8Rng::seed_from_u64(1);
153
154 for _ in 0..100 {
155 let size = rng.gen_range(1..10000);
156 let input = random_string(&mut rng, size);
157 let reader = ChaosReader::new(&mut rng, input.clone());
158
159 let mut out = String::new();
160 NormalizedReader::new(reader, LineBreak::Crlf)
161 .read_to_string(&mut out)
162 .unwrap();
163
164 let normalized_input = normalize_lines(&input, LineBreak::Crlf);
165 check_strings(&normalized_input, out);
166 }
167 }
168}