tpnote_lib/text_reader.rs
1//! An iterator adapter to suppress CRLF (`\r\n`) sequences in a stream of
2//! bytes.
3//!
4//! # Overview
5//!
6//! This module provides [`CrlfSuppressor`], an iterator adapter to filter out
7//! CR (`\r`, 0x0D) when it is immediately followed by LF (`\n`, 0x0A), as
8//! commonly found in Windows line endings.
9//!
10//! It also provides an extension trait [`CrlfSuppressorExt`] so you can easily
11//! call `.crlf_suppressor()` on any iterator over bytes (e.g., from
12//! `BufReader::bytes()`).
13//!
14//! # Usage
15//!
16//! ## Basic example
17//!
18//! ```rust
19//! use std::io::{Cursor, Error, Read};
20//! use tpnote_lib::text_reader::CrlfSuppressorExt;
21//!
22//! let data = b"hello\r\nworld";
23//! let normalized: Result<Vec<u8>, Error> = Cursor::new(data)
24//! .bytes()
25//! .crlf_suppressor()
26//! .collect();
27//! let s = String::from_utf8(normalized.unwrap()).unwrap();
28//! assert_eq!(s, "hello\nworld");
29//! ```
30//!
31//! ## Reading from a file
32//!
33//! ```rust,no_run
34//! use std::fs::File;
35//! use tpnote_lib::text_reader::read_as_string_with_crlf_suppression;
36//!
37//! let normalized = read_as_string_with_crlf_suppression(File::open("file.txt")?)?;
38//! println!("{}", normalized);
39//! # Ok::<(), std::io::Error>(())
40//! ```
41//!
42//! # Implementation details
43//!
44//! In UTF-8, continuation bytes for multi-byte code points are always in the
45//! range `0x80..0xBF`. Since `0x0D` and `0x0A` are not in this range, searching
46//! for CRLF as byte values is safe.
47//!
48//! # See also
49//!
50//! - [`BufReader::bytes`](https://doc.rust-lang.org/std/io/struct.BufReader.html#method.bytes)
51//! - [`String::from_utf8`](https://doc.rust-lang.org/std/string/struct.String.html#method.from_utf8)
52
53use std::io::{self, BufReader, Read};
54use std::iter::Peekable;
55
56const CR: u8 = 0x0D; // Carriage Return.
57const LF: u8 = 0x0A; // Line Feed.
58
59/// An iterator adapter that suppresses CR (`\r`, 0x0D) when followed by LF
60/// (`\n`, 0x0A). In a valid multi-byte UTF-8 sequence, continuation bytes must
61/// be in the range 0x80 to 0xBF. As 0x0D and 0x0A are not in this range, we can
62/// search for them in a stream of bytes.
63///
64/// * In UTF-8, multi-byte code points (3 or more bytes) have specific "marker"
65/// bits in each byte:
66/// * The first byte starts with 1110xxxx (for 3 bytes) or 11110xxx (for 4
67/// bytes). Continuation bytes always start with 10xxxxxx (0x80..0xBF).
68/// * 0x0D is 00001101 and 0x0A is 00001010—neither match the required bit
69/// patterns for multi-byte UTF-8 encoding.
70/// * In a valid multi-byte UTF-8 sequence, continuation bytes must be in the
71/// range 0x80 to 0xBF.
72/// * 0x0D and 0x0A are not in this range.
73///
74pub struct CrlfSuppressor<I: Iterator<Item = io::Result<u8>>> {
75 iter: Peekable<I>,
76}
77
78impl<I: Iterator<Item = io::Result<u8>>> CrlfSuppressor<I> {
79 /// Creates a new suppressor from an iterator over bytes.
80 /// (Preferred usage: see extension trait `CrlfSuppressorExt`).
81 ///
82 /// # Example
83 /// ```
84 /// use std::io::Cursor;
85 /// use std::io::Read;
86 /// use tpnote_lib::text_reader::CrlfSuppressor;
87 ///
88 /// let bytes = b"foo\r\nbar";
89 /// let suppressor = CrlfSuppressor::new(Cursor::new(bytes).bytes());
90 /// ```
91 /// Create a new suppressor from an iterator over bytes.
92 pub fn new(iter: I) -> Self {
93 Self {
94 iter: iter.peekable(),
95 }
96 }
97}
98
99impl<I: Iterator<Item = io::Result<u8>>> Iterator for CrlfSuppressor<I> {
100 type Item = io::Result<u8>;
101
102 fn next(&mut self) -> Option<Self::Item> {
103 match self.iter.next()? {
104 Ok(CR) => match self.iter.peek() {
105 Some(Ok(LF)) => {
106 self.iter.next(); // Consume.
107 Some(Ok(LF))
108 }
109 _ => Some(Ok(CR)),
110 },
111 Ok(byte) => Some(Ok(byte)),
112 Err(err) => Some(Err(err)),
113 }
114 }
115}
116/// Extension trait to add `.crlf_suppressor()` to any iterator over bytes.
117///
118/// # Example
119/// ```rust
120/// use std::io::{Cursor, Error, Read};
121/// use tpnote_lib::text_reader::CrlfSuppressorExt;
122///
123/// let data = b"hello\r\nworld";
124/// let normalized: Result<Vec<u8>, Error> = Cursor::new(data)
125/// .bytes()
126/// .crlf_suppressor()
127/// .collect();
128/// let s = String::from_utf8(normalized.unwrap()).unwrap();
129/// assert_eq!(s, "hello\nworld");
130/// ```
131pub trait CrlfSuppressorExt: Iterator<Item = io::Result<u8>> + Sized {
132 /// Returns an iterator that suppresses CRLF sequences.
133 fn crlf_suppressor(self) -> CrlfSuppressor<Self> {
134 CrlfSuppressor::new(self)
135 }
136}
137
138impl<T: Iterator<Item = io::Result<u8>>> CrlfSuppressorExt for T {}
139
140/// Reads all bytes from the given reader, suppressing CR (`\r`) bytes that are
141/// immediately followed by LF (`\n`).
142///
143/// This function is intended to normalize line endings by removing carriage
144/// return characters that precede line feeds (i.e., converting CRLF sequences
145/// to LF).
146///
147/// # Arguments
148///
149/// * `reader` - Any type that implements [`std::io::Read`], such as a file,
150/// buffer, or stream.
151///
152/// # Returns
153///
154/// A [`std::io::Result`] containing a `Vec<u8>` with the filtered bytes, or an
155/// error if one occurs while reading from the input.
156///
157/// # Example
158///
159/// ```rust
160/// use std::io::Cursor;
161/// use tpnote_lib::text_reader::read_with_crlf_suppression;
162///
163/// let data = b"foo\r\nbar\nbaz\r\n";
164/// let cursor = Cursor::new(data);
165/// let result = read_with_crlf_suppression(cursor).unwrap();
166/// assert_eq!(result, b"foo\nbar\nbaz\n");
167/// ```
168///
169/// # Errors
170///
171/// Returns any I/O error encountered while reading from the provided reader.
172///
173/// # See Also
174///
175/// [`std::io::Read`], [`std::fs::File`]
176pub fn read_with_crlf_suppression<R: Read>(reader: R) -> io::Result<Vec<u8>> {
177 let reader = BufReader::new(reader);
178 let filtered_bytes = reader.bytes().crlf_suppressor();
179 filtered_bytes.collect()
180}
181
182/// Reads all bytes from the given reader, suppressing CR (`\r`) bytes that are
183/// immediately followed by LF (`\n`), and returns the resulting data as a UTF-8
184/// string.
185///
186/// This function is useful for normalizing line endings (converting CRLF to LF)
187/// and reading textual data from any source that implements [`std::io::Read`].
188///
189/// # Arguments
190///
191/// * `reader` - Any type implementing [`std::io::Read`], such as a file,
192/// buffer, or stream.
193///
194/// # Returns
195///
196/// Returns an [`std::io::Result`] containing the resulting `String` if all
197/// bytes are valid UTF-8, or an error if reading fails or the data is not valid
198/// UTF-8.
199///
200/// # Errors
201///
202/// Returns an error if an I/O error occurs while reading, or if the data read
203/// is not valid UTF-8.
204///
205/// # Example
206///
207/// ```rust
208/// use std::io::Cursor;
209/// use tpnote_lib::text_reader::read_as_string_with_crlf_suppression;
210///
211/// let input = b"hello\r\nworld";
212/// let cursor = Cursor::new(input);
213/// let output = read_as_string_with_crlf_suppression(cursor).unwrap();
214/// assert_eq!(output, "hello\nworld");
215/// ```
216///
217/// # See Also
218///
219/// [`read_with_crlf_suppression`]
220pub fn read_as_string_with_crlf_suppression<R: Read>(reader: R) -> io::Result<String> {
221 let bytes = read_with_crlf_suppression(reader)?;
222 String::from_utf8(bytes).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
223}
224
225/// Additional method for `String` suppressing `\r` in `\r\n` sequences:
226/// When no `\r\n` is found, no memory allocation occurs.
227///
228/// ```rust
229/// use tpnote_lib::text_reader::StringExt;
230///
231/// let s = "hello\r\nworld".to_string();
232/// let res = s.crlf_suppressor_string();
233/// assert_eq!("hello\nworld", res);
234///
235/// let s = "hello\nworld".to_string();
236/// let res = s.crlf_suppressor_string();
237/// assert_eq!("hello\nworld", res);
238/// ```
239pub trait StringExt {
240 fn crlf_suppressor_string(self) -> String;
241}
242
243impl StringExt for String {
244 fn crlf_suppressor_string(self) -> String {
245 // Replace `\r\n` with `\n`.
246 // Searching in bytes is faster than in chars.
247 // In UTF-8, continuation bytes for multi-byte code points are always in the
248 // range `0x80..0xBF`. Since `0x0D` and `0x0A` are not in this range, searching
249 // for CRLF as byte values is safe.
250 if !self.contains("\r\n") {
251 // Forward without allocating.
252 self
253 } else {
254 // We allocate here and do a lot of copying.
255 self.replace("\r\n", "\n")
256 }
257 }
258}
259
260#[cfg(test)]
261mod tests {
262 use super::*;
263 use std::io::Cursor;
264
265 fn run(input: &[u8]) -> String {
266 let cursor = Cursor::new(input);
267 let bytes = cursor.bytes().crlf_suppressor();
268 let vec: Vec<u8> = bytes.map(|b| b.unwrap()).collect();
269 String::from_utf8(vec).unwrap()
270 }
271
272 #[test]
273 fn test_crlf_sequence() {
274 let input = b"foo\r\nbar\r\nbaz";
275 let expected = "foo\nbar\nbaz";
276 assert_eq!(run(input), expected);
277 }
278
279 #[test]
280 fn test_lone_cr() {
281 let input = b"foo\rbar";
282 let expected = "foo\rbar";
283 assert_eq!(run(input), expected);
284 }
285
286 #[test]
287 fn test_lone_lf() {
288 let input = b"foo\nbar";
289 let expected = "foo\nbar";
290 assert_eq!(run(input), expected);
291 }
292
293 #[test]
294 fn test_mixed_endings() {
295 let input = b"foo\r\nbar\rbaz\nqux";
296 let expected = "foo\nbar\rbaz\nqux";
297 assert_eq!(run(input), expected);
298 }
299
300 #[test]
301 fn test_empty_input() {
302 let input = b"";
303 let expected = "";
304 assert_eq!(run(input), expected);
305 }
306
307 #[test]
308 fn test_only_crlf() {
309 let input = b"\r\n";
310 let expected = "\n";
311 assert_eq!(run(input), expected);
312 }
313
314 #[test]
315 fn test_only_cr() {
316 let input = b"\r";
317 let expected = "\r";
318 assert_eq!(run(input), expected);
319 }
320
321 #[test]
322 fn test_only_lf() {
323 let input = b"\n";
324 let expected = "\n";
325 assert_eq!(run(input), expected);
326 }
327
328 #[test]
329 fn test_trailing_cr() {
330 let input = b"foo\r";
331 let expected = "foo\r";
332 assert_eq!(run(input), expected);
333 }
334
335 #[test]
336 fn test_trailing_crlf() {
337 let input = b"foo\r\n";
338 let expected = "foo\n";
339 assert_eq!(run(input), expected);
340 }
341
342 #[test]
343 fn test_crlf_suppressor_string() {
344 use std::ptr::addr_of;
345 let s = "hello\r\nworld".to_string();
346 let s_addr = addr_of!(*s);
347 let res = s.crlf_suppressor_string();
348 assert_eq!("hello\nworld", res);
349 // Memory allocation occurred.
350 assert_ne!(s_addr, addr_of!(*res));
351
352 //
353 let s = "hello\nworld".to_string();
354 let s_addr = addr_of!(*s);
355 let res = s.crlf_suppressor_string();
356 assert_eq!("hello\nworld", res);
357 // No memory allocation here:
358 assert_eq!(s_addr, addr_of!(*res));
359 }
360}