tpnote_lib/
text_reader.rs

1//! An iterator adapter to suppress CRLF (`\r\n`) sequences in a stream of
2//! bytes.
3//!
4//! # Overview
5//!
6//! This module provides [`CrlfSuppressor`], an iterator adapter to filter out
7//! CR (`\r`, 0x0D) when it is immediately followed by LF (`\n`, 0x0A), as
8//! commonly found in Windows line endings.
9//!
10//! It also provides an extension trait [`CrlfSuppressorExt`] so you can easily
11//! call `.crlf_suppressor()` on any iterator over bytes (e.g., from
12//! `BufReader::bytes()`).
13//!
14//! # Usage
15//!
16//! ## Basic example
17//!
18//! ```rust
19//! use std::io::{Cursor, Error, Read};
20//! use tpnote_lib::text_reader::CrlfSuppressorExt;
21//!
22//! let data = b"hello\r\nworld";
23//! let normalized: Result<Vec<u8>, Error> = Cursor::new(data)
24//!     .bytes()
25//!     .crlf_suppressor()
26//!     .collect();
27//! let s = String::from_utf8(normalized.unwrap()).unwrap();
28//! assert_eq!(s, "hello\nworld");
29//! ```
30//!
31//! ## Reading from a file
32//!
33//! ```rust,no_run
34//! use std::fs::File;
35//! use tpnote_lib::text_reader::read_as_string_with_crlf_suppression;
36//!
37//! let normalized = read_as_string_with_crlf_suppression(File::open("file.txt")?)?;
38//! println!("{}", normalized);
39//! # Ok::<(), std::io::Error>(())
40//! ```
41//!
42//! # Implementation details
43//!
44//! In UTF-8, continuation bytes for multi-byte code points are always in the
45//! range `0x80..0xBF`. Since `0x0D` and `0x0A` are not in this range, searching
46//! for CRLF as byte values is safe.
47//!
48//! # See also
49//!
50//! - [`BufReader::bytes`](https://doc.rust-lang.org/std/io/struct.BufReader.html#method.bytes)
51//! - [`String::from_utf8`](https://doc.rust-lang.org/std/string/struct.String.html#method.from_utf8)
52
53use std::io::{self, BufReader, Read};
54use std::iter::Peekable;
55
56const CR: u8 = 0x0D; // Carriage Return.
57const LF: u8 = 0x0A; // Line Feed.
58
59/// An iterator adapter that suppresses CR (`\r`, 0x0D) when followed by LF
60/// (`\n`, 0x0A). In a valid multi-byte UTF-8 sequence, continuation bytes must
61/// be in the range 0x80 to 0xBF. As 0x0D and 0x0A are not in this range, we can
62/// search for them in a stream of bytes.
63///
64/// * In UTF-8, multi-byte code points (3 or more bytes) have specific "marker"
65///   bits in each byte:
66/// * The first byte starts with 1110xxxx (for 3 bytes) or 11110xxx (for 4
67///   bytes). Continuation bytes always start with 10xxxxxx (0x80..0xBF).
68/// * 0x0D is 00001101 and 0x0A is 00001010—neither match the required bit
69///   patterns for multi-byte UTF-8 encoding.
70/// * In a valid multi-byte UTF-8 sequence, continuation bytes must be in the
71///   range 0x80 to 0xBF.
72/// * 0x0D and 0x0A are not in this range.
73///
74pub struct CrlfSuppressor<I: Iterator<Item = io::Result<u8>>> {
75    iter: Peekable<I>,
76}
77
78impl<I: Iterator<Item = io::Result<u8>>> CrlfSuppressor<I> {
79    /// Creates a new suppressor from an iterator over bytes.
80    /// (Preferred usage: see extension trait `CrlfSuppressorExt`).
81    ///
82    /// # Example
83    /// ```
84    /// use std::io::Cursor;
85    /// use std::io::Read;
86    /// use tpnote_lib::text_reader::CrlfSuppressor;
87    ///
88    /// let bytes = b"foo\r\nbar";
89    /// let suppressor = CrlfSuppressor::new(Cursor::new(bytes).bytes());
90    /// ```
91    /// Create a new suppressor from an iterator over bytes.
92    pub fn new(iter: I) -> Self {
93        Self {
94            iter: iter.peekable(),
95        }
96    }
97}
98
99impl<I: Iterator<Item = io::Result<u8>>> Iterator for CrlfSuppressor<I> {
100    type Item = io::Result<u8>;
101
102    fn next(&mut self) -> Option<Self::Item> {
103        match self.iter.next()? {
104            Ok(CR) => match self.iter.peek() {
105                Some(Ok(LF)) => {
106                    self.iter.next(); // Consume.
107                    Some(Ok(LF))
108                }
109                _ => Some(Ok(CR)),
110            },
111            Ok(byte) => Some(Ok(byte)),
112            Err(err) => Some(Err(err)),
113        }
114    }
115}
116/// Extension trait to add `.crlf_suppressor()` to any iterator over bytes.
117///
118/// # Example
119/// ```rust
120/// use std::io::{Cursor, Error, Read};
121/// use tpnote_lib::text_reader::CrlfSuppressorExt;
122///
123/// let data = b"hello\r\nworld";
124/// let normalized: Result<Vec<u8>, Error> = Cursor::new(data)
125///     .bytes()
126///     .crlf_suppressor()
127///     .collect();
128/// let s = String::from_utf8(normalized.unwrap()).unwrap();
129/// assert_eq!(s, "hello\nworld");
130/// ```
131pub trait CrlfSuppressorExt: Iterator<Item = io::Result<u8>> + Sized {
132    /// Returns an iterator that suppresses CRLF sequences.
133    fn crlf_suppressor(self) -> CrlfSuppressor<Self> {
134        CrlfSuppressor::new(self)
135    }
136}
137
138impl<T: Iterator<Item = io::Result<u8>>> CrlfSuppressorExt for T {}
139
140/// Reads all bytes from the given reader, suppressing CR (`\r`) bytes that are
141/// immediately followed by LF (`\n`).
142///
143/// This function is intended to normalize line endings by removing carriage
144/// return characters that precede line feeds (i.e., converting CRLF sequences
145/// to LF).
146///
147/// # Arguments
148///
149/// * `reader` - Any type that implements [`std::io::Read`], such as a file,
150///   buffer, or stream.
151///
152/// # Returns
153///
154/// A [`std::io::Result`] containing a `Vec<u8>` with the filtered bytes, or an
155/// error if one occurs while reading from the input.
156///
157/// # Example
158///
159/// ```rust
160/// use std::io::Cursor;
161/// use tpnote_lib::text_reader::read_with_crlf_suppression;
162///
163/// let data = b"foo\r\nbar\nbaz\r\n";
164/// let cursor = Cursor::new(data);
165/// let result = read_with_crlf_suppression(cursor).unwrap();
166/// assert_eq!(result, b"foo\nbar\nbaz\n");
167/// ```
168///
169/// # Errors
170///
171/// Returns any I/O error encountered while reading from the provided reader.
172///
173/// # See Also
174///
175/// [`std::io::Read`], [`std::fs::File`]
176pub fn read_with_crlf_suppression<R: Read>(reader: R) -> io::Result<Vec<u8>> {
177    let reader = BufReader::new(reader);
178    let filtered_bytes = reader.bytes().crlf_suppressor();
179    filtered_bytes.collect()
180}
181
182/// Reads all bytes from the given reader, suppressing CR (`\r`) bytes that are
183/// immediately followed by LF (`\n`), and returns the resulting data as a UTF-8
184/// string.
185///
186/// This function is useful for normalizing line endings (converting CRLF to LF)
187/// and reading textual data from any source that implements [`std::io::Read`].
188///
189/// # Arguments
190///
191/// * `reader` - Any type implementing [`std::io::Read`], such as a file,
192///   buffer, or stream.
193///
194/// # Returns
195///
196/// Returns an [`std::io::Result`] containing the resulting `String` if all
197/// bytes are valid UTF-8, or an error if reading fails or the data is not valid
198/// UTF-8.
199///
200/// # Errors
201///
202/// Returns an error if an I/O error occurs while reading, or if the data read
203/// is not valid UTF-8.
204///
205/// # Example
206///
207/// ```rust
208/// use std::io::Cursor;
209/// use tpnote_lib::text_reader::read_as_string_with_crlf_suppression;
210///
211/// let input = b"hello\r\nworld";
212/// let cursor = Cursor::new(input);
213/// let output = read_as_string_with_crlf_suppression(cursor).unwrap();
214/// assert_eq!(output, "hello\nworld");
215/// ```
216///
217/// # See Also
218///
219/// [`read_with_crlf_suppression`]
220pub fn read_as_string_with_crlf_suppression<R: Read>(reader: R) -> io::Result<String> {
221    let bytes = read_with_crlf_suppression(reader)?;
222    String::from_utf8(bytes).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
223}
224
225/// Additional method for `String` suppressing `\r` in `\r\n` sequences:
226/// When no `\r\n` is found, no memory allocation occurs.
227///
228/// ```rust
229/// use tpnote_lib::text_reader::StringExt;
230///
231/// let s = "hello\r\nworld".to_string();
232/// let res = s.crlf_suppressor_string();
233/// assert_eq!("hello\nworld", res);
234///
235/// let s = "hello\nworld".to_string();
236/// let res = s.crlf_suppressor_string();
237/// assert_eq!("hello\nworld", res);
238/// ```
239pub trait StringExt {
240    fn crlf_suppressor_string(self) -> String;
241}
242
243impl StringExt for String {
244    fn crlf_suppressor_string(self) -> String {
245        // Replace `\r\n` with `\n`.
246        // Searching in bytes is faster than in chars.
247        // In UTF-8, continuation bytes for multi-byte code points are always in the
248        // range `0x80..0xBF`. Since `0x0D` and `0x0A` are not in this range, searching
249        // for CRLF as byte values is safe.
250        if !self.contains("\r\n") {
251            // Forward without allocating.
252            self
253        } else {
254            // We allocate here and do a lot of copying.
255            self.replace("\r\n", "\n")
256        }
257    }
258}
259
260#[cfg(test)]
261mod tests {
262    use super::*;
263    use std::io::Cursor;
264
265    fn run(input: &[u8]) -> String {
266        let cursor = Cursor::new(input);
267        let bytes = cursor.bytes().crlf_suppressor();
268        let vec: Vec<u8> = bytes.map(|b| b.unwrap()).collect();
269        String::from_utf8(vec).unwrap()
270    }
271
272    #[test]
273    fn test_crlf_sequence() {
274        let input = b"foo\r\nbar\r\nbaz";
275        let expected = "foo\nbar\nbaz";
276        assert_eq!(run(input), expected);
277    }
278
279    #[test]
280    fn test_lone_cr() {
281        let input = b"foo\rbar";
282        let expected = "foo\rbar";
283        assert_eq!(run(input), expected);
284    }
285
286    #[test]
287    fn test_lone_lf() {
288        let input = b"foo\nbar";
289        let expected = "foo\nbar";
290        assert_eq!(run(input), expected);
291    }
292
293    #[test]
294    fn test_mixed_endings() {
295        let input = b"foo\r\nbar\rbaz\nqux";
296        let expected = "foo\nbar\rbaz\nqux";
297        assert_eq!(run(input), expected);
298    }
299
300    #[test]
301    fn test_empty_input() {
302        let input = b"";
303        let expected = "";
304        assert_eq!(run(input), expected);
305    }
306
307    #[test]
308    fn test_only_crlf() {
309        let input = b"\r\n";
310        let expected = "\n";
311        assert_eq!(run(input), expected);
312    }
313
314    #[test]
315    fn test_only_cr() {
316        let input = b"\r";
317        let expected = "\r";
318        assert_eq!(run(input), expected);
319    }
320
321    #[test]
322    fn test_only_lf() {
323        let input = b"\n";
324        let expected = "\n";
325        assert_eq!(run(input), expected);
326    }
327
328    #[test]
329    fn test_trailing_cr() {
330        let input = b"foo\r";
331        let expected = "foo\r";
332        assert_eq!(run(input), expected);
333    }
334
335    #[test]
336    fn test_trailing_crlf() {
337        let input = b"foo\r\n";
338        let expected = "foo\n";
339        assert_eq!(run(input), expected);
340    }
341
342    #[test]
343    fn test_crlf_suppressor_string() {
344        use std::ptr::addr_of;
345        let s = "hello\r\nworld".to_string();
346        let s_addr = addr_of!(*s);
347        let res = s.crlf_suppressor_string();
348        assert_eq!("hello\nworld", res);
349        // Memory allocation occurred.
350        assert_ne!(s_addr, addr_of!(*res));
351
352        //
353        let s = "hello\nworld".to_string();
354        let s_addr = addr_of!(*s);
355        let res = s.crlf_suppressor_string();
356        assert_eq!("hello\nworld", res);
357        // No memory allocation here:
358        assert_eq!(s_addr, addr_of!(*res));
359    }
360}