Skip to main content

qubit_text_io/adapters/
utf8_text_reader.rs

1/*******************************************************************************
2 *
3 *    Copyright (c) 2026 Haixing Hu.
4 *
5 *    SPDX-License-Identifier: Apache-2.0
6 *
7 *    Licensed under the Apache License, Version 2.0.
8 *
9 ******************************************************************************/
10use std::io::{
11    self,
12    BufRead,
13    BufReader,
14    Read,
15};
16
17use crate::{
18    TextLineRead,
19    TextRead,
20};
21
22/// Streaming text reader for UTF-8 byte input.
23#[derive(Debug)]
24pub struct Utf8TextReader<R> {
25    inner: R,
26}
27
28impl<R> Utf8TextReader<R>
29where
30    R: BufRead,
31{
32    /// Creates a UTF-8 text reader over a buffered byte reader.
33    ///
34    /// # Parameters
35    /// - `inner`: Buffered byte reader that yields UTF-8 data.
36    ///
37    /// # Returns
38    /// A text reader wrapping `inner`.
39    #[must_use]
40    pub const fn new(inner: R) -> Self {
41        Self { inner }
42    }
43
44    /// Returns a shared reference to the wrapped reader.
45    ///
46    /// # Returns
47    /// The wrapped reader.
48    #[must_use]
49    pub const fn get_ref(&self) -> &R {
50        &self.inner
51    }
52
53    /// Returns a mutable reference to the wrapped reader.
54    ///
55    /// # Returns
56    /// The wrapped reader.
57    pub fn get_mut(&mut self) -> &mut R {
58        &mut self.inner
59    }
60
61    /// Returns the wrapped reader.
62    ///
63    /// # Returns
64    /// The underlying buffered reader.
65    #[must_use]
66    pub fn into_inner(self) -> R {
67        self.inner
68    }
69}
70
71impl<R> Utf8TextReader<BufReader<R>>
72where
73    R: Read,
74{
75    /// Creates a UTF-8 text reader over an unbuffered byte reader.
76    ///
77    /// # Parameters
78    /// - `reader`: Byte reader that yields UTF-8 data.
79    ///
80    /// # Returns
81    /// A text reader wrapping `reader` in [`BufReader`].
82    #[must_use]
83    pub fn from_read(reader: R) -> Self {
84        Self {
85            inner: BufReader::new(reader),
86        }
87    }
88}
89
90impl<R> TextRead for Utf8TextReader<R>
91where
92    R: BufRead,
93{
94    type Error = io::Error;
95
96    fn read_char(&mut self) -> Result<Option<char>, Self::Error> {
97        read_utf8_char(&mut self.inner)
98    }
99
100    fn read_chars(&mut self, output: &mut Vec<char>, max: usize) -> Result<usize, Self::Error> {
101        let mut count = 0;
102        while count < max {
103            match self.read_char()? {
104                Some(ch) => {
105                    output.push(ch);
106                    count += 1;
107                }
108                None => break,
109            }
110        }
111        Ok(count)
112    }
113
114    fn read_to_string(&mut self, output: &mut String) -> Result<usize, Self::Error> {
115        let start = output.len();
116        self.inner.read_to_string(output)?;
117        Ok(output[start..].chars().count())
118    }
119}
120
121impl<R> TextLineRead for Utf8TextReader<R>
122where
123    R: BufRead,
124{
125    fn read_line(&mut self, output: &mut String) -> Result<bool, Self::Error> {
126        Ok(self.inner.read_line(output)? != 0)
127    }
128}
129
130/// Reads one UTF-8 character from a byte reader.
131///
132/// # Parameters
133/// - `reader`: Reader to consume bytes from.
134///
135/// # Returns
136/// The next character, or `None` at EOF.
137///
138/// # Errors
139/// Returns an I/O error when the underlying reader fails, when EOF appears in
140/// the middle of a character, or when the byte sequence is not valid UTF-8.
141fn read_utf8_char<R>(reader: &mut R) -> io::Result<Option<char>>
142where
143    R: Read + ?Sized,
144{
145    let mut first = [0_u8; 1];
146    let read = reader.read(&mut first)?;
147    if read == 0 {
148        return Ok(None);
149    }
150    let width = utf8_char_width(first[0])?;
151    let mut buffer = [0_u8; 4];
152    buffer[0] = first[0];
153    reader.read_exact(&mut buffer[1..width])?;
154    let text = std::str::from_utf8(&buffer[..width]).map_err(invalid_utf8_error)?;
155    Ok(text.chars().next())
156}
157
158/// Returns the UTF-8 character width implied by the first byte.
159///
160/// # Parameters
161/// - `byte`: First byte of a UTF-8 sequence.
162///
163/// # Returns
164/// The expected character width in bytes.
165///
166/// # Errors
167/// Returns [`io::ErrorKind::InvalidData`] when `byte` cannot start a UTF-8
168/// sequence.
169fn utf8_char_width(byte: u8) -> io::Result<usize> {
170    match byte {
171        0x00..=0x7F => Ok(1),
172        0xC2..=0xDF => Ok(2),
173        0xE0..=0xEF => Ok(3),
174        0xF0..=0xF4 => Ok(4),
175        _ => Err(io::Error::new(
176            io::ErrorKind::InvalidData,
177            format!("invalid UTF-8 leading byte: 0x{byte:02X}"),
178        )),
179    }
180}
181
182/// Converts a UTF-8 validation error to an I/O error.
183///
184/// # Parameters
185/// - `error`: UTF-8 validation error.
186///
187/// # Returns
188/// An [`io::Error`] with [`io::ErrorKind::InvalidData`].
189fn invalid_utf8_error(error: std::str::Utf8Error) -> io::Error {
190    io::Error::new(
191        io::ErrorKind::InvalidData,
192        format!("invalid UTF-8 text: {error}"),
193    )
194}