utf8_io/
utf8_reader.rs

1use crate::utf8_input::Utf8Input;
2use crate::ReadStr;
3#[cfg(windows)]
4use io_extras::os::windows::{
5    AsHandleOrSocket, AsRawHandleOrSocket, BorrowedHandleOrSocket, RawHandleOrSocket,
6};
7use std::io::{self, Read};
8use std::{fmt, str};
9#[cfg(feature = "terminal-io")]
10use terminal_io::{ReadTerminal, Terminal};
11#[cfg(feature = "layered-io")]
12use {
13    crate::ReadStrLayered,
14    layered_io::{Bufferable, ReadLayered, Status},
15};
16#[cfg(not(windows))]
17use {
18    io_extras::os::rustix::{AsRawFd, RawFd},
19    std::os::fd::{AsFd, BorrowedFd},
20};
21
22/// A [`Read`] implementation which translates from an input `Read` producing
23/// an arbitrary byte sequence into a valid UTF-8 sequence with invalid
24/// sequences replaced by [U+FFFD (REPLACEMENT CHARACTER)] in the manner of
25/// [`String::from_utf8_lossy`], where scalar value encodings never straddle
26/// `read` calls (callers can do [`str::from_utf8`] and it will always
27/// succeed).
28///
29/// [U+FFFD (REPLACEMENT CHARACTER)]: https://util.unicode.org/UnicodeJsps/character.jsp?a=FFFD
30pub struct Utf8Reader<Inner: Read> {
31    /// The wrapped byte stream.
32    pub(crate) inner: Inner,
33
34    /// UTF-8 translation state.
35    pub(crate) input: Utf8Input,
36}
37
38impl<Inner: Read> Utf8Reader<Inner> {
39    /// Construct a new instance of `Utf8Reader` wrapping `inner`.
40    #[inline]
41    pub fn new(inner: Inner) -> Self {
42        Self {
43            inner,
44            input: Utf8Input::new(),
45        }
46    }
47}
48
49#[cfg(feature = "terminal-io")]
50impl<Inner: Read + ReadTerminal> Terminal for Utf8Reader<Inner> {}
51
52#[cfg(feature = "terminal-io")]
53impl<Inner: Read + ReadTerminal> ReadTerminal for Utf8Reader<Inner> {
54    #[inline]
55    fn is_line_by_line(&self) -> bool {
56        self.inner.is_line_by_line()
57    }
58
59    #[inline]
60    fn is_input_terminal(&self) -> bool {
61        self.inner.is_input_terminal()
62    }
63}
64
65#[cfg(feature = "layered-io")]
66impl<Inner: ReadLayered> ReadLayered for Utf8Reader<Inner> {
67    #[inline]
68    fn read_with_status(&mut self, buf: &mut [u8]) -> io::Result<(usize, Status)> {
69        Utf8Input::read_with_status(self, buf)
70    }
71
72    #[inline]
73    fn minimum_buffer_size(&self) -> usize {
74        Utf8Input::minimum_buffer_size(self)
75    }
76}
77
78#[cfg(feature = "layered-io")]
79impl<Inner: ReadLayered> Bufferable for Utf8Reader<Inner> {
80    #[inline]
81    fn abandon(&mut self) {
82        Utf8Input::abandon(self)
83    }
84
85    #[inline]
86    fn suggested_buffer_size(&self) -> usize {
87        Utf8Input::suggested_buffer_size(self)
88    }
89}
90
91impl<Inner: Read> ReadStr for Utf8Reader<Inner> {
92    #[inline]
93    fn read_str(&mut self, buf: &mut str) -> io::Result<usize> {
94        Utf8Input::read_str(self, buf)
95    }
96
97    #[inline]
98    fn read_exact_str(&mut self, buf: &mut str) -> io::Result<()> {
99        Utf8Input::read_exact_str(self, buf)
100    }
101}
102
103#[cfg(feature = "layered-io")]
104impl<Inner: ReadLayered> ReadStrLayered for Utf8Reader<Inner> {
105    #[inline]
106    fn read_str_with_status(&mut self, buf: &mut str) -> io::Result<(usize, Status)> {
107        Utf8Input::read_str_with_status(self, buf)
108    }
109
110    #[inline]
111    fn read_exact_str_using_status(&mut self, buf: &mut str) -> io::Result<Status> {
112        Utf8Input::read_exact_str_using_status(self, buf)
113    }
114}
115
116impl<Inner: Read> Read for Utf8Reader<Inner> {
117    #[inline]
118    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
119        Utf8Input::read(self, buf)
120    }
121
122    #[inline]
123    fn read_to_string(&mut self, buf: &mut String) -> io::Result<usize> {
124        Utf8Input::read_to_string(self, buf)
125    }
126}
127
128#[cfg(not(windows))]
129impl<Inner: Read + AsRawFd> AsRawFd for Utf8Reader<Inner> {
130    #[inline]
131    fn as_raw_fd(&self) -> RawFd {
132        self.inner.as_raw_fd()
133    }
134}
135
136#[cfg(not(windows))]
137impl<Inner: Read + AsFd> AsFd for Utf8Reader<Inner> {
138    #[inline]
139    fn as_fd(&self) -> BorrowedFd<'_> {
140        self.inner.as_fd()
141    }
142}
143
144#[cfg(windows)]
145impl<Inner: Read + AsRawHandleOrSocket> AsRawHandleOrSocket for Utf8Reader<Inner> {
146    #[inline]
147    fn as_raw_handle_or_socket(&self) -> RawHandleOrSocket {
148        self.inner.as_raw_handle_or_socket()
149    }
150}
151
152#[cfg(windows)]
153impl<Inner: Read + AsHandleOrSocket> AsHandleOrSocket for Utf8Reader<Inner> {
154    #[inline]
155    fn as_handle_or_socket(&self) -> BorrowedHandleOrSocket<'_> {
156        self.inner.as_handle_or_socket()
157    }
158}
159
160impl<Inner: Read + fmt::Debug> fmt::Debug for Utf8Reader<Inner> {
161    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
162        let mut b = f.debug_struct("Utf8Reader");
163        b.field("inner", &self.inner);
164        b.finish()
165    }
166}
167
168#[cfg(test)]
169fn translate_via_reader(bytes: &[u8]) -> String {
170    let mut reader = Utf8Reader::new(bytes);
171    let mut s = String::new();
172    reader.read_to_string(&mut s).unwrap();
173    s
174}
175
176#[cfg(test)]
177fn translate_via_layered_reader(bytes: &[u8]) -> String {
178    let mut reader = Utf8Reader::new(layered_io::LayeredReader::new(bytes));
179    let mut s = String::new();
180    reader.read_to_string(&mut s).unwrap();
181    s
182}
183
184#[cfg(test)]
185fn translate_via_slice_reader(bytes: &[u8]) -> String {
186    let mut reader = Utf8Reader::new(layered_io::SliceReader::new(bytes));
187    let mut s = String::new();
188    reader.read_to_string(&mut s).unwrap();
189    s
190}
191
192#[cfg(test)]
193#[cfg(feature = "layered-io")]
194fn translate_with_small_buffer(bytes: &[u8]) -> String {
195    let mut reader = Utf8Reader::new(layered_io::SliceReader::new(bytes));
196    let mut v = Vec::new();
197    let mut buf = [0; 4];
198    loop {
199        let (size, status) = reader.read_with_status(&mut buf).unwrap();
200        v.extend_from_slice(&buf[..size]);
201        if status.is_end() {
202            break;
203        }
204    }
205    String::from_utf8(v).unwrap()
206}
207
208#[cfg(test)]
209#[cfg(not(feature = "layered-io"))]
210fn translate_with_small_buffer(bytes: &[u8]) -> String {
211    let mut reader = Utf8Reader::new(bytes);
212    let mut v = Vec::new();
213    let mut buf = [0; 4];
214    loop {
215        let size = match reader.read(&mut buf) {
216            Ok(0) => break,
217            Ok(size) => size,
218            Err(err) if err.kind() == io::ErrorKind::Interrupted => 0,
219            Err(err) => Err(err).unwrap(),
220        };
221        v.extend_from_slice(&buf[..size]);
222    }
223    String::from_utf8(v).unwrap()
224}
225
226#[cfg(test)]
227fn test(bytes: &[u8], s: &str) {
228    assert_eq!(translate_via_reader(bytes), s);
229    assert_eq!(translate_via_layered_reader(bytes), s);
230    assert_eq!(translate_via_slice_reader(bytes), s);
231    assert_eq!(translate_with_small_buffer(bytes), s);
232
233    for i in 1..4 {
234        let mut v = vec![0_u8; i + bytes.len()];
235        v[i..i + bytes.len()].copy_from_slice(bytes);
236        assert_eq!(
237            str::from_utf8(&translate_via_reader(&v).as_bytes()[i..]).unwrap(),
238            s
239        );
240        assert_eq!(
241            str::from_utf8(&translate_via_layered_reader(&v).as_bytes()[i..]).unwrap(),
242            s
243        );
244        assert_eq!(
245            str::from_utf8(&translate_via_slice_reader(&v).as_bytes()[i..]).unwrap(),
246            s
247        );
248        assert_eq!(
249            str::from_utf8(&translate_with_small_buffer(&v).as_bytes()[i..]).unwrap(),
250            s
251        );
252    }
253}
254
255#[test]
256fn test_empty_string() {
257    test(b"", "");
258}
259
260#[test]
261fn test_hello_world() {
262    test(b"hello world", "hello world");
263}
264
265#[test]
266fn test_embedded_invalid_byte() {
267    test(b"hello\xffworld", "hello�world");
268}
269
270#[test]
271fn test_invalid_bytes() {
272    test(b"\xff\xff\xff", "���");
273}
274
275#[test]
276fn test_some_ascii_printable() {
277    test(
278        b"`1234567890-=qwertyuiop[]\\asdfghjkl;\"zxcvbnm,./",
279        "`1234567890-=qwertyuiop[]\\asdfghjkl;\"zxcvbnm,./",
280    );
281}
282
283// Tests derived from the tests in https://hsivonen.fi/broken-utf-8/
284
285// Non-shortest forms for lowest single-byte (U+0000)
286#[test]
287fn test_two_byte_sequence_lowest_single_byte() {
288    test(b"\xC0\x80", "��");
289}
290#[test]
291fn test_three_byte_sequence_lowest_single_byte() {
292    test(b"\xE0\x80\x80", "���");
293}
294#[test]
295fn test_four_byte_sequence_lowest_single_byte() {
296    test(b"\xF0\x80\x80\x80", "����");
297}
298#[test]
299fn test_five_byte_sequence_lowest_single_byte() {
300    test(b"\xF8\x80\x80\x80\x80", "�����");
301}
302#[test]
303fn test_six_byte_sequence_lowest_single_byte() {
304    test(b"\xFC\x80\x80\x80\x80\x80", "������");
305}
306
307// Non-shortest forms for highest single-byte (U+007F)
308#[test]
309fn test_two_byte_sequence_highest_single_byte() {
310    test(b"\xC1\xBF", "��");
311}
312#[test]
313fn test_three_byte_sequence_highest_single_byte() {
314    test(b"\xE0\x81\xBF", "���");
315}
316#[test]
317fn test_four_byte_sequence_highest_single_byte() {
318    test(b"\xF0\x80\x81\xBF", "����");
319}
320#[test]
321fn test_five_byte_sequence_highest_single_byte() {
322    test(b"\xF8\x80\x80\x81\xBF", "�����");
323}
324#[test]
325fn test_six_byte_sequence_highest_single_byte() {
326    test(b"\xFC\x80\x80\x80\x81\xBF", "������");
327}
328
329// Non-shortest forms for lowest two-byte (U+0080)
330#[test]
331fn test_three_byte_sequence_lowest_two_byte() {
332    test(b"\xE0\x82\x80", "���");
333}
334#[test]
335fn test_four_byte_sequence_lowest_two_byte() {
336    test(b"\xF0\x80\x82\x80", "����");
337}
338#[test]
339fn test_five_byte_sequence_lowest_two_byte() {
340    test(b"\xF8\x80\x80\x82\x80", "�����");
341}
342#[test]
343fn test_six_byte_sequence_lowest_two_byte() {
344    test(b"\xFC\x80\x80\x80\x82\x80", "������");
345}
346
347// Non-shortest forms for highest two-byte (U+07FF)
348#[test]
349fn test_three_byte_sequence_highest_two_byte() {
350    test(b"\xE0\x9F\xBF", "���");
351}
352#[test]
353fn test_four_byte_sequence_highest_two_byte() {
354    test(b"\xF0\x80\x9F\xBF", "����");
355}
356#[test]
357fn test_five_byte_sequence_highest_two_byte() {
358    test(b"\xF8\x80\x80\x9F\xBF", "�����");
359}
360#[test]
361fn test_six_byte_sequence_highest_two_byte() {
362    test(b"\xFC\x80\x80\x80\x9F\xBF", "������");
363}
364
365// Non-shortest forms for lowest three-byte (U+0800)
366#[test]
367fn test_four_byte_sequence_lowest_three_byte() {
368    test(b"\xF0\x80\xA0\x80", "����");
369}
370#[test]
371fn test_five_byte_sequence_lowest_three_byte() {
372    test(b"\xF8\x80\x80\xA0\x80", "�����");
373}
374#[test]
375fn test_six_byte_sequence_lowest_three_byte() {
376    test(b"\xFC\x80\x80\x80\xA0\x80", "������");
377}
378
379// Non-shortest forms for highest three-byte (U+FFFF)
380#[test]
381fn test_four_byte_sequence_highest_three_byte() {
382    test(b"\xF0\x8F\xBF\xBF", "����");
383}
384#[test]
385fn test_five_byte_sequence_highest_three_byte() {
386    test(b"\xF8\x80\x8F\xBF\xBF", "�����");
387}
388#[test]
389fn test_six_byte_sequence_highest_three_byte() {
390    test(b"\xFC\x80\x80\x8F\xBF\xBF", "������");
391}
392
393// Non-shortest forms for lowest four-byte (U+10000)
394#[test]
395fn test_five_byte_sequence_lowest_four_byte() {
396    test(b"\xF8\x80\x90\x80\x80", "�����");
397}
398#[test]
399fn test_six_byte_sequence_lowest_four_byte() {
400    test(b"\xFC\x80\x80\x90\x80\x80", "������");
401}
402
403// Non-shortest forms for last Unicode (U+10FFFF)
404#[test]
405fn test_five_byte_sequence() {
406    test(b"\xF8\x84\x8F\xBF\xBF", "�����");
407}
408#[test]
409fn test_six_byte_sequence() {
410    test(b"\xFC\x80\x84\x8F\xBF\xBF", "������");
411}
412
413// Out of range
414#[test]
415fn test_one_past_unicode() {
416    test(b"\xF4\x90\x80\x80", "����");
417}
418#[test]
419fn test_longest_five_byte_sequence() {
420    test(b"\xFB\xBF\xBF\xBF\xBF", "�����");
421}
422#[test]
423fn test_longest_six_byte_sequence() {
424    test(b"\xFD\xBF\xBF\xBF\xBF\xBF", "������");
425}
426#[test]
427fn test_first_surrogate() {
428    test(b"\xED\xA0\x80", "���");
429}
430#[test]
431fn test_last_surrogate() {
432    test(b"\xED\xBF\xBF", "���");
433}
434#[test]
435fn test_cesu_8_surrogate_pair() {
436    test(b"\xED\xA0\xBD\xED\xB2\xA9", "������");
437}
438
439// Out of range and non-shortest
440#[test]
441fn test_one_past_unicode_as_five_byte_sequence() {
442    test(b"\xF8\x84\x90\x80\x80", "�����");
443}
444#[test]
445fn test_one_past_unicode_as_six_byte_sequence() {
446    test(b"\xFC\x80\x84\x90\x80\x80", "������");
447}
448#[test]
449fn test_first_surrogate_as_four_byte_sequence() {
450    test(b"\xF0\x8D\xA0\x80", "����");
451}
452#[test]
453fn test_last_surrogate_as_four_byte_sequence() {
454    test(b"\xF0\x8D\xBF\xBF", "����");
455}
456#[test]
457fn test_cesu_8_surrogate_pair_as_two_four_byte_overlongs() {
458    test(b"\xF0\x8D\xA0\xBD\xF0\x8D\xB2\xA9", "��������");
459}
460
461// Lone trails
462#[test]
463fn test_one() {
464    test(b"\x80", "�");
465}
466#[test]
467fn test_two() {
468    test(b"\x80\x80", "��");
469}
470#[test]
471fn test_three() {
472    test(b"\x80\x80\x80", "���");
473}
474#[test]
475fn test_four() {
476    test(b"\x80\x80\x80\x80", "����");
477}
478#[test]
479fn test_five() {
480    test(b"\x80\x80\x80\x80\x80", "�����");
481}
482#[test]
483fn test_six() {
484    test(b"\x80\x80\x80\x80\x80\x80", "������");
485}
486#[test]
487fn test_seven() {
488    test(b"\x80\x80\x80\x80\x80\x80\x80", "�������");
489}
490#[test]
491fn test_after_valid_two_byte() {
492    test(b"\xC2\xB6\x80", "¶�");
493}
494#[test]
495fn test_after_valid_three_byte() {
496    test(b"\xE2\x98\x83\x80", "☃�");
497}
498#[test]
499fn test_after_valid_four_byte() {
500    test(b"\xF0\x9F\x92\xA9\x80", "💩�");
501}
502#[test]
503fn test_after_five_byte() {
504    test(b"\xFB\xBF\xBF\xBF\xBF\x80", "������");
505}
506#[test]
507fn test_after_six_byte() {
508    test(b"\xFD\xBF\xBF\xBF\xBF\xBF\x80", "�������");
509}
510
511// Truncated_sequences
512#[test]
513fn test_two_byte_lead() {
514    test(b"\xC2", "�");
515}
516#[test]
517fn test_three_byte_lead() {
518    test(b"\xE2", "�");
519}
520#[test]
521fn test_three_byte_lead_and_one_trail() {
522    test(b"\xE2\x98", "�");
523}
524#[test]
525fn test_four_byte_lead() {
526    test(b"\xF0", "�");
527}
528#[test]
529fn test_four_byte_lead_and_one_trail() {
530    test(b"\xF0\x9F", "�");
531}
532#[test]
533fn test_four_byte_lead_and_two_trails() {
534    test(b"\xF0\x9F\x92", "�");
535}
536
537// Leftovers
538#[test]
539fn test_fe() {
540    test(b"\xFE", "�");
541}
542
543#[test]
544fn test_fe_and_trail() {
545    test(b"\xFE\x80", "��");
546}
547
548#[test]
549fn test_ff() {
550    test(b"\xFF", "�");
551}
552#[test]
553fn test_ff_and_trail() {
554    test(b"\xFF\x80", "��");
555}