1use crate::utf8_input::Utf8Input;
2use crate::ReadStr;
3#[cfg(windows)]
4use io_extras::os::windows::{
5 AsHandleOrSocket, AsRawHandleOrSocket, BorrowedHandleOrSocket, RawHandleOrSocket,
6};
7use std::io::{self, Read};
8use std::{fmt, str};
9#[cfg(feature = "terminal-io")]
10use terminal_io::{ReadTerminal, Terminal};
11#[cfg(feature = "layered-io")]
12use {
13 crate::ReadStrLayered,
14 layered_io::{Bufferable, ReadLayered, Status},
15};
16#[cfg(not(windows))]
17use {
18 io_extras::os::rustix::{AsRawFd, RawFd},
19 std::os::fd::{AsFd, BorrowedFd},
20};
21
22pub struct Utf8Reader<Inner: Read> {
31 pub(crate) inner: Inner,
33
34 pub(crate) input: Utf8Input,
36}
37
38impl<Inner: Read> Utf8Reader<Inner> {
39 #[inline]
41 pub fn new(inner: Inner) -> Self {
42 Self {
43 inner,
44 input: Utf8Input::new(),
45 }
46 }
47}
48
49#[cfg(feature = "terminal-io")]
50impl<Inner: Read + ReadTerminal> Terminal for Utf8Reader<Inner> {}
51
52#[cfg(feature = "terminal-io")]
53impl<Inner: Read + ReadTerminal> ReadTerminal for Utf8Reader<Inner> {
54 #[inline]
55 fn is_line_by_line(&self) -> bool {
56 self.inner.is_line_by_line()
57 }
58
59 #[inline]
60 fn is_input_terminal(&self) -> bool {
61 self.inner.is_input_terminal()
62 }
63}
64
65#[cfg(feature = "layered-io")]
66impl<Inner: ReadLayered> ReadLayered for Utf8Reader<Inner> {
67 #[inline]
68 fn read_with_status(&mut self, buf: &mut [u8]) -> io::Result<(usize, Status)> {
69 Utf8Input::read_with_status(self, buf)
70 }
71
72 #[inline]
73 fn minimum_buffer_size(&self) -> usize {
74 Utf8Input::minimum_buffer_size(self)
75 }
76}
77
78#[cfg(feature = "layered-io")]
79impl<Inner: ReadLayered> Bufferable for Utf8Reader<Inner> {
80 #[inline]
81 fn abandon(&mut self) {
82 Utf8Input::abandon(self)
83 }
84
85 #[inline]
86 fn suggested_buffer_size(&self) -> usize {
87 Utf8Input::suggested_buffer_size(self)
88 }
89}
90
91impl<Inner: Read> ReadStr for Utf8Reader<Inner> {
92 #[inline]
93 fn read_str(&mut self, buf: &mut str) -> io::Result<usize> {
94 Utf8Input::read_str(self, buf)
95 }
96
97 #[inline]
98 fn read_exact_str(&mut self, buf: &mut str) -> io::Result<()> {
99 Utf8Input::read_exact_str(self, buf)
100 }
101}
102
103#[cfg(feature = "layered-io")]
104impl<Inner: ReadLayered> ReadStrLayered for Utf8Reader<Inner> {
105 #[inline]
106 fn read_str_with_status(&mut self, buf: &mut str) -> io::Result<(usize, Status)> {
107 Utf8Input::read_str_with_status(self, buf)
108 }
109
110 #[inline]
111 fn read_exact_str_using_status(&mut self, buf: &mut str) -> io::Result<Status> {
112 Utf8Input::read_exact_str_using_status(self, buf)
113 }
114}
115
116impl<Inner: Read> Read for Utf8Reader<Inner> {
117 #[inline]
118 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
119 Utf8Input::read(self, buf)
120 }
121
122 #[inline]
123 fn read_to_string(&mut self, buf: &mut String) -> io::Result<usize> {
124 Utf8Input::read_to_string(self, buf)
125 }
126}
127
128#[cfg(not(windows))]
129impl<Inner: Read + AsRawFd> AsRawFd for Utf8Reader<Inner> {
130 #[inline]
131 fn as_raw_fd(&self) -> RawFd {
132 self.inner.as_raw_fd()
133 }
134}
135
136#[cfg(not(windows))]
137impl<Inner: Read + AsFd> AsFd for Utf8Reader<Inner> {
138 #[inline]
139 fn as_fd(&self) -> BorrowedFd<'_> {
140 self.inner.as_fd()
141 }
142}
143
144#[cfg(windows)]
145impl<Inner: Read + AsRawHandleOrSocket> AsRawHandleOrSocket for Utf8Reader<Inner> {
146 #[inline]
147 fn as_raw_handle_or_socket(&self) -> RawHandleOrSocket {
148 self.inner.as_raw_handle_or_socket()
149 }
150}
151
152#[cfg(windows)]
153impl<Inner: Read + AsHandleOrSocket> AsHandleOrSocket for Utf8Reader<Inner> {
154 #[inline]
155 fn as_handle_or_socket(&self) -> BorrowedHandleOrSocket<'_> {
156 self.inner.as_handle_or_socket()
157 }
158}
159
160impl<Inner: Read + fmt::Debug> fmt::Debug for Utf8Reader<Inner> {
161 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
162 let mut b = f.debug_struct("Utf8Reader");
163 b.field("inner", &self.inner);
164 b.finish()
165 }
166}
167
168#[cfg(test)]
169fn translate_via_reader(bytes: &[u8]) -> String {
170 let mut reader = Utf8Reader::new(bytes);
171 let mut s = String::new();
172 reader.read_to_string(&mut s).unwrap();
173 s
174}
175
176#[cfg(test)]
177fn translate_via_layered_reader(bytes: &[u8]) -> String {
178 let mut reader = Utf8Reader::new(layered_io::LayeredReader::new(bytes));
179 let mut s = String::new();
180 reader.read_to_string(&mut s).unwrap();
181 s
182}
183
184#[cfg(test)]
185fn translate_via_slice_reader(bytes: &[u8]) -> String {
186 let mut reader = Utf8Reader::new(layered_io::SliceReader::new(bytes));
187 let mut s = String::new();
188 reader.read_to_string(&mut s).unwrap();
189 s
190}
191
192#[cfg(test)]
193#[cfg(feature = "layered-io")]
194fn translate_with_small_buffer(bytes: &[u8]) -> String {
195 let mut reader = Utf8Reader::new(layered_io::SliceReader::new(bytes));
196 let mut v = Vec::new();
197 let mut buf = [0; 4];
198 loop {
199 let (size, status) = reader.read_with_status(&mut buf).unwrap();
200 v.extend_from_slice(&buf[..size]);
201 if status.is_end() {
202 break;
203 }
204 }
205 String::from_utf8(v).unwrap()
206}
207
208#[cfg(test)]
209#[cfg(not(feature = "layered-io"))]
210fn translate_with_small_buffer(bytes: &[u8]) -> String {
211 let mut reader = Utf8Reader::new(bytes);
212 let mut v = Vec::new();
213 let mut buf = [0; 4];
214 loop {
215 let size = match reader.read(&mut buf) {
216 Ok(0) => break,
217 Ok(size) => size,
218 Err(err) if err.kind() == io::ErrorKind::Interrupted => 0,
219 Err(err) => Err(err).unwrap(),
220 };
221 v.extend_from_slice(&buf[..size]);
222 }
223 String::from_utf8(v).unwrap()
224}
225
226#[cfg(test)]
227fn test(bytes: &[u8], s: &str) {
228 assert_eq!(translate_via_reader(bytes), s);
229 assert_eq!(translate_via_layered_reader(bytes), s);
230 assert_eq!(translate_via_slice_reader(bytes), s);
231 assert_eq!(translate_with_small_buffer(bytes), s);
232
233 for i in 1..4 {
234 let mut v = vec![0_u8; i + bytes.len()];
235 v[i..i + bytes.len()].copy_from_slice(bytes);
236 assert_eq!(
237 str::from_utf8(&translate_via_reader(&v).as_bytes()[i..]).unwrap(),
238 s
239 );
240 assert_eq!(
241 str::from_utf8(&translate_via_layered_reader(&v).as_bytes()[i..]).unwrap(),
242 s
243 );
244 assert_eq!(
245 str::from_utf8(&translate_via_slice_reader(&v).as_bytes()[i..]).unwrap(),
246 s
247 );
248 assert_eq!(
249 str::from_utf8(&translate_with_small_buffer(&v).as_bytes()[i..]).unwrap(),
250 s
251 );
252 }
253}
254
255#[test]
256fn test_empty_string() {
257 test(b"", "");
258}
259
260#[test]
261fn test_hello_world() {
262 test(b"hello world", "hello world");
263}
264
265#[test]
266fn test_embedded_invalid_byte() {
267 test(b"hello\xffworld", "hello�world");
268}
269
270#[test]
271fn test_invalid_bytes() {
272 test(b"\xff\xff\xff", "���");
273}
274
275#[test]
276fn test_some_ascii_printable() {
277 test(
278 b"`1234567890-=qwertyuiop[]\\asdfghjkl;\"zxcvbnm,./",
279 "`1234567890-=qwertyuiop[]\\asdfghjkl;\"zxcvbnm,./",
280 );
281}
282
283#[test]
287fn test_two_byte_sequence_lowest_single_byte() {
288 test(b"\xC0\x80", "��");
289}
290#[test]
291fn test_three_byte_sequence_lowest_single_byte() {
292 test(b"\xE0\x80\x80", "���");
293}
294#[test]
295fn test_four_byte_sequence_lowest_single_byte() {
296 test(b"\xF0\x80\x80\x80", "����");
297}
298#[test]
299fn test_five_byte_sequence_lowest_single_byte() {
300 test(b"\xF8\x80\x80\x80\x80", "�����");
301}
302#[test]
303fn test_six_byte_sequence_lowest_single_byte() {
304 test(b"\xFC\x80\x80\x80\x80\x80", "������");
305}
306
307#[test]
309fn test_two_byte_sequence_highest_single_byte() {
310 test(b"\xC1\xBF", "��");
311}
312#[test]
313fn test_three_byte_sequence_highest_single_byte() {
314 test(b"\xE0\x81\xBF", "���");
315}
316#[test]
317fn test_four_byte_sequence_highest_single_byte() {
318 test(b"\xF0\x80\x81\xBF", "����");
319}
320#[test]
321fn test_five_byte_sequence_highest_single_byte() {
322 test(b"\xF8\x80\x80\x81\xBF", "�����");
323}
324#[test]
325fn test_six_byte_sequence_highest_single_byte() {
326 test(b"\xFC\x80\x80\x80\x81\xBF", "������");
327}
328
329#[test]
331fn test_three_byte_sequence_lowest_two_byte() {
332 test(b"\xE0\x82\x80", "���");
333}
334#[test]
335fn test_four_byte_sequence_lowest_two_byte() {
336 test(b"\xF0\x80\x82\x80", "����");
337}
338#[test]
339fn test_five_byte_sequence_lowest_two_byte() {
340 test(b"\xF8\x80\x80\x82\x80", "�����");
341}
342#[test]
343fn test_six_byte_sequence_lowest_two_byte() {
344 test(b"\xFC\x80\x80\x80\x82\x80", "������");
345}
346
347#[test]
349fn test_three_byte_sequence_highest_two_byte() {
350 test(b"\xE0\x9F\xBF", "���");
351}
352#[test]
353fn test_four_byte_sequence_highest_two_byte() {
354 test(b"\xF0\x80\x9F\xBF", "����");
355}
356#[test]
357fn test_five_byte_sequence_highest_two_byte() {
358 test(b"\xF8\x80\x80\x9F\xBF", "�����");
359}
360#[test]
361fn test_six_byte_sequence_highest_two_byte() {
362 test(b"\xFC\x80\x80\x80\x9F\xBF", "������");
363}
364
365#[test]
367fn test_four_byte_sequence_lowest_three_byte() {
368 test(b"\xF0\x80\xA0\x80", "����");
369}
370#[test]
371fn test_five_byte_sequence_lowest_three_byte() {
372 test(b"\xF8\x80\x80\xA0\x80", "�����");
373}
374#[test]
375fn test_six_byte_sequence_lowest_three_byte() {
376 test(b"\xFC\x80\x80\x80\xA0\x80", "������");
377}
378
379#[test]
381fn test_four_byte_sequence_highest_three_byte() {
382 test(b"\xF0\x8F\xBF\xBF", "����");
383}
384#[test]
385fn test_five_byte_sequence_highest_three_byte() {
386 test(b"\xF8\x80\x8F\xBF\xBF", "�����");
387}
388#[test]
389fn test_six_byte_sequence_highest_three_byte() {
390 test(b"\xFC\x80\x80\x8F\xBF\xBF", "������");
391}
392
393#[test]
395fn test_five_byte_sequence_lowest_four_byte() {
396 test(b"\xF8\x80\x90\x80\x80", "�����");
397}
398#[test]
399fn test_six_byte_sequence_lowest_four_byte() {
400 test(b"\xFC\x80\x80\x90\x80\x80", "������");
401}
402
403#[test]
405fn test_five_byte_sequence() {
406 test(b"\xF8\x84\x8F\xBF\xBF", "�����");
407}
408#[test]
409fn test_six_byte_sequence() {
410 test(b"\xFC\x80\x84\x8F\xBF\xBF", "������");
411}
412
413#[test]
415fn test_one_past_unicode() {
416 test(b"\xF4\x90\x80\x80", "����");
417}
418#[test]
419fn test_longest_five_byte_sequence() {
420 test(b"\xFB\xBF\xBF\xBF\xBF", "�����");
421}
422#[test]
423fn test_longest_six_byte_sequence() {
424 test(b"\xFD\xBF\xBF\xBF\xBF\xBF", "������");
425}
426#[test]
427fn test_first_surrogate() {
428 test(b"\xED\xA0\x80", "���");
429}
430#[test]
431fn test_last_surrogate() {
432 test(b"\xED\xBF\xBF", "���");
433}
434#[test]
435fn test_cesu_8_surrogate_pair() {
436 test(b"\xED\xA0\xBD\xED\xB2\xA9", "������");
437}
438
439#[test]
441fn test_one_past_unicode_as_five_byte_sequence() {
442 test(b"\xF8\x84\x90\x80\x80", "�����");
443}
444#[test]
445fn test_one_past_unicode_as_six_byte_sequence() {
446 test(b"\xFC\x80\x84\x90\x80\x80", "������");
447}
448#[test]
449fn test_first_surrogate_as_four_byte_sequence() {
450 test(b"\xF0\x8D\xA0\x80", "����");
451}
452#[test]
453fn test_last_surrogate_as_four_byte_sequence() {
454 test(b"\xF0\x8D\xBF\xBF", "����");
455}
456#[test]
457fn test_cesu_8_surrogate_pair_as_two_four_byte_overlongs() {
458 test(b"\xF0\x8D\xA0\xBD\xF0\x8D\xB2\xA9", "��������");
459}
460
461#[test]
463fn test_one() {
464 test(b"\x80", "�");
465}
466#[test]
467fn test_two() {
468 test(b"\x80\x80", "��");
469}
470#[test]
471fn test_three() {
472 test(b"\x80\x80\x80", "���");
473}
474#[test]
475fn test_four() {
476 test(b"\x80\x80\x80\x80", "����");
477}
478#[test]
479fn test_five() {
480 test(b"\x80\x80\x80\x80\x80", "�����");
481}
482#[test]
483fn test_six() {
484 test(b"\x80\x80\x80\x80\x80\x80", "������");
485}
486#[test]
487fn test_seven() {
488 test(b"\x80\x80\x80\x80\x80\x80\x80", "�������");
489}
490#[test]
491fn test_after_valid_two_byte() {
492 test(b"\xC2\xB6\x80", "¶�");
493}
494#[test]
495fn test_after_valid_three_byte() {
496 test(b"\xE2\x98\x83\x80", "☃�");
497}
498#[test]
499fn test_after_valid_four_byte() {
500 test(b"\xF0\x9F\x92\xA9\x80", "💩�");
501}
502#[test]
503fn test_after_five_byte() {
504 test(b"\xFB\xBF\xBF\xBF\xBF\x80", "������");
505}
506#[test]
507fn test_after_six_byte() {
508 test(b"\xFD\xBF\xBF\xBF\xBF\xBF\x80", "�������");
509}
510
511#[test]
513fn test_two_byte_lead() {
514 test(b"\xC2", "�");
515}
516#[test]
517fn test_three_byte_lead() {
518 test(b"\xE2", "�");
519}
520#[test]
521fn test_three_byte_lead_and_one_trail() {
522 test(b"\xE2\x98", "�");
523}
524#[test]
525fn test_four_byte_lead() {
526 test(b"\xF0", "�");
527}
528#[test]
529fn test_four_byte_lead_and_one_trail() {
530 test(b"\xF0\x9F", "�");
531}
532#[test]
533fn test_four_byte_lead_and_two_trails() {
534 test(b"\xF0\x9F\x92", "�");
535}
536
537#[test]
539fn test_fe() {
540 test(b"\xFE", "�");
541}
542
543#[test]
544fn test_fe_and_trail() {
545 test(b"\xFE\x80", "��");
546}
547
548#[test]
549fn test_ff() {
550 test(b"\xFF", "�");
551}
552#[test]
553fn test_ff_and_trail() {
554 test(b"\xFF\x80", "��");
555}