libyaml_safer/
reader.rs

1use std::io::BufRead;
2
3use alloc::collections::VecDeque;
4
5use crate::{Encoding, Error, Result, scanner::Scanner};
6
7const BOM_UTF8: [u8; 3] = [0xef, 0xbb, 0xbf];
8const BOM_UTF16LE: [u8; 2] = [0xff, 0xfe];
9const BOM_UTF16BE: [u8; 2] = [0xfe, 0xff];
10
11fn yaml_parser_determine_encoding(reader: &mut dyn BufRead) -> Result<Option<Encoding>> {
12    let initial_bytes = reader.fill_buf()?;
13    if initial_bytes.is_empty() {
14        return Ok(None);
15    }
16
17    match initial_bytes[0] {
18        0xef => {
19            let mut bom = [0; 3];
20            reader.read_exact(&mut bom)?;
21            if bom == BOM_UTF8 {
22                Ok(Some(Encoding::Utf8))
23            } else {
24                Err(Error::reader(
25                    "invalid byte order marker",
26                    0,
27                    i32::from_be_bytes([bom[0], bom[1], bom[2], 0]),
28                ))
29            }
30        }
31        0xff | 0xfe => {
32            let mut bom = [0; 2];
33            reader.read_exact(&mut bom)?;
34            if bom == BOM_UTF16LE {
35                Ok(Some(Encoding::Utf16Le))
36            } else if bom == BOM_UTF16BE {
37                Ok(Some(Encoding::Utf16Be))
38            } else {
39                Err(Error::reader(
40                    "invalid byte order marker",
41                    0,
42                    i32::from_le_bytes([bom[0], bom[1], 0, 0]),
43                ))
44            }
45        }
46        _ => Ok(Some(Encoding::Utf8)),
47    }
48}
49
50// Allowing unsafe code because it is the only efficient way to partially decode
51// a string slice from a stream of UTF-8 bytes.
52#[allow(unsafe_code)]
53fn read_utf8_buffered(
54    reader: &mut dyn BufRead,
55    out: &mut VecDeque<char>,
56    offset: &mut usize,
57) -> Result<bool> {
58    let available = loop {
59        match reader.fill_buf() {
60            Ok([]) => return Ok(false),
61            Ok(available) => break available,
62            Err(err) if err.kind() == std::io::ErrorKind::Interrupted => (),
63            Err(err) => return Err(err.into()),
64        }
65    };
66
67    match core::str::from_utf8(available) {
68        Ok(valid) => {
69            let used = valid.len();
70            // The entire contents of the input buffer was valid UTF-8.
71            for ch in valid.chars() {
72                push_char(out, ch, *offset)?;
73                *offset += ch.len_utf8();
74            }
75            reader.consume(used);
76            Ok(true)
77        }
78        Err(err) => {
79            let valid_bytes = err.valid_up_to();
80
81            // If some of the buffer contents were valid, append that to the
82            // output.
83            let valid = unsafe {
84                // SAFETY: This is safe because of `valid_up_to()`.
85                core::str::from_utf8_unchecked(&available[..valid_bytes])
86            };
87            for ch in valid.chars() {
88                push_char(out, ch, *offset)?;
89                *offset += ch.len_utf8();
90            }
91
92            match err.error_len() {
93                Some(_invalid_len) => Err(Error::reader(
94                    "invalid UTF-8",
95                    *offset,
96                    available[valid_bytes] as _,
97                )),
98                None => {
99                    if valid_bytes != 0 {
100                        // Some valid UTF-8 characters were present, and the
101                        // tail end of the buffer was an incomplete sequence.
102                        // Leave the incomplete sequence in the buffer.
103                        reader.consume(valid_bytes);
104                        Ok(true)
105                    } else {
106                        // The beginning of the buffer was an incomplete UTF-8
107                        // sequence. Read the whole character unbuffered.
108                        //
109                        // This will return `UnexpectedEof` if the sequence
110                        // cannot be completed. Note that `read_exact()` handles
111                        // interrupt automatically.
112                        let initial = available[0];
113                        read_utf8_char_unbuffered(reader, out, initial, offset)?;
114                        Ok(true)
115                    }
116                }
117            }
118        }
119    }
120}
121
122fn read_utf8_char_unbuffered(
123    reader: &mut dyn BufRead,
124    out: &mut VecDeque<char>,
125    initial: u8,
126    offset: &mut usize,
127) -> Result<()> {
128    let width = utf8_char_width(initial);
129    let mut buffer = [0; 4];
130    reader.read_exact(&mut buffer[..width])?;
131    if let Ok(valid) = core::str::from_utf8(&buffer[..width]) {
132        // We read a whole, valid character.
133        let ch = match valid.chars().next() {
134            Some(ch) => ch,
135            None => unreachable!(),
136        };
137        push_char(out, ch, *offset)?;
138        *offset += width;
139        Ok(())
140    } else {
141        // Since we read the exact character width, the only
142        // possible error here is invalid Unicode.
143        Err(Error::reader("invalid UTF-8", *offset, buffer[0] as _))
144    }
145}
146
147fn read_utf16_buffered<const BIG_ENDIAN: bool>(
148    reader: &mut dyn BufRead,
149    out: &mut VecDeque<char>,
150    offset: &mut usize,
151) -> Result<bool> {
152    let available = loop {
153        match reader.fill_buf() {
154            Ok([]) => return Ok(false),
155            Ok(available) => break available,
156            Err(err) if err.kind() == std::io::ErrorKind::Interrupted => (),
157            Err(err) => return Err(err.into()),
158        }
159    };
160
161    let chunks = available.chunks_exact(2).map(|chunk| {
162        let (a, b) = match chunk {
163            [a, b] => (a, b),
164            _ => unreachable!(),
165        };
166        if BIG_ENDIAN {
167            u16::from_be_bytes([*a, *b])
168        } else {
169            u16::from_le_bytes([*a, *b])
170        }
171    });
172
173    let mut used = 0;
174    for ch in core::char::decode_utf16(chunks) {
175        match ch {
176            Ok(ch) => {
177                push_char(out, ch, *offset)?;
178                let n = ch.len_utf16();
179                *offset += n;
180                used += n;
181            }
182            Err(_) => {
183                // An unpaired surrogate may either be a corrupt stream, but it
184                // can also be that the buffer just happens to contain the first
185                // half of a surrogate pair. Consume all of the valid bytes in
186                // the buffer first, and then handle the unpaired surrogate in
187                // the "slow" path (`read_utf16_char_unbuffered`) the next time
188                // we are called.
189                break;
190            }
191        }
192    }
193
194    if used != 0 {
195        reader.consume(used);
196        *offset += used;
197        Ok(true)
198    } else {
199        debug_assert!(!available.is_empty() && available.len() < 2);
200        read_utf16_char_unbuffered::<BIG_ENDIAN>(reader, out, offset)?;
201        Ok(true)
202    }
203}
204
205fn read_utf16_char_unbuffered<const BIG_ENDIAN: bool>(
206    reader: &mut dyn BufRead,
207    out: &mut VecDeque<char>,
208    offset: &mut usize,
209) -> Result<()> {
210    let mut buffer = [0; 2];
211    reader.read_exact(&mut buffer)?;
212    let first = if BIG_ENDIAN {
213        u16::from_be_bytes(buffer)
214    } else {
215        u16::from_le_bytes(buffer)
216    };
217
218    if is_utf16_surrogate(first) {
219        reader.read_exact(&mut buffer)?;
220        let second = if BIG_ENDIAN {
221            u16::from_be_bytes(buffer)
222        } else {
223            u16::from_le_bytes(buffer)
224        };
225
226        match core::char::decode_utf16([first, second]).next() {
227            Some(Ok(ch)) => {
228                push_char(out, ch, *offset)?;
229                *offset += 4;
230                Ok(())
231            }
232            Some(Err(err)) => Err(Error::reader(
233                "invalid UTF-16",
234                *offset,
235                err.unpaired_surrogate() as _,
236            )),
237            None => unreachable!(),
238        }
239    } else {
240        match core::char::decode_utf16([first]).next() {
241            Some(Ok(ch)) => {
242                push_char(out, ch, *offset)?;
243                *offset += 2;
244                Ok(())
245            }
246            Some(Err(_)) | None => unreachable!(),
247        }
248    }
249}
250
251fn utf8_char_width(initial: u8) -> usize {
252    if initial & 0x80 == 0 {
253        1
254    } else if initial & 0xE0 == 0xC0 {
255        2
256    } else if initial & 0xF0 == 0xE0 {
257        3
258    } else if initial & 0xF8 == 0xF0 {
259        4
260    } else {
261        0
262    }
263}
264
265fn is_utf16_surrogate(value: u16) -> bool {
266    matches!(value, 0xD800..=0xDFFF)
267}
268
269fn push_char(out: &mut VecDeque<char>, ch: char, offset: usize) -> Result<()> {
270    if !(ch == '\x09'
271        || ch == '\x0A'
272        || ch == '\x0D'
273        || ch >= '\x20' && ch <= '\x7E'
274        || ch == '\u{0085}'
275        || ch >= '\u{00A0}' && ch <= '\u{D7FF}'
276        || ch >= '\u{E000}' && ch <= '\u{FFFD}'
277        || ch >= '\u{10000}' && ch <= '\u{10FFFF}')
278    {
279        return Err(Error::reader(
280            "control characters are not allowed",
281            offset,
282            ch as _,
283        ));
284    }
285    out.push_back(ch);
286    Ok(())
287}
288
289pub(crate) fn yaml_parser_update_buffer<R: BufRead>(
290    parser: &mut Scanner<R>,
291    length: usize,
292) -> Result<()> {
293    let reader = parser.read_handler.as_mut().expect("no read handler");
294    if parser.buffer.len() >= length {
295        return Ok(());
296    }
297    if parser.encoding == Encoding::Any {
298        if let Some(encoding) = yaml_parser_determine_encoding(reader)? {
299            parser.encoding = encoding;
300        } else {
301            parser.eof = true;
302            return Ok(());
303        }
304    }
305
306    while parser.buffer.len() < length {
307        if parser.eof {
308            return Ok(());
309        }
310
311        let not_eof = match parser.encoding {
312            Encoding::Any => unreachable!(),
313            Encoding::Utf8 => read_utf8_buffered(reader, &mut parser.buffer, &mut parser.offset)?,
314            Encoding::Utf16Le => {
315                read_utf16_buffered::<false>(reader, &mut parser.buffer, &mut parser.offset)?
316            }
317            Encoding::Utf16Be => {
318                read_utf16_buffered::<true>(reader, &mut parser.buffer, &mut parser.offset)?
319            }
320        };
321        if !not_eof {
322            parser.eof = true;
323            return Ok(());
324        }
325    }
326
327    if parser.offset >= (!0_usize).wrapping_div(2_usize) {
328        return Err(Error::reader("input is too long", parser.offset, -1));
329    }
330    Ok(())
331}