1use std::io::BufRead;
2
3use alloc::collections::VecDeque;
4
5use crate::{Encoding, Error, Result, scanner::Scanner};
6
7const BOM_UTF8: [u8; 3] = [0xef, 0xbb, 0xbf];
8const BOM_UTF16LE: [u8; 2] = [0xff, 0xfe];
9const BOM_UTF16BE: [u8; 2] = [0xfe, 0xff];
10
11fn yaml_parser_determine_encoding(reader: &mut dyn BufRead) -> Result<Option<Encoding>> {
12 let initial_bytes = reader.fill_buf()?;
13 if initial_bytes.is_empty() {
14 return Ok(None);
15 }
16
17 match initial_bytes[0] {
18 0xef => {
19 let mut bom = [0; 3];
20 reader.read_exact(&mut bom)?;
21 if bom == BOM_UTF8 {
22 Ok(Some(Encoding::Utf8))
23 } else {
24 Err(Error::reader(
25 "invalid byte order marker",
26 0,
27 i32::from_be_bytes([bom[0], bom[1], bom[2], 0]),
28 ))
29 }
30 }
31 0xff | 0xfe => {
32 let mut bom = [0; 2];
33 reader.read_exact(&mut bom)?;
34 if bom == BOM_UTF16LE {
35 Ok(Some(Encoding::Utf16Le))
36 } else if bom == BOM_UTF16BE {
37 Ok(Some(Encoding::Utf16Be))
38 } else {
39 Err(Error::reader(
40 "invalid byte order marker",
41 0,
42 i32::from_le_bytes([bom[0], bom[1], 0, 0]),
43 ))
44 }
45 }
46 _ => Ok(Some(Encoding::Utf8)),
47 }
48}
49
50#[allow(unsafe_code)]
53fn read_utf8_buffered(
54 reader: &mut dyn BufRead,
55 out: &mut VecDeque<char>,
56 offset: &mut usize,
57) -> Result<bool> {
58 let available = loop {
59 match reader.fill_buf() {
60 Ok([]) => return Ok(false),
61 Ok(available) => break available,
62 Err(err) if err.kind() == std::io::ErrorKind::Interrupted => (),
63 Err(err) => return Err(err.into()),
64 }
65 };
66
67 match core::str::from_utf8(available) {
68 Ok(valid) => {
69 let used = valid.len();
70 for ch in valid.chars() {
72 push_char(out, ch, *offset)?;
73 *offset += ch.len_utf8();
74 }
75 reader.consume(used);
76 Ok(true)
77 }
78 Err(err) => {
79 let valid_bytes = err.valid_up_to();
80
81 let valid = unsafe {
84 core::str::from_utf8_unchecked(&available[..valid_bytes])
86 };
87 for ch in valid.chars() {
88 push_char(out, ch, *offset)?;
89 *offset += ch.len_utf8();
90 }
91
92 match err.error_len() {
93 Some(_invalid_len) => Err(Error::reader(
94 "invalid UTF-8",
95 *offset,
96 available[valid_bytes] as _,
97 )),
98 None => {
99 if valid_bytes != 0 {
100 reader.consume(valid_bytes);
104 Ok(true)
105 } else {
106 let initial = available[0];
113 read_utf8_char_unbuffered(reader, out, initial, offset)?;
114 Ok(true)
115 }
116 }
117 }
118 }
119 }
120}
121
122fn read_utf8_char_unbuffered(
123 reader: &mut dyn BufRead,
124 out: &mut VecDeque<char>,
125 initial: u8,
126 offset: &mut usize,
127) -> Result<()> {
128 let width = utf8_char_width(initial);
129 let mut buffer = [0; 4];
130 reader.read_exact(&mut buffer[..width])?;
131 if let Ok(valid) = core::str::from_utf8(&buffer[..width]) {
132 let ch = match valid.chars().next() {
134 Some(ch) => ch,
135 None => unreachable!(),
136 };
137 push_char(out, ch, *offset)?;
138 *offset += width;
139 Ok(())
140 } else {
141 Err(Error::reader("invalid UTF-8", *offset, buffer[0] as _))
144 }
145}
146
147fn read_utf16_buffered<const BIG_ENDIAN: bool>(
148 reader: &mut dyn BufRead,
149 out: &mut VecDeque<char>,
150 offset: &mut usize,
151) -> Result<bool> {
152 let available = loop {
153 match reader.fill_buf() {
154 Ok([]) => return Ok(false),
155 Ok(available) => break available,
156 Err(err) if err.kind() == std::io::ErrorKind::Interrupted => (),
157 Err(err) => return Err(err.into()),
158 }
159 };
160
161 let chunks = available.chunks_exact(2).map(|chunk| {
162 let (a, b) = match chunk {
163 [a, b] => (a, b),
164 _ => unreachable!(),
165 };
166 if BIG_ENDIAN {
167 u16::from_be_bytes([*a, *b])
168 } else {
169 u16::from_le_bytes([*a, *b])
170 }
171 });
172
173 let mut used = 0;
174 for ch in core::char::decode_utf16(chunks) {
175 match ch {
176 Ok(ch) => {
177 push_char(out, ch, *offset)?;
178 let n = ch.len_utf16();
179 *offset += n;
180 used += n;
181 }
182 Err(_) => {
183 break;
190 }
191 }
192 }
193
194 if used != 0 {
195 reader.consume(used);
196 *offset += used;
197 Ok(true)
198 } else {
199 debug_assert!(!available.is_empty() && available.len() < 2);
200 read_utf16_char_unbuffered::<BIG_ENDIAN>(reader, out, offset)?;
201 Ok(true)
202 }
203}
204
205fn read_utf16_char_unbuffered<const BIG_ENDIAN: bool>(
206 reader: &mut dyn BufRead,
207 out: &mut VecDeque<char>,
208 offset: &mut usize,
209) -> Result<()> {
210 let mut buffer = [0; 2];
211 reader.read_exact(&mut buffer)?;
212 let first = if BIG_ENDIAN {
213 u16::from_be_bytes(buffer)
214 } else {
215 u16::from_le_bytes(buffer)
216 };
217
218 if is_utf16_surrogate(first) {
219 reader.read_exact(&mut buffer)?;
220 let second = if BIG_ENDIAN {
221 u16::from_be_bytes(buffer)
222 } else {
223 u16::from_le_bytes(buffer)
224 };
225
226 match core::char::decode_utf16([first, second]).next() {
227 Some(Ok(ch)) => {
228 push_char(out, ch, *offset)?;
229 *offset += 4;
230 Ok(())
231 }
232 Some(Err(err)) => Err(Error::reader(
233 "invalid UTF-16",
234 *offset,
235 err.unpaired_surrogate() as _,
236 )),
237 None => unreachable!(),
238 }
239 } else {
240 match core::char::decode_utf16([first]).next() {
241 Some(Ok(ch)) => {
242 push_char(out, ch, *offset)?;
243 *offset += 2;
244 Ok(())
245 }
246 Some(Err(_)) | None => unreachable!(),
247 }
248 }
249}
250
251fn utf8_char_width(initial: u8) -> usize {
252 if initial & 0x80 == 0 {
253 1
254 } else if initial & 0xE0 == 0xC0 {
255 2
256 } else if initial & 0xF0 == 0xE0 {
257 3
258 } else if initial & 0xF8 == 0xF0 {
259 4
260 } else {
261 0
262 }
263}
264
265fn is_utf16_surrogate(value: u16) -> bool {
266 matches!(value, 0xD800..=0xDFFF)
267}
268
269fn push_char(out: &mut VecDeque<char>, ch: char, offset: usize) -> Result<()> {
270 if !(ch == '\x09'
271 || ch == '\x0A'
272 || ch == '\x0D'
273 || ch >= '\x20' && ch <= '\x7E'
274 || ch == '\u{0085}'
275 || ch >= '\u{00A0}' && ch <= '\u{D7FF}'
276 || ch >= '\u{E000}' && ch <= '\u{FFFD}'
277 || ch >= '\u{10000}' && ch <= '\u{10FFFF}')
278 {
279 return Err(Error::reader(
280 "control characters are not allowed",
281 offset,
282 ch as _,
283 ));
284 }
285 out.push_back(ch);
286 Ok(())
287}
288
289pub(crate) fn yaml_parser_update_buffer<R: BufRead>(
290 parser: &mut Scanner<R>,
291 length: usize,
292) -> Result<()> {
293 let reader = parser.read_handler.as_mut().expect("no read handler");
294 if parser.buffer.len() >= length {
295 return Ok(());
296 }
297 if parser.encoding == Encoding::Any {
298 if let Some(encoding) = yaml_parser_determine_encoding(reader)? {
299 parser.encoding = encoding;
300 } else {
301 parser.eof = true;
302 return Ok(());
303 }
304 }
305
306 while parser.buffer.len() < length {
307 if parser.eof {
308 return Ok(());
309 }
310
311 let not_eof = match parser.encoding {
312 Encoding::Any => unreachable!(),
313 Encoding::Utf8 => read_utf8_buffered(reader, &mut parser.buffer, &mut parser.offset)?,
314 Encoding::Utf16Le => {
315 read_utf16_buffered::<false>(reader, &mut parser.buffer, &mut parser.offset)?
316 }
317 Encoding::Utf16Be => {
318 read_utf16_buffered::<true>(reader, &mut parser.buffer, &mut parser.offset)?
319 }
320 };
321 if !not_eof {
322 parser.eof = true;
323 return Ok(());
324 }
325 }
326
327 if parser.offset >= (!0_usize).wrapping_div(2_usize) {
328 return Err(Error::reader("input is too long", parser.offset, -1));
329 }
330 Ok(())
331}