lexpr/parse/
read.rs

1use std::ops::Deref;
2use std::{char, cmp, io, str, u32};
3
4use super::error::{Error, ErrorCode, Result};
5use super::iter::LineColIterator;
6
7/// Trait used by the parser for iterating over input.
8///
9/// This trait is manually "specialized" for iterating over
10/// `&[u8]`. Once feature(specialization) is stable we can use actual
11/// specialization.
12///
13/// This trait is sealed and cannot be implemented for types outside of
14/// `lexpr`.
15pub trait Read<'de>: private::Sealed {
16    #[doc(hidden)]
17    fn next(&mut self) -> Result<Option<u8>>;
18    #[doc(hidden)]
19    fn peek(&mut self) -> Result<Option<u8>>;
20
21    /// Only valid after a call to peek(). Discards the peeked byte.
22    #[doc(hidden)]
23    fn discard(&mut self);
24
25    /// Position of the most recent call to next().
26    ///
27    /// The most recent call was probably next() and not peek(), but this method
28    /// should try to return a sensible result if the most recent call was
29    /// actually peek() because we don't always know.
30    ///
31    /// Only called in case of an error, so performance is not important.
32    #[doc(hidden)]
33    fn position(&self) -> Position;
34
35    /// Position of the most recent call to peek().
36    ///
37    /// The most recent call was probably peek() and not next(), but this method
38    /// should try to return a sensible result if the most recent call was
39    /// actually next() because we don't always know.
40    ///
41    /// Only called in case of an error, so performance is not important.
42    #[doc(hidden)]
43    fn peek_position(&self) -> Position;
44
45    /// Offset from the beginning of the input to the next byte that would be
46    /// returned by next() or peek().
47    #[doc(hidden)]
48    fn byte_offset(&self) -> usize;
49
50    /// Assumes the previous byte was a quotation mark. Parses a string with
51    /// R6RS escapes until the next quotation mark using the given scratch space
52    /// if necessary. The scratch space is initially empty.
53    #[doc(hidden)]
54    fn parse_r6rs_str<'s>(
55        &'s mut self,
56        scratch: &'s mut Vec<u8>,
57    ) -> Result<Reference<'de, 's, str>>;
58
59    /// Assumes the previous byte was a quotation mark. Parses a string with
60    /// Emacs Lisp escapes until the next quotation mark using the given scratch
61    /// space if necessary. The scratch space is initially empty.
62    #[doc(hidden)]
63    fn parse_elisp_str<'s>(
64        &'s mut self,
65        scratch: &'s mut Vec<u8>,
66    ) -> Result<ElispStrReference<'de, 's>>;
67
68    /// Parses an unescaped string until the next whitespace or non-symbol character.
69    #[doc(hidden)]
70    fn parse_symbol<'s>(&'s mut self, scratch: &'s mut Vec<u8>) -> Result<Reference<'de, 's, str>>;
71
72    /// Parses a R6RS character constant.
73    #[doc(hidden)]
74    fn parse_r6rs_char(&mut self, scratch: &mut Vec<u8>) -> Result<char> {
75        // TODO: This could maybe benefit, performance-wise, from a seperate
76        // implementation on `SliceRead`.
77        parse_r6rs_char(self, scratch)
78    }
79
80    /// Parses an Emacs Lisp character constant.
81    fn parse_elisp_char(&mut self, scratch: &mut Vec<u8>) -> Result<char> {
82        // TODO: This could maybe benefit, performance-wise, from a seperate
83        // implementation on `SliceRead`.
84        parse_elisp_char(self, scratch)
85    }
86}
87
88/// A location in the parsed text.
89#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, PartialOrd, Ord)]
90pub struct Position {
91    line: usize,
92    column: usize,
93}
94
95impl Position {
96    pub(crate) fn new(line: usize, column: usize) -> Self {
97        Position { line, column }
98    }
99
100    /// Returns the 1-based line number.
101    pub fn line(&self) -> usize {
102        self.line
103    }
104    /// Returns the column.
105    ///
106    /// Column numbers are 0-based byte offsets from the last line terminator
107    /// (`\n`).
108    pub fn column(&self) -> usize {
109        self.column
110    }
111}
112
113pub enum Reference<'b, 'c, T: ?Sized + 'static> {
114    Borrowed(&'b T),
115    Copied(&'c T),
116}
117
118impl<'b, 'c, T: ?Sized + 'static> Deref for Reference<'b, 'c, T> {
119    type Target = T;
120
121    fn deref(&self) -> &Self::Target {
122        match *self {
123            Reference::Borrowed(b) => b,
124            Reference::Copied(c) => c,
125        }
126    }
127}
128
129#[derive(Debug)]
130pub enum ElispStr<U, M> {
131    Unibyte(U),
132    Multibyte(M),
133}
134
135pub type ElispStrReference<'b, 'c> = ElispStr<Reference<'b, 'c, [u8]>, Reference<'b, 'c, str>>;
136
137#[derive(Debug)]
138enum ElispEscape {
139    Unibyte,
140    Multibyte,
141    Indeterminate,
142}
143
144/// S-expression input source that reads from a std::io input stream.
145pub struct IoRead<R>
146where
147    R: io::Read,
148{
149    iter: LineColIterator<io::Bytes<R>>,
150    /// Temporary storage of peeked byte.
151    ch: Option<u8>,
152}
153
154/// S-expression input source that reads from a slice of bytes.
155//
156// This is more efficient than other iterators because peek() can be read-only
157// and we can compute line/col position only if an error happens.
158pub struct SliceRead<'a> {
159    slice: &'a [u8],
160    /// Index of the *next* byte that will be returned by next() or peek().
161    index: usize,
162}
163
164/// S-expression input source that reads from a UTF-8 string.
165//
166// Able to elide UTF-8 checks by assuming that the input is valid UTF-8.
167pub struct StrRead<'a> {
168    delegate: SliceRead<'a>,
169}
170
171// Prevent users from implementing the Read trait.
172mod private {
173    pub trait Sealed {}
174}
175
176//////////////////////////////////////////////////////////////////////////////
177
178impl<R> IoRead<R>
179where
180    R: io::Read,
181{
182    /// Create a S-expression input source to read from a std::io input stream.
183    pub fn new(reader: R) -> Self {
184        IoRead {
185            iter: LineColIterator::new(reader.bytes()),
186            ch: None,
187        }
188    }
189}
190
191impl<R> private::Sealed for IoRead<R> where R: io::Read {}
192
193impl<R> IoRead<R>
194where
195    R: io::Read,
196{
197    fn parse_r6rs_str_bytes<'s, T, F>(
198        &'s mut self,
199        scratch: &'s mut Vec<u8>,
200        result: F,
201    ) -> Result<T>
202    where
203        T: 's,
204        F: FnOnce(&'s Self, &'s [u8]) -> Result<T>,
205    {
206        loop {
207            let ch = next_or_eof(self)?;
208            match ch {
209                b'"' => {
210                    return result(self, scratch);
211                }
212                b'\\' => {
213                    parse_r6rs_escape(self, scratch)?;
214                }
215                _ => {
216                    scratch.push(ch);
217                }
218            }
219        }
220    }
221
222    fn parse_symbol_bytes<'s, T, F>(&'s mut self, scratch: &'s mut Vec<u8>, result: F) -> Result<T>
223    where
224        T: 's,
225        F: FnOnce(&'s Self, &'s [u8]) -> Result<T>,
226    {
227        loop {
228            match self.peek()? {
229                Some(b' ') | Some(b'\n') | Some(b'\t') | Some(b'\r') | Some(b')') | Some(b']')
230                | Some(b'(') | Some(b'[') | Some(b';') | None => {
231                    if scratch == b"." {
232                        return error(self, ErrorCode::InvalidSymbol);
233                    }
234                    return result(self, scratch);
235                }
236                Some(ch) => {
237                    self.discard();
238                    scratch.push(ch);
239                }
240            }
241        }
242    }
243}
244
245impl<'de, R> Read<'de> for IoRead<R>
246where
247    R: io::Read,
248{
249    #[inline]
250    fn next(&mut self) -> Result<Option<u8>> {
251        match self.ch.take() {
252            Some(ch) => Ok(Some(ch)),
253            None => match self.iter.next() {
254                Some(Err(err)) => Err(Error::io(err)),
255                Some(Ok(ch)) => Ok(Some(ch)),
256                None => Ok(None),
257            },
258        }
259    }
260
261    #[inline]
262    fn peek(&mut self) -> Result<Option<u8>> {
263        match self.ch {
264            Some(ch) => Ok(Some(ch)),
265            None => match self.iter.next() {
266                Some(Err(err)) => Err(Error::io(err)),
267                Some(Ok(ch)) => {
268                    self.ch = Some(ch);
269                    Ok(self.ch)
270                }
271                None => Ok(None),
272            },
273        }
274    }
275
276    #[inline]
277    fn discard(&mut self) {
278        self.ch = None;
279    }
280
281    fn position(&self) -> Position {
282        Position {
283            line: self.iter.line(),
284            column: self.iter.col(),
285        }
286    }
287
288    fn peek_position(&self) -> Position {
289        // The LineColIterator updates its position during peek() so it has the
290        // right one here.
291        self.position()
292    }
293
294    fn byte_offset(&self) -> usize {
295        match self.ch {
296            Some(_) => self.iter.byte_offset() - 1,
297            None => self.iter.byte_offset(),
298        }
299    }
300
301    fn parse_r6rs_str<'s>(
302        &'s mut self,
303        scratch: &'s mut Vec<u8>,
304    ) -> Result<Reference<'de, 's, str>> {
305        self.parse_r6rs_str_bytes(scratch, as_str)
306            .map(Reference::Copied)
307    }
308
309    fn parse_elisp_str<'s>(
310        &'s mut self,
311        scratch: &'s mut Vec<u8>,
312    ) -> Result<ElispStrReference<'de, 's>> {
313        let mut seen_ub_escape = false;
314        let mut seen_mb_escape = false;
315        let mut seen_non_ascii = false;
316        loop {
317            let ch = next_or_eof(self)?;
318            match ch {
319                b'"' => {
320                    let s = if seen_ub_escape && !(seen_mb_escape || seen_non_ascii) {
321                        ElispStr::Unibyte(Reference::Copied(scratch.as_slice()))
322                    } else {
323                        ElispStr::Multibyte(Reference::Copied(as_str(self, scratch)?))
324                    };
325                    return Ok(s);
326                }
327                b'\\' => match parse_elisp_escape(self, scratch)? {
328                    ElispEscape::Unibyte => seen_ub_escape = true,
329                    ElispEscape::Multibyte => seen_mb_escape = true,
330                    ElispEscape::Indeterminate => {}
331                },
332                _ => {
333                    if ch > 127 {
334                        seen_non_ascii = true;
335                    }
336                    scratch.push(ch);
337                }
338            }
339        }
340    }
341
342    fn parse_symbol<'s>(&'s mut self, scratch: &'s mut Vec<u8>) -> Result<Reference<'de, 's, str>> {
343        self.parse_symbol_bytes(scratch, as_str)
344            .map(Reference::Copied)
345    }
346}
347
348//////////////////////////////////////////////////////////////////////////////
349
350impl<'a> SliceRead<'a> {
351    /// Create a S-expression input source to read from a slice of bytes.
352    pub fn new(slice: &'a [u8]) -> Self {
353        SliceRead { slice, index: 0 }
354    }
355
356    fn position_of_index(&self, i: usize) -> Position {
357        let mut position = Position { line: 1, column: 0 };
358        for ch in &self.slice[..i] {
359            match *ch {
360                b'\n' => {
361                    position.line += 1;
362                    position.column = 0;
363                }
364                _ => {
365                    position.column += 1;
366                }
367            }
368        }
369        position
370    }
371
372    fn parse_symbol_bytes<'s, T: ?Sized, F>(
373        &'s mut self,
374        scratch: &'s mut Vec<u8>,
375        result: F,
376    ) -> Result<Reference<'a, 's, T>>
377    where
378        T: 's,
379        F: for<'f> FnOnce(&'s Self, &'f [u8]) -> Result<&'f T>,
380    {
381        // Index of the first byte not yet copied into the scratch space.
382        let start = self.index;
383
384        loop {
385            match self.peek_byte() {
386                None | Some(b' ') | Some(b'\n') | Some(b'\t') | Some(b'\r') | Some(b')')
387                | Some(b']') | Some(b'(') | Some(b'[') | Some(b';') => {
388                    if scratch.is_empty() {
389                        // Fast path: return a slice of the raw S-expression without any
390                        // copying.
391                        let borrowed = &self.slice[start..self.index];
392                        if borrowed == b"." {
393                            return error(self, ErrorCode::InvalidSymbol);
394                        }
395                        return result(self, borrowed).map(Reference::Borrowed);
396                    } else {
397                        scratch.extend_from_slice(&self.slice[start..self.index]);
398                        if scratch == b"." {
399                            return error(self, ErrorCode::InvalidSymbol);
400                        }
401                        // "as &[u8]" is required for rustc 1.8.0
402                        let copied = scratch as &[u8];
403                        return result(self, copied).map(Reference::Copied);
404                    }
405                }
406                _ => self.index += 1,
407            }
408        }
409    }
410
411    fn peek_byte(&self) -> Option<u8> {
412        if self.index < self.slice.len() {
413            Some(self.slice[self.index])
414        } else {
415            None
416        }
417    }
418
419    fn parse_elisp_str_bytes<'s, F>(
420        &'s mut self,
421        scratch: &'s mut Vec<u8>,
422        result: F,
423    ) -> Result<ElispStrReference<'a, 's>>
424    where
425        F: for<'f> FnOnce(&'s Self, &'f [u8]) -> Result<&'f str>,
426    {
427        // Index of the first byte not yet copied into the scratch space.
428        let mut start = self.index;
429        let mut seen_ub_escape = false;
430        let mut seen_mb_escape = false;
431        let mut seen_non_ascii = false;
432
433        loop {
434            while self.index < self.slice.len() {
435                let ch = self.slice[self.index];
436                if ch > 127 {
437                    seen_non_ascii = true;
438                }
439                if needs_escape(ch) {
440                    break;
441                }
442                self.index += 1;
443            }
444            if self.index == self.slice.len() {
445                return error(self, ErrorCode::EofWhileParsingString);
446            }
447            match self.slice[self.index] {
448                b'"' => {
449                    if scratch.is_empty() {
450                        // Fast path: return a slice of the raw S-expression without any
451                        // copying.
452                        let borrowed = &self.slice[start..self.index];
453                        self.index += 1;
454                        if seen_ub_escape && !(seen_mb_escape || seen_non_ascii) {
455                            return Ok(ElispStr::Unibyte(Reference::Borrowed(borrowed)));
456                        } else {
457                            return result(self, borrowed)
458                                .map(|s| ElispStr::Multibyte(Reference::Borrowed(s)));
459                        }
460                    } else {
461                        scratch.extend_from_slice(&self.slice[start..self.index]);
462                        self.index += 1;
463                        if seen_ub_escape && !(seen_mb_escape || seen_non_ascii) {
464                            return Ok(ElispStr::Unibyte(Reference::Copied(scratch)));
465                        } else {
466                            return result(self, scratch)
467                                .map(|s| ElispStr::Multibyte(Reference::Copied(s)));
468                        }
469                    }
470                }
471                b'\\' => {
472                    scratch.extend_from_slice(&self.slice[start..self.index]);
473                    self.index += 1;
474                    match parse_elisp_escape(self, scratch)? {
475                        ElispEscape::Multibyte => seen_mb_escape = true,
476                        ElispEscape::Unibyte => seen_ub_escape = true,
477                        ElispEscape::Indeterminate => {}
478                    }
479                    start = self.index;
480                }
481                _ => unreachable!(),
482            }
483        }
484    }
485
486    /// The big optimization here over IoRead is that if the string contains no
487    /// backslash escape sequences, the returned &str is a slice of the raw
488    /// S-expression data so we avoid copying into the scratch space.
489    fn parse_r6rs_str_bytes<'s, T: ?Sized, F>(
490        &'s mut self,
491        scratch: &'s mut Vec<u8>,
492        result: F,
493    ) -> Result<Reference<'a, 's, T>>
494    where
495        T: 's,
496        F: for<'f> FnOnce(&'s Self, &'f [u8]) -> Result<&'f T>,
497    {
498        // Index of the first byte not yet copied into the scratch space.
499        let mut start = self.index;
500
501        loop {
502            while self.index < self.slice.len() && !needs_escape(self.slice[self.index]) {
503                self.index += 1;
504            }
505            if self.index == self.slice.len() {
506                return error(self, ErrorCode::EofWhileParsingString);
507            }
508            match self.slice[self.index] {
509                b'"' => {
510                    if scratch.is_empty() {
511                        // Fast path: return a slice of the raw S-expression without any
512                        // copying.
513                        let borrowed = &self.slice[start..self.index];
514                        self.index += 1;
515                        return result(self, borrowed).map(Reference::Borrowed);
516                    } else {
517                        scratch.extend_from_slice(&self.slice[start..self.index]);
518                        self.index += 1;
519                        return result(self, scratch).map(Reference::Copied);
520                    }
521                }
522                b'\\' => {
523                    scratch.extend_from_slice(&self.slice[start..self.index]);
524                    self.index += 1;
525                    parse_r6rs_escape(self, scratch)?;
526                    start = self.index;
527                }
528                _ => unreachable!(),
529            }
530        }
531    }
532}
533
534impl<'a> private::Sealed for SliceRead<'a> {}
535
536impl<'a> Read<'a> for SliceRead<'a> {
537    #[inline]
538    fn next(&mut self) -> Result<Option<u8>> {
539        // `Ok(self.slice.get(self.index).map(|ch| { self.index += 1; *ch }))`
540        // is about 10% slower.
541        Ok(if self.index < self.slice.len() {
542            let ch = self.slice[self.index];
543            self.index += 1;
544            Some(ch)
545        } else {
546            None
547        })
548    }
549
550    #[inline]
551    fn peek(&mut self) -> Result<Option<u8>> {
552        // `Ok(self.slice.get(self.index).map(|ch| *ch))` is about 10% slower
553        // for some reason.
554        Ok(if self.index < self.slice.len() {
555            Some(self.slice[self.index])
556        } else {
557            None
558        })
559    }
560
561    #[inline]
562    fn discard(&mut self) {
563        self.index += 1;
564    }
565
566    fn position(&self) -> Position {
567        self.position_of_index(self.index)
568    }
569
570    fn peek_position(&self) -> Position {
571        // Cap it at slice.len() just in case the most recent call was next()
572        // and it returned the last byte.
573        self.position_of_index(cmp::min(self.slice.len(), self.index + 1))
574    }
575
576    fn byte_offset(&self) -> usize {
577        self.index
578    }
579
580    fn parse_r6rs_str<'s>(
581        &'s mut self,
582        scratch: &'s mut Vec<u8>,
583    ) -> Result<Reference<'a, 's, str>> {
584        self.parse_r6rs_str_bytes(scratch, as_str)
585    }
586
587    fn parse_elisp_str<'s>(
588        &'s mut self,
589        scratch: &'s mut Vec<u8>,
590    ) -> Result<ElispStrReference<'a, 's>> {
591        self.parse_elisp_str_bytes(scratch, as_str)
592    }
593
594    fn parse_symbol<'s>(&'s mut self, scratch: &'s mut Vec<u8>) -> Result<Reference<'a, 's, str>> {
595        self.parse_symbol_bytes(scratch, as_str)
596    }
597}
598
599//////////////////////////////////////////////////////////////////////////////
600
601impl<'a> StrRead<'a> {
602    /// Create a S-expression input source to read from a UTF-8 string.
603    pub fn new(s: &'a str) -> Self {
604        StrRead {
605            delegate: SliceRead::new(s.as_bytes()),
606        }
607    }
608}
609
610impl<'a> private::Sealed for StrRead<'a> {}
611
612impl<'a> Read<'a> for StrRead<'a> {
613    #[inline]
614    fn next(&mut self) -> Result<Option<u8>> {
615        self.delegate.next()
616    }
617
618    #[inline]
619    fn peek(&mut self) -> Result<Option<u8>> {
620        self.delegate.peek()
621    }
622
623    #[inline]
624    fn discard(&mut self) {
625        self.delegate.discard();
626    }
627
628    fn position(&self) -> Position {
629        self.delegate.position()
630    }
631
632    fn peek_position(&self) -> Position {
633        self.delegate.peek_position()
634    }
635
636    fn byte_offset(&self) -> usize {
637        self.delegate.byte_offset()
638    }
639
640    fn parse_r6rs_str<'s>(
641        &'s mut self,
642        scratch: &'s mut Vec<u8>,
643    ) -> Result<Reference<'a, 's, str>> {
644        self.delegate.parse_r6rs_str_bytes(scratch, |_, bytes| {
645            // The input is assumed to be valid UTF-8 and the \x-escapes are
646            // checked along the way, so don't need to check here.
647            Ok(unsafe { str::from_utf8_unchecked(bytes) })
648        })
649    }
650
651    fn parse_elisp_str<'s>(
652        &'s mut self,
653        scratch: &'s mut Vec<u8>,
654    ) -> Result<ElispStrReference<'a, 's>> {
655        // We cannot apply the same optimization as in `parse_r6rs_str`, because
656        // Emacs Lisp allows embedding arbitrary bytes using hex and octal
657        // escapes.
658        self.delegate.parse_elisp_str_bytes(scratch, as_str)
659    }
660
661    fn parse_symbol<'s>(&'s mut self, scratch: &'s mut Vec<u8>) -> Result<Reference<'a, 's, str>> {
662        self.delegate.parse_symbol_bytes(scratch, |_, bytes| {
663            // The input is assumed to be valid UTF-8 and the \u-escapes are
664            // checked along the way, so don't need to check here.
665            Ok(unsafe { str::from_utf8_unchecked(bytes) })
666        })
667    }
668}
669
670fn next_or_eof<'de, R: ?Sized + Read<'de>>(read: &mut R) -> Result<u8> {
671    match read.next()? {
672        Some(b) => Ok(b),
673        None => error(read, ErrorCode::EofWhileParsingString),
674    }
675}
676
677fn next_or_eof_char<'de, R: ?Sized + Read<'de>>(read: &mut R) -> Result<u8> {
678    match read.next()? {
679        Some(b) => Ok(b),
680        None => error(read, ErrorCode::EofWhileParsingCharacterConstant),
681    }
682}
683
684fn error<'de, R: ?Sized + Read<'de>, T>(read: &R, reason: ErrorCode) -> Result<T> {
685    let position = read.position();
686    Err(Error::syntax(reason, position.line, position.column))
687}
688
689fn as_str<'de, 's, R: Read<'de>>(read: &R, slice: &'s [u8]) -> Result<&'s str> {
690    str::from_utf8(slice).or_else(|_| error(read, ErrorCode::InvalidUnicodeCodePoint))
691}
692
693fn as_char<'de, 's, R: Read<'de> + ?Sized>(read: &R, value: u32) -> Result<char> {
694    match char::from_u32(value) {
695        None => error(read, ErrorCode::InvalidUnicodeCodePoint),
696        Some(c) => Ok(c),
697    }
698}
699
700fn needs_escape(c: u8) -> bool {
701    c == b'\\' || c == b'"'
702}
703
704/// Assumes the previous byte was a hex escape sequnce ('\x') in a string.
705/// Parses the hexadecimal sequence, including the terminating semicolon.
706fn decode_r6rs_hex_escape<'de, R: Read<'de>>(read: &mut R) -> Result<u32> {
707    let mut n = 0;
708    loop {
709        let next = next_or_eof(read)?;
710        if next == b';' {
711            return Ok(n);
712        }
713        match decode_hex_val(next) {
714            None => return error(read, ErrorCode::EofWhileParsingString),
715            Some(val) => {
716                if n >= (1 << 24) {
717                    // A codepoint never has more than 24 bits
718                    return error(read, ErrorCode::InvalidUnicodeCodePoint);
719                }
720                n = (n << 4) + u32::from(val);
721            }
722        }
723    }
724}
725
726/// Parses an R6RS escape sequence and appends it into the scratch
727/// space. Assumes the previous byte read was a backslash.
728fn parse_r6rs_escape<'de, R: Read<'de>>(read: &mut R, scratch: &mut Vec<u8>) -> Result<()> {
729    let ch = next_or_eof(read)?;
730
731    match ch {
732        b'"' => scratch.push(b'"'),
733        b'\\' => scratch.push(b'\\'),
734        b'a' => scratch.push(0x07),
735        b'b' => scratch.push(0x08),
736        b'f' => scratch.push(0x0C),
737        b'n' => scratch.push(b'\n'),
738        b'r' => scratch.push(b'\r'),
739        b't' => scratch.push(b'\t'),
740        b'v' => scratch.push(0x0B),
741        b'|' => scratch.push(b'|'),
742        // TODO: trailing backspace (i.e., a continuation line)
743        b'x' => {
744            let c = {
745                let n = decode_r6rs_hex_escape(read)?;
746                match char::from_u32(n) {
747                    Some(c) => c,
748                    None => {
749                        return error(read, ErrorCode::InvalidUnicodeCodePoint);
750                    }
751                }
752            };
753            scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes());
754        }
755        _ => {
756            return error(read, ErrorCode::InvalidEscape);
757        }
758    }
759
760    Ok(())
761}
762
763/// Assumes the previous byte was a hex escape sequence ('\x') in a string.
764/// Parses the hexadecimal sequence, not including the terminating character
765/// (i.e, first non hex-digit).
766fn decode_elisp_hex_escape<'de, R: Read<'de> + ?Sized>(read: &mut R) -> Result<u32> {
767    let mut n = 0;
768    while let Some(c) = read.peek()? {
769        match decode_hex_val(c) {
770            None => break,
771            Some(val) => {
772                read.discard();
773                if n >= (1 << 24) {
774                    // A codepoint never has more than 24 bits
775                    return error(read, ErrorCode::InvalidUnicodeCodePoint);
776                }
777                n = (n << 4) + u32::from(val);
778            }
779        }
780    }
781    Ok(n)
782}
783
784fn decode_elisp_uni_escape<'de, R: Read<'de> + ?Sized>(read: &mut R, count: u8) -> Result<u32> {
785    let mut n = 0;
786    for _ in 0..count {
787        let c = next_or_eof(read)?;
788        let val = match decode_hex_val(c) {
789            None => return error(read, ErrorCode::InvalidEscape),
790            Some(val) => val,
791        };
792        if n >= (1 << 24) {
793            // A codepoint never has more than 24 bits
794            return error(read, ErrorCode::InvalidUnicodeCodePoint);
795        }
796        n = (n << 4) + u32::from(val);
797    }
798    Ok(n)
799}
800
801/// Assumes the previous byte was a backslash, followed by a character is in the
802/// octal range ('0'..'7'). This character is passed in as `initial`.  Parses
803/// the octal sequence, not including the terminating character (i.e, first non
804/// octal digit).
805fn decode_elisp_octal_escape<'de, R: Read<'de> + ?Sized>(read: &mut R, initial: u8) -> Result<u32> {
806    let mut n = u32::from(initial - b'0');
807    while let Some(c) = read.peek()? {
808        match decode_octal_val(c) {
809            None => break,
810            Some(val) => {
811                read.discard();
812                if n >= (1 << 24) {
813                    // A codepoint never has more than 24 bits
814                    return error(read, ErrorCode::InvalidUnicodeCodePoint);
815                }
816                n = (n << 3) + u32::from(val);
817            }
818        }
819    }
820    Ok(n)
821}
822
823fn parse_elisp_char_escape<'de, R, F>(
824    read: &mut R,
825    scratch: &mut Vec<u8>,
826    decode: F,
827) -> Result<ElispEscape>
828where
829    R: Read<'de>,
830    F: FnOnce(&mut R) -> Result<u32>,
831{
832    let n = decode(read)?;
833    match char::from_u32(n) {
834        Some(c) => {
835            if n > 255 {
836                scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes());
837                Ok(ElispEscape::Multibyte)
838            } else {
839                scratch.push(n as u8);
840                Ok(ElispEscape::Unibyte)
841            }
842        }
843        None => error(read, ErrorCode::InvalidUnicodeCodePoint),
844    }
845}
846
847fn parse_elisp_uni_char_escape<'de, R, F>(
848    read: &mut R,
849    scratch: &mut Vec<u8>,
850    decode: F,
851) -> Result<ElispEscape>
852where
853    R: Read<'de>,
854    F: FnOnce(&mut R) -> Result<u32>,
855{
856    let n = decode(read)?;
857    match char::from_u32(n) {
858        Some(c) => {
859            scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes());
860            Ok(ElispEscape::Multibyte)
861        }
862        None => error(read, ErrorCode::InvalidUnicodeCodePoint),
863    }
864}
865
866/// Parses an Emacs Lisp escape sequence and appends it into the scratch
867/// space. Assumes the previous byte read was a backslash.
868///
869/// Note that we explictly do not support meta-character syntax in
870/// strings. Including meta-escaped characters in strings is recommended against
871/// in the Emacs Lisp manual, Section 21.7.15 "Putting Keyboard Events in
872/// Strings". The same goes for control characters written in the
873/// "keyboard-oriented" syntax (see Section, 2.3.3.3 "Control-Character
874/// Syntax").
875fn parse_elisp_escape<'de, R: Read<'de>>(
876    read: &mut R,
877    scratch: &mut Vec<u8>,
878) -> Result<ElispEscape> {
879    let ch = next_or_eof(read)?;
880
881    match ch {
882        b'"' => scratch.push(b'"'),
883        b'\\' => scratch.push(b'\\'),
884        b' ' => {} // Escaped blank is ignored
885        b'a' => scratch.push(0x07),
886        b'b' => scratch.push(0x08),
887        b't' => scratch.push(b'\t'),
888        b'n' => scratch.push(b'\n'),
889        b'v' => scratch.push(0x0B),
890        b'f' => scratch.push(0x0C),
891        b'r' => scratch.push(b'\r'),
892        b'e' => scratch.push(0x1B),
893        b's' => scratch.push(b' '),
894        b'd' => scratch.push(0x7F),
895        b'^' => {
896            // Control character syntax
897            let ch = next_or_eof(read)?.to_ascii_lowercase();
898            if ch.is_ascii_lowercase() {
899                scratch.push(ch - b'a');
900            } else {
901                return error(read, ErrorCode::InvalidEscape);
902            }
903        }
904        b'N' => {
905            // Name based unicode escape, `\N{NAME}` or `\N{U+X}`. These imply a
906            // multibyte string.
907            if next_or_eof(read)? != b'{' {
908                return error(read, ErrorCode::InvalidEscape);
909            }
910            if next_or_eof(read)? != b'U' || next_or_eof(read)? != b'+' {
911                return error(read, ErrorCode::InvalidEscape);
912            }
913            let escape = parse_elisp_uni_char_escape(read, scratch, decode_elisp_hex_escape)?;
914            if next_or_eof(read)? != b'}' {
915                return error(read, ErrorCode::InvalidEscape);
916            }
917            return Ok(escape);
918        }
919        b'u' => {
920            // Codepoint unicode escape, `\uXXXX` (exactly four hex digits). These imply a multibyte
921            // string.
922            return parse_elisp_uni_char_escape(read, scratch, |read| {
923                decode_elisp_uni_escape(read, 4)
924            });
925        }
926        b'U' => {
927            // Codepoint unicode escape, `\UXXXXXXXX` (exactly eight hex
928            // digits). These imply a multibyte string.
929            return parse_elisp_uni_char_escape(read, scratch, |read| {
930                decode_elisp_uni_escape(read, 8)
931            });
932        }
933        b'x' => {
934            // Hexadecimal escape, allows arbitrary number of hex digits. If in
935            // byte range, these imply a unibyte string, if the other conditions
936            // are met.
937            return parse_elisp_char_escape(read, scratch, decode_elisp_hex_escape);
938        }
939        b'0' | b'1' | b'2' | b'3' | b'4' | b'5' | b'6' | b'7' => {
940            // Octal escape, allows arbitrary number of octale digits. If in
941            // byte range, these imply a unibyte string, if the other conditions
942            // are met.
943            return parse_elisp_char_escape(read, scratch, |read| {
944                decode_elisp_octal_escape(read, ch)
945            });
946        }
947        _ => scratch.push(ch),
948    }
949    Ok(ElispEscape::Indeterminate)
950}
951
952/// Expects either `<any character>, `<character name>` or `x<hex scalar
953/// value>`. Clears the scratch space if necessary.
954fn parse_r6rs_char<'de, R: Read<'de> + ?Sized>(
955    read: &mut R,
956    scratch: &mut Vec<u8>,
957) -> Result<char> {
958    let initial = next_or_eof_char(read)?;
959    if initial == b'x' {
960        match decode_r6rs_char_hex_escape(read)? {
961            Some(n) => match char::from_u32(n) {
962                Some(c) => Ok(c),
963                None => error(read, ErrorCode::InvalidUnicodeCodePoint),
964            },
965            None => Ok('x'),
966        }
967    } else if initial > 0x7F {
968        decode_utf8_sequence(read, scratch, initial)
969    } else {
970        // ASCII character, may be standalone or part of a `<character name>`.
971        let next = match read.peek()? {
972            Some(next) => {
973                if is_delimiter(next) {
974                    return Ok(char::from(initial));
975                } else {
976                    next
977                }
978            }
979            None => return Ok(char::from(initial)),
980        };
981        // Accumulate `<character name>` in scratch space
982        scratch.clear();
983        scratch.push(initial);
984        scratch.push(next);
985        read.discard();
986        while let Some(next) = read.peek()? {
987            if is_delimiter(next) {
988                break;
989            }
990            scratch.push(next);
991            read.discard();
992        }
993        match scratch.as_slice() {
994            b"nul" => Ok('\x00'),
995            b"alarm" => Ok('\x07'),
996            b"backspace" => Ok('\x08'),
997            b"tab" => Ok('\t'),
998            b"linefeed" => Ok('\n'),
999            b"newline" => Ok('\n'),
1000            b"vtab" => Ok('\x0B'),
1001            b"page" => Ok('\x0C'),
1002            b"return" => Ok('\r'),
1003            b"esc" => Ok('\x1B'),
1004            b"space" => Ok(' '),
1005            b"delete" => Ok('\x7F'),
1006            _ => error(read, ErrorCode::InvalidCharacterConstant),
1007        }
1008    }
1009}
1010
1011/// Expects a `#\x` sequence has just been consumed; returns the value of the
1012/// subsequent hex digits, or `None`, if the sequence was empty.
1013fn decode_r6rs_char_hex_escape<'de, R: Read<'de> + ?Sized>(read: &mut R) -> Result<Option<u32>> {
1014    let mut n = 0;
1015    let mut first = true;
1016    loop {
1017        let next = match read.peek()? {
1018            Some(c) => {
1019                if is_delimiter(c) {
1020                    return Ok(if first { None } else { Some(n) });
1021                } else {
1022                    c
1023                }
1024            }
1025            None => return Ok(if first { None } else { Some(n) }),
1026        };
1027        read.discard();
1028        first = false;
1029        match decode_hex_val(next) {
1030            None => return error(read, ErrorCode::EofWhileParsingCharacterConstant),
1031            Some(val) => {
1032                if n >= (1 << 24) {
1033                    // A codepoint never has more than 24 bits
1034                    return error(read, ErrorCode::InvalidUnicodeCodePoint);
1035                }
1036                n = (n << 4) + u32::from(val);
1037            }
1038        }
1039    }
1040}
1041
1042/// Expects either `\<memnonic escape>`, `\<normal escape>`, `\x<hex scalar
1043/// value>` or `<non-delimiter>` where `<non delimiter>` is any character but
1044/// `()[]\;"`. Clears the scratch space if necessary.
1045fn parse_elisp_char<'de, R: Read<'de> + ?Sized>(
1046    read: &mut R,
1047    scratch: &mut Vec<u8>,
1048) -> Result<char> {
1049    let initial = match read.next()? {
1050        None => return error(read, ErrorCode::EofWhileParsingCharacterConstant),
1051        Some(c) => c,
1052    };
1053    if initial > 0x7F {
1054        decode_utf8_sequence(read, scratch, initial)
1055    } else {
1056        // ASCII character
1057        match initial {
1058            b'(' | b')' | b'[' | b']' | b';' => error(read, ErrorCode::InvalidCharacterConstant),
1059            b'\\' => decode_elisp_char_escape(read, scratch),
1060            _ => Ok(char::from(initial)),
1061        }
1062    }
1063}
1064
1065fn decode_elisp_char_escape<'de, R: Read<'de> + ?Sized>(
1066    read: &mut R,
1067    scratch: &mut Vec<u8>,
1068) -> Result<char> {
1069    let ch = next_or_eof_char(read)?;
1070    match ch {
1071        b'a' => Ok('\x07'),
1072        b'b' => Ok('\x08'),
1073        b't' => Ok('\t'),
1074        b'n' => Ok('\n'),
1075        b'v' => Ok('\x0B'),
1076        b'f' => Ok('\x0C'),
1077        b'r' => Ok('\r'),
1078        b'e' => Ok('\x1B'),
1079        b's' => Ok(' '),
1080        b'\\' => Ok('\\'),
1081        b'd' => Ok('\x7F'),
1082        b'^' => {
1083            // Control character syntax
1084            let ch = next_or_eof_char(read)?.to_ascii_lowercase();
1085            if ch.is_ascii_lowercase() {
1086                Ok(char::from(ch - b'a'))
1087            } else {
1088                error(read, ErrorCode::InvalidEscape)
1089            }
1090        }
1091        b'N' => {
1092            // Name based unicode escape, `\N{NAME}` or `\N{U+X}`. These imply a
1093            // multibyte string.
1094            if next_or_eof_char(read)? != b'{' {
1095                return error(read, ErrorCode::InvalidEscape);
1096            }
1097            if next_or_eof_char(read)? != b'U' || next_or_eof_char(read)? != b'+' {
1098                return error(read, ErrorCode::InvalidEscape);
1099            }
1100            let n = decode_elisp_hex_escape(read)?;
1101            if next_or_eof(read)? != b'}' {
1102                return error(read, ErrorCode::InvalidEscape);
1103            }
1104            match char::from_u32(n) {
1105                Some(c) => Ok(c),
1106                None => error(read, ErrorCode::InvalidEscape),
1107            }
1108        }
1109        b'u' => {
1110            // Codepoint unicode escape, `\uXXXX` (exactly four hex digits).
1111            decode_elisp_uni_escape(read, 4).and_then(|n| as_char(read, n))
1112        }
1113        b'U' => {
1114            // Codepoint unicode escape, `\UXXXXXXXX` (exactly eight hex
1115            // digits).
1116            decode_elisp_uni_escape(read, 8).and_then(|n| as_char(read, n))
1117        }
1118        b'x' => {
1119            // Hexadecimal escape, allows arbitrary number of hex digits.
1120            decode_elisp_hex_escape(read).and_then(|n| as_char(read, n))
1121        }
1122        b'0' | b'1' | b'2' | b'3' | b'4' | b'5' | b'6' | b'7' => {
1123            // Octal escape, allows arbitrary number of octale digits.
1124            decode_elisp_octal_escape(read, ch).and_then(|n| as_char(read, n))
1125        }
1126        next => {
1127            if next > 0x7F {
1128                decode_utf8_sequence(read, scratch, next)
1129            } else {
1130                Ok(char::from(next))
1131            }
1132        }
1133    }
1134}
1135
1136#[allow(clippy::zero_prefixed_literal)]
1137static HEX: [u8; 256] = {
1138    const __: u8 = 255; // not a hex digit
1139    [
1140        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
1141        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0
1142        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1
1143        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
1144        00, 01, 02, 03, 04, 05, 06, 07, 08, 09, __, __, __, __, __, __, // 3
1145        __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 4
1146        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 5
1147        __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 6
1148        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
1149        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
1150        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
1151        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
1152        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
1153        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
1154        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
1155        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
1156        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
1157    ]
1158};
1159
1160fn decode_hex_val(val: u8) -> Option<u8> {
1161    let n = HEX[val as usize];
1162    if n == 255 {
1163        None
1164    } else {
1165        Some(n)
1166    }
1167}
1168
1169fn decode_octal_val(val: u8) -> Option<u8> {
1170    if (b'0'..=b'7').contains(&val) {
1171        Some(val - b'0')
1172    } else {
1173        None
1174    }
1175}
1176
1177fn is_delimiter(c: u8) -> bool {
1178    DELIMITER.contains(&c)
1179}
1180
1181// This could probably profit from being a `u8 -> bool` LUT instead.
1182static DELIMITER: [u8; 12] = [
1183    b'(', b')', b'[', b']', b'"', b';', b'#', b' ', b'\n', b'\t', b'\r', 0x0C,
1184];
1185
1186/// Decode a UTF8 multibyte sequence starting with `initial` and return the
1187/// decoded codepoint.
1188fn decode_utf8_sequence<'de, R: Read<'de> + ?Sized>(
1189    read: &mut R,
1190    scratch: &mut Vec<u8>,
1191    initial: u8,
1192) -> Result<char> {
1193    // Start of a UTF-8 sequence. It seems there is no simple way to
1194    // consume a single UTF-8 codepoint from a source of bytes, so
1195    // we implement just enough of an UTF-8 decoder to consume the
1196    // correct number of bytes, and then use the standard library to
1197    // turn these into a codepoint. If one would like to optimize
1198    // this, doing the complete decoding here could eliminate the
1199    // use of `scratch` and the calls into the standard library.
1200    if !(0xC2..=0xF4).contains(&initial) {
1201        return error(read, ErrorCode::InvalidUnicodeCodePoint);
1202    }
1203    scratch.clear();
1204    scratch.push(initial);
1205    let len = (initial - 0xC0) >> 4;
1206    for _ in 0..len {
1207        let b = match read.next()? {
1208            Some(c) => c,
1209            None => return error(read, ErrorCode::InvalidUnicodeCodePoint),
1210        };
1211        scratch.push(b);
1212    }
1213    match str::from_utf8(scratch) {
1214        Err(_) => error(read, ErrorCode::InvalidUnicodeCodePoint),
1215        Ok(s) => Ok(s.chars().next().unwrap()),
1216    }
1217}
lexpr/parse/read.rs

lexpr/parse/
read.rs