quoted_string/
spec.rs

1//! This module contains types for specifying what kind of quoted string is used
2use std::fmt::Debug;
3use error::CoreError;
4
5/// type to specify the quoting classifier and parsing implementation
6///
7/// This is normally a zero-sized type.
8pub trait GeneralQSSpec: Clone+Debug {
9    type Quoting: QuotingClassifier;
10    type Parsing: ParsingImpl;
11}
12
13/// Type to provide a quoting classification method.
14///
15/// This is normally a zero-sized type.
16pub trait QuotingClassifier {
17
18    /// Returns the "class" the partial code point belongs too
19    ///
20    /// This is either `QTest`, `NeedsQuoting` or `Invalid`.
21    /// As this function accepts `PartialCodePoint` it can not
22    /// differ between different utf-8 characters, which happens
23    /// to be fine for all supported quoting use cases.
24    fn classify_for_quoting(pcp: PartialCodePoint) -> QuotingClass;
25}
26
27/// Represents if a char can be contained in a quoted string and if it needs escapeing
28#[derive(Debug, Eq, PartialEq, Hash, Clone)]
29pub enum QuotingClass {
30    /// The char can be represented in a quoted string
31    QText,
32    /// The char can be represented but needs quoting (e.g. the " in `r#"bla\"blo"#`)
33    NeedsQuoting,
34    /// The char is invalid (e.g. a CTL char)
35    Invalid
36}
37
38/// Used to validate if a string is valid without beeing quoted.
39///
40/// Depending on the complexity of the underlying grammar this types
41/// implementing this trait might have an internal state, through
42/// they have to be careful wrt. the semantics of `next` when mutating
43/// it.
44///
45/// Types impl this trait are normally not expected to be reused between
46/// multiple calls to `quoted_if_needed` (or whoever uses it). And might
47/// keep track of some meta-information as they are passed normally to the
48/// consuming function as `&mut`.
49pub trait WithoutQuotingValidator {
50    /// if next returns false, it's (self) state should NOT be modified
51    /// i.e. calling .end() after next(..) and returning false corresponds
52    /// to the input sequence _until_ next(..) was false, not including
53    /// the `pcp` from the last `next` call
54    fn next(&mut self, pcp: PartialCodePoint) -> bool;
55
56    /// this is called once the validation through next ended
57    ///
58    /// - the validation might end because there is no more input
59    /// - but it also might end because next returned false, due to the
60    ///   definition of next to not change the state if it returns false
61    ///   this can and is done
62    /// - it _does not_ need to validate that the length is at last 1, this
63    ///   is done by the algorithm using it
64    /// - so for many cases this is just true (the default impl)
65    fn end(&self) -> bool { true }
66}
67
68/// State used when parsing a quoted string
69#[derive(Copy, Clone, Eq, PartialEq, Debug, Hash)]
70pub enum State<T: Copy+Eq+Debug> {
71    /// The initial state
72    Start,
73    /// The normal state
74    Normal,
75    /// Failed as it e.g. hit an invalid char
76    Failed,
77    /// start of a quoted-pair e.g. the \ of \"
78    QPStart,
79    /// a custom state needed for more complex quoted strings
80    Custom(T),
81    /// the end of the quoted string was found
82    /// (this is not necessary the end of the input)
83    End
84}
85
86/// This normally zero sized type provides functions for parsing a quoted string
87pub trait ParsingImpl: Copy+Eq+Debug {
88    fn can_be_quoted(bch: PartialCodePoint) -> bool;
89    fn handle_normal_state(bch: PartialCodePoint) -> Result<(State<Self>, bool), CoreError>;
90    fn advance(&self, _pcp: PartialCodePoint) -> Result<(State<Self>, bool), CoreError> {
91        unreachable!("[BUG] custom state is not used, so advance is unreachable")
92    }
93}
94
95#[derive(Debug, Eq, PartialEq, Hash, Clone)]
96pub struct ScanAutomaton<T: ParsingImpl> {
97    state: State<T>,
98    last_was_emit: bool
99}
100
101impl<Impl> ScanAutomaton<Impl>
102    where Impl: ParsingImpl
103{
104
105    pub fn new() -> Self {
106        ScanAutomaton { state: State::Start, last_was_emit: false }
107    }
108
109    pub fn did_end(&self) -> bool {
110        self.state == State::End
111    }
112
113    pub fn end(&mut self) -> Result<(), CoreError> {
114        if self.did_end() {
115            Ok(())
116        } else {
117            Err(CoreError::DoesNotEndWithDQuotes.into())
118        }
119    }
120
121    pub fn advance(&mut self, pcp: PartialCodePoint) -> Result<bool, CoreError> {
122        match _advance_scan_automaton(self.state, pcp) {
123            Ok((state, emit)) => {
124                self.state = state;
125                self.last_was_emit = emit;
126                Ok(emit)
127            },
128            Err(err) => {
129                self.state = State::Failed;
130                Err(err)
131            }
132        }
133    }
134}
135
136fn _advance_scan_automaton<Impl: ParsingImpl>(state: State<Impl>, pcp: PartialCodePoint)
137    -> Result<(State<Impl>, bool), CoreError>
138{
139    use self::State::*;
140    let pcp_val = pcp.as_u8();
141    match state {
142        Start => {
143            if pcp_val == b'"' {
144                Ok((Normal, false))
145            } else {
146                Err(CoreError::DoesNotStartWithDQuotes)
147            }
148        }
149        Normal => {
150            match pcp_val {
151                b'"' => Ok((End, false)),
152                b'\\' => Ok((QPStart, false)),
153                _ => Impl::handle_normal_state(pcp)
154            }
155        }
156        QPStart => {
157            if Impl::can_be_quoted(pcp) {
158                Ok((Normal, true))
159            } else {
160                Err(CoreError::UnquoteableCharQuoted.into())
161            }
162        }
163        Custom(inner) => {
164            inner.advance(pcp)
165        }
166        End => {
167            Err(CoreError::QuotedStringAlreadyEnded.into())
168        },
169        Failed => Err(CoreError::AdvancedFailedAutomaton.into())
170    }
171}
172
173/// A type which represents part of a utf-8 code point
174///
175/// It does not know which part of a code point it represents
176/// (e.g. utf-8 first or later byte of a code point > 0x7f).
177///
178/// When used in a iteration like context it is also not guaranteed
179/// to go through all utf-8 bytes, it might, or it might just represent
180/// the first byte replacing all bytes > 0x7f with 0xFF.
181///
182/// This allows efficiently abstracting over char sequences and utf-8
183/// byte sequences for tasks which are mainly focused on us-ascii and
184/// treat all non ascii utf8 code points the same.
185#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Debug)]
186pub struct PartialCodePoint(u8);
187impl PartialCodePoint {
188    #[inline(always)]
189    pub fn as_u8(self) -> u8 {
190        self.0
191    }
192
193    /// creates a partial code point from a utf8 byte
194    ///
195    /// The inner value will be the byte passed in,
196    /// which should not be 0xff as 0xff doesn't appear
197    /// in a utf-8 byte sequence
198    ///
199    /// # Debug Assertions
200    ///
201    /// if debug assertions are enabled and 0xff is passed
202    /// in this will panic as it wasn't created from a byte
203    /// from a utf-8 byte sequence
204    #[inline(always)]
205    pub fn from_utf8_byte(u8b: u8) -> PartialCodePoint {
206        debug_assert!(u8b != 0xFF, "utf8 bytes can not be 0xFF");
207        PartialCodePoint(u8b)
208    }
209
210    /// creates a `PartialCodePoint` from a utf-8 code point.
211    ///
212    /// The inner value will be:
213    ///
214    /// - the char if the code point is us-ascii
215    /// - 0xFF if it is larger then 0x7f i.e. non us-ascii
216    #[inline]
217    pub fn from_code_point(code_point: u32) -> PartialCodePoint {
218        if code_point > 0x7f {
219            PartialCodePoint(0xFF)
220        } else {
221            PartialCodePoint(code_point as u8)
222        }
223    }
224}
225
226
227/// Allows unquoted text containing only `_ | a..z | A..Z | 0..9`
228#[derive(Copy, Clone, Debug)]
229pub struct AsciiWordValidator;
230
231impl WithoutQuotingValidator for AsciiWordValidator {
232    fn next(&mut self, pcp: PartialCodePoint) -> bool {
233        let u8val = pcp.as_u8();
234        u8val.is_ascii_alphanumeric() || u8val == b'_'
235    }
236}