quoted_string/spec.rs
1//! This module contains types for specifying what kind of quoted string is used
2use std::fmt::Debug;
3use error::CoreError;
4
5/// type to specify the quoting classifier and parsing implementation
6///
7/// This is normally a zero-sized type.
8pub trait GeneralQSSpec: Clone+Debug {
9 type Quoting: QuotingClassifier;
10 type Parsing: ParsingImpl;
11}
12
13/// Type to provide a quoting classification method.
14///
15/// This is normally a zero-sized type.
16pub trait QuotingClassifier {
17
18 /// Returns the "class" the partial code point belongs too
19 ///
20 /// This is either `QTest`, `NeedsQuoting` or `Invalid`.
21 /// As this function accepts `PartialCodePoint` it can not
22 /// differ between different utf-8 characters, which happens
23 /// to be fine for all supported quoting use cases.
24 fn classify_for_quoting(pcp: PartialCodePoint) -> QuotingClass;
25}
26
27/// Represents if a char can be contained in a quoted string and if it needs escapeing
28#[derive(Debug, Eq, PartialEq, Hash, Clone)]
29pub enum QuotingClass {
30 /// The char can be represented in a quoted string
31 QText,
32 /// The char can be represented but needs quoting (e.g. the " in `r#"bla\"blo"#`)
33 NeedsQuoting,
34 /// The char is invalid (e.g. a CTL char)
35 Invalid
36}
37
38/// Used to validate if a string is valid without beeing quoted.
39///
40/// Depending on the complexity of the underlying grammar this types
41/// implementing this trait might have an internal state, through
42/// they have to be careful wrt. the semantics of `next` when mutating
43/// it.
44///
45/// Types impl this trait are normally not expected to be reused between
46/// multiple calls to `quoted_if_needed` (or whoever uses it). And might
47/// keep track of some meta-information as they are passed normally to the
48/// consuming function as `&mut`.
49pub trait WithoutQuotingValidator {
50 /// if next returns false, it's (self) state should NOT be modified
51 /// i.e. calling .end() after next(..) and returning false corresponds
52 /// to the input sequence _until_ next(..) was false, not including
53 /// the `pcp` from the last `next` call
54 fn next(&mut self, pcp: PartialCodePoint) -> bool;
55
56 /// this is called once the validation through next ended
57 ///
58 /// - the validation might end because there is no more input
59 /// - but it also might end because next returned false, due to the
60 /// definition of next to not change the state if it returns false
61 /// this can and is done
62 /// - it _does not_ need to validate that the length is at last 1, this
63 /// is done by the algorithm using it
64 /// - so for many cases this is just true (the default impl)
65 fn end(&self) -> bool { true }
66}
67
68/// State used when parsing a quoted string
69#[derive(Copy, Clone, Eq, PartialEq, Debug, Hash)]
70pub enum State<T: Copy+Eq+Debug> {
71 /// The initial state
72 Start,
73 /// The normal state
74 Normal,
75 /// Failed as it e.g. hit an invalid char
76 Failed,
77 /// start of a quoted-pair e.g. the \ of \"
78 QPStart,
79 /// a custom state needed for more complex quoted strings
80 Custom(T),
81 /// the end of the quoted string was found
82 /// (this is not necessary the end of the input)
83 End
84}
85
86/// This normally zero sized type provides functions for parsing a quoted string
87pub trait ParsingImpl: Copy+Eq+Debug {
88 fn can_be_quoted(bch: PartialCodePoint) -> bool;
89 fn handle_normal_state(bch: PartialCodePoint) -> Result<(State<Self>, bool), CoreError>;
90 fn advance(&self, _pcp: PartialCodePoint) -> Result<(State<Self>, bool), CoreError> {
91 unreachable!("[BUG] custom state is not used, so advance is unreachable")
92 }
93}
94
95#[derive(Debug, Eq, PartialEq, Hash, Clone)]
96pub struct ScanAutomaton<T: ParsingImpl> {
97 state: State<T>,
98 last_was_emit: bool
99}
100
101impl<Impl> ScanAutomaton<Impl>
102 where Impl: ParsingImpl
103{
104
105 pub fn new() -> Self {
106 ScanAutomaton { state: State::Start, last_was_emit: false }
107 }
108
109 pub fn did_end(&self) -> bool {
110 self.state == State::End
111 }
112
113 pub fn end(&mut self) -> Result<(), CoreError> {
114 if self.did_end() {
115 Ok(())
116 } else {
117 Err(CoreError::DoesNotEndWithDQuotes.into())
118 }
119 }
120
121 pub fn advance(&mut self, pcp: PartialCodePoint) -> Result<bool, CoreError> {
122 match _advance_scan_automaton(self.state, pcp) {
123 Ok((state, emit)) => {
124 self.state = state;
125 self.last_was_emit = emit;
126 Ok(emit)
127 },
128 Err(err) => {
129 self.state = State::Failed;
130 Err(err)
131 }
132 }
133 }
134}
135
136fn _advance_scan_automaton<Impl: ParsingImpl>(state: State<Impl>, pcp: PartialCodePoint)
137 -> Result<(State<Impl>, bool), CoreError>
138{
139 use self::State::*;
140 let pcp_val = pcp.as_u8();
141 match state {
142 Start => {
143 if pcp_val == b'"' {
144 Ok((Normal, false))
145 } else {
146 Err(CoreError::DoesNotStartWithDQuotes)
147 }
148 }
149 Normal => {
150 match pcp_val {
151 b'"' => Ok((End, false)),
152 b'\\' => Ok((QPStart, false)),
153 _ => Impl::handle_normal_state(pcp)
154 }
155 }
156 QPStart => {
157 if Impl::can_be_quoted(pcp) {
158 Ok((Normal, true))
159 } else {
160 Err(CoreError::UnquoteableCharQuoted.into())
161 }
162 }
163 Custom(inner) => {
164 inner.advance(pcp)
165 }
166 End => {
167 Err(CoreError::QuotedStringAlreadyEnded.into())
168 },
169 Failed => Err(CoreError::AdvancedFailedAutomaton.into())
170 }
171}
172
173/// A type which represents part of a utf-8 code point
174///
175/// It does not know which part of a code point it represents
176/// (e.g. utf-8 first or later byte of a code point > 0x7f).
177///
178/// When used in a iteration like context it is also not guaranteed
179/// to go through all utf-8 bytes, it might, or it might just represent
180/// the first byte replacing all bytes > 0x7f with 0xFF.
181///
182/// This allows efficiently abstracting over char sequences and utf-8
183/// byte sequences for tasks which are mainly focused on us-ascii and
184/// treat all non ascii utf8 code points the same.
185#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Debug)]
186pub struct PartialCodePoint(u8);
187impl PartialCodePoint {
188 #[inline(always)]
189 pub fn as_u8(self) -> u8 {
190 self.0
191 }
192
193 /// creates a partial code point from a utf8 byte
194 ///
195 /// The inner value will be the byte passed in,
196 /// which should not be 0xff as 0xff doesn't appear
197 /// in a utf-8 byte sequence
198 ///
199 /// # Debug Assertions
200 ///
201 /// if debug assertions are enabled and 0xff is passed
202 /// in this will panic as it wasn't created from a byte
203 /// from a utf-8 byte sequence
204 #[inline(always)]
205 pub fn from_utf8_byte(u8b: u8) -> PartialCodePoint {
206 debug_assert!(u8b != 0xFF, "utf8 bytes can not be 0xFF");
207 PartialCodePoint(u8b)
208 }
209
210 /// creates a `PartialCodePoint` from a utf-8 code point.
211 ///
212 /// The inner value will be:
213 ///
214 /// - the char if the code point is us-ascii
215 /// - 0xFF if it is larger then 0x7f i.e. non us-ascii
216 #[inline]
217 pub fn from_code_point(code_point: u32) -> PartialCodePoint {
218 if code_point > 0x7f {
219 PartialCodePoint(0xFF)
220 } else {
221 PartialCodePoint(code_point as u8)
222 }
223 }
224}
225
226
227/// Allows unquoted text containing only `_ | a..z | A..Z | 0..9`
228#[derive(Copy, Clone, Debug)]
229pub struct AsciiWordValidator;
230
231impl WithoutQuotingValidator for AsciiWordValidator {
232 fn next(&mut self, pcp: PartialCodePoint) -> bool {
233 let u8val = pcp.as_u8();
234 u8val.is_ascii_alphanumeric() || u8val == b'_'
235 }
236}