scan_rules/
input.rs

1/*
2Copyright ⓒ 2016 Daniel Keep.
3
4Licensed under the MIT license (see LICENSE or <http://opensource.org
5/licenses/MIT>) or the Apache License, Version 2.0 (see LICENSE of
6<http://www.apache.org/licenses/LICENSE-2.0>), at your option. All
7files in the project carrying such notice may not be copied, modified,
8or distributed except according to those terms.
9*/
10/*!
11This module contains items related to input handling.
12
13The short version is this:
14
15* Values provided as input to the user-facing scanning macros must implement `IntoScanCursor`, which converts them into something that implements `ScanCursor`.
16
17* The input provided to actual type scanners will be something that implements the `ScanInput` trait.
18
19`IntoScanCursor` will be of interest if you are implementing a type which you want to be scannable.  `StrCursor` will be of interest if you want to construct a specialised cursor.  `ScanCursor` will be of interest if you are using a `^..cursor` pattern to capture a cursor.
20*/
21use std::borrow::Cow;
22use std::marker::PhantomData;
23use ::ScanError;
24
25/**
26Conversion into a `ScanCursor`.
27
28This is a helper trait used to convert different values into a scannable cursor type.  Implement this if you want your type to be usable as input to one of the scanning macros.
29*/
30pub trait IntoScanCursor<'a>: Sized {
31    /**
32    The corresponding scannable cursor type.
33    */
34    type Output: 'a + ScanCursor<'a>;
35
36    /**
37    Convert this into a scannable cursor.
38    */
39    fn into_scan_cursor(self) -> Self::Output;
40}
41
42impl<'a, T> IntoScanCursor<'a> for T where T: 'a + ScanCursor<'a> {
43    type Output = Self;
44    fn into_scan_cursor(self) -> Self::Output {
45        self
46    }
47}
48
49impl<'a> IntoScanCursor<'a> for &'a str {
50    type Output = StrCursor<'a>;
51    fn into_scan_cursor(self) -> Self::Output {
52        StrCursor::new(self)
53    }
54}
55
56impl<'a> IntoScanCursor<'a> for &'a String {
57    type Output = StrCursor<'a>;
58    fn into_scan_cursor(self) -> Self::Output {
59        StrCursor::new(self)
60    }
61}
62
63impl<'a> IntoScanCursor<'a> for &'a Cow<'a, str> {
64    type Output = StrCursor<'a>;
65    fn into_scan_cursor(self) -> Self::Output {
66        StrCursor::new(self)
67    }
68}
69
70/**
71This trait defines the interface to input values that can be scanned.
72*/
73pub trait ScanCursor<'a>: 'a + Sized + Clone {
74    /**
75    Corresponding scan input type.
76    */
77    type ScanInput: ScanInput<'a>;
78
79    /**
80    Assert that the input has been exhausted, or that the current position is a valid place to "stop".
81    */
82    fn try_end(self) -> Result<(), (ScanError, Self)>;
83
84    /**
85    Scan a value from the current position.  The closure will be called with all available input, and is expected to return *either* the scanned value, and the number of bytes of input consumed, *or* a reason why scanning failed.
86
87    The input will have all leading whitespace removed, if applicable.
88    */
89    fn try_scan<F, Out>(self, f: F) -> Result<(Out, Self), (ScanError, Self)>
90    where F: FnOnce(Self::ScanInput) -> Result<(Out, usize), ScanError>;
91
92    /**
93    Performs the same task as [`try_scan`](#tymethod.try_scan), except that it *does not* perform whitespace stripping.
94    */
95    fn try_scan_raw<F, Out>(self, f: F) -> Result<(Out, Self), (ScanError, Self)>
96    where F: FnOnce(Self::ScanInput) -> Result<(Out, usize), ScanError>;
97
98    /**
99    Match the provided literal term against the input.
100
101    Implementations are free to interpret "match" as they please.
102    */
103    fn try_match_literal(self, lit: &str) -> Result<Self, (ScanError, Self)>;
104
105    /**
106    Returns the remaining input as a string slice.
107    */
108    fn as_str(self) -> &'a str;
109
110    /**
111    Returns the number of bytes consumed by this cursor since its creation.
112    */
113    fn offset(&self) -> usize;
114}
115
116/**
117This trait is the interface scanners use to access the input being scanned.
118*/
119pub trait ScanInput<'a>: 'a + Sized + Clone {
120    /**
121    Corresponding cursor type.
122    */
123    type ScanCursor: ScanCursor<'a>;
124
125    /**
126    Marker type used to do string comparisons.
127    */
128    type StrCompare: StrCompare;
129
130    /**
131    Get the contents of the input as a string slice.
132    */
133    fn as_str(&self) -> &'a str;
134
135    /**
136    Create a new input from a subslice of *this* input's contents.
137
138    This should be used to ensure that additional state and settings (such as the string comparison marker) are preserved.
139    */
140    fn from_subslice(&self, subslice: &'a str) -> Self;
141
142    /**
143    Turn the input into an independent cursor, suitable for feeding back into a user-facing scanning macro.
144    */
145    fn to_cursor(&self) -> Self::ScanCursor;
146}
147
148/**
149Basic cursor implementation wrapping a string slice.
150
151The `Cmp` parameter can be used to control the string comparison logic used.
152*/
153#[derive(Debug)]
154pub struct StrCursor<'a, Cmp=ExactCompare, Space=IgnoreSpace, Word=Wordish>
155where
156    Cmp: StrCompare,
157    Space: SkipSpace,
158    Word: SliceWord,
159{
160    offset: usize,
161    slice: &'a str,
162    _marker: PhantomData<(Cmp, Space, Word)>,
163}
164
165/*
166These have to be spelled out to avoid erroneous constraints on the type parameters.
167*/
168impl<'a, Cmp, Space, Word>
169Copy for StrCursor<'a, Cmp, Space, Word>
170where
171    Cmp: StrCompare,
172    Space: SkipSpace,
173    Word: SliceWord,
174{}
175
176impl<'a, Cmp, Space, Word>
177Clone for StrCursor<'a, Cmp, Space, Word>
178where
179    Cmp: StrCompare,
180    Space: SkipSpace,
181    Word: SliceWord,
182{
183    fn clone(&self) -> Self {
184        *self
185    }
186}
187
188impl<'a, Cmp, Space, Word>
189StrCursor<'a, Cmp, Space, Word>
190where
191    Cmp: StrCompare,
192    Space: SkipSpace,
193    Word: SliceWord,
194{
195    /**
196    Construct a new `StrCursor` with a specific `offset`.
197
198    The `offset` is logically the number of bytes which have already been consumed from the original input; these already-consumed bytes *must not* be included in `slice`.
199    */
200    pub fn new(slice: &'a str) -> Self {
201        StrCursor {
202            offset: 0,
203            slice: slice,
204            _marker: PhantomData,
205        }
206    }
207
208    /**
209    Advance the cursor by the given number of bytes.
210    */
211    fn advance_by(self, bytes: usize) -> Self {
212        StrCursor {
213            offset: self.offset + bytes,
214            slice: &self.slice[bytes..],
215            _marker: PhantomData,
216        }
217    }
218
219    /**
220    Returns the number of bytes of input that have been consumed by this `StrCursor`.
221    */
222    fn offset(self) -> usize {
223        self.offset
224    }
225}
226
227impl<'a, Cmp, Space, Word>
228ScanCursor<'a> for StrCursor<'a, Cmp, Space, Word>
229where
230    Cmp: StrCompare,
231    Space: SkipSpace,
232    Word: SliceWord,
233{
234    type ScanInput = Self;
235
236    fn try_end(self) -> Result<(), (ScanError, Self)> {
237        if Space::skip_space(self.slice) == self.slice.len() {
238            Ok(())
239        } else {
240            Err((ScanError::expected_end().add_offset(self.offset()), self))
241        }
242    }
243
244    fn try_scan<F, Out>(self, f: F) -> Result<(Out, Self), (ScanError, Self)>
245    where F: FnOnce(Self::ScanInput) -> Result<(Out, usize), ScanError> {
246        let tmp_off = Space::skip_space(self.slice);
247        let tmp = self.advance_by(tmp_off);
248        match f(tmp) {
249            Ok((out, off)) => Ok((out, tmp.advance_by(off))),
250            Err(err) => Err((err.add_offset(tmp.offset()), self)),
251        }
252    }
253
254    fn try_scan_raw<F, Out>(self, f: F) -> Result<(Out, Self), (ScanError, Self)>
255    where F: FnOnce(Self::ScanInput) -> Result<(Out, usize), ScanError> {
256        match f(self) {
257            Ok((out, off)) => Ok((out, self.advance_by(off))),
258            Err(err) => Err((err.add_offset(self.offset()), self)),
259        }
260    }
261
262    fn try_match_literal(self, lit: &str) -> Result<Self, (ScanError, Self)> {
263        let mut tmp_off = Space::skip_space(self.slice);
264        let mut tmp = &self.slice[tmp_off..];
265        let mut lit = lit;
266
267        while lit.len() > 0 {
268            // Match leading spaces.
269            match Space::match_spaces(tmp, lit) {
270                Ok((a, b)) => {
271                    tmp = &tmp[a..];
272                    tmp_off += a;
273                    lit = &lit[b..];
274                },
275                Err(off) => {
276                    return Err((
277                        ScanError::literal_mismatch()
278                            .add_offset(self.offset() + tmp_off + off),
279                        self
280                    ));
281                },
282            }
283
284            if lit.len() == 0 { break; }
285
286            // Pull out the leading wordish things.
287            let lit_word = match Word::slice_word(lit) {
288                Some(0) | None => panic!("literal {:?} begins with a non-space, non-word", lit),
289                Some(b) => &lit[..b],
290            };
291            let tmp_word = match Word::slice_word(tmp) {
292                Some(b) => &tmp[..b],
293                None => return Err((
294                    ScanError::literal_mismatch()
295                        .add_offset(self.offset() + tmp_off),
296                    self
297                )),
298            };
299
300            if !Cmp::compare(tmp_word, lit_word) {
301                return Err((
302                    ScanError::literal_mismatch()
303                        .add_offset(self.offset() + tmp_off),
304                    self
305                ));
306            }
307
308            tmp = &tmp[tmp_word.len()..];
309            tmp_off += tmp_word.len();
310            lit = &lit[lit_word.len()..];
311        }
312
313        Ok(self.advance_by(tmp_off))
314    }
315
316    fn as_str(self) -> &'a str {
317        self.slice
318    }
319
320    fn offset(&self) -> usize {
321        self.offset
322    }
323}
324
325impl<'a, Cmp, Space, Word>
326ScanInput<'a> for StrCursor<'a, Cmp, Space, Word>
327where
328    Cmp: StrCompare,
329    Space: SkipSpace,
330    Word: SliceWord,
331{
332    type ScanCursor = Self;
333    type StrCompare = Cmp;
334
335    fn as_str(&self) -> &'a str {
336        self.slice
337    }
338
339    fn from_subslice(&self, subslice: &'a str) -> Self {
340        use ::util::StrUtil;
341        let offset = self.as_str().subslice_offset_stable(subslice)
342            .expect("called `StrCursor::from_subslice` with disjoint subslice");
343
344        StrCursor {
345            offset: self.offset + offset,
346            slice: subslice,
347            _marker: PhantomData,
348        }
349    }
350
351    fn to_cursor(&self) -> Self::ScanCursor {
352        /*
353        Note that we strip the offset information here, essentially making this a *new* cursor, not just a copy of the existing one.
354        */
355        StrCursor::new(self.slice)
356    }
357}
358
359/**
360This implementation is provided to allow scanners to be used manually with a minimum of fuss.
361
362It *only* supports direct, exact equality comparison.
363*/
364impl<'a> ScanInput<'a> for &'a str {
365    type ScanCursor = StrCursor<'a>;
366    type StrCompare = ExactCompare;
367
368    fn as_str(&self) -> &'a str {
369        *self
370    }
371
372    fn from_subslice(&self, subslice: &'a str) -> Self {
373        subslice
374    }
375
376    fn to_cursor(&self) -> Self::ScanCursor {
377        self.into_scan_cursor()
378    }
379}
380
381/**
382Skip all leading whitespace in a string, and return both the resulting slice and the number of bytes skipped.
383*/
384fn skip_space(s: &str) -> (&str, usize) {
385    let off = s.char_indices()
386        .take_while(|&(_, c)| c.is_whitespace())
387        .map(|(i, c)| i + c.len_utf8())
388        .last()
389        .unwrap_or(0);
390    (&s[off..], off)
391}
392
393/**
394Defines an interface for skipping whitespace.
395*/
396pub trait SkipSpace: 'static {
397    /**
398    Given two strings, does the leading whitespace match?
399
400    If so, how many leading bytes from each should be dropped?
401
402    If not, after many bytes into `a` do they disagree?
403    */
404    fn match_spaces(a: &str, b: &str) -> Result<(usize, usize), usize>;
405
406    /**
407    Return the number of bytes of leading whitespace in `a` that should be skipped.
408    */
409    fn skip_space(a: &str) -> usize;
410}
411
412/**
413Matches all whitespace *exactly*, and does not skip any.
414*/
415#[derive(Debug)]
416pub enum ExactSpace {}
417
418impl SkipSpace for ExactSpace {
419    fn match_spaces(a: &str, b: &str) -> Result<(usize, usize), usize> {
420        let mut acs = a.char_indices();
421        let mut bcs = b.char_indices();
422        let (mut last_ai, mut last_bi) = (0, 0);
423        while let (Some((ai, ac)), Some((bi, bc))) = (acs.next(), bcs.next()) {
424            if !ac.is_whitespace() {
425                return Ok((ai, bi));
426            } else if ac != bc {
427                return Err(ai);
428            } else {
429                last_ai = ai + ac.len_utf8();
430                last_bi = bi + ac.len_utf8();
431            }
432        }
433        Ok((last_ai, last_bi))
434    }
435
436    fn skip_space(_: &str) -> usize {
437        0
438    }
439}
440
441#[cfg(test)]
442#[test]
443fn test_exact_space() {
444    use self::ExactSpace as ES;
445
446    assert_eq!(ES::match_spaces("", ""), Ok((0, 0)));
447    assert_eq!(ES::match_spaces(" ", " "), Ok((1, 1)));
448    assert_eq!(ES::match_spaces(" x", " x"), Ok((1, 1)));
449    assert_eq!(ES::match_spaces(" ", " x"), Ok((1, 1)));
450    assert_eq!(ES::match_spaces(" x", " "), Ok((1, 1)));
451    assert_eq!(ES::match_spaces(" \t ", "   "), Err(1));
452}
453
454/**
455Requires that whitespace in the pattern exists in the input, but the exact *kind* of space doesn't matter.
456*/
457#[derive(Debug)]
458pub enum FuzzySpace {}
459
460impl SkipSpace for FuzzySpace {
461    fn match_spaces(inp: &str, pat: &str) -> Result<(usize, usize), usize> {
462        let (_, a_off) = skip_space(inp);
463        let (_, b_off) = skip_space(pat);
464
465        match (a_off, b_off) {
466            (0, 0) => Ok((0, 0)),
467            (a, b) if a != 0 && b != 0 => Ok((a, b)),
468            (_, _) => Err(0),
469        }
470    }
471
472    fn skip_space(_: &str) -> usize {
473        0
474    }
475}
476
477#[cfg(test)]
478#[test]
479fn test_fuzzy_space() {
480    use self::FuzzySpace as FS;
481
482    assert_eq!(FS::match_spaces("x", "x"), Ok((0, 0)));
483    assert_eq!(FS::match_spaces(" x", " x"), Ok((1, 1)));
484    assert_eq!(FS::match_spaces("  x", " x"), Ok((2, 1)));
485    assert_eq!(FS::match_spaces(" x", "  x"), Ok((1, 2)));
486    assert_eq!(FS::match_spaces("\tx", " x"), Ok((1, 1)));
487    assert_eq!(FS::match_spaces(" x", "\tx"), Ok((1, 1)));
488    assert_eq!(FS::match_spaces("x", " x"), Err(0));
489    assert_eq!(FS::match_spaces(" x", "x"), Err(0));
490}
491
492/**
493Ignores all whitespace *other* than line breaks.
494*/
495#[derive(Debug)]
496pub enum IgnoreNonLine {}
497
498impl SkipSpace for IgnoreNonLine {
499    fn match_spaces(a: &str, b: &str) -> Result<(usize, usize), usize> {
500        let a_off = skip_space_non_line(a);
501        let b_off = skip_space_non_line(b);
502        Ok((a_off, b_off))
503    }
504
505    fn skip_space(s: &str) -> usize {
506        skip_space_non_line(s)
507    }
508}
509
510fn skip_space_non_line(s: &str) -> usize {
511    s.char_indices()
512        .take_while(|&(_, c)| c.is_whitespace()
513            && c != '\r' && c != '\n')
514        .last()
515        .map(|(i, c)| i + c.len_utf8())
516        .unwrap_or(0)
517}
518
519/**
520Ignores all whitespace entirely.
521*/
522#[derive(Debug)]
523pub enum IgnoreSpace {}
524
525impl SkipSpace for IgnoreSpace {
526    fn match_spaces(a: &str, b: &str) -> Result<(usize, usize), usize> {
527        let (_, a_off) = skip_space(a);
528        let (_, b_off) = skip_space(b);
529        Ok((a_off, b_off))
530    }
531
532    fn skip_space(s: &str) -> usize {
533        s.char_indices()
534            .take_while(|&(_, c)| c.is_whitespace())
535            .map(|(i, c)| i + c.len_utf8())
536            .last()
537            .unwrap_or(0)
538    }
539}
540
541/**
542Defines an interface for slicing words out of input and literal text.
543*/
544pub trait SliceWord: 'static {
545    /**
546    If `s` starts with a word, how long is it?
547    */
548    fn slice_word(s: &str) -> Option<usize>;
549}
550
551/**
552Treat any contiguous sequence of non-space characters (according to Unicode's definition of the `\s` regular expression class) as a word.
553*/
554#[derive(Debug)]
555pub enum NonSpace {}
556
557impl SliceWord for NonSpace {
558    fn slice_word(s: &str) -> Option<usize> {
559        slice_non_space(s)
560    }
561}
562
563/**
564Treat any contiguous sequence of "word" characters (according to Unicode's definition of the `\w` regular expression class) *or* any other single character as a word.
565*/
566#[derive(Debug)]
567pub enum Wordish {}
568
569impl SliceWord for Wordish {
570    fn slice_word(s: &str) -> Option<usize> {
571        slice_wordish(s)
572    }
573}
574
575/**
576Defines an interface for comparing two strings for equality.
577
578This is used to allow `StrCursor` to be parametrised on different kinds of string comparisons: case-sensitive, case-insensitive, canonicalising, *etc.*
579*/
580pub trait StrCompare: 'static {
581    /**
582    Compare two strings and return `true` if they should be considered "equal".
583    */
584    fn compare(a: &str, b: &str) -> bool;
585}
586
587/**
588Marker type used to do exact, byte-for-byte string comparisons.
589
590This is likely the fastest kind of string comparison, and matches the default behaviour of the `==` operator on strings.
591*/
592#[derive(Debug)]
593pub enum ExactCompare {}
594
595impl StrCompare for ExactCompare {
596    fn compare(a: &str, b: &str) -> bool {
597        a == b
598    }
599}
600
601/**
602Marker type used to do case-insensitive string comparisons.
603
604Note that this *does not* take any locale information into account.  It is only as correct as a call to `char::to_lowercase`.
605*/
606#[derive(Debug)]
607pub enum IgnoreCase {}
608
609impl StrCompare for IgnoreCase {
610    fn compare(a: &str, b: &str) -> bool {
611        let mut acs = a.chars().flat_map(char::to_lowercase);
612        let mut bcs = b.chars().flat_map(char::to_lowercase);
613        loop {
614            match (acs.next(), bcs.next()) {
615                (Some(a), Some(b)) if a == b => (),
616                (None, None) => return true,
617                _ => return false
618            }
619        }
620    }
621}
622
623#[cfg(test)]
624#[test]
625fn test_ignore_case() {
626    use self::IgnoreCase as IC;
627
628    assert_eq!(IC::compare("hi", "hi"), true);
629    assert_eq!(IC::compare("Hi", "hI"), true);
630    assert_eq!(IC::compare("hI", "Hi"), true);
631    assert_eq!(IC::compare("ẞß", "ßẞ"), true);
632    assert_eq!(IC::compare("ßẞ", "ẞß"), true);
633}
634
635/**
636Marker type used to do case-insensitive, normalized string comparisons.
637
638Specifically, this type will compare strings based on the result of a NFD transform, followed by conversion to lower-case.
639
640Note that this *does not* take any locale information into account.  It is only as correct as a call to `char::to_lowercase`.
641*/
642#[cfg(feature="unicode-normalization")]
643#[derive(Debug)]
644pub enum IgnoreCaseNormalized {}
645
646#[cfg(feature="unicode-normalization")]
647impl StrCompare for IgnoreCaseNormalized {
648    fn compare(a: &str, b: &str) -> bool {
649        use unicode_normalization::UnicodeNormalization;
650
651        let mut acs = a.nfd().flat_map(char::to_lowercase);
652        let mut bcs = b.nfd().flat_map(char::to_lowercase);
653        loop {
654            match (acs.next(), bcs.next()) {
655                (Some(a), Some(b)) if a == b => (),
656                (None, None) => return true,
657                _ => return false
658            }
659        }
660    }
661}
662
663#[cfg(feature="unicode-normalization")]
664#[cfg(test)]
665#[test]
666fn test_ignore_case_normalized() {
667    use self::IgnoreCaseNormalized as ICN;
668
669    assert_eq!(ICN::compare("hi", "hi"), true);
670    assert_eq!(ICN::compare("Hi", "hI"), true);
671    assert_eq!(ICN::compare("hI", "Hi"), true);
672    assert_eq!(ICN::compare("café", "cafe\u{301}"), true);
673    assert_eq!(ICN::compare("cafe\u{301}", "café"), true);
674    assert_eq!(ICN::compare("CafÉ", "CafE\u{301}"), true);
675    assert_eq!(ICN::compare("CAFÉ", "cafe\u{301}"), true);
676}
677
678/**
679Marker type used to do ASCII case-insensitive string comparisons.
680
681Note that this is *only correct* for pure, ASCII-only strings.  To get less incorrect case-insensitive comparisons, you will need to use a Unicode-aware comparison.
682
683This exists because ASCII-only case conversions are easily understood and relatively fast.
684*/
685#[derive(Debug)]
686pub enum IgnoreAsciiCase {}
687
688impl StrCompare for IgnoreAsciiCase {
689    fn compare(a: &str, b: &str) -> bool {
690        use std::ascii::AsciiExt;
691        a.eq_ignore_ascii_case(b)
692    }
693}
694
695/**
696Marker type used to do normalized string comparisons.
697
698Specifically, this type will compare strings based on the result of a NFD transform.
699*/
700#[cfg(feature="unicode-normalization")]
701#[derive(Debug)]
702pub enum Normalized {}
703
704#[cfg(feature="unicode-normalization")]
705impl StrCompare for Normalized {
706    fn compare(a: &str, b: &str) -> bool {
707        use unicode_normalization::UnicodeNormalization;
708
709        let mut acs = a.nfd();
710        let mut bcs = b.nfd();
711        loop {
712            match (acs.next(), bcs.next()) {
713                (Some(a), Some(b)) if a == b => (),
714                (None, None) => return true,
715                _ => return false
716            }
717        }
718    }
719}
720
721#[cfg(feature="unicode-normalization")]
722#[cfg(test)]
723#[test]
724fn test_normalized() {
725    use self::Normalized as N;
726
727    assert_eq!(N::compare("hi", "hi"), true);
728    assert_eq!(N::compare("café", "cafe\u{301}"), true);
729    assert_eq!(N::compare("cafe\u{301}", "café"), true);
730}
731
732fn slice_non_space(s: &str) -> Option<usize> {
733    use ::util::TableUtil;
734    use ::unicode::property::White_Space_table as WS;
735
736    s.char_indices()
737        .take_while(|&(_, c)| !WS.span_table_contains(&c))
738        .map(|(i, c)| i + c.len_utf8())
739        .last()
740}
741
742fn slice_wordish(s: &str) -> Option<usize> {
743    use ::util::TableUtil;
744    use ::unicode::regex::PERLW;
745
746    let word_len = s.char_indices()
747        .take_while(|&(_, c)| PERLW.span_table_contains(&c))
748        .map(|(i, c)| i + c.len_utf8())
749        .last();
750
751    match word_len {
752        Some(n) => Some(n),
753        None => s.chars().next().map(|c| c.len_utf8()),
754    }
755}