chumsky/
text.rs

1//! Text-specific parsers and utilities.
2//!
3//! *“Ford!" he said, "there's an infinite number of monkeys outside who want to talk to us about this script for
4//! Hamlet they've worked out.”*
5//!
6//! The parsers in this module are generic over both Unicode ([`char`]) and ASCII ([`u8`]) characters. Most parsers take
7//! a type parameter, `C`, that can be either [`u8`] or [`char`] in order to handle either case.
8
9use crate::prelude::*;
10use alloc::string::ToString;
11
12use super::*;
13
14/// A trait implemented by textual character types (currently, [`u8`] and [`char`]).
15///
16/// This trait is currently sealed to minimize the impact of breaking changes. If you find a type that you think should
17/// implement this trait, please [open an issue/PR](https://github.com/zesterer/chumsky/issues/new).
18pub trait Char: Copy + PartialEq + Sealed {
19    /// Returns true if the character is canonically considered to be inline whitespace (i.e: not part of a newline).
20    fn is_inline_whitespace(&self) -> bool;
21
22    /// Returns true if the character is canonically considered to be whitespace.
23    fn is_whitespace(&self) -> bool;
24
25    /// Returns true if the character is canonically considered to be newline.
26    fn is_newline(&self) -> bool;
27
28    /// Return the '0' digit of the character.
29    fn digit_zero() -> Self;
30
31    /// Returns true if the character is canonically considered to be a numeric digit.
32    fn is_digit(&self, radix: u32) -> bool;
33
34    /// Returns true if the character is canonically considered to be valid for starting an identifier.
35    fn is_ident_start(&self) -> bool;
36
37    /// Returns true if the character is canonically considered to be a valid within an identifier.
38    fn is_ident_continue(&self) -> bool;
39
40    /// Returns this character as a [`char`].
41    fn to_ascii(&self) -> Option<u8>;
42}
43
44impl Sealed for &Grapheme {}
45impl Char for &Grapheme {
46    fn is_inline_whitespace(&self) -> bool {
47        self.as_str() == " " || self.as_str() == "\t"
48    }
49
50    fn is_whitespace(&self) -> bool {
51        let mut iter = self.as_str().chars();
52        iter.all(unicode::is_whitespace)
53    }
54
55    fn is_newline(&self) -> bool {
56        [
57            "\r\n",     // CR LF
58            "\n",       // Newline
59            "\r",       // Carriage return
60            "\x0B",     // Vertical tab
61            "\x0C",     // Form feed
62            "\u{0085}", // Next line
63            "\u{2028}", // Line separator
64            "\u{2029}", // Paragraph separator
65        ]
66        .as_slice()
67        .contains(&self.as_str())
68    }
69
70    fn digit_zero() -> Self {
71        Grapheme::digit_zero()
72    }
73
74    fn is_digit(&self, radix: u32) -> bool {
75        let mut iter = self.as_str().chars();
76        match (iter.next(), iter.next()) {
77            (Some(i), None) => i.is_digit(radix),
78            _ => false,
79        }
80    }
81
82    fn to_ascii(&self) -> Option<u8> {
83        let mut iter = self.as_bytes().iter();
84        match (iter.next(), iter.next()) {
85            (Some(i), None) if i.is_ascii() => Some(*i),
86            _ => None,
87        }
88    }
89
90    fn is_ident_start(&self) -> bool {
91        let (first, rest) = self.split();
92        let is_start = unicode_ident::is_xid_start(first) || first == '_';
93        is_start && rest.chars().all(unicode_ident::is_xid_continue)
94    }
95
96    fn is_ident_continue(&self) -> bool {
97        let mut iter = self.as_str().chars();
98        iter.all(unicode_ident::is_xid_continue)
99    }
100}
101
102impl Sealed for char {}
103impl Char for char {
104    fn is_inline_whitespace(&self) -> bool {
105        *self == ' ' || *self == '\t'
106    }
107
108    fn is_whitespace(&self) -> bool {
109        unicode::is_whitespace(*self)
110    }
111
112    fn is_newline(&self) -> bool {
113        [
114            '\n',       // Newline
115            '\r',       // Carriage return
116            '\x0B',     // Vertical tab
117            '\x0C',     // Form feed
118            '\u{0085}', // Next line
119            '\u{2028}', // Line separator
120            '\u{2029}', // Paragraph separator
121        ]
122        .as_slice()
123        .contains(self)
124    }
125
126    fn digit_zero() -> Self {
127        '0'
128    }
129
130    fn is_digit(&self, radix: u32) -> bool {
131        char::is_digit(*self, radix)
132    }
133
134    fn to_ascii(&self) -> Option<u8> {
135        self.is_ascii().then_some(*self as u8)
136    }
137
138    fn is_ident_start(&self) -> bool {
139        unicode_ident::is_xid_start(*self) || *self == '_'
140    }
141
142    fn is_ident_continue(&self) -> bool {
143        unicode_ident::is_xid_continue(*self)
144    }
145}
146
147impl Sealed for u8 {}
148impl Char for u8 {
149    fn is_inline_whitespace(&self) -> bool {
150        *self == b' ' || *self == b'\t'
151    }
152
153    fn is_whitespace(&self) -> bool {
154        self.is_ascii_whitespace()
155    }
156
157    fn is_newline(&self) -> bool {
158        [
159            b'\n',   // Newline
160            b'\r',   // Carriage return
161            b'\x0B', // Vertical tab
162            b'\x0C', // Form feed
163        ]
164        .as_slice()
165        .contains(self)
166    }
167
168    fn digit_zero() -> Self {
169        b'0'
170    }
171
172    fn is_digit(&self, radix: u32) -> bool {
173        (*self as char).is_digit(radix)
174    }
175
176    fn to_ascii(&self) -> Option<u8> {
177        Some(*self)
178    }
179
180    fn is_ident_start(&self) -> bool {
181        (*self as char).is_ident_start()
182    }
183
184    fn is_ident_continue(&self) -> bool {
185        (*self as char).is_ident_continue()
186    }
187}
188
189/// A parser that accepts (and ignores) any number of whitespace characters before or after another pattern.
190#[derive(Copy, Clone)]
191pub struct Padded<A> {
192    pub(crate) parser: A,
193}
194
195impl<'src, I, O, E, A> Parser<'src, I, O, E> for Padded<A>
196where
197    I: ValueInput<'src>,
198    E: ParserExtra<'src, I>,
199    I::Token: Char,
200    A: Parser<'src, I, O, E>,
201{
202    fn go<M: Mode>(&self, inp: &mut InputRef<'src, '_, I, E>) -> PResult<M, O> {
203        inp.skip_while(|c| c.is_whitespace());
204        let out = self.parser.go::<M>(inp)?;
205        inp.skip_while(|c| c.is_whitespace());
206        Ok(out)
207    }
208
209    go_extra!(O);
210}
211
212/// Labels denoting a variety of text-related patterns.
213#[non_exhaustive]
214pub enum TextExpected<'src, I: StrInput<'src>>
215where
216    I::Token: Char,
217{
218    /// Whitespace (for example: spaces, tabs, or newlines).
219    Whitespace,
220    /// Inline whitespace (for example: spaces or tabs).
221    InlineWhitespace,
222    /// A newline character or sequence.
223    Newline,
224    /// A numeric digit within the given radix range.
225    ///
226    /// For example:
227    ///
228    /// - `Digit(0..10)` implies any base-10 digit
229    /// - `Digit(1..16)` implies any non-zero hexadecimal digit
230    Digit(Range<u32>),
231    /// Part of an identifier, either ASCII or unicode.
232    IdentifierPart,
233    /// A specific identifier.
234    Identifier(I::Slice),
235}
236
237/// A parser that accepts (and ignores) any number of whitespace characters.
238///
239/// This parser is a `Parser::Repeated` and so methods such as `at_least()` can be called on it.
240///
241/// The output type of this parser is `()`.
242///
243/// # Examples
244///
245/// ```
246/// # use chumsky::prelude::*;
247/// let whitespace = text::whitespace::<_, extra::Err<Simple<char>>>();
248///
249/// // Any amount of whitespace is parsed...
250/// assert_eq!(whitespace.parse("\t \n  \r ").into_result(), Ok(()));
251/// // ...including none at all!
252/// assert_eq!(whitespace.parse("").into_result(), Ok(()));
253/// ```
254pub fn whitespace<'src, I, E>() -> Repeated<impl Parser<'src, I, (), E> + Copy, (), I, E>
255where
256    I: StrInput<'src>,
257    I::Token: Char + 'src,
258    E: ParserExtra<'src, I>,
259    E::Error: LabelError<'src, I, TextExpected<'src, I>>,
260{
261    any()
262        .filter(|c: &I::Token| c.is_whitespace())
263        .map_err(|mut err: E::Error| {
264            err.label_with(TextExpected::Whitespace);
265            err
266        })
267        .ignored()
268        .repeated()
269}
270
271/// A parser that accepts (and ignores) any number of inline whitespace characters.
272///
273/// This parser is a `Parser::Repeated` and so methods such as `at_least()` can be called on it.
274///
275/// The output type of this parser is `()`.
276///
277/// # Examples
278///
279/// ```
280/// # use chumsky::prelude::*;
281/// let inline_whitespace = text::inline_whitespace::<_, extra::Err<Simple<char>>>();
282///
283/// // Any amount of inline whitespace is parsed...
284/// assert_eq!(inline_whitespace.parse("\t  ").into_result(), Ok(()));
285/// // ...including none at all!
286/// assert_eq!(inline_whitespace.parse("").into_result(), Ok(()));
287/// // ... but not newlines
288/// assert!(inline_whitespace.at_least(1).parse("\n\r").has_errors());
289/// ```
290pub fn inline_whitespace<'src, I, E>() -> Repeated<impl Parser<'src, I, (), E> + Copy, (), I, E>
291where
292    I: StrInput<'src>,
293    I::Token: Char + 'src,
294    E: ParserExtra<'src, I>,
295    E::Error: LabelError<'src, I, TextExpected<'src, I>>,
296{
297    any()
298        .filter(|c: &I::Token| c.is_inline_whitespace())
299        .map_err(|mut err: E::Error| {
300            err.label_with(TextExpected::InlineWhitespace);
301            err
302        })
303        .ignored()
304        .repeated()
305}
306
307/// A parser that accepts (and ignores) any newline characters or character sequences.
308///
309/// The output type of this parser is `()`.
310///
311/// This parser is quite extensive, recognizing:
312///
313/// - Line feed (`\n`)
314/// - Carriage return (`\r`)
315/// - Carriage return + line feed (`\r\n`)
316/// - Vertical tab (`\x0B`)
317/// - Form feed (`\x0C`)
318/// - Next line (`\u{0085}`)
319/// - Line separator (`\u{2028}`)
320/// - Paragraph separator (`\u{2029}`)
321///
322/// # Examples
323///
324/// ```
325/// # use chumsky::prelude::*;
326/// let newline = text::newline::<_, extra::Err<Simple<char>>>();
327///
328/// assert_eq!(newline.parse("\n").into_result(), Ok(()));
329/// assert_eq!(newline.parse("\r").into_result(), Ok(()));
330/// assert_eq!(newline.parse("\r\n").into_result(), Ok(()));
331/// assert_eq!(newline.parse("\x0B").into_result(), Ok(()));
332/// assert_eq!(newline.parse("\x0C").into_result(), Ok(()));
333/// assert_eq!(newline.parse("\u{0085}").into_result(), Ok(()));
334/// assert_eq!(newline.parse("\u{2028}").into_result(), Ok(()));
335/// assert_eq!(newline.parse("\u{2029}").into_result(), Ok(()));
336/// ```
337#[must_use]
338pub fn newline<'src, I, E>() -> impl Parser<'src, I, (), E> + Copy
339where
340    I: StrInput<'src>,
341    I::Token: Char + 'src,
342    E: ParserExtra<'src, I>,
343    &'src str: OrderedSeq<'src, I::Token>,
344    E::Error: LabelError<'src, I, TextExpected<'src, I>>,
345{
346    custom(|inp| {
347        let before = inp.cursor();
348
349        if inp
350            .peek()
351            .map_or(false, |c: I::Token| c.to_ascii() == Some(b'\r'))
352        {
353            inp.skip();
354            if inp
355                .peek()
356                .map_or(false, |c: I::Token| c.to_ascii() == Some(b'\n'))
357            {
358                inp.skip();
359            }
360            Ok(())
361        } else {
362            let c = inp.next();
363            if c.map_or(false, |c: I::Token| c.is_newline()) {
364                Ok(())
365            } else {
366                let span = inp.span_since(&before);
367                Err(LabelError::expected_found(
368                    [TextExpected::Newline],
369                    c.map(MaybeRef::Val),
370                    span,
371                ))
372            }
373        }
374    })
375}
376
377/// A parser that accepts one or more ASCII digits.
378///
379/// The output type of this parser is `I::Slice` (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`]
380/// when `I::Slice` is [`&[u8]`]).
381///
382/// The `radix` parameter functions identically to [`char::is_digit`]. If in doubt, choose `10`.
383///
384/// # Examples
385///
386/// ```
387/// # use chumsky::prelude::*;
388/// let digits = text::digits::<_, extra::Err<Simple<char>>>(10).to_slice();
389///
390/// assert_eq!(digits.parse("0").into_result(), Ok("0"));
391/// assert_eq!(digits.parse("1").into_result(), Ok("1"));
392/// assert_eq!(digits.parse("01234").into_result(), Ok("01234"));
393/// assert_eq!(digits.parse("98345").into_result(), Ok("98345"));
394/// // A string of zeroes is still valid. Use `int` if this is not desirable.
395/// assert_eq!(digits.parse("0000").into_result(), Ok("0000"));
396/// assert!(digits.parse("").has_errors());
397/// ```
398#[must_use]
399pub fn digits<'src, I, E>(
400    radix: u32,
401) -> Repeated<impl Parser<'src, I, <I as Input<'src>>::Token, E> + Copy, I::Token, I, E>
402where
403    I: StrInput<'src>,
404    I::Token: Char + 'src,
405    E: ParserExtra<'src, I>,
406    E::Error: LabelError<'src, I, TextExpected<'src, I>>,
407{
408    any()
409        .filter(move |c: &I::Token| c.is_digit(radix))
410        .map_err(move |mut err: E::Error| {
411            err.label_with(TextExpected::Digit(0..radix));
412            err
413        })
414        .repeated()
415        .at_least(1)
416}
417
418/// A parser that accepts a non-negative integer.
419///
420/// An integer is defined as a non-empty sequence of ASCII digits, where the first digit is non-zero or the sequence
421/// has length one.
422///
423/// The output type of this parser is `I::Slice` (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`]
424/// when `I::Slice` is [`&[u8]`]).
425///
426/// The `radix` parameter functions identically to [`char::is_digit`]. If in doubt, choose `10`.
427///
428/// # Examples
429///
430/// ```
431/// # use chumsky::prelude::*;
432/// let dec = text::int::<_, extra::Err<Simple<char>>>(10);
433///
434/// assert_eq!(dec.parse("0").into_result(), Ok("0"));
435/// assert_eq!(dec.parse("1").into_result(), Ok("1"));
436/// assert_eq!(dec.parse("1452").into_result(), Ok("1452"));
437/// // No leading zeroes are permitted!
438/// assert!(dec.parse("04").has_errors());
439///
440/// let hex = text::int::<_, extra::Err<Simple<char>>>(16);
441///
442/// assert_eq!(hex.parse("2A").into_result(), Ok("2A"));
443/// assert_eq!(hex.parse("d").into_result(), Ok("d"));
444/// assert_eq!(hex.parse("b4").into_result(), Ok("b4"));
445/// assert!(hex.parse("0B").has_errors());
446/// ```
447///
448#[must_use]
449pub fn int<'src, I, E>(radix: u32) -> impl Parser<'src, I, <I as SliceInput<'src>>::Slice, E> + Copy
450where
451    I: StrInput<'src>,
452    I::Token: Char + 'src,
453    E: ParserExtra<'src, I>,
454    E::Error:
455        LabelError<'src, I, TextExpected<'src, I>> + LabelError<'src, I, MaybeRef<'src, I::Token>>,
456{
457    any()
458        .filter(move |c: &I::Token| c.is_digit(radix) && c != &I::Token::digit_zero())
459        .map_err(move |mut err: E::Error| {
460            err.label_with(TextExpected::Digit(1..radix));
461            err
462        })
463        .then(
464            any()
465                .filter(move |c: &I::Token| c.is_digit(radix))
466                .map_err(move |mut err: E::Error| {
467                    err.label_with(TextExpected::Digit(0..radix));
468                    err
469                })
470                .repeated(),
471        )
472        .ignored()
473        .or(just(I::Token::digit_zero()).ignored())
474        .to_slice()
475}
476
477/// Parsers and utilities for working with ASCII inputs.
478pub mod ascii {
479    use super::*;
480
481    /// A parser that accepts a C-style identifier.
482    ///
483    /// The output type of this parser is [`SliceInput::Slice`] (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`] when `I` is
484    /// [`&[u8]`]).
485    ///
486    /// An identifier is defined as an ASCII alphabetic character or an underscore followed by any number of alphanumeric
487    /// characters or underscores. The regex pattern for it is `[a-zA-Z_][a-zA-Z0-9_]*`.
488    #[must_use]
489    pub fn ident<'src, I, E>() -> impl Parser<'src, I, <I as SliceInput<'src>>::Slice, E> + Copy
490    where
491        I: StrInput<'src>,
492        I::Token: Char + 'src,
493        E: ParserExtra<'src, I>,
494        E::Error: LabelError<'src, I, TextExpected<'src, I>>,
495    {
496        any()
497            .filter(|c: &I::Token| {
498                c.to_ascii()
499                    .map_or(false, |i| i.is_ascii_alphabetic() || i == b'_')
500            })
501            .map_err(|mut err: E::Error| {
502                err.label_with(TextExpected::IdentifierPart);
503                err
504            })
505            .then(
506                any()
507                    .filter(|c: &I::Token| {
508                        c.to_ascii()
509                            .map_or(false, |i| i.is_ascii_alphanumeric() || i == b'_')
510                    })
511                    .map_err(|mut err: E::Error| {
512                        err.label_with(TextExpected::IdentifierPart);
513                        err
514                    })
515                    .repeated(),
516            )
517            .to_slice()
518    }
519
520    /// Like [`ident`], but only accepts a specific identifier while rejecting trailing identifier characters.
521    ///
522    /// The output type of this parser is `I::Slice` (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`]
523    /// when `I::Slice` is [`&[u8]`]).
524    ///
525    /// # Examples
526    ///
527    /// ```
528    /// # use chumsky::prelude::*;
529    /// let def = text::ascii::keyword::<_, _, extra::Err<Simple<char>>>("def");
530    ///
531    /// // Exactly 'def' was found
532    /// assert_eq!(def.parse("def").into_result(), Ok("def"));
533    /// // Exactly 'def' was found, with non-identifier trailing characters
534    /// // This works because we made the parser lazy: it parses 'def' and ignores the rest
535    /// assert_eq!(def.clone().lazy().parse("def(foo, bar)").into_result(), Ok("def"));
536    /// // 'def' was found, but only as part of a larger identifier, so this fails to parse
537    /// assert!(def.lazy().parse("define").has_errors());
538    /// ```
539    #[track_caller]
540    pub fn keyword<'src, I, S, E>(
541        keyword: S,
542    ) -> impl Parser<'src, I, <I as SliceInput<'src>>::Slice, E> + Clone + 'src
543    where
544        I: StrInput<'src>,
545        I::Slice: PartialEq,
546        I::Token: Char + fmt::Debug + 'src,
547        S: PartialEq<I::Slice> + Clone + 'src,
548        E: ParserExtra<'src, I> + 'src,
549        E::Error: LabelError<'src, I, TextExpected<'src, I>> + LabelError<'src, I, S>,
550    {
551        /*
552        #[cfg(debug_assertions)]
553        {
554            let mut cs = keyword.seq_iter();
555            if let Some(c) = cs.next() {
556                let c = c.borrow().to_char();
557                assert!(c.is_ascii_alphabetic() || c == '_', "The first character of a keyword must be ASCII alphabetic or an underscore, not {:?}", c);
558            } else {
559                panic!("Keyword must have at least one character");
560            }
561            for c in cs {
562                let c = c.borrow().to_char();
563                assert!(c.is_ascii_alphanumeric() || c == '_', "Trailing characters of a keyword must be ASCII alphanumeric or an underscore, not {:?}", c);
564            }
565        }
566        */
567        ident()
568            .try_map(move |s: I::Slice, span| {
569                if keyword == s {
570                    Ok(())
571                } else {
572                    Err(LabelError::expected_found([keyword.clone()], None, span))
573                }
574            })
575            .to_slice()
576    }
577}
578
579// Unicode is the default
580pub use unicode::*;
581
582/// Parsers and utilities for working with unicode inputs.
583pub mod unicode {
584    use super::*;
585
586    use core::str::{Bytes, Chars};
587    use unicode_segmentation::UnicodeSegmentation;
588
589    /// A type containing one extended Unicode grapheme cluster.
590    #[derive(PartialEq, Eq)]
591    #[repr(transparent)]
592    pub struct Grapheme {
593        inner: str,
594    }
595
596    impl Grapheme {
597        fn new(inner: &str) -> &Self {
598            // SAFETY: This is ok because Grapheme is #[repr(transparent)]
599            unsafe { &*(inner as *const str as *const Self) }
600        }
601
602        /// Creates a new grapheme with the character `'0'` inside it.
603        pub fn digit_zero() -> &'static Self {
604            Self::new("0")
605        }
606
607        /// Gets an iterator over code points.
608        pub fn code_points(&self) -> Chars<'_> {
609            self.inner.chars()
610        }
611
612        /// Gets an iterator over bytes.
613        pub fn bytes(&self) -> Bytes<'_> {
614            self.inner.bytes()
615        }
616
617        /// Gets the slice of code points that are contained in the grapheme cluster.
618        pub fn as_str(&self) -> &str {
619            &self.inner
620        }
621
622        /// Gets the slice of bytes that are contained in the grapheme cluster.
623        pub fn as_bytes(&self) -> &[u8] {
624            self.inner.as_bytes()
625        }
626
627        /// Splits the grapheme into the first code point and the remaining code points.
628        pub fn split(&self) -> (char, &str) {
629            let mut iter = self.inner.chars();
630            // The operation never falls because the grapheme always contains at least one code point.
631            let first = iter.next().unwrap();
632            (first, iter.as_str())
633        }
634    }
635
636    impl fmt::Debug for Grapheme {
637        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
638            f.write_str("g'")?;
639            for i in self.as_str().chars() {
640                write!(f, "{}", i.escape_debug())?;
641            }
642            f.write_str("'")?;
643            Ok(())
644        }
645    }
646
647    impl fmt::Display for Grapheme {
648        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
649            fmt::Display::fmt(&self.inner, f)
650        }
651    }
652
653    impl AsRef<str> for Grapheme {
654        fn as_ref(&self) -> &str {
655            self.as_str()
656        }
657    }
658
659    impl AsRef<[u8]> for Grapheme {
660        fn as_ref(&self) -> &[u8] {
661            self.as_bytes()
662        }
663    }
664
665    impl AsRef<Grapheme> for Grapheme {
666        fn as_ref(&self) -> &Grapheme {
667            self
668        }
669    }
670
671    impl Borrow<str> for Grapheme {
672        fn borrow(&self) -> &str {
673            self.as_str()
674        }
675    }
676
677    impl Borrow<[u8]> for Grapheme {
678        fn borrow(&self) -> &[u8] {
679            self.as_bytes()
680        }
681    }
682
683    impl<'src> From<&'src Grapheme> for Box<Grapheme> {
684        fn from(value: &'src Grapheme) -> Self {
685            let value: Box<str> = Box::from(value.as_str());
686            // SAFETY: This is ok because Grapheme is #[repr(transparent)]
687            unsafe { Box::from_raw(Box::into_raw(value) as *mut Grapheme) }
688        }
689    }
690
691    impl From<Box<Grapheme>> for Box<str> {
692        fn from(value: Box<Grapheme>) -> Self {
693            // SAFETY: This is ok because Grapheme is #[repr(transparent)]
694            unsafe { Box::from_raw(Box::into_raw(value) as *mut str) }
695        }
696    }
697
698    impl From<Box<Grapheme>> for Box<[u8]> {
699        fn from(value: Box<Grapheme>) -> Self {
700            Box::<str>::from(value).into()
701        }
702    }
703
704    /// A type containing any number of extended Unicode grapheme clusters.
705    #[derive(PartialEq, Eq)]
706    #[repr(transparent)]
707    pub struct Graphemes {
708        inner: str,
709    }
710
711    impl Graphemes {
712        /// Create a new graphemes.
713        pub fn new(inner: &str) -> &Self {
714            // SAFETY: This is ok because Graphemes is #[repr(transparent)]
715            unsafe { &*(inner as *const str as *const Self) }
716        }
717
718        /// Gets an iterator over graphemes.
719        pub fn iter(&self) -> GraphemesIter<'_> {
720            self.into_iter()
721        }
722
723        /// Gets an iterator over code points.
724        pub fn code_points(&self) -> Chars<'_> {
725            self.inner.chars()
726        }
727
728        /// Gets an iterator over bytes.
729        pub fn bytes(&self) -> Bytes<'_> {
730            self.inner.bytes()
731        }
732
733        /// Gets the slice of code points that are contained in the string.
734        pub fn as_str(&self) -> &str {
735            &self.inner
736        }
737
738        /// Gets the slice of bytes that are contained in the string.
739        pub fn as_bytes(&self) -> &[u8] {
740            self.inner.as_bytes()
741        }
742    }
743
744    impl fmt::Debug for Graphemes {
745        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
746            f.write_str("g")?;
747            fmt::Debug::fmt(&self.inner, f)
748        }
749    }
750
751    impl fmt::Display for Graphemes {
752        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
753            fmt::Display::fmt(&self.inner, f)
754        }
755    }
756
757    impl AsRef<str> for Graphemes {
758        fn as_ref(&self) -> &str {
759            self.as_str()
760        }
761    }
762
763    impl AsRef<[u8]> for Graphemes {
764        fn as_ref(&self) -> &[u8] {
765            self.as_bytes()
766        }
767    }
768
769    impl AsRef<Graphemes> for Graphemes {
770        fn as_ref(&self) -> &Graphemes {
771            self
772        }
773    }
774
775    impl Borrow<str> for Graphemes {
776        fn borrow(&self) -> &str {
777            self.as_str()
778        }
779    }
780
781    impl Borrow<[u8]> for Graphemes {
782        fn borrow(&self) -> &[u8] {
783            self.as_bytes()
784        }
785    }
786
787    impl<'src> From<&'src str> for &'src Graphemes {
788        fn from(value: &'src str) -> Self {
789            Graphemes::new(value)
790        }
791    }
792
793    impl<'src> From<&'src Graphemes> for &'src str {
794        fn from(value: &'src Graphemes) -> Self {
795            value.as_str()
796        }
797    }
798
799    impl<'src> From<&'src Graphemes> for Box<Graphemes> {
800        fn from(value: &'src Graphemes) -> Self {
801            value.as_str().into()
802        }
803    }
804
805    impl<'src> From<&'src str> for Box<Graphemes> {
806        fn from(value: &'src str) -> Self {
807            Box::<str>::from(value).into()
808        }
809    }
810
811    impl From<Box<str>> for Box<Graphemes> {
812        fn from(value: Box<str>) -> Self {
813            // SAFETY: This is ok because Grapheme is #[repr(transparent)]
814            unsafe { Box::from_raw(Box::into_raw(value) as *mut Graphemes) }
815        }
816    }
817
818    impl From<Box<Graphemes>> for Box<str> {
819        fn from(value: Box<Graphemes>) -> Self {
820            // SAFETY: This is ok because Grapheme is #[repr(transparent)]
821            unsafe { Box::from_raw(Box::into_raw(value) as *mut str) }
822        }
823    }
824
825    impl From<Box<Graphemes>> for Box<[u8]> {
826        fn from(value: Box<Graphemes>) -> Self {
827            Box::<str>::from(value).into()
828        }
829    }
830
831    impl<'src> IntoIterator for &'src Graphemes {
832        type Item = &'src Grapheme;
833
834        type IntoIter = GraphemesIter<'src>;
835
836        fn into_iter(self) -> Self::IntoIter {
837            GraphemesIter::new(self)
838        }
839    }
840
841    impl Sealed for &'_ Graphemes {}
842    impl<'src> StrInput<'src> for &'src Graphemes {
843        #[doc(hidden)]
844        fn stringify(slice: Self::Slice) -> String {
845            slice.to_string()
846        }
847    }
848
849    impl<'src> Input<'src> for &'src Graphemes {
850        type Cursor = usize;
851        type Span = SimpleSpan<usize>;
852
853        type Token = &'src Grapheme;
854        type MaybeToken = &'src Grapheme;
855
856        type Cache = Self;
857
858        #[inline]
859        fn begin(self) -> (Self::Cursor, Self::Cache) {
860            (0, self)
861        }
862
863        #[inline]
864        fn cursor_location(cursor: &Self::Cursor) -> usize {
865            *cursor
866        }
867
868        #[inline(always)]
869        unsafe fn next_maybe(
870            this: &mut Self::Cache,
871            cursor: &mut Self::Cursor,
872        ) -> Option<Self::MaybeToken> {
873            if *cursor < this.as_str().len() {
874                // SAFETY: `cursor < self.len()` above guarantees cursor is in-bounds
875                //         We only ever return cursors that are at a code point boundary.
876                //         The `next()` implementation returns `None`, only in the
877                //         situation of zero length of the remaining part of the string.
878                //         And the Unicode standard guarantees that any sequence of code
879                //         points is a valid sequence of grapheme clusters, so the
880                //         behaviour of the `next()` function should not change.
881                let c = this
882                    .as_str()
883                    .get_unchecked(*cursor..)
884                    .graphemes(true)
885                    .next()
886                    .unwrap_unchecked();
887                *cursor += c.len();
888                Some(Grapheme::new(c))
889            } else {
890                None
891            }
892        }
893
894        #[inline(always)]
895        unsafe fn span(_this: &mut Self::Cache, range: Range<&Self::Cursor>) -> Self::Span {
896            (*range.start..*range.end).into()
897        }
898    }
899
900    impl<'src> ExactSizeInput<'src> for &'src Graphemes {
901        #[inline(always)]
902        unsafe fn span_from(this: &mut Self::Cache, range: RangeFrom<&Self::Cursor>) -> Self::Span {
903            (*range.start..this.as_str().len()).into()
904        }
905    }
906
907    impl<'src> ValueInput<'src> for &'src Graphemes {
908        #[inline(always)]
909        unsafe fn next(this: &mut Self::Cache, cursor: &mut Self::Cursor) -> Option<Self::Token> {
910            Self::next_maybe(this, cursor)
911        }
912    }
913
914    impl<'src> SliceInput<'src> for &'src Graphemes {
915        type Slice = Self;
916
917        #[inline(always)]
918        fn full_slice(this: &mut Self::Cache) -> Self::Slice {
919            *this
920        }
921
922        #[inline(always)]
923        unsafe fn slice(this: &mut Self::Cache, range: Range<&Self::Cursor>) -> Self::Slice {
924            Graphemes::new(&this.as_str()[*range.start..*range.end])
925        }
926
927        #[inline(always)]
928        unsafe fn slice_from(
929            this: &mut Self::Cache,
930            from: RangeFrom<&Self::Cursor>,
931        ) -> Self::Slice {
932            Graphemes::new(&this.as_str()[*from.start..])
933        }
934    }
935
936    /// Grapheme iterator type.
937    #[derive(Debug, Clone)]
938    pub struct GraphemesIter<'src> {
939        iter: unicode_segmentation::Graphemes<'src>,
940    }
941
942    impl<'src> GraphemesIter<'src> {
943        /// Create a new grapheme iterator.
944        pub fn new(graphemes: &'src Graphemes) -> Self {
945            Self {
946                iter: graphemes.as_str().graphemes(true),
947            }
948        }
949
950        /// Gets the slice of code points that are contained in the grapheme cluster.
951        pub fn as_str(self) -> &'src str {
952            self.iter.as_str()
953        }
954    }
955
956    impl<'src> Iterator for GraphemesIter<'src> {
957        type Item = &'src Grapheme;
958
959        #[inline]
960        fn size_hint(&self) -> (usize, Option<usize>) {
961            self.iter.size_hint()
962        }
963
964        #[inline]
965        fn next(&mut self) -> Option<Self::Item> {
966            self.iter.next().map(Grapheme::new)
967        }
968    }
969
970    impl DoubleEndedIterator for GraphemesIter<'_> {
971        #[inline]
972        fn next_back(&mut self) -> Option<Self::Item> {
973            self.iter.next_back().map(Grapheme::new)
974        }
975    }
976
977    /// A parser that accepts an identifier.
978    ///
979    /// The output type of this parser is [`SliceInput::Slice`] (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`] when `I` is
980    /// [`&[u8]`]).
981    ///
982    /// An identifier is defined as per "Default Identifiers" in [Unicode Standard Annex #31](https://www.unicode.org/reports/tr31/).
983    #[must_use]
984    pub fn ident<'src, I, E>() -> impl Parser<'src, I, <I as SliceInput<'src>>::Slice, E> + Copy
985    where
986        I: StrInput<'src>,
987        I::Token: Char + 'src,
988        E: ParserExtra<'src, I>,
989        E::Error: LabelError<'src, I, TextExpected<'src, I>>,
990    {
991        any()
992            .filter(|c: &I::Token| c.is_ident_start())
993            .map_err(|mut err: E::Error| {
994                err.label_with(TextExpected::IdentifierPart);
995                err
996            })
997            .then(
998                any()
999                    .filter(|c: &I::Token| c.is_ident_continue())
1000                    .map_err(|mut err: E::Error| {
1001                        err.label_with(TextExpected::IdentifierPart);
1002                        err
1003                    })
1004                    .repeated(),
1005            )
1006            .to_slice()
1007    }
1008
1009    /// Like [`ident`], but only accepts a specific identifier while rejecting trailing identifier characters.
1010    ///
1011    /// The output type of this parser is `I::Slice` (i.e: [`&str`] when `I` is [`&str`], and [`&[u8]`]
1012    /// when `I::Slice` is [`&[u8]`]).
1013    ///
1014    /// # Examples
1015    ///
1016    /// ```
1017    /// # use chumsky::prelude::*;
1018    /// let def = text::ascii::keyword::<_, _, extra::Err<Simple<char>>>("def");
1019    ///
1020    /// // Exactly 'def' was found
1021    /// assert_eq!(def.parse("def").into_result(), Ok("def"));
1022    /// // Exactly 'def' was found, with non-identifier trailing characters
1023    /// // This works because we made the parser lazy: it parses 'def' and ignores the rest
1024    /// assert_eq!(def.clone().lazy().parse("def(foo, bar)").into_result(), Ok("def"));
1025    /// // 'def' was found, but only as part of a larger identifier, so this fails to parse
1026    /// assert!(def.lazy().parse("define").has_errors());
1027    /// ```
1028    #[track_caller]
1029    pub fn keyword<'src, I, S, E>(
1030        keyword: S,
1031    ) -> impl Parser<'src, I, <I as SliceInput<'src>>::Slice, E> + Clone + 'src
1032    where
1033        I: StrInput<'src>,
1034        I::Slice: PartialEq,
1035        I::Token: Char + fmt::Debug + 'src,
1036        S: PartialEq<I::Slice> + Clone + 'src,
1037        E: ParserExtra<'src, I> + 'src,
1038        E::Error: LabelError<'src, I, TextExpected<'src, I>> + LabelError<'src, I, S>,
1039    {
1040        /*
1041        #[cfg(debug_assertions)]
1042        {
1043            let mut cs = keyword.seq_iter();
1044            if let Some(c) = cs.next() {
1045                let c = c.borrow();
1046                assert!(
1047                    c.is_ident_start(),
1048                    "The first character of a keyword must be a valid unicode XID_START, not {:?}",
1049                    c
1050                );
1051            } else {
1052                panic!("Keyword must have at least one character");
1053            }
1054            for c in cs {
1055                let c = c.borrow();
1056                assert!(c.is_ident_continue(), "Trailing characters of a keyword must be valid as unicode XID_CONTINUE, not {:?}", c);
1057            }
1058        }
1059        */
1060        ident()
1061            .try_map(move |s: I::Slice, span| {
1062                if keyword.borrow() == &s {
1063                    Ok(())
1064                } else {
1065                    Err(LabelError::expected_found([keyword.clone()], None, span))
1066                }
1067            })
1068            .to_slice()
1069    }
1070
1071    /// Like [`char::is_whitespace`], but rejects the characters U+202A, U+202B, U+202C, U+202D, U+202E, U+2066, U+2067, U+2068, U+2069
1072    /// to mitigate against [CVE-2021-42574](https://nvd.nist.gov/vuln/detail/CVE-2021-42574)
1073    pub fn is_whitespace(c: char) -> bool {
1074        c.is_whitespace()
1075            && !matches!(
1076                c,
1077                '\u{202A}'
1078                    | '\u{202B}'
1079                    | '\u{202C}'
1080                    | '\u{202D}'
1081                    | '\u{202E}'
1082                    | '\u{2066}'
1083                    | '\u{2067}'
1084                    | '\u{2068}'
1085                    | '\u{2069}'
1086            )
1087    }
1088}
1089
1090#[cfg(test)]
1091mod tests {
1092    use crate::prelude::*;
1093    use std::fmt;
1094
1095    fn make_ascii_kw_parser<'src, I>(s: I::Slice) -> impl Parser<'src, I, ()>
1096    where
1097        I: crate::StrInput<'src>,
1098        I::Slice: PartialEq + Clone,
1099        I::Token: crate::Char + fmt::Debug + 'src,
1100    {
1101        text::ascii::keyword(s).ignored()
1102    }
1103
1104    fn make_unicode_kw_parser<'src, I>(s: I::Slice) -> impl Parser<'src, I, ()>
1105    where
1106        I: crate::StrInput<'src>,
1107        I::Slice: PartialEq + Clone,
1108        I::Token: crate::Char + fmt::Debug + 'src,
1109    {
1110        text::unicode::keyword(s).ignored()
1111    }
1112
1113    fn test_ok<'src, P: Parser<'src, &'src str, &'src str>>(parser: P, input: &'src str) {
1114        assert_eq!(
1115            parser.parse(input),
1116            ParseResult {
1117                output: Some(input),
1118                errs: vec![]
1119            }
1120        );
1121    }
1122
1123    fn test_err<'src, P: Parser<'src, &'src str, &'src str>>(parser: P, input: &'src str) {
1124        assert_eq!(
1125            parser.parse(input),
1126            ParseResult {
1127                output: None,
1128                errs: vec![EmptyErr::default()]
1129            }
1130        );
1131    }
1132
1133    #[test]
1134    fn keyword_good() {
1135        make_ascii_kw_parser::<&str>("hello");
1136        make_ascii_kw_parser::<&str>("_42");
1137        make_ascii_kw_parser::<&str>("_42");
1138
1139        make_unicode_kw_parser::<&str>("שלום");
1140        make_unicode_kw_parser::<&str>("привет");
1141        make_unicode_kw_parser::<&str>("你好");
1142    }
1143
1144    #[test]
1145    fn ident() {
1146        let ident = text::ident::<&str, extra::Default>();
1147        test_ok(ident, "foo");
1148        test_ok(ident, "foo_bar");
1149        test_ok(ident, "foo_");
1150        test_ok(ident, "_foo");
1151        test_ok(ident, "_");
1152        test_ok(ident, "__");
1153        test_ok(ident, "__init__");
1154        test_err(ident, "");
1155        test_err(ident, ".");
1156        test_err(ident, "123");
1157    }
1158
1159    #[test]
1160    fn whitespace() {
1161        use crate::{whitespace, LabelError, TextExpected};
1162
1163        let parser = whitespace::<&str, extra::Err<Rich<_>>>().exactly(1);
1164
1165        assert_eq!(
1166            parser.parse("").into_output_errors(),
1167            (
1168                None,
1169                vec![LabelError::<&str, _>::expected_found(
1170                    vec![TextExpected::<&str>::Whitespace],
1171                    None,
1172                    SimpleSpan::new((), 0..0)
1173                )]
1174            )
1175        );
1176    }
1177
1178    /*
1179    #[test]
1180    #[should_panic]
1181    fn keyword_numeric() {
1182        make_ascii_kw_parser::<&str>("42");
1183    }
1184
1185    #[test]
1186    #[should_panic]
1187    fn keyword_empty() {
1188        make_ascii_kw_parser::<&str>("");
1189    }
1190
1191    #[test]
1192    #[should_panic]
1193    fn keyword_not_alphanum() {
1194        make_ascii_kw_parser::<&str>("hi\n");
1195    }
1196
1197    #[test]
1198    #[should_panic]
1199    fn keyword_unicode_in_ascii() {
1200        make_ascii_kw_parser::<&str>("שלום");
1201    }
1202    */
1203}
chumsky/text.rs

chumsky/
text.rs