simple_tokenizer/
lib.rs

1#![no_std]
2#![doc = include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/README.md"))]
3#![warn(missing_docs)]
4
5use core::fmt::{self, Display};
6
7#[cfg(feature = "yap")]
8pub use yap;
9
10/// Support for `yap` crate.
11#[cfg(feature = "yap")]
12pub mod yap_support;
13
14/// Byte range in the source input.
15pub type Span = core::ops::Range<usize>;
16
17/// Position (line & column) in the source input.
18#[derive(Clone, Copy, Debug, PartialEq, Eq)]
19pub struct Position {
20    /// Line count
21    pub line: u32,
22    /// Column count
23    pub column: u32,
24}
25
26impl Position {
27    /// Starting position (line = 1, column = 1).
28    #[inline]
29    pub const fn starting() -> Self {
30        Position { line: 1, column: 1 }
31    }
32
33    /// Updates the position.
34    /// If ch == '\n', increases line count and resets column count,
35    /// otherwise just increases column count.
36    ///
37    /// # Example
38    ///
39    /// ```rust
40    /// use simple_tokenizer::Position;
41    ///
42    /// let mut pos = Position::starting();
43    ///
44    /// assert_eq!(pos.update_from_char(' '), Position { line: 1, column: 2 });
45    /// assert_eq!(pos.update_from_char('\n'), Position { line: 2, column: 1 });
46    ///
47    /// ```
48    pub fn update_from_char(&mut self, ch: char) -> Self {
49        if ch == '\n' {
50            self.line += 1;
51            self.column = 1;
52        } else {
53            self.column += 1;
54        }
55
56        *self
57    }
58
59    /// Updates the position.
60    /// Identical to calling `update_from_char()` for every character of the string.
61    ///
62    /// # Example
63    ///
64    /// ```rust
65    /// use simple_tokenizer::Position;
66    ///
67    /// let mut pos = Position::starting();
68    ///
69    /// assert_eq!(pos.update_from_str("line 1\nline 2\nlong line 3"), Position { line: 3, column: 12 });
70    /// assert_eq!(pos.update_from_str(""), Position { line: 3, column: 12 });
71    /// assert_eq!(pos.update_from_str("continuation"), Position { line: 3, column: 24 });
72    ///
73    /// ```
74    pub fn update_from_str(&mut self, s: &str) -> Self {
75        let mut last_line_border = None;
76
77        let added_lines = s.bytes().enumerate().filter(|(i, b)| {
78            if *b == b'\n' {
79                last_line_border = Some(*i);
80                true
81            } else {
82                false
83            }
84        }).count() as u32;
85        self.line += added_lines;
86
87        match last_line_border {
88            Some(i) => {
89                // we had a '\n'
90
91                // i is the position of '\n' so we take i+1
92                // and we take 1+count because columns start with 1
93                self.column = 1 + s[i + 1..].chars().count() as u32;
94            }
95            None => {
96                self.column += s.chars().count() as u32;
97            }
98        }
99
100        *self
101    }
102}
103
104impl Display for Position {
105    #[inline]
106    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
107        write!(f, "[line {}, col {}]", self.line, self.column)
108    }
109}
110
111/// Byte offset in the source input.
112#[derive(Clone, Copy, Debug, PartialEq, Eq)]
113pub struct Offset(pub usize);
114
115/// Tokens instance.
116#[derive(Clone, Debug, PartialEq, Eq)]
117pub struct Tokens<'s> {
118    full_input: &'s str,
119    remaining_input: &'s str,
120    span: Span,
121    pos: Position,
122    offset: usize,
123}
124
125impl<'s> Tokens<'s> {
126    /// Construct a new instance from a string slice.
127    #[inline]
128    pub fn new(input: &'s str) -> Self {
129        Self {
130            full_input: input,
131            remaining_input: input,
132            span: 0..0,
133            pos: Position::starting(),
134            offset: 0,
135        }
136    }
137
138    /// Returns the original full input.
139    #[inline]
140    pub fn input(&self) -> &str {
141        self.full_input
142    }
143
144    /// Part of the input string that hasn't been consumed yet.
145    #[inline]
146    pub fn remainder(&self) -> &str {
147        self.remaining_input
148    }
149
150    /// Byte span of the last token.
151    #[inline]
152    pub fn span(&self) -> Span {
153        self.span.clone()
154    }
155
156    /// Current position (just after the last token).
157    #[inline]
158    pub fn position(&self) -> Position {
159        self.pos
160    }
161
162    /// Current byte offset in the source.
163    #[inline]
164    pub fn offset(&self) -> Offset {
165        Offset(self.offset)
166    }
167
168    /// Sets the offset if it is valid, updating position and span.
169    /// Returns `true` if the offset is valid, `false` otherwise.
170    pub fn set_offset(&mut self, offset: Offset) -> bool {
171        let offset = offset.0;
172
173        if self.full_input.is_char_boundary(offset) {
174            self.remaining_input = &self.full_input[offset..];
175
176            self.span = offset..offset;
177            self.pos = Position::starting().update_from_str(&self.full_input[..offset]);
178            self.offset = offset;
179            true
180        } else {
181            false
182        }
183    }
184
185    /// Returns `true` if the current position is the start of input.
186    #[inline]
187    pub fn is_at_start(&self) -> bool {
188        self.offset == 0
189    }
190
191    /// Returns `true` if the input has been exhausted.
192    #[inline]
193    pub fn is_at_end(&self) -> bool {
194        self.remaining_input.is_empty()
195    }
196
197    /// Peeks at the next character of the input.
198    #[inline]
199    pub fn peek(&self) -> Option<char> {
200        self.remaining_input.chars().next()
201    }
202
203    /// Consumes the rest of input.
204    ///
205    /// # Example
206    ///
207    /// ```rust
208    /// use simple_tokenizer::*;
209    ///
210    /// let mut tokens = "tokens".as_tokens();
211    ///
212    /// assert_eq!(tokens.consume_all(), "tokens");
213    /// assert!(tokens.remainder().is_empty());
214    ///
215    /// ```
216    #[inline]
217    pub fn consume_all(&mut self) -> &str {
218        self.split(self.remaining_input.len())
219    }
220
221    /// Consume the next substring equal to `token` or nothing.
222    /// Basically a shortcut for `bytes_if(token.len(), |s| s == token).is_some()`.
223    ///
224    /// # Example
225    ///
226    /// ```rust
227    /// use simple_tokenizer::*;
228    ///
229    /// let mut tokens = "tok1 tok2".as_tokens();
230    ///
231    /// assert!(tokens.token("tok1"));
232    /// assert_eq!(tokens.remainder(), " tok2");
233    ///
234    /// assert!(!tokens.token(" tok3"));
235    /// assert_eq!(tokens.remainder(), " tok2");
236    ///
237    /// ```
238    pub fn token(&mut self, token: impl AsRef<str>) -> bool {
239        let token = token.as_ref();
240
241        self.remaining_input
242            .get(..token.len())
243            .filter(|s| *s == token)
244            .map(|s| self.split(s.len()))
245            .is_some()
246    }
247
248    /// Try to consume a substring equal to one of the provided tokens.
249    /// Returns the first successful substring.
250    ///
251    /// # Example
252    ///
253    /// ```rust
254    /// use simple_tokenizer::*;
255    ///
256    /// let mut tokens = "tok1 tok2".as_tokens();
257    ///
258    /// assert_eq!(tokens.tokens(&["tok", "tok1"]), Some("tok"));
259    /// assert_eq!(tokens.remainder(), "1 tok2");
260    ///
261    /// assert_eq!(tokens.tokens(&["1 tok3", "2 tok2"]), None);
262    /// assert_eq!(tokens.remainder(), "1 tok2");
263    ///
264    /// ```
265    pub fn tokens(&mut self, tokens: impl IntoIterator<Item = impl AsRef<str>>) -> Option<&str> {
266        for token in tokens.into_iter() {
267            if self.token(token) {
268                return Some(&self.full_input[self.span.clone()]);
269            }
270        }
271
272        None
273    }
274
275    /// Consume the next character.
276    ///
277    /// # Example
278    ///
279    /// ```rust
280    /// use simple_tokenizer::*;
281    ///
282    /// let mut tokens = "tokens".as_tokens();
283    ///
284    /// assert_eq!(tokens.char(), Some('t'));
285    /// assert_eq!(tokens.remainder(), "okens");
286    ///
287    /// ```
288    pub fn char(&mut self) -> Option<char> {
289        (!self.remaining_input.is_empty()).then(|| self.split_next_char())
290    }
291
292    /// Consume the next character if it matches a predicate.
293    ///
294    /// # Example
295    ///
296    /// ```rust
297    /// use simple_tokenizer::*;
298    ///
299    /// let mut tokens = "tokens".as_tokens();
300    ///
301    /// assert_eq!(tokens.char_if(char::is_alphabetic), Some('t'));
302    /// assert_eq!(tokens.remainder(), "okens");
303    ///
304    /// assert_eq!(tokens.char_if(char::is_numeric), None);
305    /// assert_eq!(tokens.remainder(), "okens");
306    ///
307    /// ```
308    pub fn char_if(&mut self, f: impl FnOnce(char) -> bool) -> Option<char> {
309        self.remaining_input
310            .chars()
311            .next()
312            .filter(|ch| f(*ch))
313            .map(|_| self.split_next_char())
314    }
315
316    /// Consume the next `n` bytes.
317    ///
318    /// # Example
319    ///
320    /// ```rust
321    /// use simple_tokenizer::*;
322    ///
323    /// let mut tokens = "tokens123".as_tokens();
324    ///
325    /// assert_eq!(tokens.bytes(6), Some("tokens"));
326    /// assert_eq!(tokens.remainder(), "123");
327    ///
328    /// assert_eq!(tokens.bytes(5), None);
329    /// assert_eq!(tokens.remainder(), "123");
330    ///
331    /// ```
332    pub fn bytes(&mut self, n: usize) -> Option<&str> {
333        self.remaining_input
334            .is_char_boundary(n)
335            .then(|| self.split(n))
336    }
337
338    /// Consume the next `n` bytes if they match a predicate.
339    ///
340    /// # Example
341    ///
342    /// ```rust
343    /// use simple_tokenizer::*;
344    ///
345    /// let mut tokens = "1231234".as_tokens();
346    ///
347    /// assert_eq!(tokens.bytes_if(3, |s| s.chars().all(char::is_numeric)), Some("123"));
348    /// assert_eq!(tokens.remainder(), "1234");
349    ///
350    /// assert_eq!(tokens.bytes_if(5, |s| s.chars().all(char::is_numeric)), None);
351    /// assert_eq!(tokens.remainder(), "1234");
352    ///
353    /// ```
354    pub fn bytes_if(&mut self, n: usize, f: impl FnOnce(&str) -> bool) -> Option<&str> {
355        self.remaining_input
356            .get(..n)
357            .filter(|s| f(s))
358            .map(|s| self.split(s.len()))
359    }
360
361    /// Limit the input to the next `n` bytes.
362    /// Returns `true` if successful (`n` lands on a char boundary).
363    ///
364    /// # Example
365    ///
366    /// ```rust
367    /// use simple_tokenizer::*;
368    ///
369    /// let mut tokens = "123456".as_tokens();
370    ///
371    /// assert!(tokens.limit_bytes(4));
372    /// assert_eq!(tokens.remainder(), "1234");
373    ///
374    /// ```
375    pub fn limit_bytes(&mut self, n: usize) -> bool {
376        if self.remaining_input.is_char_boundary(n) {
377            self.remaining_input = &self.remaining_input[..n];
378            true
379        } else {
380            false
381        }
382    }
383
384    /// Attempts to split the `Tokens` into two.
385    /// Similar to [`str::split_at()`](https://doc.rust-lang.org/std/primitive.str.html#method.split_at).
386    ///
387    /// # Example
388    ///
389    /// ```rust
390    /// use simple_tokenizer::*;
391    ///
392    /// let mut tokens = "1231234".as_tokens();
393    ///
394    /// let (first, second) = tokens.split_bytes(3).unwrap();
395    ///
396    /// assert_eq!(first.remainder(), "123");
397    /// assert_eq!(second.remainder(), "1234");
398    /// assert_eq!(second.offset(), Offset(3));
399    ///
400    /// ```
401    pub fn split_bytes(self, n: usize) -> Option<(Tokens<'s>, Tokens<'s>)> {
402        let mut first = self.clone();
403        let mut second = self;
404
405        if second.bytes(n).is_some() {
406            first.limit_bytes(n);
407
408            Some((first, second))
409        } else {
410            None
411        }
412    }
413
414    /// Consume the next `n` characters.
415    /// Doesn't advance if there aren't enough characters left.
416    ///
417    /// # Example
418    ///
419    /// ```rust
420    /// use simple_tokenizer::*;
421    ///
422    /// let mut tokens = "tokens123".as_tokens();
423    ///
424    /// assert_eq!(tokens.chars(6), Some("tokens"));
425    /// assert_eq!(tokens.remainder(), "123");
426    ///
427    /// assert_eq!(tokens.chars(5), None);
428    /// assert_eq!(tokens.remainder(), "123");
429    ///
430    /// ```
431    pub fn chars(&mut self, n: usize) -> Option<&str> {
432        self.remaining_input
433            .char_indices()
434            .nth(n.checked_sub(1)?)
435            .map(|(i, ch)| self.split(i + ch.len_utf8()))
436    }
437
438    /// Consume the next `n` characters if they match a predicate.
439    /// Doesn't advance if there aren't enough characters left.
440    ///
441    /// # Example
442    ///
443    /// ```rust
444    /// use simple_tokenizer::*;
445    ///
446    /// let mut tokens = "1231234".as_tokens();
447    ///
448    /// assert_eq!(tokens.chars_if(3, |s| s.chars().all(char::is_numeric)), Some("123"));
449    /// assert_eq!(tokens.remainder(), "1234");
450    ///
451    /// assert_eq!(tokens.chars_if(5, |s| s.chars().all(char::is_numeric)), None);
452    /// assert_eq!(tokens.remainder(), "1234");
453    ///
454    /// ```
455    pub fn chars_if(&mut self, n: usize, f: impl FnOnce(&str) -> bool) -> Option<&str> {
456        self.remaining_input
457            .char_indices()
458            .nth(n.checked_sub(1)?)
459            .map(|(i, ch)| &self.remaining_input[..i + ch.len_utf8()])
460            .filter(|s| f(s))
461            .map(|s| self.split(s.len()))
462    }
463
464    /// Limits the input to the next `n` characters.
465    /// Returns `true` if successful (>=n characters left in the input).
466    ///
467    /// # Example
468    ///
469    /// ```rust
470    /// use simple_tokenizer::*;
471    ///
472    /// let mut tokens = "123456".as_tokens();
473    ///
474    /// assert!(tokens.limit_chars(4));
475    /// assert_eq!(tokens.remainder(), "1234");
476    ///
477    /// ```
478    pub fn limit_chars(&mut self, n: usize) -> bool {
479        if let Some((i, _)) = self.remaining_input.char_indices().nth(n) {
480            self.remaining_input = &self.remaining_input[..i];
481            true
482        } else {
483            false
484        }
485    }
486
487    /// Attempts to split the `Tokens` into two.
488    /// Similar to [`str::split_at()`](https://doc.rust-lang.org/std/primitive.str.html#method.split_at), but `n` is in characters.
489    ///
490    /// # Example
491    ///
492    /// ```rust
493    /// use simple_tokenizer::*;
494    ///
495    /// let mut tokens = "1231234".as_tokens();
496    ///
497    /// let (first, second) = tokens.split_chars(3).unwrap();
498    ///
499    /// assert_eq!(first.remainder(), "123");
500    /// assert_eq!(second.remainder(), "1234");
501    /// assert_eq!(second.offset(), Offset(3));
502    ///
503    /// ```
504    pub fn split_chars(self, n: usize) -> Option<(Tokens<'s>, Tokens<'s>)> {
505        let mut first = self.clone();
506        let mut second = self;
507
508        if second.chars(n).is_some() {
509            first.limit_chars(n);
510
511            Some((first, second))
512        } else {
513            None
514        }
515    }
516
517    /// Consume characters while `f` returns true.
518    /// Returns the consumed substring.
519    ///
520    /// # Example
521    ///
522    /// ```rust
523    /// use simple_tokenizer::*;
524    ///
525    /// let mut tokens = "12345word".as_tokens();
526    ///
527    /// assert_eq!(tokens.take_while(char::is_numeric), "12345");
528    /// assert_eq!(tokens.remainder(), "word");
529    ///
530    /// ```
531    pub fn take_while(&mut self, mut f: impl FnMut(char) -> bool) -> &str {
532        self.remaining_input
533            .char_indices()
534            .take_while(|(_, ch)| f(*ch))
535            .last()
536            .map(|(i, ch)| self.split(i + ch.len_utf8()))
537            .unwrap_or("")
538    }
539
540    /// Limit the input to the next amount of characters, for which `f` returns `true`.
541    ///
542    /// # Example
543    ///
544    /// ```rust
545    /// use simple_tokenizer::*;
546    ///
547    /// let mut tokens = "line 1\nline 2".as_tokens();
548    ///
549    /// tokens.limit_while(|ch| ch != '\n');
550    /// assert_eq!(tokens.remainder(), "line 1");
551    ///
552    /// ```
553    pub fn limit_while(&mut self, mut f: impl FnMut(char) -> bool) {
554        if let Some((i, ch)) = self
555            .remaining_input
556            .char_indices()
557            .take_while(|(_, ch)| f(*ch))
558            .last()
559        {
560            self.remaining_input = &self.remaining_input[..i + ch.len_utf8()];
561        }
562    }
563
564    /// Attempts to split the `Tokens` into two.
565    /// Similar to [`str::split_at()`](https://doc.rust-lang.org/std/primitive.str.html#method.split_at).
566    /// The split point is determined by `f`.
567    ///
568    /// # Example
569    ///
570    /// ```rust
571    /// use simple_tokenizer::*;
572    ///
573    /// let mut tokens = "12345abcdef".as_tokens();
574    ///
575    /// let (first, second) = tokens.split_while(char::is_numeric);
576    ///
577    /// assert_eq!(first.remainder(), "12345");
578    /// assert_eq!(second.remainder(), "abcdef");
579    /// assert_eq!(second.offset(), Offset(5));
580    ///
581    /// ```
582    pub fn split_while(self, f: impl FnMut(char) -> bool) -> (Tokens<'s>, Tokens<'s>) {
583        let mut first = self.clone();
584        let mut second = self;
585
586        let n = second.take_while(f).len();
587        first.limit_bytes(n);
588
589        (first, second)
590    }
591
592    fn split(&mut self, i: usize) -> &str {
593        let (result, remainder) = self.remaining_input.split_at(i);
594
595        self.remaining_input = remainder;
596
597        self.pos.update_from_str(result);
598
599        self.offset += i;
600        self.span = self.span.end..self.offset;
601
602        result
603    }
604
605    fn split_next_char(&mut self) -> char {
606        let ch = self.remaining_input.chars().next().unwrap();
607
608        self.remaining_input = &self.remaining_input[ch.len_utf8()..];
609
610        self.offset += ch.len_utf8();
611        self.span = self.span.end..self.offset;
612
613        self.pos.update_from_char(ch);
614
615        ch
616    }
617}
618
619/// Convenience trait implemented for every `T: AsRef<str>`.
620pub trait AsTokens {
621    /// Convenient converting to tokens instance.
622    fn as_tokens(&self) -> Tokens<'_>;
623}
624
625impl<T> AsTokens for T
626where
627    T: AsRef<str>,
628{
629    #[inline]
630    fn as_tokens(&self) -> Tokens {
631        Tokens::new(self.as_ref())
632    }
633}
634
635impl<'s> AsTokens for Tokens<'s> {
636    #[inline]
637    fn as_tokens(&self) -> Tokens<'_> {
638        // it is very cheap anyway
639        self.clone()
640    }
641}