simple_tokenizer/lib.rs
1#![no_std]
2#![doc = include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/README.md"))]
3#![warn(missing_docs)]
4
5use core::fmt::{self, Display};
6
7#[cfg(feature = "yap")]
8pub use yap;
9
10/// Support for `yap` crate.
11#[cfg(feature = "yap")]
12pub mod yap_support;
13
14/// Byte range in the source input.
15pub type Span = core::ops::Range<usize>;
16
17/// Position (line & column) in the source input.
18#[derive(Clone, Copy, Debug, PartialEq, Eq)]
19pub struct Position {
20 /// Line count
21 pub line: u32,
22 /// Column count
23 pub column: u32,
24}
25
26impl Position {
27 /// Starting position (line = 1, column = 1).
28 #[inline]
29 pub const fn starting() -> Self {
30 Position { line: 1, column: 1 }
31 }
32
33 /// Updates the position.
34 /// If ch == '\n', increases line count and resets column count,
35 /// otherwise just increases column count.
36 ///
37 /// # Example
38 ///
39 /// ```rust
40 /// use simple_tokenizer::Position;
41 ///
42 /// let mut pos = Position::starting();
43 ///
44 /// assert_eq!(pos.update_from_char(' '), Position { line: 1, column: 2 });
45 /// assert_eq!(pos.update_from_char('\n'), Position { line: 2, column: 1 });
46 ///
47 /// ```
48 pub fn update_from_char(&mut self, ch: char) -> Self {
49 if ch == '\n' {
50 self.line += 1;
51 self.column = 1;
52 } else {
53 self.column += 1;
54 }
55
56 *self
57 }
58
59 /// Updates the position.
60 /// Identical to calling `update_from_char()` for every character of the string.
61 ///
62 /// # Example
63 ///
64 /// ```rust
65 /// use simple_tokenizer::Position;
66 ///
67 /// let mut pos = Position::starting();
68 ///
69 /// assert_eq!(pos.update_from_str("line 1\nline 2\nlong line 3"), Position { line: 3, column: 12 });
70 /// assert_eq!(pos.update_from_str(""), Position { line: 3, column: 12 });
71 /// assert_eq!(pos.update_from_str("continuation"), Position { line: 3, column: 24 });
72 ///
73 /// ```
74 pub fn update_from_str(&mut self, s: &str) -> Self {
75 let mut last_line_border = None;
76
77 let added_lines = s.bytes().enumerate().filter(|(i, b)| {
78 if *b == b'\n' {
79 last_line_border = Some(*i);
80 true
81 } else {
82 false
83 }
84 }).count() as u32;
85 self.line += added_lines;
86
87 match last_line_border {
88 Some(i) => {
89 // we had a '\n'
90
91 // i is the position of '\n' so we take i+1
92 // and we take 1+count because columns start with 1
93 self.column = 1 + s[i + 1..].chars().count() as u32;
94 }
95 None => {
96 self.column += s.chars().count() as u32;
97 }
98 }
99
100 *self
101 }
102}
103
104impl Display for Position {
105 #[inline]
106 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
107 write!(f, "[line {}, col {}]", self.line, self.column)
108 }
109}
110
111/// Byte offset in the source input.
112#[derive(Clone, Copy, Debug, PartialEq, Eq)]
113pub struct Offset(pub usize);
114
115/// Tokens instance.
116#[derive(Clone, Debug, PartialEq, Eq)]
117pub struct Tokens<'s> {
118 full_input: &'s str,
119 remaining_input: &'s str,
120 span: Span,
121 pos: Position,
122 offset: usize,
123}
124
125impl<'s> Tokens<'s> {
126 /// Construct a new instance from a string slice.
127 #[inline]
128 pub fn new(input: &'s str) -> Self {
129 Self {
130 full_input: input,
131 remaining_input: input,
132 span: 0..0,
133 pos: Position::starting(),
134 offset: 0,
135 }
136 }
137
138 /// Returns the original full input.
139 #[inline]
140 pub fn input(&self) -> &str {
141 self.full_input
142 }
143
144 /// Part of the input string that hasn't been consumed yet.
145 #[inline]
146 pub fn remainder(&self) -> &str {
147 self.remaining_input
148 }
149
150 /// Byte span of the last token.
151 #[inline]
152 pub fn span(&self) -> Span {
153 self.span.clone()
154 }
155
156 /// Current position (just after the last token).
157 #[inline]
158 pub fn position(&self) -> Position {
159 self.pos
160 }
161
162 /// Current byte offset in the source.
163 #[inline]
164 pub fn offset(&self) -> Offset {
165 Offset(self.offset)
166 }
167
168 /// Sets the offset if it is valid, updating position and span.
169 /// Returns `true` if the offset is valid, `false` otherwise.
170 pub fn set_offset(&mut self, offset: Offset) -> bool {
171 let offset = offset.0;
172
173 if self.full_input.is_char_boundary(offset) {
174 self.remaining_input = &self.full_input[offset..];
175
176 self.span = offset..offset;
177 self.pos = Position::starting().update_from_str(&self.full_input[..offset]);
178 self.offset = offset;
179 true
180 } else {
181 false
182 }
183 }
184
185 /// Returns `true` if the current position is the start of input.
186 #[inline]
187 pub fn is_at_start(&self) -> bool {
188 self.offset == 0
189 }
190
191 /// Returns `true` if the input has been exhausted.
192 #[inline]
193 pub fn is_at_end(&self) -> bool {
194 self.remaining_input.is_empty()
195 }
196
197 /// Peeks at the next character of the input.
198 #[inline]
199 pub fn peek(&self) -> Option<char> {
200 self.remaining_input.chars().next()
201 }
202
203 /// Consumes the rest of input.
204 ///
205 /// # Example
206 ///
207 /// ```rust
208 /// use simple_tokenizer::*;
209 ///
210 /// let mut tokens = "tokens".as_tokens();
211 ///
212 /// assert_eq!(tokens.consume_all(), "tokens");
213 /// assert!(tokens.remainder().is_empty());
214 ///
215 /// ```
216 #[inline]
217 pub fn consume_all(&mut self) -> &str {
218 self.split(self.remaining_input.len())
219 }
220
221 /// Consume the next substring equal to `token` or nothing.
222 /// Basically a shortcut for `bytes_if(token.len(), |s| s == token).is_some()`.
223 ///
224 /// # Example
225 ///
226 /// ```rust
227 /// use simple_tokenizer::*;
228 ///
229 /// let mut tokens = "tok1 tok2".as_tokens();
230 ///
231 /// assert!(tokens.token("tok1"));
232 /// assert_eq!(tokens.remainder(), " tok2");
233 ///
234 /// assert!(!tokens.token(" tok3"));
235 /// assert_eq!(tokens.remainder(), " tok2");
236 ///
237 /// ```
238 pub fn token(&mut self, token: impl AsRef<str>) -> bool {
239 let token = token.as_ref();
240
241 self.remaining_input
242 .get(..token.len())
243 .filter(|s| *s == token)
244 .map(|s| self.split(s.len()))
245 .is_some()
246 }
247
248 /// Try to consume a substring equal to one of the provided tokens.
249 /// Returns the first successful substring.
250 ///
251 /// # Example
252 ///
253 /// ```rust
254 /// use simple_tokenizer::*;
255 ///
256 /// let mut tokens = "tok1 tok2".as_tokens();
257 ///
258 /// assert_eq!(tokens.tokens(&["tok", "tok1"]), Some("tok"));
259 /// assert_eq!(tokens.remainder(), "1 tok2");
260 ///
261 /// assert_eq!(tokens.tokens(&["1 tok3", "2 tok2"]), None);
262 /// assert_eq!(tokens.remainder(), "1 tok2");
263 ///
264 /// ```
265 pub fn tokens(&mut self, tokens: impl IntoIterator<Item = impl AsRef<str>>) -> Option<&str> {
266 for token in tokens.into_iter() {
267 if self.token(token) {
268 return Some(&self.full_input[self.span.clone()]);
269 }
270 }
271
272 None
273 }
274
275 /// Consume the next character.
276 ///
277 /// # Example
278 ///
279 /// ```rust
280 /// use simple_tokenizer::*;
281 ///
282 /// let mut tokens = "tokens".as_tokens();
283 ///
284 /// assert_eq!(tokens.char(), Some('t'));
285 /// assert_eq!(tokens.remainder(), "okens");
286 ///
287 /// ```
288 pub fn char(&mut self) -> Option<char> {
289 (!self.remaining_input.is_empty()).then(|| self.split_next_char())
290 }
291
292 /// Consume the next character if it matches a predicate.
293 ///
294 /// # Example
295 ///
296 /// ```rust
297 /// use simple_tokenizer::*;
298 ///
299 /// let mut tokens = "tokens".as_tokens();
300 ///
301 /// assert_eq!(tokens.char_if(char::is_alphabetic), Some('t'));
302 /// assert_eq!(tokens.remainder(), "okens");
303 ///
304 /// assert_eq!(tokens.char_if(char::is_numeric), None);
305 /// assert_eq!(tokens.remainder(), "okens");
306 ///
307 /// ```
308 pub fn char_if(&mut self, f: impl FnOnce(char) -> bool) -> Option<char> {
309 self.remaining_input
310 .chars()
311 .next()
312 .filter(|ch| f(*ch))
313 .map(|_| self.split_next_char())
314 }
315
316 /// Consume the next `n` bytes.
317 ///
318 /// # Example
319 ///
320 /// ```rust
321 /// use simple_tokenizer::*;
322 ///
323 /// let mut tokens = "tokens123".as_tokens();
324 ///
325 /// assert_eq!(tokens.bytes(6), Some("tokens"));
326 /// assert_eq!(tokens.remainder(), "123");
327 ///
328 /// assert_eq!(tokens.bytes(5), None);
329 /// assert_eq!(tokens.remainder(), "123");
330 ///
331 /// ```
332 pub fn bytes(&mut self, n: usize) -> Option<&str> {
333 self.remaining_input
334 .is_char_boundary(n)
335 .then(|| self.split(n))
336 }
337
338 /// Consume the next `n` bytes if they match a predicate.
339 ///
340 /// # Example
341 ///
342 /// ```rust
343 /// use simple_tokenizer::*;
344 ///
345 /// let mut tokens = "1231234".as_tokens();
346 ///
347 /// assert_eq!(tokens.bytes_if(3, |s| s.chars().all(char::is_numeric)), Some("123"));
348 /// assert_eq!(tokens.remainder(), "1234");
349 ///
350 /// assert_eq!(tokens.bytes_if(5, |s| s.chars().all(char::is_numeric)), None);
351 /// assert_eq!(tokens.remainder(), "1234");
352 ///
353 /// ```
354 pub fn bytes_if(&mut self, n: usize, f: impl FnOnce(&str) -> bool) -> Option<&str> {
355 self.remaining_input
356 .get(..n)
357 .filter(|s| f(s))
358 .map(|s| self.split(s.len()))
359 }
360
361 /// Limit the input to the next `n` bytes.
362 /// Returns `true` if successful (`n` lands on a char boundary).
363 ///
364 /// # Example
365 ///
366 /// ```rust
367 /// use simple_tokenizer::*;
368 ///
369 /// let mut tokens = "123456".as_tokens();
370 ///
371 /// assert!(tokens.limit_bytes(4));
372 /// assert_eq!(tokens.remainder(), "1234");
373 ///
374 /// ```
375 pub fn limit_bytes(&mut self, n: usize) -> bool {
376 if self.remaining_input.is_char_boundary(n) {
377 self.remaining_input = &self.remaining_input[..n];
378 true
379 } else {
380 false
381 }
382 }
383
384 /// Attempts to split the `Tokens` into two.
385 /// Similar to [`str::split_at()`](https://doc.rust-lang.org/std/primitive.str.html#method.split_at).
386 ///
387 /// # Example
388 ///
389 /// ```rust
390 /// use simple_tokenizer::*;
391 ///
392 /// let mut tokens = "1231234".as_tokens();
393 ///
394 /// let (first, second) = tokens.split_bytes(3).unwrap();
395 ///
396 /// assert_eq!(first.remainder(), "123");
397 /// assert_eq!(second.remainder(), "1234");
398 /// assert_eq!(second.offset(), Offset(3));
399 ///
400 /// ```
401 pub fn split_bytes(self, n: usize) -> Option<(Tokens<'s>, Tokens<'s>)> {
402 let mut first = self.clone();
403 let mut second = self;
404
405 if second.bytes(n).is_some() {
406 first.limit_bytes(n);
407
408 Some((first, second))
409 } else {
410 None
411 }
412 }
413
414 /// Consume the next `n` characters.
415 /// Doesn't advance if there aren't enough characters left.
416 ///
417 /// # Example
418 ///
419 /// ```rust
420 /// use simple_tokenizer::*;
421 ///
422 /// let mut tokens = "tokens123".as_tokens();
423 ///
424 /// assert_eq!(tokens.chars(6), Some("tokens"));
425 /// assert_eq!(tokens.remainder(), "123");
426 ///
427 /// assert_eq!(tokens.chars(5), None);
428 /// assert_eq!(tokens.remainder(), "123");
429 ///
430 /// ```
431 pub fn chars(&mut self, n: usize) -> Option<&str> {
432 self.remaining_input
433 .char_indices()
434 .nth(n.checked_sub(1)?)
435 .map(|(i, ch)| self.split(i + ch.len_utf8()))
436 }
437
438 /// Consume the next `n` characters if they match a predicate.
439 /// Doesn't advance if there aren't enough characters left.
440 ///
441 /// # Example
442 ///
443 /// ```rust
444 /// use simple_tokenizer::*;
445 ///
446 /// let mut tokens = "1231234".as_tokens();
447 ///
448 /// assert_eq!(tokens.chars_if(3, |s| s.chars().all(char::is_numeric)), Some("123"));
449 /// assert_eq!(tokens.remainder(), "1234");
450 ///
451 /// assert_eq!(tokens.chars_if(5, |s| s.chars().all(char::is_numeric)), None);
452 /// assert_eq!(tokens.remainder(), "1234");
453 ///
454 /// ```
455 pub fn chars_if(&mut self, n: usize, f: impl FnOnce(&str) -> bool) -> Option<&str> {
456 self.remaining_input
457 .char_indices()
458 .nth(n.checked_sub(1)?)
459 .map(|(i, ch)| &self.remaining_input[..i + ch.len_utf8()])
460 .filter(|s| f(s))
461 .map(|s| self.split(s.len()))
462 }
463
464 /// Limits the input to the next `n` characters.
465 /// Returns `true` if successful (>=n characters left in the input).
466 ///
467 /// # Example
468 ///
469 /// ```rust
470 /// use simple_tokenizer::*;
471 ///
472 /// let mut tokens = "123456".as_tokens();
473 ///
474 /// assert!(tokens.limit_chars(4));
475 /// assert_eq!(tokens.remainder(), "1234");
476 ///
477 /// ```
478 pub fn limit_chars(&mut self, n: usize) -> bool {
479 if let Some((i, _)) = self.remaining_input.char_indices().nth(n) {
480 self.remaining_input = &self.remaining_input[..i];
481 true
482 } else {
483 false
484 }
485 }
486
487 /// Attempts to split the `Tokens` into two.
488 /// Similar to [`str::split_at()`](https://doc.rust-lang.org/std/primitive.str.html#method.split_at), but `n` is in characters.
489 ///
490 /// # Example
491 ///
492 /// ```rust
493 /// use simple_tokenizer::*;
494 ///
495 /// let mut tokens = "1231234".as_tokens();
496 ///
497 /// let (first, second) = tokens.split_chars(3).unwrap();
498 ///
499 /// assert_eq!(first.remainder(), "123");
500 /// assert_eq!(second.remainder(), "1234");
501 /// assert_eq!(second.offset(), Offset(3));
502 ///
503 /// ```
504 pub fn split_chars(self, n: usize) -> Option<(Tokens<'s>, Tokens<'s>)> {
505 let mut first = self.clone();
506 let mut second = self;
507
508 if second.chars(n).is_some() {
509 first.limit_chars(n);
510
511 Some((first, second))
512 } else {
513 None
514 }
515 }
516
517 /// Consume characters while `f` returns true.
518 /// Returns the consumed substring.
519 ///
520 /// # Example
521 ///
522 /// ```rust
523 /// use simple_tokenizer::*;
524 ///
525 /// let mut tokens = "12345word".as_tokens();
526 ///
527 /// assert_eq!(tokens.take_while(char::is_numeric), "12345");
528 /// assert_eq!(tokens.remainder(), "word");
529 ///
530 /// ```
531 pub fn take_while(&mut self, mut f: impl FnMut(char) -> bool) -> &str {
532 self.remaining_input
533 .char_indices()
534 .take_while(|(_, ch)| f(*ch))
535 .last()
536 .map(|(i, ch)| self.split(i + ch.len_utf8()))
537 .unwrap_or("")
538 }
539
540 /// Limit the input to the next amount of characters, for which `f` returns `true`.
541 ///
542 /// # Example
543 ///
544 /// ```rust
545 /// use simple_tokenizer::*;
546 ///
547 /// let mut tokens = "line 1\nline 2".as_tokens();
548 ///
549 /// tokens.limit_while(|ch| ch != '\n');
550 /// assert_eq!(tokens.remainder(), "line 1");
551 ///
552 /// ```
553 pub fn limit_while(&mut self, mut f: impl FnMut(char) -> bool) {
554 if let Some((i, ch)) = self
555 .remaining_input
556 .char_indices()
557 .take_while(|(_, ch)| f(*ch))
558 .last()
559 {
560 self.remaining_input = &self.remaining_input[..i + ch.len_utf8()];
561 }
562 }
563
564 /// Attempts to split the `Tokens` into two.
565 /// Similar to [`str::split_at()`](https://doc.rust-lang.org/std/primitive.str.html#method.split_at).
566 /// The split point is determined by `f`.
567 ///
568 /// # Example
569 ///
570 /// ```rust
571 /// use simple_tokenizer::*;
572 ///
573 /// let mut tokens = "12345abcdef".as_tokens();
574 ///
575 /// let (first, second) = tokens.split_while(char::is_numeric);
576 ///
577 /// assert_eq!(first.remainder(), "12345");
578 /// assert_eq!(second.remainder(), "abcdef");
579 /// assert_eq!(second.offset(), Offset(5));
580 ///
581 /// ```
582 pub fn split_while(self, f: impl FnMut(char) -> bool) -> (Tokens<'s>, Tokens<'s>) {
583 let mut first = self.clone();
584 let mut second = self;
585
586 let n = second.take_while(f).len();
587 first.limit_bytes(n);
588
589 (first, second)
590 }
591
592 fn split(&mut self, i: usize) -> &str {
593 let (result, remainder) = self.remaining_input.split_at(i);
594
595 self.remaining_input = remainder;
596
597 self.pos.update_from_str(result);
598
599 self.offset += i;
600 self.span = self.span.end..self.offset;
601
602 result
603 }
604
605 fn split_next_char(&mut self) -> char {
606 let ch = self.remaining_input.chars().next().unwrap();
607
608 self.remaining_input = &self.remaining_input[ch.len_utf8()..];
609
610 self.offset += ch.len_utf8();
611 self.span = self.span.end..self.offset;
612
613 self.pos.update_from_char(ch);
614
615 ch
616 }
617}
618
619/// Convenience trait implemented for every `T: AsRef<str>`.
620pub trait AsTokens {
621 /// Convenient converting to tokens instance.
622 fn as_tokens(&self) -> Tokens<'_>;
623}
624
625impl<T> AsTokens for T
626where
627 T: AsRef<str>,
628{
629 #[inline]
630 fn as_tokens(&self) -> Tokens {
631 Tokens::new(self.as_ref())
632 }
633}
634
635impl<'s> AsTokens for Tokens<'s> {
636 #[inline]
637 fn as_tokens(&self) -> Tokens<'_> {
638 // it is very cheap anyway
639 self.clone()
640 }
641}