simple_cursor/lib.rs
1//! A super simple `#[no_std]`-compatible character cursor implementation geared towards
2//! lexers/tokenizers. The implementation is inspired by the one used in `rustc` and should be
3//! performant enough to handle pretty much anything you could throw at it.
4//!
5//! # Basic use
6//! The following examples showcases the basic features of `simple_cursor`. Please refer to
7//! [`Cursor`] for more info.
8//!
9//! ```rust
10//! use simple_cursor::Cursor;
11//!
12//! // Create the input string and the cursor.
13//! let input = "123 foobar竜<!>";
14//! let mut cursor = Cursor::new(input);
15//!
16//! // "123"
17//! let number_start = cursor.byte_pos();
18//! cursor.skip_while(|c| c.is_ascii_digit());
19//! let number_end = cursor.byte_pos();
20//!
21//! // Some(' ')
22//! let whitespace = cursor.bump();
23//!
24//! // "foobar"
25//! let ident_start = cursor.byte_pos();
26//! cursor.skip_while(|c| c.is_ascii_alphabetic());
27//! let ident_end = cursor.byte_pos();
28//!
29//! // "竜<!>"
30//! let rest_start = ident_end;
31//! let rest_end = input.len();
32//!
33//! assert_eq!("123", &input[number_start..number_end]);
34//! assert_eq!(Some(' '), whitespace);
35//! assert_eq!("foobar", &input[ident_start..ident_end]);
36//! assert_eq!("竜<!>", &input[rest_start..rest_end]);
37//! ```
38
39#![no_std]
40
41use core::str::Chars;
42
43/// Abstraction over a character iterator.
44pub struct Cursor<'a> {
45 /// Raw charactor iterator.
46 chars: Chars<'a>,
47 /// Current byte position of the cursor.
48 byte_pos: usize,
49}
50
51impl<'a> Cursor<'a> {
52 /// Creates a new [`Cursor`] from an input string.
53 pub fn new(input: &'a str) -> Self {
54 Self {
55 chars: input.chars(),
56 byte_pos: 0,
57 }
58 }
59
60 /// Immutable reference to the internal character iterator.
61 pub fn chars(&self) -> &Chars<'a> {
62 &self.chars
63 }
64
65 /// The current byte position of the cursor into the input string.
66 pub fn byte_pos(&self) -> usize {
67 self.byte_pos
68 }
69
70 /// Peeks the next character without advancing the cursor.
71 pub fn peek(&self) -> Option<char> {
72 // Cloning a [`Chars`] iterator is cheap.
73 self.chars.clone().next()
74 }
75
76 /// Peeks the next two characters without advancing the cursor.
77 pub fn peek_two(&self) -> (Option<char>, Option<char>) {
78 // Cloning a [`Chars`] iterator is cheap.
79 let mut cloned = self.chars.clone();
80 (cloned.next(), cloned.next())
81 }
82
83 /// Bumps the cursor and returns the next character.
84 pub fn bump(&mut self) -> Option<char> {
85 // Bump the character iterator
86 let c = self.chars.next();
87 // Bump the byte position
88 self.byte_pos += c.map(char::len_utf8).unwrap_or_default();
89 c
90 }
91
92 /// Bumps the cursor and returns the next two characters.
93 pub fn bump_two(&mut self) -> (Option<char>, Option<char>) {
94 // Bump the character iterator
95 let (c1, c2) = (self.chars.next(), self.chars.next());
96
97 // Bump the byte position
98 self.byte_pos += c1.map(char::len_utf8).unwrap_or_default();
99 self.byte_pos += c2.map(char::len_utf8).unwrap_or_default();
100
101 (c1, c2)
102 }
103
104 /// Bumps the cursor while `predicate` is true for the current character.
105 ///
106 /// Notably, this method will **not** consume the first non-matching character. This is in
107 /// contrast with methods like [`Iterator::take_while`]. This behavior is achieved by peeking
108 /// the next character to see if it matches before consuming it.
109 pub fn skip_while(&mut self, predicate: fn(char) -> bool) {
110 // Record the remaining input bytes before skipping
111 let start_length = self.chars.as_str().len();
112 // Skip while predicate matches (without taking the first non-matching)
113 while matches!(self.peek(), Some(c) if predicate(c)) {
114 // Notice how this doesn't call [`Cursor::next`] directly.
115 // This way we can batch the byte_pos update.
116 self.chars.next();
117 }
118 // Record the remaining input bytes after skipping
119 let final_length = self.chars.as_str().len();
120 // Bump the byte_pos by how many bytes were skipped
121 self.byte_pos += start_length - final_length;
122 }
123}
124
125#[cfg(test)]
126mod tests {
127 use super::Cursor;
128
129 #[test]
130 fn peek() {
131 let input = "s";
132 let cursor = Cursor::new(input);
133 assert_eq!(cursor.peek(), Some('s'));
134 assert_eq!(cursor.byte_pos, 0);
135 assert_eq!(cursor.chars.as_str(), input);
136
137 let input = "";
138 let cursor = Cursor::new(input);
139 assert_eq!(cursor.peek(), None);
140 assert_eq!(cursor.byte_pos, 0);
141 assert_eq!(cursor.chars.as_str(), input);
142 }
143
144 #[test]
145 fn peek_two() {
146 let input = "ab";
147 let cursor = Cursor::new(input);
148 assert_eq!(cursor.peek_two(), (Some('a'), Some('b')));
149 assert_eq!(cursor.byte_pos, 0);
150 assert_eq!(cursor.chars.as_str(), input);
151
152 let input = "a";
153 let cursor = Cursor::new(input);
154 assert_eq!(cursor.peek_two(), (Some('a'), None));
155 assert_eq!(cursor.byte_pos, 0);
156 assert_eq!(cursor.chars.as_str(), input);
157
158 let input = "";
159 let cursor = Cursor::new(input);
160 assert_eq!(cursor.peek_two(), (None, None));
161 assert_eq!(cursor.byte_pos, 0);
162 assert_eq!(cursor.chars.as_str(), input);
163 }
164
165 #[test]
166 fn bump() {
167 let input = "a";
168 let mut cursor = Cursor::new(input);
169 assert_eq!(cursor.bump(), Some('a'));
170 assert_eq!(cursor.byte_pos, 1);
171 assert_eq!(cursor.chars.as_str(), "");
172 }
173
174 #[test]
175 fn bump_two() {
176 let input = "abc";
177 let mut cursor = Cursor::new(input);
178 assert_eq!(cursor.bump_two(), (Some('a'), Some('b')));
179 assert_eq!(cursor.byte_pos, 2);
180 assert_eq!(cursor.chars.as_str(), "c");
181
182 assert_eq!(cursor.bump_two(), (Some('c'), None));
183 assert_eq!(cursor.byte_pos, 3);
184 assert_eq!(cursor.chars.as_str(), "");
185
186 assert_eq!(cursor.bump_two(), (None, None));
187 assert_eq!(cursor.byte_pos, 3);
188 assert_eq!(cursor.chars.as_str(), "");
189 }
190
191 #[test]
192 fn skip_while() {
193 let input = "aaaab";
194 let mut cursor = Cursor::new(input);
195 cursor.skip_while(|c| c == 'a');
196 assert_eq!(cursor.byte_pos, 4);
197 assert_eq!(cursor.chars.as_str(), "b");
198 }
199}