tokenise/lib.rs
1//! # Tokenise
2//!
3//! A flexible lexical analyser (tokeniser) for parsing text into configurable token types.
4//!
5//! `tokenise` allows you to split text into tokens based on customisable rules for special characters,
6//! delimiters, and comments. It's designed to be flexible enough to handle various syntax styles
7//! while remaining simple to configure.
8//!
9//! ## Basic Usage
10//!
11//! The following example demonstrates how to configure a tokeniser with common syntax elements
12//! and process a simple code snippet:
13//!
14//! ```
15//! use tokenise::Tokeniser;
16//!
17//! fn main() {
18//! // Create a new tokeniser
19//! let mut tokeniser = Tokeniser::new();
20//!
21//! // Configure tokeniser with rules
22//! tokeniser.add_specials(".,;:!?");
23//! tokeniser.add_delimiter_pairs(&vec!["()", "[]", "{}"]).unwrap();
24//! tokeniser.add_balanced_delimiter("\"").unwrap();
25//! tokeniser.set_sl_comment("//").unwrap();
26//! tokeniser.set_ml_comment("/*", "*/").unwrap();
27//!
28//! // Tokenise some source text
29//! let source = "let x = 42; // The answer\nprint(\"Hello world!\");";
30//! let tokens = tokeniser.tokenise(source).unwrap();
31//!
32//! // Work with the resulting tokens
33//! for token in tokens {
34//! println!("{:?}: '{}'", token.get_state(), token.value());
35//! }
36//! }
37//! ```
38//!
39//! ## Features
40//!
41//! - Unicode support (using grapheme clusters)
42//! - Configurable special characters and delimiters
43//! - Support for paired delimiters (e.g., parentheses, brackets)
44//! - Support for balanced delimiters (e.g., quotation marks)
45//! - Single-line and multi-line comment handling
46//! - Whitespace and newline preservation
47//!
48//! ## Token Types
49//!
50//! The tokeniser recognises several token types represented by the `TokenState` enum:
51//!
52//! - `Word`: Non-special character sequences (anything not identified as a special character or whitespace)
53//! - `LDelimiter`/`RDelimiter`: Left/right delimiters of a pair (e.g., '(', ')')
54//! - `BDelimiter`: Balanced delimiters (e.g., quotation marks)
55//! - `SymbolString`: Special characters
56//! - `NewLine`: Line breaks
57//! - `WhiteSpace`: Spaces, tabs, etc.
58//! - `SLComment`: Single-line comments
59//! - `MLComment`: Multi-line comments
60//!
61//! More precise definitions can be found in the documentation for each specific type.
62
63use unicode_segmentation::UnicodeSegmentation;
64
65// TODO: add multi-character Parenthesis
66/// Represents the type of a token in the tokenisation process.
67///
68/// Each token in the parsed text is classified as one of these types,
69/// which determines how it is interpreted and processed.
70#[derive(Clone, Copy, Debug, PartialEq)]
71pub enum TokenState {
72 /// A sequence of non-special characters (excluding whitespace).
73 Word,
74
75 /// A left delimiter of a pair (e.g., opening bracket).
76 LDelimiter,
77
78 /// A right delimiter of a pair (e.g., closing bracket).
79 RDelimiter,
80
81 /// A balanced delimiter that can open or close (e.g., quotation mark).
82 BDelimiter,
83
84 /// A sequence of special characters not recognized as delimiters or comments.
85 SymbolString,
86
87 /// A newline character sequence (\n, \r, or \r\n).
88 NewLine,
89
90 /// A sequence of whitespace characters (excluding newlines).
91 WhiteSpace,
92
93 /// A single-line comment.
94 SLComment,
95
96 /// A multi-line comment.
97 MLComment
98}
99use TokenState::*;
100
101/// Represents the categorisation of delimiters into left, right, or balanced types.
102///
103/// This is used to classify delimiters when tokenising text:
104/// - Left delimiters open a section (e.g., opening brackets)
105/// - Right delimiters close a section (e.g., closing brackets)
106/// - Balanced delimiters can serve as both opening and closing (e.g., quotation marks)
107#[derive(Debug, PartialEq, Eq)]
108pub enum Side {
109 /// A right (closing) delimiter such as ')', ']', or '}'.
110 Right,
111
112 /// A left (opening) delimiter such as '(', '[', or '{'.
113 Left,
114
115 /// A balanced delimiter that can both open and close, such as '"'.
116 Bal
117}
118
119/// Checks if a string is exactly one grapheme cluster (user-perceived character).
120///
121/// # Examples
122/// ```
123/// assert!(tokenise::is_grapheme("a"));
124/// assert!(tokenise::is_grapheme("π¨βπ»"));
125/// assert!(tokenise::is_grapheme("\r\n"));
126/// assert!(!tokenise::is_grapheme("ab"));
127/// ```
128pub fn is_grapheme(s: &str) -> bool {
129 s.graphemes(true).collect::<Vec<_>>().len() == 1
130}
131
132/// Checks if a string consists entirely of whitespace.
133///
134/// # Examples
135/// ```
136/// assert!(tokenise::is_whitespace(" \t"));
137/// assert!(!tokenise::is_whitespace("a "));
138/// ```
139pub fn is_whitespace(c: &str) -> bool {
140 c.trim().is_empty()
141}
142
143/// Represents a token extracted from the source text during tokenisation.
144///
145/// Each token has a state (type), a string value, and a position in the original text.
146#[derive(Debug, PartialEq, Clone, Copy)]
147pub struct Token<'a> {
148 /// The type of this token.
149 state: TokenState,
150
151 /// The string content of this token.
152 val: &'a str,
153
154 /// The starting position (in characters) of this token in the original text.
155 start_pos: usize
156}
157
158impl<'a> Token<'a> {
159 /// Returns the starting position of this token in the original text.
160 pub fn start(&self) -> usize {
161 self.start_pos
162 }
163
164 /// Returns the string content of this token.
165 pub fn value(&self) -> &'a str {
166 self.val
167 }
168
169 /// Returns the state (type) of this token.
170 pub fn get_state(&self) -> TokenState {
171 self.state
172 }
173}
174
175/// A configurable tokeniser for parsing text into meaningful tokens.
176///
177/// The `Tokeniser` can be customised with special characters, delimiter pairs,
178/// balanced delimiters, and comment markers to suit different syntax requirements.
179/// Once configured, it can parse text into tokens according to those rules.
180///
181/// Note that delimiters and the characters in comment markers are automatically
182/// treated as special characters, but with additional distinctions in how they're
183/// processed during tokenisation.
184///
185/// # Examples
186///
187/// ```
188/// use tokenise::Tokeniser;
189///
190/// // Create and configure a tokeniser for a C-like language
191/// let mut tokeniser = Tokeniser::new();
192/// tokeniser.add_specials("+-*/=<>!&|^~%");
193/// tokeniser.add_delimiter_pairs(&vec!["()", "[]", "{}"]).unwrap();
194/// tokeniser.set_sl_comment("//").unwrap();
195/// tokeniser.set_ml_comment("/*", "*/").unwrap();
196///
197/// // Tokenise some code
198/// let code = "int main() { // Entry point\n return 0;\n}";
199/// let tokens = tokeniser.tokenise(code).unwrap();
200/// ```
201pub struct Tokeniser {
202 special_characters: Vec<String>,
203 delimiter_pairs: Vec<(String, String)>,
204 balanced_delimiters: Vec<String>,
205 single_line_comment: Option<String>,
206 multi_line_comment: Option<(String, String)>
207}
208
209impl Tokeniser {
210 /// Creates a new, unconfigured `Tokeniser`.
211 ///
212 /// This constructor creates a tokeniser with no special characters, delimiters, or comment markers.
213 /// You'll need to configure it with the appropriate methods before it's ready for use.
214 ///
215 /// # Examples
216 ///
217 /// ```
218 /// use tokenise::Tokeniser;
219 ///
220 /// let mut tokeniser = Tokeniser::new();
221 /// // Configure the tokeniser...
222 /// ```
223 pub fn new() -> Self {
224 Self {
225 special_characters: Vec::new(),
226 delimiter_pairs: Vec::new(),
227 balanced_delimiters: Vec::new(),
228 single_line_comment: None,
229 multi_line_comment: None,
230 }
231 }
232
233 /// Adds a single special character to the tokeniser.
234 ///
235 /// Special characters are treated differently from regular text during tokenisation.
236 /// They form `SymbolString` tokens unless they're also configured as delimiters or
237 /// used in comment markers.
238 ///
239 /// # Arguments
240 ///
241 /// * `special` - The special character to add, which must be a single grapheme
242 ///
243 /// # Returns
244 ///
245 /// * `Ok(())` if the character was added successfully
246 /// * `Err(String)` if the input is not a single grapheme
247 ///
248 /// # Examples
249 ///
250 /// ```
251 /// use tokenise::Tokeniser;
252 ///
253 /// let mut tokeniser = Tokeniser::new();
254 /// tokeniser.add_special("@").unwrap();
255 /// tokeniser.add_special("+").unwrap();
256 ///
257 /// // Unicode graphemes are supported
258 /// tokeniser.add_special("π¨βπ»").unwrap();
259 ///
260 /// // This would fail as it's not a single grapheme
261 /// assert!(tokeniser.add_special("abc").is_err());
262 /// ```
263 pub fn add_special(&mut self, special: &str) -> Result<(),String> {
264 if !is_grapheme(special) {
265 Err(format!("string {:?} is not a single grapheme",special))
266 } else {
267 if !self.special(special) {
268 self.special_characters.push(special.to_string());
269 }
270 Ok(())
271 }
272 }
273
274 /// Adds multiple special characters to the tokeniser.
275 ///
276 /// This is a convenience method that adds each grapheme in the input string
277 /// as a special character.
278 ///
279 /// # Arguments
280 ///
281 /// * `specials` - A string containing the special characters to add
282 ///
283 /// # Examples
284 ///
285 /// ```
286 /// use tokenise::Tokeniser;
287 ///
288 /// let mut tokeniser = Tokeniser::new();
289 /// tokeniser.add_specials("+-*/=<>!&|^~%");
290 /// ```
291 pub fn add_specials(&mut self, specials: &str) {
292 for c in specials.graphemes(true) {
293 self.add_special(c).unwrap();
294 }
295 }
296
297 /// Adds a pair of left and right delimiters to the tokeniser.
298 ///
299 /// Delimiter pairs are used to mark the beginning and end of sections in text,
300 /// such as parentheses, brackets, and braces. During tokenisation, they are
301 /// classified as `LDelimiter` and `RDelimiter` respectively.
302 ///
303 /// Both characters are automatically added as special characters if they aren't already.
304 ///
305 /// # Arguments
306 ///
307 /// * `left` - The left (opening) delimiter, which must be a single grapheme
308 /// * `right` - The right (closing) delimiter, which must be a single grapheme
309 ///
310 /// # Returns
311 ///
312 /// * `Ok(())` if the delimiter pair was added successfully
313 /// * `Err(String)` if either character is not a single grapheme, or if either
314 /// character is already used as a different type of delimiter
315 ///
316 /// # Examples
317 ///
318 /// ```
319 /// use tokenise::Tokeniser;
320 ///
321 /// let mut tokeniser = Tokeniser::new();
322 /// tokeniser.add_delimiter_pair("(", ")").unwrap();
323 /// tokeniser.add_delimiter_pair("[", "]").unwrap();
324 /// tokeniser.add_delimiter_pair("{", "}").unwrap();
325 ///
326 /// // Unicode delimiters are supported
327 /// tokeniser.add_delimiter_pair("γ", "γ").unwrap();
328 /// ```
329 pub fn add_delimiter_pair(&mut self, left: &str, right: &str) -> Result<(),String> {
330 if !is_grapheme(left) {
331 Err(format!("string {:?} is not a single grapheme",left))
332 } else if !is_grapheme(right) {
333 Err(format!("string {:?} is not a single grapheme",right))
334 } else {
335 match (self.delimiter(left),self.delimiter(right)) {
336 (None, None) => {
337 self.add_special(left).unwrap();
338 self.add_special(right).unwrap();
339 self.delimiter_pairs.push((left.to_string(),right.to_string()));
340 },
341 (None, Some(_)) => {
342 return Err(format!("right delimiter {right:?} is already a delimiter of type {:?} with other pair", self.delimiter(right).unwrap()));
343 },
344 (Some(_), None) => {
345 return Err(format!("left delimiter {left:?} is already a delimiter of type {:?} with other pair", self.delimiter(left).unwrap()));
346 },
347 (Some(l), Some(r)) => {
348 match l {
349 Side::Right => {
350 return Err(format!("left delimiter {left:?} is already a delimiter of type {:?} with other pair", Side::Right));
351 },
352 Side::Left => {},
353 Side::Bal => {
354 return Err(format!("left delimiter {left:?} is already a delimiter of type {:?} with other pair", Side::Bal));
355 },
356 }
357 match r {
358 Side::Right => {},
359 Side::Left => {
360 return Err(format!("right delimiter {right:?} is already a delimiter of type {:?} with other pair", Side::Left));
361 },
362 Side::Bal => {
363 return Err(format!("right delimiter {right:?} is already a delimiter of type {:?} with other pair", Side::Bal));
364 },
365 }
366 },
367 }
368 Ok(())
369 }
370 }
371
372 /// Adds multiple delimiter pairs to the tokeniser.
373 ///
374 /// Each pair should be represented as a string containing exactly two graphemes,
375 /// where the first is the left delimiter and the second is the right delimiter.
376 ///
377 /// Each character is automatically added as a special character if it isn't already.
378 ///
379 /// # Arguments
380 ///
381 /// * `delimiter_pairs` - A vector of strings, each containing exactly two graphemes
382 ///
383 /// # Returns
384 ///
385 /// * `Ok(())` if all delimiter pairs were added successfully
386 /// * `Err(String)` if any string doesn't contain exactly two graphemes, or if any
387 /// character is already used as a different type of delimiter
388 ///
389 /// # Examples
390 ///
391 /// ```
392 /// use tokenise::Tokeniser;
393 ///
394 /// let mut tokeniser = Tokeniser::new();
395 /// tokeniser.add_delimiter_pairs(&vec!["()", "[]", "{}"]).unwrap();
396 /// ```
397 pub fn add_delimiter_pairs(&mut self, delimiter_pairs: &Vec<&str>) -> Result<(),String> {
398 for &s in delimiter_pairs {
399 let v = s.graphemes(true).collect::<Vec<_>>();
400 if v.len() != 2 {
401 return Err(format!("delimiter pair {s:?} must be made up of 2 graphemes"));
402 }
403 let [left,right] = v.try_into().unwrap();
404 match self.add_delimiter_pair(left, right) {
405 Ok(_) => {},
406 Err(x) => {
407 return Err(x);
408 },
409 }
410 }
411 Ok(())
412 }
413
414 /// Adds a balanced delimiter to the tokeniser.
415 ///
416 /// Balanced delimiters are characters that serve as both opening and closing markers,
417 /// such as quotation marks. During tokenisation, they are classified as `BDelimiter`.
418 ///
419 /// The character is automatically added as a special character if it isn't already.
420 ///
421 /// # Arguments
422 ///
423 /// * `delim` - The balanced delimiter, which must be a single grapheme
424 ///
425 /// # Returns
426 ///
427 /// * `Ok(())` if the delimiter was added successfully
428 /// * `Err(String)` if the character is not a single grapheme, or if it is already used
429 /// as a different type of delimiter
430 ///
431 /// # Examples
432 ///
433 /// ```
434 /// use tokenise::Tokeniser;
435 ///
436 /// let mut tokeniser = Tokeniser::new();
437 /// tokeniser.add_balanced_delimiter("\"").unwrap(); // Double quote
438 /// tokeniser.add_balanced_delimiter("'").unwrap(); // Single quote
439 /// tokeniser.add_balanced_delimiter("`").unwrap(); // Backtick
440 /// ```
441 pub fn add_balanced_delimiter(&mut self, delim: &str) -> Result<(),String> {
442 if !is_grapheme(delim) {
443 Err(format!("string {:?} is not a single grapheme",delim))
444 } else {
445 match self.delimiter(delim) {
446 Some(side) => {
447 match side {
448 Side::Right => {
449 return Err(format!("balanced delimiter {delim:?} is already a delimiter of type {:?} with other pair", Side::Right));
450 },
451 Side::Left => {
452 return Err(format!("balanced delimiter {delim:?} is already a delimiter of type {:?} with other pair", Side::Left));
453 },
454 Side::Bal => {},
455 }
456 },
457 None => {
458 self.add_special(delim).unwrap();
459 self.balanced_delimiters.push(delim.to_string());
460 },
461 }
462 Ok(())
463 }
464 }
465
466 /// Adds multiple balanced delimiters to the tokeniser.
467 ///
468 /// Each character in the input string is added as a balanced delimiter.
469 /// The characters are automatically added as special characters if they aren't already.
470 ///
471 /// # Arguments
472 ///
473 /// * `delims` - A string containing the balanced delimiters to add
474 ///
475 /// # Returns
476 ///
477 /// * `Ok(())` if all delimiters were added successfully
478 /// * `Err(String)` if any character is already used as a different type of delimiter
479 ///
480 /// # Examples
481 ///
482 /// ```
483 /// use tokenise::Tokeniser;
484 ///
485 /// let mut tokeniser = Tokeniser::new();
486 /// tokeniser.add_balanced_delimiters("\"'`").unwrap(); // Adds ", ', and ` as balanced delimiters
487 /// ```
488 pub fn add_balanced_delimiters(&mut self, delims: &str) -> Result<(),String> {
489 for delim in delims.graphemes(true) {
490 match self.add_balanced_delimiter(delim) {
491 Ok(_) => {},
492 Err(x) => {
493 return Err(x);
494 }
495 }
496 }
497 Ok(())
498 }
499
500 /// Sets the marker for single-line comments.
501 ///
502 /// Single-line comments run from the marker to the end of the line.
503 /// During tokenisation, they are classified as `SLComment`.
504 ///
505 /// All characters in the comment marker are automatically added as special characters.
506 ///
507 /// # Arguments
508 ///
509 /// * `comm` - The single-line comment marker (e.g., "//")
510 ///
511 /// # Returns
512 ///
513 /// * `Ok(())` if the marker was set successfully
514 /// * `Err(String)` if the marker is an empty string
515 ///
516 /// # Examples
517 ///
518 /// ```
519 /// use tokenise::Tokeniser;
520 ///
521 /// let mut tokeniser = Tokeniser::new();
522 /// tokeniser.set_sl_comment("//").unwrap(); // C/C++/Rust style
523 ///
524 /// // Could also use other styles
525 /// // tokeniser.set_sl_comment("#").unwrap(); // Python/Ruby style
526 /// // tokeniser.set_sl_comment("--").unwrap(); // SQL/Lua style
527 /// ```
528 pub fn set_sl_comment(&mut self, comm: &str) -> Result<(),String> {
529 if comm.len() == 0 {
530 Err(format!("Empty string cannot be the start of a single line comment"))
531 } else {
532 self.add_specials(comm);
533 self.single_line_comment = Some(comm.to_string());
534 Ok(())
535 }
536 }
537
538 /// Sets the markers for multi-line comments.
539 ///
540 /// Multi-line comments run from the start marker to the end marker,
541 /// potentially spanning multiple lines. During tokenisation, they
542 /// are classified as `MLComment`.
543 ///
544 /// All characters in both comment markers are automatically added as special characters.
545 ///
546 /// # Arguments
547 ///
548 /// * `left` - The start marker for multi-line comments (e.g., "/*")
549 /// * `right` - The end marker for multi-line comments (e.g., "*/")
550 ///
551 /// # Returns
552 ///
553 /// * `Ok(())` if the markers were set successfully
554 /// * `Err(String)` if either marker is an empty string
555 ///
556 /// # Examples
557 ///
558 /// ```
559 /// use tokenise::Tokeniser;
560 ///
561 /// let mut tokeniser = Tokeniser::new();
562 /// tokeniser.set_ml_comment("/*", "*/").unwrap(); // C/C++/Rust style
563 ///
564 /// // Could also use other styles
565 /// // tokeniser.set_ml_comment("<!--", "-->").unwrap(); // HTML/XML style
566 /// // tokeniser.set_ml_comment("{-", "-}").unwrap(); // Haskell style
567 /// ```
568 ///
569 /// # Warning
570 ///
571 /// Be cautious with comment markers that contain alphanumeric characters (like words).
572 /// Since all characters in comment markers are added as special characters, using
573 /// word-based markers may cause unexpected tokenisation of normal text:
574 ///
575 /// ```
576 /// use tokenise::Tokeniser;
577 ///
578 /// // Not recommended - would treat the letters in "begin" and "end" as special characters
579 /// let mut tokeniser = Tokeniser::new();
580 /// tokeniser.set_ml_comment("=begin", "=end").unwrap(); // Ruby style
581 /// ```
582 pub fn set_ml_comment(&mut self, left: &str, right: &str) -> Result<(),String> {
583 if left.len() == 0 {
584 Err(format!("Empty string cannot be the start of a multi-line comment"))
585 } else if right.len() == 0 {
586 Err(format!("Empty string cannot be the end of a multi-line comment"))
587 } else {
588 self.add_specials(left);
589 self.add_specials(right);
590 self.multi_line_comment = Some((left.to_string(),right.to_string()));
591 Ok(())
592 }
593 }
594
595 /// Returns a vector of all registered special characters.
596 ///
597 /// # Returns
598 ///
599 /// A vector of string slices, each containing one special character.
600 ///
601 /// # Examples
602 ///
603 /// ```
604 /// use tokenise::Tokeniser;
605 ///
606 /// let mut tokeniser = Tokeniser::new();
607 /// tokeniser.add_specials("+-*/");
608 ///
609 /// let specials = tokeniser.specials();
610 /// assert!(specials.contains(&"+"));
611 /// assert!(specials.contains(&"-"));
612 /// ```
613 pub fn specials<'a>(&'a self) -> Vec<&'a str> {
614 self.special_characters
615 .iter()
616 .map(|x|x.as_str())
617 .collect()
618 }
619
620 /// Returns a vector of all registered left-right delimiter pairs.
621 ///
622 /// # Returns
623 ///
624 /// A vector of tuples, each containing a left delimiter and its corresponding right delimiter.
625 ///
626 /// # Examples
627 ///
628 /// ```
629 /// use tokenise::Tokeniser;
630 ///
631 /// let mut tokeniser = Tokeniser::new();
632 /// tokeniser.add_delimiter_pairs(&vec!["()", "[]"]).unwrap();
633 ///
634 /// let delimiters = tokeniser.lr_delimiters();
635 /// assert!(delimiters.contains(&("(", ")")));
636 /// ```
637 pub fn lr_delimiters<'a>(&'a self) -> Vec<(&'a str, &'a str)> {
638 self.delimiter_pairs
639 .iter()
640 .map(|(x,y)|(x.as_str(),y.as_str()))
641 .collect()
642 }
643
644 /// Returns a vector of all registered balanced delimiters.
645 ///
646 /// # Returns
647 ///
648 /// A vector of string slices, each containing one balanced delimiter.
649 ///
650 /// # Examples
651 ///
652 /// ```
653 /// use tokenise::Tokeniser;
654 ///
655 /// let mut tokeniser = Tokeniser::new();
656 /// tokeniser.add_balanced_delimiters("\"'").unwrap();
657 ///
658 /// let delimiters = tokeniser.bal_delimiters();
659 /// assert!(delimiters.contains(&"\""));
660 /// assert!(delimiters.contains(&"'"));
661 /// ```
662 pub fn bal_delimiters<'a>(&'a self) -> Vec<&'a str> {
663 self.balanced_delimiters
664 .iter()
665 .map(|x|x.as_str())
666 .collect()
667 }
668
669 /// Returns the configured single-line comment marker, if any.
670 ///
671 /// # Returns
672 ///
673 /// An `Option` containing the single-line comment marker, or `None` if not configured.
674 ///
675 /// # Examples
676 ///
677 /// ```
678 /// use tokenise::Tokeniser;
679 ///
680 /// let mut tokeniser = Tokeniser::new();
681 /// assert_eq!(tokeniser.sl_comment(), None);
682 ///
683 /// tokeniser.set_sl_comment("//").unwrap();
684 /// assert_eq!(tokeniser.sl_comment(), Some("//"));
685 /// ```
686 pub fn sl_comment<'a>(&'a self) -> Option<&'a str> {
687 self.single_line_comment
688 .iter()
689 .map(|x| x.as_str())
690 .next()
691 }
692
693 /// Returns the configured multi-line comment markers, if any.
694 ///
695 /// # Returns
696 ///
697 /// An `Option` containing a tuple of the start and end markers for multi-line comments,
698 /// or `None` if not configured.
699 ///
700 /// # Examples
701 ///
702 /// ```
703 /// use tokenise::Tokeniser;
704 ///
705 /// let mut tokeniser = Tokeniser::new();
706 /// assert_eq!(tokeniser.ml_comment(), None);
707 ///
708 /// tokeniser.set_ml_comment("/*", "*/").unwrap();
709 /// assert_eq!(tokeniser.ml_comment(), Some(("/*", "*/")));
710 /// ```
711 pub fn ml_comment<'a>(&'a self) -> Option<(&'a str, &'a str)> {
712 self.multi_line_comment
713 .iter()
714 .map(|(x,y)|(x.as_str(),y.as_str()))
715 .next()
716 }
717
718 /// Checks if a character is registered as a special character.
719 ///
720 /// Special characters include those explicitly added via `add_special`/`add_specials`,
721 /// as well as any characters used in delimiters or comment markers.
722 ///
723 /// # Arguments
724 ///
725 /// * `c` - The character to check
726 ///
727 /// # Returns
728 ///
729 /// `true` if the character is registered as a special character, `false` otherwise.
730 ///
731 /// # Examples
732 ///
733 /// ```
734 /// use tokenise::Tokeniser;
735 ///
736 /// let mut tokeniser = Tokeniser::new();
737 /// tokeniser.add_special("+").unwrap();
738 /// tokeniser.add_delimiter_pair("(", ")").unwrap();
739 ///
740 /// assert!(tokeniser.special("+")); // Explicitly added special
741 /// assert!(tokeniser.special("(")); // Special because it's a delimiter
742 /// assert!(!tokeniser.special("-")); // Not registered as special
743 /// ```
744 pub fn special(&self, c: &str) -> bool {
745 for x in self.specials() {
746 if x == c {
747 return true;
748 }
749 }
750 false
751 }
752
753 /// Checks if a character is registered as a delimiter and returns its type.
754 ///
755 /// # Arguments
756 ///
757 /// * `c` - The character to check
758 ///
759 /// # Returns
760 ///
761 /// * `Some(Side::Left)` if the character is a left delimiter
762 /// * `Some(Side::Right)` if the character is a right delimiter
763 /// * `Some(Side::Bal)` if the character is a balanced delimiter
764 /// * `None` if the character is not a delimiter
765 ///
766 /// # Examples
767 ///
768 /// ```
769 /// use tokenise::{Tokeniser, Side};
770 ///
771 /// let mut tokeniser = Tokeniser::new();
772 /// tokeniser.add_delimiter_pair("(", ")").unwrap();
773 /// tokeniser.add_balanced_delimiter("\"").unwrap();
774 ///
775 /// assert_eq!(tokeniser.delimiter("("), Some(Side::Left));
776 /// assert_eq!(tokeniser.delimiter(")"), Some(Side::Right));
777 /// assert_eq!(tokeniser.delimiter("\""), Some(Side::Bal));
778 /// assert_eq!(tokeniser.delimiter("a"), None);
779 /// ```
780 pub fn delimiter<'g>(&self, c: &'g str) -> Option<Side> {
781 for (x,y) in self.lr_delimiters() {
782 if x == c {
783 return Some(Side::Left);
784 }
785 if y == c {
786 return Some(Side::Right);
787 }
788 }
789 for x in self.bal_delimiters() {
790 if x == c {
791 return Some(Side::Bal);
792 }
793 }
794 None
795 }
796
797 /// Checks if a string is the configured single-line comment marker.
798 ///
799 /// # Arguments
800 ///
801 /// * `s` - The string to check
802 ///
803 /// # Returns
804 ///
805 /// `true` if the string exactly matches the configured single-line comment marker,
806 /// `false` otherwise or if no single-line comment marker is configured.
807 ///
808 /// # Examples
809 ///
810 /// ```
811 /// use tokenise::Tokeniser;
812 ///
813 /// let mut tokeniser = Tokeniser::new();
814 /// tokeniser.set_sl_comment("//").unwrap();
815 ///
816 /// assert!(tokeniser.is_sl_comment_start("//"));
817 /// assert!(!tokeniser.is_sl_comment_start("/"));
818 /// ```
819 pub fn is_sl_comment_start(&self, s: &str) -> bool {
820 match self.sl_comment() {
821 None => false,
822 Some(sl_comment) => s == sl_comment
823 }
824 }
825
826 /// Checks if a string ends with the configured single-line comment marker.
827 ///
828 /// This is used during tokenisation to detect when a series of special characters
829 /// transitions into a comment.
830 ///
831 /// # Arguments
832 ///
833 /// * `s` - The string to check
834 ///
835 /// # Returns
836 ///
837 /// `true` if the string ends with the configured single-line comment marker,
838 /// `false` otherwise or if no single-line comment marker is configured.
839 ///
840 /// # Examples
841 ///
842 /// ```
843 /// use tokenise::Tokeniser;
844 ///
845 /// let mut tokeniser = Tokeniser::new();
846 /// tokeniser.set_sl_comment("//").unwrap();
847 ///
848 /// assert!(tokeniser.ends_with_sl_comment_start("abc//"));
849 /// assert!(!tokeniser.ends_with_sl_comment_start("abc/"));
850 /// ```
851 pub fn ends_with_sl_comment_start(&self, s: &str) -> bool {
852 match self.sl_comment() {
853 None => false,
854 Some(sl_comment) => s.ends_with(sl_comment)
855 }
856 }
857
858 /// Checks if a string is the configured multi-line comment start marker.
859 ///
860 /// # Arguments
861 ///
862 /// * `s` - The string to check
863 ///
864 /// # Returns
865 ///
866 /// `true` if the string exactly matches the configured multi-line comment start marker,
867 /// `false` otherwise or if no multi-line comment marker is configured.
868 ///
869 /// # Examples
870 ///
871 /// ```
872 /// use tokenise::Tokeniser;
873 ///
874 /// let mut tokeniser = Tokeniser::new();
875 /// tokeniser.set_ml_comment("/*", "*/").unwrap();
876 ///
877 /// assert!(tokeniser.is_ml_comment_start("/*"));
878 /// assert!(!tokeniser.is_ml_comment_start("*/"));
879 /// ```
880 pub fn is_ml_comment_start(&self, s: &str) -> bool {
881 match self.ml_comment() {
882 None => false,
883 Some((start,_)) => s == start
884 }
885 }
886
887 /// Checks if a string ends with the configured multi-line comment start marker.
888 ///
889 /// This is used during tokenisation to detect when a series of special characters
890 /// transitions into a comment.
891 ///
892 /// # Arguments
893 ///
894 /// * `s` - The string to check
895 ///
896 /// # Returns
897 ///
898 /// `true` if the string ends with the configured multi-line comment start marker,
899 /// `false` otherwise or if no multi-line comment marker is configured.
900 ///
901 /// # Examples
902 ///
903 /// ```
904 /// use tokenise::Tokeniser;
905 ///
906 /// let mut tokeniser = Tokeniser::new();
907 /// tokeniser.set_ml_comment("/*", "*/").unwrap();
908 ///
909 /// assert!(tokeniser.ends_with_ml_comment_start("abc/*"));
910 /// assert!(!tokeniser.ends_with_ml_comment_start("abc/"));
911 /// ```
912 pub fn ends_with_ml_comment_start(&self, s: &str) -> bool {
913 match self.ml_comment() {
914 None => false,
915 Some((start,_)) => s.ends_with(start)
916 }
917 }
918
919 /// Checks if a string is the configured multi-line comment end marker.
920 ///
921 /// # Arguments
922 ///
923 /// * `s` - The string to check
924 ///
925 /// # Returns
926 ///
927 /// `true` if the string exactly matches the configured multi-line comment end marker,
928 /// `false` otherwise or if no multi-line comment marker is configured.
929 ///
930 /// # Examples
931 ///
932 /// ```
933 /// use tokenise::Tokeniser;
934 ///
935 /// let mut tokeniser = Tokeniser::new();
936 /// tokeniser.set_ml_comment("/*", "*/").unwrap();
937 ///
938 /// assert!(tokeniser.is_ml_comment_end("*/"));
939 /// assert!(!tokeniser.is_ml_comment_end("/*"));
940 /// ```
941 pub fn is_ml_comment_end(&self, s: &str) -> bool {
942 match self.ml_comment() {
943 None => false,
944 Some((_, end)) => s == end
945 }
946 }
947
948 /// Tokenises a string according to the configured rules.
949 ///
950 /// This is the main method of the library, converting a string into a sequence of tokens
951 /// based on the special characters, delimiters, and comment markers that have been configured.
952 ///
953 /// # Arguments
954 ///
955 /// * `text` - The string to tokenise
956 ///
957 /// # Returns
958 ///
959 /// * `Ok(Vec<Token>)` - A vector of tokens if tokenisation was successful
960 /// * `Err(String)` - An error message if tokenisation failed
961 ///
962 /// # Examples
963 ///
964 /// ```
965 /// use tokenise::{Tokeniser, TokenState};
966 ///
967 /// let mut tokeniser = Tokeniser::new();
968 /// tokeniser.add_specials("+-*/=");
969 /// tokeniser.add_delimiter_pairs(&vec!["()", "[]"]).unwrap();
970 /// tokeniser.set_sl_comment("//").unwrap();
971 ///
972 /// let source = "x = 42; // The answer";
973 /// let tokens = tokeniser.tokenise(source).unwrap();
974 ///
975 /// // We can now work with the tokens
976 /// for token in &tokens {
977 /// match token.get_state() {
978 /// TokenState::Word => println!("Word: {}", token.value()),
979 /// TokenState::SymbolString => println!("Symbol: {}", token.value()),
980 /// TokenState::SLComment => println!("Comment: {}", token.value()),
981 /// _ => println!("Other token: {}", token.value()),
982 /// }
983 /// }
984 /// ```
985 pub fn tokenise<'g>(&self, text: &'g str) -> Result<Vec<Token<'g>>,String> {
986 let mut out: Vec<Token<'g>> = Vec::new();
987 let mut curr_start: usize = 0;
988 let mut curr_state : Option<TokenState> = None;
989 for (curr_pos,c) in text.grapheme_indices(true) {
990 match curr_state {
991 None => {
992 if self.special(c) {
993 if self.is_sl_comment_start(c) {
994 curr_state = Some(SLComment);
995 curr_start = curr_pos;
996 } else if self.is_ml_comment_start(c) {
997 curr_state = Some(MLComment);
998 curr_start = curr_pos;
999 } else {
1000 curr_state = Some(SymbolString);
1001 curr_start = curr_pos;
1002 match self.delimiter(c) {
1003 Some(Side::Left) => {
1004 out.push(Token { state: LDelimiter, val: c, start_pos: curr_pos });
1005 curr_state = None;
1006 },
1007 Some(Side::Right) => {
1008 out.push(Token { state: RDelimiter, val: c, start_pos: curr_pos });
1009 curr_state = None;
1010 },
1011 Some(Side::Bal) => {
1012 out.push(Token { state: BDelimiter, val: c, start_pos: curr_pos });
1013 curr_state = None;
1014 }
1015 None => {}
1016 }
1017 }
1018 } else {
1019 if c == "\n" || c == "\r" || c == "\r\n" {
1020 out.push(Token {
1021 state: NewLine,
1022 val: c,
1023 start_pos: curr_pos
1024 });
1025 } else if is_whitespace(c) {
1026 curr_state = Some(WhiteSpace);
1027 } else {
1028 curr_state = Some(Word);
1029 }
1030 curr_start = curr_pos;
1031 }
1032 },
1033 Some(Word) => {
1034 if self.special(c) {
1035 out.push(Token{
1036 state: Word,
1037 val: &text[curr_start..curr_pos],
1038 start_pos: curr_start
1039 });
1040
1041 match self.delimiter(c) {
1042 Some(Side::Left) => {
1043 out.push(Token { state: LDelimiter, val: c, start_pos: curr_pos });
1044 curr_state = None;
1045 },
1046 Some(Side::Right) => {
1047 out.push(Token { state: RDelimiter, val: c, start_pos: curr_pos });
1048 curr_state = None;
1049 },
1050 Some(Side::Bal) => {
1051 out.push(Token { state: BDelimiter, val: c, start_pos: curr_pos });
1052 curr_state = None;
1053 },
1054 None => {
1055 curr_start = curr_pos;
1056 curr_state = Some(SymbolString);
1057 }
1058 }
1059 } else {
1060 if is_whitespace(c) {
1061 out.push(Token{
1062 state: Word,
1063 val: &text[curr_start..curr_pos],
1064 start_pos: curr_start
1065 });
1066 if c == "\n" || c == "\r" || c == "\r\n" {
1067 out.push(Token {
1068 state: NewLine,
1069 val: c,
1070 start_pos: curr_pos
1071 });
1072 curr_state = None;
1073 } else {
1074 curr_state = Some(WhiteSpace);
1075 curr_start = curr_pos;
1076 }
1077 } else {
1078 }
1079 }
1080 },
1081 Some(SymbolString) => {
1082 if !self.special(c) {
1083 out.push(Token { state: SymbolString, val: &text[curr_start..curr_pos], start_pos: curr_start });
1084 curr_start = curr_pos;
1085 if is_whitespace(c) {
1086 if c == "\n" || c == "\r" || c == "\r\n" {
1087 out.push(Token {
1088 state: NewLine,
1089 val: c,
1090 start_pos: curr_pos
1091 });
1092 curr_state = None;
1093 } else {
1094 curr_state = Some(WhiteSpace);
1095 }
1096 } else {
1097 curr_state = Some(Word);
1098 }
1099 } else {
1100 let curr_str = &text[curr_start..curr_pos+c.len()];
1101 if self.ends_with_sl_comment_start(curr_str) {
1102 if self.is_sl_comment_start(curr_str) {
1103 curr_state = Some(SLComment);
1104 } else {
1105 let new_start = curr_pos + c.len() - self.sl_comment().unwrap().len();
1106 out.push(Token {
1107 state: SymbolString,
1108 val: &text[curr_start..new_start],
1109 start_pos: curr_start
1110 });
1111 curr_state = Some(SLComment);
1112 curr_start = new_start;
1113 }
1114 } else if self.ends_with_ml_comment_start(curr_str) {
1115 if self.is_ml_comment_start(curr_str) {
1116 curr_state = Some(MLComment);
1117 } else {
1118 let new_start = curr_pos + c.len() - self.ml_comment().unwrap().0.len();
1119 out.push(Token {
1120 state: SymbolString,
1121 val: &text[curr_start..new_start],
1122 start_pos: curr_start
1123 });
1124 curr_state = Some(MLComment);
1125 curr_start = new_start;
1126 }
1127 }
1128 }
1129 },
1130 Some(WhiteSpace) => {
1131 if self.special(c) {
1132 out.push(Token {
1133 state: WhiteSpace,
1134 val: &text[curr_start..curr_pos],
1135 start_pos: curr_start
1136 });
1137 match self.delimiter(c) {
1138 None => {
1139 if self.is_sl_comment_start(c) {
1140 curr_state = Some(SLComment);
1141 } else if self.is_ml_comment_start(c) {
1142 curr_state = Some(MLComment);
1143 } else {
1144 curr_state = Some(SymbolString);
1145 }
1146 curr_start = curr_pos;
1147 },
1148 Some(Side::Left) => {
1149 out.push(Token { state: LDelimiter, val: c, start_pos: curr_pos });
1150 curr_state = None;
1151 },
1152 Some(Side::Right) => {
1153 out.push(Token { state: RDelimiter, val: c, start_pos: curr_pos });
1154 curr_state = None;
1155 },
1156 Some(Side::Bal) => {
1157 out.push(Token { state: BDelimiter, val: c, start_pos: curr_pos });
1158 curr_state = None;
1159 }
1160 }
1161 } else {
1162 if c == "\n" || c == "\r" || c == "\r\n" {
1163 out.push(Token {
1164 state: WhiteSpace,
1165 val: &text[curr_start..curr_pos],
1166 start_pos: curr_start
1167 });
1168 out.push(Token {
1169 state: NewLine,
1170 val: c,
1171 start_pos: curr_pos
1172 });
1173 curr_state = None;
1174 } else if !is_whitespace(c) {
1175 out.push(Token {
1176 state: WhiteSpace,
1177 val: &text[curr_start..curr_pos],
1178 start_pos: curr_start
1179 });
1180 curr_start = curr_pos;
1181 curr_state = Some(Word);
1182 }
1183 }
1184 },
1185 Some(SLComment) => {
1186 if c == "\n" || c == "\r" || c == "\r\n" {
1187 out.push(Token {
1188 state: SLComment,
1189 val: &text[curr_start..curr_pos],
1190 start_pos: curr_start
1191 });
1192 out.push(Token {
1193 state: NewLine,
1194 val: c,
1195 start_pos: curr_pos
1196 });
1197 curr_state = None;
1198 }
1199 },
1200 Some(MLComment) => {
1201 let curr_str = &text[curr_start..curr_pos+(c.len())];
1202 let end = match self.ml_comment() {
1203 Some((_, e)) => Ok(e),
1204 _ => Err("This should never happen".to_string())
1205 }.unwrap();
1206 if curr_str.ends_with(end) {
1207 out.push(Token {
1208 state: MLComment,
1209 val: &text[curr_start..curr_pos+(c.len())],
1210 start_pos: curr_start
1211 });
1212 curr_state = None;
1213 }
1214 },
1215 other => {return Err(format!("curr_state should never reach {:?}",other))}
1216 }
1217 }
1218 if let Some(token) = out.last() {
1219 if token.value().len() + token.start() != text.len() {
1220 out.push(Token {
1221 state: curr_state.unwrap(),
1222 val: &text[curr_start..],
1223 start_pos: curr_start
1224 });
1225 }
1226 }
1227 Ok(out)
1228 }
1229}
1230
1231
1232#[cfg(test)]
1233mod tests {
1234 use super::*;
1235
1236 #[test]
1237 fn token_gets_work() {
1238 let token = Token{ state: Word, val:"hi", start_pos:4 };
1239 assert_eq!(token.start(), 4);
1240 assert_eq!(token.value(), "hi");
1241 }
1242
1243 #[test]
1244 fn tokeniser_building_works() {
1245 let mut tokeniser = Tokeniser::new();
1246 tokeniser.add_specials("!@%π¨βπ»*");
1247 tokeniser.add_delimiter_pairs(&vec!["<>", "()", "{}", "πΊπΈππ½"]).unwrap();
1248 tokeniser.add_balanced_delimiters("\"").unwrap();
1249 tokeniser.set_sl_comment("//").unwrap();
1250 tokeniser.set_ml_comment("/*","*/").unwrap();
1251 assert_eq!(tokeniser.specials(),vec!["!", "@", "%", "π¨βπ»", "*", "<", ">", "(", ")", "{", "}", "πΊπΈ", "ππ½", "\"", "/"]);
1252 assert_eq!(tokeniser.lr_delimiters(),vec![("<",">"),("(",")"),("{","}"),("πΊπΈ","ππ½")]);
1253 assert_eq!(tokeniser.bal_delimiters(),vec!["\""]);
1254 assert_eq!(tokeniser.sl_comment(),Some("//"));
1255 assert_eq!(tokeniser.ml_comment(),Some(("/*","*/")));
1256 }
1257
1258 #[test]
1259 fn tokeniser_tokenise_works() {
1260 let source = " hi, skdjfs;; 842\t 39fsl == 3\n \
1261 what's going on? idk... \n\
1262 fire___sldfksfl // what's going on? \n\
1263 idk what I'm πΊπΈdoing \n\
1264 \n\
1265 nowππ½ hi Β£*$*@ \n\
1266 help!\n\
1267 \"hello\"hi";
1268 let mut tokeniser = Tokeniser::new();
1269 tokeniser.add_delimiter_pairs(&vec!["()","[]"]).unwrap();
1270 tokeniser.add_balanced_delimiter("\"").unwrap();
1271 tokeniser.set_sl_comment("//").unwrap();
1272 tokeniser.set_ml_comment("πΊπΈ","ππ½").unwrap();
1273 tokeniser.add_specials(",;=?.'Β£<>@*");
1274 assert_eq!(tokeniser.tokenise(source).unwrap(),vec![
1275 Token { state: WhiteSpace, val: " ", start_pos: 0 },
1276 Token { state: Word, val: "hi", start_pos: 1 },
1277 Token { state: SymbolString, val: ",", start_pos: 3 },
1278 Token { state: WhiteSpace, val: " ", start_pos: 4 },
1279 Token { state: Word, val: "skdjfs", start_pos: 5 },
1280 Token { state: SymbolString, val: ";;", start_pos: 11 },
1281 Token { state: WhiteSpace, val: " ", start_pos: 13 },
1282 Token { state: Word, val: "842", start_pos: 17 },
1283 Token { state: WhiteSpace, val: "\t ", start_pos: 20 },
1284 Token { state: Word, val: "39fsl", start_pos: 22 },
1285 Token { state: WhiteSpace, val: " ", start_pos: 27 },
1286 Token { state: SymbolString, val: "==", start_pos: 28 },
1287 Token { state: WhiteSpace, val: " ", start_pos: 30 },
1288 Token { state: Word, val: "3", start_pos: 31 },
1289 Token { state: NewLine, val: "\n", start_pos: 32 },
1290 Token { state: WhiteSpace, val: " ", start_pos: 33 },
1291 Token { state: Word, val: "what", start_pos: 34 },
1292 Token { state: SymbolString, val: "'", start_pos: 38 },
1293 Token { state: Word, val: "s", start_pos: 39 },
1294 Token { state: WhiteSpace, val: " ", start_pos: 40 },
1295 Token { state: Word, val: "going", start_pos: 41 },
1296 Token { state: WhiteSpace, val: " ", start_pos: 46 },
1297 Token { state: Word, val: "on", start_pos: 47 },
1298 Token { state: SymbolString, val: "?", start_pos: 49 },
1299 Token { state: WhiteSpace, val: " ", start_pos: 50 },
1300 Token { state: Word, val: "idk", start_pos: 51 },
1301 Token { state: SymbolString, val: "...", start_pos: 54 },
1302 Token { state: WhiteSpace, val: " ", start_pos: 57 },
1303 Token { state: NewLine, val: "\n", start_pos: 58 },
1304 Token { state: Word, val: "fire___sldfksfl", start_pos: 59 },
1305 Token { state: WhiteSpace, val: " ", start_pos: 74 },
1306 Token { state: SLComment, val: "// what's going on? ", start_pos: 75 },
1307 Token { state: NewLine, val: "\n", start_pos: 95 },
1308 Token { state: Word, val: "idk", start_pos: 96 },
1309 Token { state: WhiteSpace, val: " ", start_pos: 99 },
1310 Token { state: Word, val: "what", start_pos: 100 },
1311 Token { state: WhiteSpace, val: " ", start_pos: 104 },
1312 Token { state: Word, val: "I", start_pos: 105 },
1313 Token { state: SymbolString, val: "'", start_pos: 106 },
1314 Token { state: Word, val: "m", start_pos: 107 },
1315 Token { state: WhiteSpace, val: " ", start_pos: 108 },
1316 Token { state: MLComment, val: "πΊπΈdoing \n\nnowππ½", start_pos: 109 },
1317 Token { state: WhiteSpace, val: " ", start_pos: 136 },
1318 Token { state: Word, val: "hi", start_pos: 137 },
1319 Token { state: WhiteSpace, val: " ", start_pos: 139 },
1320 Token { state: SymbolString, val: "Β£*", start_pos: 140 },
1321 Token { state: Word, val: "$", start_pos: 143 },
1322 Token { state: SymbolString, val: "*@", start_pos: 144 },
1323 Token { state: WhiteSpace, val: " ", start_pos: 146 },
1324 Token { state: NewLine, val: "\n", start_pos: 147 },
1325 Token { state: Word, val: "help!", start_pos: 148 },
1326 Token { state: NewLine, val: "\n", start_pos: 153 },
1327 Token { state: BDelimiter, val: "\"", start_pos: 154 },
1328 Token { state: Word, val: "hello", start_pos: 155 },
1329 Token { state: BDelimiter, val: "\"", start_pos: 160 },
1330 Token { state: Word, val: "hi", start_pos: 161 }
1331 ]);
1332 }
1333
1334 #[test]
1335 fn test_tokeniser_tokenise_newline_handling() {
1336 let source = "hi \r\n\n\rhiagain";
1337 let tokens = vec![
1338 Token { state: Word, val: "hi", start_pos: 0 },
1339 Token { state: WhiteSpace, val: " ", start_pos: 2 },
1340 Token { state: NewLine, val: "\r\n", start_pos: 5 },
1341 Token { state: NewLine, val: "\n", start_pos: 7 },
1342 Token { state: NewLine, val: "\r", start_pos: 8},
1343 Token { state: Word, val: "hiagain", start_pos: 9}
1344 ];
1345 let tokeniser = Tokeniser::new();
1346 assert_eq!(tokeniser.tokenise(source).unwrap(), tokens);
1347 }
1348}