tokenise/
lib.rs

1//! # Tokenise
2//! 
3//! A flexible lexical analyser (tokeniser) for parsing text into configurable token types.
4//! 
5//! `tokenise` allows you to split text into tokens based on customisable rules for special characters,
6//! delimiters, and comments. It's designed to be flexible enough to handle various syntax styles
7//! while remaining simple to configure.
8//! 
9//! ## Basic Usage
10//! 
11//! The following example demonstrates how to configure a tokeniser with common syntax elements
12//! and process a simple code snippet:
13//! 
14//! ```
15//! use tokenise::Tokeniser;
16//! 
17//! fn main() {
18//!     // Create a new tokeniser
19//!     let mut tokeniser = Tokeniser::new();
20//!     
21//!     // Configure tokeniser with rules
22//!     tokeniser.add_specials(".,;:!?");
23//!     tokeniser.add_delimiter_pairs(&vec!["()", "[]", "{}"]).unwrap();
24//!     tokeniser.add_balanced_delimiter("\"").unwrap();
25//!     tokeniser.set_sl_comment("//").unwrap();
26//!     tokeniser.set_ml_comment("/*", "*/").unwrap();
27//!     
28//!     // Tokenise some source text
29//!     let source = "let x = 42; // The answer\nprint(\"Hello world!\");";
30//!     let tokens = tokeniser.tokenise(source).unwrap();
31//!     
32//!     // Work with the resulting tokens
33//!     for token in tokens {
34//!         println!("{:?}: '{}'", token.get_state(), token.value());
35//!     }
36//! }
37//! ```
38//! 
39//! ## Features
40//! 
41//! - Unicode support (using grapheme clusters)
42//! - Configurable special characters and delimiters
43//! - Support for paired delimiters (e.g., parentheses, brackets)
44//! - Support for balanced delimiters (e.g., quotation marks)
45//! - Single-line and multi-line comment handling
46//! - Whitespace and newline preservation
47//! 
48//! ## Token Types
49//! 
50//! The tokeniser recognises several token types represented by the `TokenState` enum:
51//! 
52//! - `Word`: Non-special character sequences (anything not identified as a special character or whitespace)
53//! - `LDelimiter`/`RDelimiter`: Left/right delimiters of a pair (e.g., '(', ')')
54//! - `BDelimiter`: Balanced delimiters (e.g., quotation marks)
55//! - `SymbolString`: Special characters
56//! - `NewLine`: Line breaks
57//! - `WhiteSpace`: Spaces, tabs, etc.
58//! - `SLComment`: Single-line comments
59//! - `MLComment`: Multi-line comments
60//! 
61//! More precise definitions can be found in the documentation for each specific type.
62
63use unicode_segmentation::UnicodeSegmentation;
64
65// TODO: add multi-character Parenthesis
66/// Represents the type of a token in the tokenisation process.
67///
68/// Each token in the parsed text is classified as one of these types,
69/// which determines how it is interpreted and processed.
70#[derive(Clone, Copy, Debug, PartialEq)]
71pub enum TokenState {
72    /// A sequence of non-special characters (excluding whitespace).
73    Word,
74    
75    /// A left delimiter of a pair (e.g., opening bracket).
76    LDelimiter,
77    
78    /// A right delimiter of a pair (e.g., closing bracket).
79    RDelimiter,
80    
81    /// A balanced delimiter that can open or close (e.g., quotation mark).
82    BDelimiter,
83    
84    /// A sequence of special characters not recognized as delimiters or comments.
85    SymbolString,
86    
87    /// A newline character sequence (\n, \r, or \r\n).
88    NewLine,
89    
90    /// A sequence of whitespace characters (excluding newlines).
91    WhiteSpace,
92    
93    /// A single-line comment.
94    SLComment,
95    
96    /// A multi-line comment.
97    MLComment
98}
99use TokenState::*;
100
101/// Represents the categorisation of delimiters into left, right, or balanced types.
102///
103/// This is used to classify delimiters when tokenising text:
104/// - Left delimiters open a section (e.g., opening brackets)
105/// - Right delimiters close a section (e.g., closing brackets)
106/// - Balanced delimiters can serve as both opening and closing (e.g., quotation marks)
107#[derive(Debug, PartialEq, Eq)]
108pub enum Side {
109    /// A right (closing) delimiter such as ')', ']', or '}'.
110    Right,
111    
112    /// A left (opening) delimiter such as '(', '[', or '{'.
113    Left,
114    
115    /// A balanced delimiter that can both open and close, such as '"'.
116    Bal
117}
118
119/// Checks if a string is exactly one grapheme cluster (user-perceived character).
120///
121/// # Examples
122/// ```
123/// assert!(tokenise::is_grapheme("a"));
124/// assert!(tokenise::is_grapheme("πŸ‘¨β€πŸ’»"));
125/// assert!(tokenise::is_grapheme("\r\n"));
126/// assert!(!tokenise::is_grapheme("ab"));
127/// ```
128pub fn is_grapheme(s: &str) -> bool {
129    s.graphemes(true).collect::<Vec<_>>().len() == 1
130}
131
132/// Checks if a string consists entirely of whitespace.
133///
134/// # Examples
135/// ```
136/// assert!(tokenise::is_whitespace(" \t"));
137/// assert!(!tokenise::is_whitespace("a "));
138/// ```
139pub fn is_whitespace(c: &str) -> bool {
140    c.trim().is_empty()
141}
142
143/// Represents a token extracted from the source text during tokenisation.
144///
145/// Each token has a state (type), a string value, and a position in the original text.
146#[derive(Debug, PartialEq, Clone, Copy)]
147pub struct Token<'a> {
148    /// The type of this token.
149    state: TokenState,
150    
151    /// The string content of this token.
152    val: &'a str,
153    
154    /// The starting position (in characters) of this token in the original text.
155    start_pos: usize
156}
157
158impl<'a> Token<'a> {
159    /// Returns the starting position of this token in the original text.
160    pub fn start(&self) -> usize {
161        self.start_pos
162    }
163
164    /// Returns the string content of this token.
165    pub fn value(&self) -> &'a str {
166        self.val
167    }
168
169    /// Returns the state (type) of this token.
170    pub fn get_state(&self) -> TokenState {
171        self.state
172    }
173}
174
175/// A configurable tokeniser for parsing text into meaningful tokens.
176///
177/// The `Tokeniser` can be customised with special characters, delimiter pairs,
178/// balanced delimiters, and comment markers to suit different syntax requirements.
179/// Once configured, it can parse text into tokens according to those rules.
180///
181/// Note that delimiters and the characters in comment markers are automatically
182/// treated as special characters, but with additional distinctions in how they're
183/// processed during tokenisation.
184///
185/// # Examples
186///
187/// ```
188/// use tokenise::Tokeniser;
189///
190/// // Create and configure a tokeniser for a C-like language
191/// let mut tokeniser = Tokeniser::new();
192/// tokeniser.add_specials("+-*/=<>!&|^~%");
193/// tokeniser.add_delimiter_pairs(&vec!["()", "[]", "{}"]).unwrap();
194/// tokeniser.set_sl_comment("//").unwrap();
195/// tokeniser.set_ml_comment("/*", "*/").unwrap();
196///
197/// // Tokenise some code
198/// let code = "int main() { // Entry point\n    return 0;\n}";
199/// let tokens = tokeniser.tokenise(code).unwrap();
200/// ```
201pub struct Tokeniser {
202    special_characters: Vec<String>,
203    delimiter_pairs: Vec<(String, String)>,
204    balanced_delimiters: Vec<String>,
205    single_line_comment: Option<String>,
206    multi_line_comment: Option<(String, String)>
207}
208
209impl Tokeniser {
210    /// Creates a new, unconfigured `Tokeniser`.
211    ///
212    /// This constructor creates a tokeniser with no special characters, delimiters, or comment markers.
213    /// You'll need to configure it with the appropriate methods before it's ready for use.
214    ///
215    /// # Examples
216    ///
217    /// ```
218    /// use tokenise::Tokeniser;
219    ///
220    /// let mut tokeniser = Tokeniser::new();
221    /// // Configure the tokeniser...
222    /// ```
223    pub fn new() -> Self {
224        Self {
225            special_characters: Vec::new(),
226            delimiter_pairs: Vec::new(),
227            balanced_delimiters: Vec::new(),
228            single_line_comment: None,
229            multi_line_comment: None,
230        }
231    }
232
233    /// Adds a single special character to the tokeniser.
234    ///
235    /// Special characters are treated differently from regular text during tokenisation.
236    /// They form `SymbolString` tokens unless they're also configured as delimiters or
237    /// used in comment markers.
238    ///
239    /// # Arguments
240    ///
241    /// * `special` - The special character to add, which must be a single grapheme
242    ///
243    /// # Returns
244    ///
245    /// * `Ok(())` if the character was added successfully
246    /// * `Err(String)` if the input is not a single grapheme
247    ///
248    /// # Examples
249    ///
250    /// ```
251    /// use tokenise::Tokeniser;
252    ///
253    /// let mut tokeniser = Tokeniser::new();
254    /// tokeniser.add_special("@").unwrap();
255    /// tokeniser.add_special("+").unwrap();
256    ///
257    /// // Unicode graphemes are supported
258    /// tokeniser.add_special("πŸ‘¨β€πŸ’»").unwrap();
259    ///
260    /// // This would fail as it's not a single grapheme
261    /// assert!(tokeniser.add_special("abc").is_err());
262    /// ```
263    pub fn add_special(&mut self, special: &str) -> Result<(),String> {
264        if !is_grapheme(special) {
265            Err(format!("string {:?} is not a single grapheme",special))
266        } else {
267            if !self.special(special) {
268                self.special_characters.push(special.to_string());
269            }
270            Ok(())
271        }
272    }
273
274    /// Adds multiple special characters to the tokeniser.
275    ///
276    /// This is a convenience method that adds each grapheme in the input string
277    /// as a special character.
278    ///
279    /// # Arguments
280    ///
281    /// * `specials` - A string containing the special characters to add
282    ///
283    /// # Examples
284    ///
285    /// ```
286    /// use tokenise::Tokeniser;
287    ///
288    /// let mut tokeniser = Tokeniser::new();
289    /// tokeniser.add_specials("+-*/=<>!&|^~%");
290    /// ```
291    pub fn add_specials(&mut self, specials: &str) {
292        for c in specials.graphemes(true) {
293            self.add_special(c).unwrap();
294        }
295    }
296
297    /// Adds a pair of left and right delimiters to the tokeniser.
298    ///
299    /// Delimiter pairs are used to mark the beginning and end of sections in text,
300    /// such as parentheses, brackets, and braces. During tokenisation, they are
301    /// classified as `LDelimiter` and `RDelimiter` respectively.
302    ///
303    /// Both characters are automatically added as special characters if they aren't already.
304    ///
305    /// # Arguments
306    ///
307    /// * `left` - The left (opening) delimiter, which must be a single grapheme
308    /// * `right` - The right (closing) delimiter, which must be a single grapheme
309    ///
310    /// # Returns
311    ///
312    /// * `Ok(())` if the delimiter pair was added successfully
313    /// * `Err(String)` if either character is not a single grapheme, or if either
314    ///   character is already used as a different type of delimiter
315    ///
316    /// # Examples
317    ///
318    /// ```
319    /// use tokenise::Tokeniser;
320    ///
321    /// let mut tokeniser = Tokeniser::new();
322    /// tokeniser.add_delimiter_pair("(", ")").unwrap();
323    /// tokeniser.add_delimiter_pair("[", "]").unwrap();
324    /// tokeniser.add_delimiter_pair("{", "}").unwrap();
325    ///
326    /// // Unicode delimiters are supported
327    /// tokeniser.add_delimiter_pair("γ€Œ", "」").unwrap();
328    /// ```
329    pub fn add_delimiter_pair(&mut self, left: &str, right: &str) -> Result<(),String> {
330        if !is_grapheme(left) {
331            Err(format!("string {:?} is not a single grapheme",left))
332        } else if !is_grapheme(right) {
333            Err(format!("string {:?} is not a single grapheme",right))
334        } else {
335            match (self.delimiter(left),self.delimiter(right)) {
336                (None, None) => {
337                    self.add_special(left).unwrap();
338                    self.add_special(right).unwrap();
339                    self.delimiter_pairs.push((left.to_string(),right.to_string()));
340                },
341                (None, Some(_)) => {
342                    return Err(format!("right delimiter {right:?} is already a delimiter of type {:?} with other pair", self.delimiter(right).unwrap()));
343                },
344                (Some(_), None) => {
345                    return Err(format!("left delimiter {left:?} is already a delimiter of type {:?} with other pair", self.delimiter(left).unwrap()));
346                },
347                (Some(l), Some(r)) => {
348                    match l {
349                        Side::Right => {
350                            return Err(format!("left delimiter {left:?} is already a delimiter of type {:?} with other pair", Side::Right));
351                        },
352                        Side::Left => {},
353                        Side::Bal => {
354                            return Err(format!("left delimiter {left:?} is already a delimiter of type {:?} with other pair", Side::Bal));
355                        },
356                    }
357                    match r {
358                        Side::Right => {},
359                        Side::Left => {
360                            return Err(format!("right delimiter {right:?} is already a delimiter of type {:?} with other pair", Side::Left));
361                        },
362                        Side::Bal => {
363                            return Err(format!("right delimiter {right:?} is already a delimiter of type {:?} with other pair", Side::Bal));
364                        },
365                    }
366                },
367            }
368            Ok(())
369        }
370    }
371
372    /// Adds multiple delimiter pairs to the tokeniser.
373    ///
374    /// Each pair should be represented as a string containing exactly two graphemes,
375    /// where the first is the left delimiter and the second is the right delimiter.
376    /// 
377    /// Each character is automatically added as a special character if it isn't already.
378    ///
379    /// # Arguments
380    ///
381    /// * `delimiter_pairs` - A vector of strings, each containing exactly two graphemes
382    ///
383    /// # Returns
384    ///
385    /// * `Ok(())` if all delimiter pairs were added successfully
386    /// * `Err(String)` if any string doesn't contain exactly two graphemes, or if any
387    ///   character is already used as a different type of delimiter
388    ///
389    /// # Examples
390    ///
391    /// ```
392    /// use tokenise::Tokeniser;
393    ///
394    /// let mut tokeniser = Tokeniser::new();
395    /// tokeniser.add_delimiter_pairs(&vec!["()", "[]", "{}"]).unwrap();
396    /// ```
397    pub fn add_delimiter_pairs(&mut self, delimiter_pairs: &Vec<&str>) -> Result<(),String> {
398        for &s in delimiter_pairs {
399            let v = s.graphemes(true).collect::<Vec<_>>();
400            if v.len() != 2 {
401                return Err(format!("delimiter pair {s:?} must be made up of 2 graphemes"));
402            }
403            let [left,right] = v.try_into().unwrap();
404            match self.add_delimiter_pair(left, right) {
405                Ok(_) => {},
406                Err(x) => {
407                    return Err(x);
408                },
409            }
410        }
411        Ok(())
412    }
413
414    /// Adds a balanced delimiter to the tokeniser.
415    ///
416    /// Balanced delimiters are characters that serve as both opening and closing markers,
417    /// such as quotation marks. During tokenisation, they are classified as `BDelimiter`.
418    ///
419    /// The character is automatically added as a special character if it isn't already.
420    ///
421    /// # Arguments
422    ///
423    /// * `delim` - The balanced delimiter, which must be a single grapheme
424    ///
425    /// # Returns
426    ///
427    /// * `Ok(())` if the delimiter was added successfully
428    /// * `Err(String)` if the character is not a single grapheme, or if it is already used
429    ///   as a different type of delimiter
430    ///
431    /// # Examples
432    ///
433    /// ```
434    /// use tokenise::Tokeniser;
435    ///
436    /// let mut tokeniser = Tokeniser::new();
437    /// tokeniser.add_balanced_delimiter("\"").unwrap(); // Double quote
438    /// tokeniser.add_balanced_delimiter("'").unwrap();  // Single quote
439    /// tokeniser.add_balanced_delimiter("`").unwrap();  // Backtick
440    /// ```
441    pub fn add_balanced_delimiter(&mut self, delim: &str) -> Result<(),String> {
442        if !is_grapheme(delim) {
443            Err(format!("string {:?} is not a single grapheme",delim))
444        } else {
445            match self.delimiter(delim) {
446                Some(side) => {
447                    match side {
448                        Side::Right => {
449                        return Err(format!("balanced delimiter {delim:?} is already a delimiter of type {:?} with other pair", Side::Right));
450                    },
451                        Side::Left => {
452                            return Err(format!("balanced delimiter {delim:?} is already a delimiter of type {:?} with other pair", Side::Left));
453                        },
454                        Side::Bal => {},
455                    }
456                },
457                None => {
458                    self.add_special(delim).unwrap();
459                    self.balanced_delimiters.push(delim.to_string());
460                },
461            }
462            Ok(())
463        }
464    }
465
466    /// Adds multiple balanced delimiters to the tokeniser.
467    ///
468    /// Each character in the input string is added as a balanced delimiter.
469    /// The characters are automatically added as special characters if they aren't already.
470    ///
471    /// # Arguments
472    ///
473    /// * `delims` - A string containing the balanced delimiters to add
474    ///
475    /// # Returns
476    ///
477    /// * `Ok(())` if all delimiters were added successfully
478    /// * `Err(String)` if any character is already used as a different type of delimiter
479    ///
480    /// # Examples
481    ///
482    /// ```
483    /// use tokenise::Tokeniser;
484    ///
485    /// let mut tokeniser = Tokeniser::new();
486    /// tokeniser.add_balanced_delimiters("\"'`").unwrap(); // Adds ", ', and ` as balanced delimiters
487    /// ```
488    pub fn add_balanced_delimiters(&mut self, delims: &str) -> Result<(),String> {
489        for delim in delims.graphemes(true) {
490            match self.add_balanced_delimiter(delim) {
491                Ok(_) => {},
492                Err(x) => {
493                    return Err(x);
494                }
495            }
496        }
497        Ok(())
498    }
499
500    /// Sets the marker for single-line comments.
501    ///
502    /// Single-line comments run from the marker to the end of the line.
503    /// During tokenisation, they are classified as `SLComment`.
504    ///
505    /// All characters in the comment marker are automatically added as special characters.
506    ///
507    /// # Arguments
508    ///
509    /// * `comm` - The single-line comment marker (e.g., "//")
510    ///
511    /// # Returns
512    ///
513    /// * `Ok(())` if the marker was set successfully
514    /// * `Err(String)` if the marker is an empty string
515    ///
516    /// # Examples
517    ///
518    /// ```
519    /// use tokenise::Tokeniser;
520    ///
521    /// let mut tokeniser = Tokeniser::new();
522    /// tokeniser.set_sl_comment("//").unwrap();  // C/C++/Rust style
523    ///
524    /// // Could also use other styles
525    /// // tokeniser.set_sl_comment("#").unwrap();   // Python/Ruby style
526    /// // tokeniser.set_sl_comment("--").unwrap();  // SQL/Lua style
527    /// ```
528    pub fn set_sl_comment(&mut self, comm: &str) -> Result<(),String> {
529        if comm.len() == 0 {
530            Err(format!("Empty string cannot be the start of a single line comment"))
531        } else {
532            self.add_specials(comm);
533            self.single_line_comment = Some(comm.to_string());
534            Ok(())
535        }
536    }
537
538    /// Sets the markers for multi-line comments.
539    ///
540    /// Multi-line comments run from the start marker to the end marker,
541    /// potentially spanning multiple lines. During tokenisation, they
542    /// are classified as `MLComment`.
543    ///
544    /// All characters in both comment markers are automatically added as special characters.
545    ///
546    /// # Arguments
547    ///
548    /// * `left` - The start marker for multi-line comments (e.g., "/*")
549    /// * `right` - The end marker for multi-line comments (e.g., "*/")
550    ///
551    /// # Returns
552    ///
553    /// * `Ok(())` if the markers were set successfully
554    /// * `Err(String)` if either marker is an empty string
555    ///
556    /// # Examples
557    ///
558    /// ```
559    /// use tokenise::Tokeniser;
560    ///
561    /// let mut tokeniser = Tokeniser::new();
562    /// tokeniser.set_ml_comment("/*", "*/").unwrap();  // C/C++/Rust style
563    ///
564    /// // Could also use other styles
565    /// // tokeniser.set_ml_comment("<!--", "-->").unwrap();  // HTML/XML style
566    /// // tokeniser.set_ml_comment("{-", "-}").unwrap();    // Haskell style
567    /// ```
568    /// 
569    /// # Warning
570    /// 
571    /// Be cautious with comment markers that contain alphanumeric characters (like words). 
572    /// Since all characters in comment markers are added as special characters, using 
573    /// word-based markers may cause unexpected tokenisation of normal text:
574    ///
575    /// ```
576    /// use tokenise::Tokeniser;
577    /// 
578    /// // Not recommended - would treat the letters in "begin" and "end" as special characters
579    /// let mut tokeniser = Tokeniser::new();
580    /// tokeniser.set_ml_comment("=begin", "=end").unwrap(); // Ruby style
581    /// ```
582    pub fn set_ml_comment(&mut self, left: &str, right: &str) -> Result<(),String> {
583        if left.len() == 0 {
584            Err(format!("Empty string cannot be the start of a multi-line comment"))
585        } else if right.len() == 0 {
586            Err(format!("Empty string cannot be the end of a multi-line comment"))
587        } else {
588            self.add_specials(left);
589            self.add_specials(right);
590            self.multi_line_comment = Some((left.to_string(),right.to_string()));
591            Ok(())
592        }
593    }
594
595    /// Returns a vector of all registered special characters.
596    ///
597    /// # Returns
598    ///
599    /// A vector of string slices, each containing one special character.
600    ///
601    /// # Examples
602    ///
603    /// ```
604    /// use tokenise::Tokeniser;
605    ///
606    /// let mut tokeniser = Tokeniser::new();
607    /// tokeniser.add_specials("+-*/");
608    /// 
609    /// let specials = tokeniser.specials();
610    /// assert!(specials.contains(&"+"));
611    /// assert!(specials.contains(&"-"));
612    /// ```
613    pub fn specials<'a>(&'a self) -> Vec<&'a str> {
614        self.special_characters
615            .iter()
616            .map(|x|x.as_str())
617            .collect()
618    }
619
620    /// Returns a vector of all registered left-right delimiter pairs.
621    ///
622    /// # Returns
623    ///
624    /// A vector of tuples, each containing a left delimiter and its corresponding right delimiter.
625    ///
626    /// # Examples
627    ///
628    /// ```
629    /// use tokenise::Tokeniser;
630    ///
631    /// let mut tokeniser = Tokeniser::new();
632    /// tokeniser.add_delimiter_pairs(&vec!["()", "[]"]).unwrap();
633    /// 
634    /// let delimiters = tokeniser.lr_delimiters();
635    /// assert!(delimiters.contains(&("(", ")")));
636    /// ```
637    pub fn lr_delimiters<'a>(&'a self) -> Vec<(&'a str, &'a str)> {
638        self.delimiter_pairs
639            .iter()
640            .map(|(x,y)|(x.as_str(),y.as_str()))
641            .collect()
642    }
643
644    /// Returns a vector of all registered balanced delimiters.
645    ///
646    /// # Returns
647    ///
648    /// A vector of string slices, each containing one balanced delimiter.
649    ///
650    /// # Examples
651    ///
652    /// ```
653    /// use tokenise::Tokeniser;
654    ///
655    /// let mut tokeniser = Tokeniser::new();
656    /// tokeniser.add_balanced_delimiters("\"'").unwrap();
657    /// 
658    /// let delimiters = tokeniser.bal_delimiters();
659    /// assert!(delimiters.contains(&"\""));
660    /// assert!(delimiters.contains(&"'"));
661    /// ```
662    pub fn bal_delimiters<'a>(&'a self) -> Vec<&'a str> {
663        self.balanced_delimiters
664            .iter()
665            .map(|x|x.as_str())
666            .collect()
667    }
668
669    /// Returns the configured single-line comment marker, if any.
670    ///
671    /// # Returns
672    ///
673    /// An `Option` containing the single-line comment marker, or `None` if not configured.
674    ///
675    /// # Examples
676    ///
677    /// ```
678    /// use tokenise::Tokeniser;
679    ///
680    /// let mut tokeniser = Tokeniser::new();
681    /// assert_eq!(tokeniser.sl_comment(), None);
682    ///
683    /// tokeniser.set_sl_comment("//").unwrap();
684    /// assert_eq!(tokeniser.sl_comment(), Some("//"));
685    /// ```
686    pub fn sl_comment<'a>(&'a self) -> Option<&'a str> {
687        self.single_line_comment
688            .iter()
689            .map(|x| x.as_str())
690            .next()
691    }
692
693    /// Returns the configured multi-line comment markers, if any.
694    ///
695    /// # Returns
696    ///
697    /// An `Option` containing a tuple of the start and end markers for multi-line comments,
698    /// or `None` if not configured.
699    ///
700    /// # Examples
701    ///
702    /// ```
703    /// use tokenise::Tokeniser;
704    ///
705    /// let mut tokeniser = Tokeniser::new();
706    /// assert_eq!(tokeniser.ml_comment(), None);
707    ///
708    /// tokeniser.set_ml_comment("/*", "*/").unwrap();
709    /// assert_eq!(tokeniser.ml_comment(), Some(("/*", "*/")));
710    /// ```
711    pub fn ml_comment<'a>(&'a self) -> Option<(&'a str, &'a str)> {
712        self.multi_line_comment
713            .iter()
714            .map(|(x,y)|(x.as_str(),y.as_str()))
715            .next()
716    }
717
718    /// Checks if a character is registered as a special character.
719    ///
720    /// Special characters include those explicitly added via `add_special`/`add_specials`,
721    /// as well as any characters used in delimiters or comment markers.
722    ///
723    /// # Arguments
724    ///
725    /// * `c` - The character to check
726    ///
727    /// # Returns
728    ///
729    /// `true` if the character is registered as a special character, `false` otherwise.
730    ///
731    /// # Examples
732    ///
733    /// ```
734    /// use tokenise::Tokeniser;
735    ///
736    /// let mut tokeniser = Tokeniser::new();
737    /// tokeniser.add_special("+").unwrap();
738    /// tokeniser.add_delimiter_pair("(", ")").unwrap();
739    ///
740    /// assert!(tokeniser.special("+")); // Explicitly added special
741    /// assert!(tokeniser.special("(")); // Special because it's a delimiter
742    /// assert!(!tokeniser.special("-")); // Not registered as special
743    /// ```
744    pub fn special(&self, c: &str) -> bool {
745        for x in self.specials() {
746            if x == c {
747                return true;
748            }
749        }
750        false
751    }
752    
753    /// Checks if a character is registered as a delimiter and returns its type.
754    ///
755    /// # Arguments
756    ///
757    /// * `c` - The character to check
758    ///
759    /// # Returns
760    ///
761    /// * `Some(Side::Left)` if the character is a left delimiter
762    /// * `Some(Side::Right)` if the character is a right delimiter
763    /// * `Some(Side::Bal)` if the character is a balanced delimiter
764    /// * `None` if the character is not a delimiter
765    ///
766    /// # Examples
767    ///
768    /// ```
769    /// use tokenise::{Tokeniser, Side};
770    ///
771    /// let mut tokeniser = Tokeniser::new();
772    /// tokeniser.add_delimiter_pair("(", ")").unwrap();
773    /// tokeniser.add_balanced_delimiter("\"").unwrap();
774    ///
775    /// assert_eq!(tokeniser.delimiter("("), Some(Side::Left));
776    /// assert_eq!(tokeniser.delimiter(")"), Some(Side::Right));
777    /// assert_eq!(tokeniser.delimiter("\""), Some(Side::Bal));
778    /// assert_eq!(tokeniser.delimiter("a"), None);
779    /// ```
780    pub fn delimiter<'g>(&self, c: &'g str) -> Option<Side> {
781        for (x,y) in self.lr_delimiters() {
782            if x == c {
783                return Some(Side::Left);
784            } 
785            if y == c {
786                return Some(Side::Right);
787            }
788        }
789        for x in self.bal_delimiters() {
790            if x == c {
791                return Some(Side::Bal);
792            }
793        }
794        None
795    }
796
797    /// Checks if a string is the configured single-line comment marker.
798    ///
799    /// # Arguments
800    ///
801    /// * `s` - The string to check
802    ///
803    /// # Returns
804    ///
805    /// `true` if the string exactly matches the configured single-line comment marker,
806    /// `false` otherwise or if no single-line comment marker is configured.
807    ///
808    /// # Examples
809    ///
810    /// ```
811    /// use tokenise::Tokeniser;
812    ///
813    /// let mut tokeniser = Tokeniser::new();
814    /// tokeniser.set_sl_comment("//").unwrap();
815    ///
816    /// assert!(tokeniser.is_sl_comment_start("//"));
817    /// assert!(!tokeniser.is_sl_comment_start("/"));
818    /// ```
819    pub fn is_sl_comment_start(&self, s: &str) -> bool {
820        match self.sl_comment() {
821            None => false,
822            Some(sl_comment) => s == sl_comment
823        }
824    }
825
826    /// Checks if a string ends with the configured single-line comment marker.
827    ///
828    /// This is used during tokenisation to detect when a series of special characters
829    /// transitions into a comment.
830    ///
831    /// # Arguments
832    ///
833    /// * `s` - The string to check
834    ///
835    /// # Returns
836    ///
837    /// `true` if the string ends with the configured single-line comment marker,
838    /// `false` otherwise or if no single-line comment marker is configured.
839    ///
840    /// # Examples
841    ///
842    /// ```
843    /// use tokenise::Tokeniser;
844    ///
845    /// let mut tokeniser = Tokeniser::new();
846    /// tokeniser.set_sl_comment("//").unwrap();
847    ///
848    /// assert!(tokeniser.ends_with_sl_comment_start("abc//"));
849    /// assert!(!tokeniser.ends_with_sl_comment_start("abc/"));
850    /// ```
851    pub fn ends_with_sl_comment_start(&self, s: &str) -> bool {
852        match self.sl_comment() {
853            None => false,
854            Some(sl_comment) => s.ends_with(sl_comment)
855        }
856    }
857
858    /// Checks if a string is the configured multi-line comment start marker.
859    ///
860    /// # Arguments
861    ///
862    /// * `s` - The string to check
863    ///
864    /// # Returns
865    ///
866    /// `true` if the string exactly matches the configured multi-line comment start marker,
867    /// `false` otherwise or if no multi-line comment marker is configured.
868    ///
869    /// # Examples
870    ///
871    /// ```
872    /// use tokenise::Tokeniser;
873    ///
874    /// let mut tokeniser = Tokeniser::new();
875    /// tokeniser.set_ml_comment("/*", "*/").unwrap();
876    ///
877    /// assert!(tokeniser.is_ml_comment_start("/*"));
878    /// assert!(!tokeniser.is_ml_comment_start("*/"));
879    /// ```
880    pub fn is_ml_comment_start(&self, s: &str) -> bool {
881        match self.ml_comment() {
882            None => false,
883            Some((start,_)) => s == start
884        }
885    }
886
887    /// Checks if a string ends with the configured multi-line comment start marker.
888    ///
889    /// This is used during tokenisation to detect when a series of special characters
890    /// transitions into a comment.
891    ///
892    /// # Arguments
893    ///
894    /// * `s` - The string to check
895    ///
896    /// # Returns
897    ///
898    /// `true` if the string ends with the configured multi-line comment start marker,
899    /// `false` otherwise or if no multi-line comment marker is configured.
900    ///
901    /// # Examples
902    ///
903    /// ```
904    /// use tokenise::Tokeniser;
905    ///
906    /// let mut tokeniser = Tokeniser::new();
907    /// tokeniser.set_ml_comment("/*", "*/").unwrap();
908    ///
909    /// assert!(tokeniser.ends_with_ml_comment_start("abc/*"));
910    /// assert!(!tokeniser.ends_with_ml_comment_start("abc/"));
911    /// ```
912    pub fn ends_with_ml_comment_start(&self, s: &str) -> bool {
913        match self.ml_comment() {
914            None => false,
915            Some((start,_)) => s.ends_with(start)
916        }
917    }
918
919    /// Checks if a string is the configured multi-line comment end marker.
920    ///
921    /// # Arguments
922    ///
923    /// * `s` - The string to check
924    ///
925    /// # Returns
926    ///
927    /// `true` if the string exactly matches the configured multi-line comment end marker,
928    /// `false` otherwise or if no multi-line comment marker is configured.
929    ///
930    /// # Examples
931    ///
932    /// ```
933    /// use tokenise::Tokeniser;
934    ///
935    /// let mut tokeniser = Tokeniser::new();
936    /// tokeniser.set_ml_comment("/*", "*/").unwrap();
937    ///
938    /// assert!(tokeniser.is_ml_comment_end("*/"));
939    /// assert!(!tokeniser.is_ml_comment_end("/*"));
940    /// ```
941    pub fn is_ml_comment_end(&self, s: &str) -> bool {
942        match self.ml_comment() {
943            None => false,
944            Some((_, end)) => s == end
945        }
946    }
947    
948    /// Tokenises a string according to the configured rules.
949    ///
950    /// This is the main method of the library, converting a string into a sequence of tokens
951    /// based on the special characters, delimiters, and comment markers that have been configured.
952    ///
953    /// # Arguments
954    ///
955    /// * `text` - The string to tokenise
956    ///
957    /// # Returns
958    ///
959    /// * `Ok(Vec<Token>)` - A vector of tokens if tokenisation was successful
960    /// * `Err(String)` - An error message if tokenisation failed
961    ///
962    /// # Examples
963    ///
964    /// ```
965    /// use tokenise::{Tokeniser, TokenState};
966    ///
967    /// let mut tokeniser = Tokeniser::new();
968    /// tokeniser.add_specials("+-*/=");
969    /// tokeniser.add_delimiter_pairs(&vec!["()", "[]"]).unwrap();
970    /// tokeniser.set_sl_comment("//").unwrap();
971    ///
972    /// let source = "x = 42; // The answer";
973    /// let tokens = tokeniser.tokenise(source).unwrap();
974    ///
975    /// // We can now work with the tokens
976    /// for token in &tokens {
977    ///     match token.get_state() {
978    ///         TokenState::Word => println!("Word: {}", token.value()),
979    ///         TokenState::SymbolString => println!("Symbol: {}", token.value()),
980    ///         TokenState::SLComment => println!("Comment: {}", token.value()),
981    ///         _ => println!("Other token: {}", token.value()),
982    ///     }
983    /// }
984    /// ```
985    pub fn tokenise<'g>(&self, text: &'g str) -> Result<Vec<Token<'g>>,String> {
986        let mut out: Vec<Token<'g>> = Vec::new();
987        let mut curr_start: usize = 0;
988        let mut curr_state : Option<TokenState> = None;
989        for (curr_pos,c) in text.grapheme_indices(true) {
990            match curr_state {
991                None => {
992                    if self.special(c) {
993                        if self.is_sl_comment_start(c) {
994                            curr_state = Some(SLComment);
995                            curr_start = curr_pos;
996                        } else if self.is_ml_comment_start(c) {
997                            curr_state = Some(MLComment);
998                            curr_start = curr_pos;
999                        } else {
1000                            curr_state = Some(SymbolString);
1001                            curr_start = curr_pos;
1002                            match self.delimiter(c) {
1003                                Some(Side::Left) => {
1004                                    out.push(Token { state: LDelimiter, val: c, start_pos: curr_pos });
1005                                    curr_state = None;
1006                                },
1007                                Some(Side::Right) => {
1008                                    out.push(Token { state: RDelimiter, val: c, start_pos: curr_pos });
1009                                    curr_state = None;
1010                                },
1011                                Some(Side::Bal) => {
1012                                    out.push(Token { state: BDelimiter, val: c, start_pos: curr_pos });
1013                                    curr_state = None;
1014                                }
1015                                None => {}
1016                            }
1017                        }
1018                    } else {
1019                        if c == "\n" || c == "\r" || c == "\r\n" {
1020                            out.push(Token {
1021                                state: NewLine,
1022                                val: c,
1023                                start_pos: curr_pos
1024                            });
1025                        } else if is_whitespace(c) {
1026                            curr_state = Some(WhiteSpace);
1027                        } else {
1028                            curr_state = Some(Word);
1029                        }
1030                        curr_start = curr_pos;
1031                    }
1032                },
1033                Some(Word) => {
1034                    if self.special(c) {
1035                        out.push(Token{
1036                            state: Word,
1037                            val: &text[curr_start..curr_pos],
1038                            start_pos: curr_start
1039                        });
1040                        
1041                        match self.delimiter(c) {
1042                            Some(Side::Left) => {
1043                                out.push(Token { state: LDelimiter, val: c, start_pos: curr_pos });
1044                                curr_state = None;
1045                            },
1046                            Some(Side::Right) => {
1047                                out.push(Token { state: RDelimiter, val: c, start_pos: curr_pos });
1048                                curr_state = None;
1049                            },
1050                            Some(Side::Bal) => {
1051                                out.push(Token { state: BDelimiter, val: c, start_pos: curr_pos });
1052                                curr_state = None;
1053                            },
1054                            None => {
1055                                curr_start = curr_pos;
1056                                curr_state = Some(SymbolString);
1057                            }
1058                        }
1059                    } else {
1060                        if is_whitespace(c) {
1061                            out.push(Token{
1062                                state: Word,
1063                                val: &text[curr_start..curr_pos],
1064                                start_pos: curr_start
1065                            });
1066                            if c == "\n" || c == "\r" || c == "\r\n" {
1067                                out.push(Token {
1068                                    state: NewLine,
1069                                    val: c,
1070                                    start_pos: curr_pos
1071                                });
1072                                curr_state = None;
1073                            } else {
1074                                curr_state = Some(WhiteSpace);
1075                                curr_start = curr_pos;
1076                            }
1077                        } else {
1078                        }
1079                    }
1080                },
1081                Some(SymbolString) => {
1082                    if !self.special(c) {
1083                        out.push(Token { state: SymbolString, val: &text[curr_start..curr_pos], start_pos: curr_start });
1084                        curr_start = curr_pos;
1085                        if is_whitespace(c) {
1086                            if c == "\n" || c == "\r" || c == "\r\n" {
1087                                out.push(Token {
1088                                    state: NewLine,
1089                                    val: c,
1090                                    start_pos: curr_pos
1091                                });
1092                                curr_state = None;
1093                            } else {
1094                                curr_state = Some(WhiteSpace);
1095                            }
1096                        } else {
1097                            curr_state = Some(Word);
1098                        }
1099                    } else {
1100                        let curr_str = &text[curr_start..curr_pos+c.len()];
1101                        if self.ends_with_sl_comment_start(curr_str) {
1102                            if self.is_sl_comment_start(curr_str) {
1103                                curr_state = Some(SLComment);
1104                            } else {
1105                                let new_start = curr_pos + c.len() - self.sl_comment().unwrap().len();
1106                                out.push(Token {
1107                                    state: SymbolString,
1108                                    val: &text[curr_start..new_start],
1109                                    start_pos: curr_start
1110                                });
1111                                curr_state = Some(SLComment);
1112                                curr_start = new_start;
1113                            }
1114                        } else if self.ends_with_ml_comment_start(curr_str) {
1115                            if self.is_ml_comment_start(curr_str) {
1116                                curr_state = Some(MLComment);
1117                            } else {
1118                                let new_start = curr_pos + c.len() - self.ml_comment().unwrap().0.len();
1119                                out.push(Token {
1120                                    state: SymbolString,
1121                                    val: &text[curr_start..new_start],
1122                                    start_pos: curr_start
1123                                });
1124                                curr_state = Some(MLComment);
1125                                curr_start = new_start;
1126                            }
1127                        }
1128                    }
1129                },
1130                Some(WhiteSpace) => {
1131                    if self.special(c) {
1132                        out.push(Token {
1133                            state: WhiteSpace,
1134                            val: &text[curr_start..curr_pos],
1135                            start_pos: curr_start
1136                        });
1137                        match self.delimiter(c) {
1138                            None => {
1139                                if self.is_sl_comment_start(c) {
1140                                    curr_state = Some(SLComment);
1141                                } else if self.is_ml_comment_start(c) {
1142                                    curr_state = Some(MLComment);
1143                                } else {
1144                                    curr_state = Some(SymbolString);
1145                                }
1146                                curr_start = curr_pos;
1147                            },
1148                            Some(Side::Left) => {
1149                                out.push(Token { state: LDelimiter, val: c, start_pos: curr_pos });
1150                                curr_state = None;
1151                            },
1152                            Some(Side::Right) => {
1153                                out.push(Token { state: RDelimiter, val: c, start_pos: curr_pos });
1154                                curr_state = None;
1155                            },
1156                            Some(Side::Bal) => {
1157                                out.push(Token { state: BDelimiter, val: c, start_pos: curr_pos });
1158                                curr_state = None;
1159                            }
1160                        }
1161                    } else {
1162                        if c == "\n" || c == "\r" || c == "\r\n" {
1163                            out.push(Token {
1164                                state: WhiteSpace,
1165                                val: &text[curr_start..curr_pos],
1166                                start_pos: curr_start
1167                            });
1168                            out.push(Token {
1169                                state: NewLine,
1170                                val: c,
1171                                start_pos: curr_pos
1172                            });
1173                            curr_state = None;
1174                        } else if !is_whitespace(c) {
1175                            out.push(Token {
1176                                state: WhiteSpace,
1177                                val: &text[curr_start..curr_pos],
1178                                start_pos: curr_start
1179                            });
1180                            curr_start = curr_pos;
1181                            curr_state = Some(Word);
1182                        }
1183                    }
1184                },
1185                Some(SLComment) => {
1186                    if c == "\n" || c == "\r" || c == "\r\n" {
1187                        out.push(Token {
1188                            state: SLComment,
1189                            val: &text[curr_start..curr_pos],
1190                            start_pos: curr_start
1191                        });
1192                        out.push(Token {
1193                            state: NewLine,
1194                            val: c,
1195                            start_pos: curr_pos
1196                        });
1197                        curr_state = None;
1198                    }
1199                },
1200                Some(MLComment) => {
1201                    let curr_str = &text[curr_start..curr_pos+(c.len())];
1202                    let end = match self.ml_comment() {
1203                        Some((_, e)) => Ok(e),
1204                        _ => Err("This should never happen".to_string())
1205                    }.unwrap();
1206                    if curr_str.ends_with(end) {
1207                        out.push(Token {
1208                            state: MLComment,
1209                            val: &text[curr_start..curr_pos+(c.len())],
1210                            start_pos: curr_start
1211                        });
1212                        curr_state = None;
1213                    }
1214                },
1215                other => {return Err(format!("curr_state should never reach {:?}",other))}
1216            }
1217        }
1218        if let Some(token) = out.last() {
1219            if token.value().len() + token.start() != text.len() {
1220                out.push(Token {
1221                    state: curr_state.unwrap(),
1222                    val: &text[curr_start..],
1223                    start_pos: curr_start
1224                });
1225            }
1226        }
1227        Ok(out)
1228    }
1229}
1230
1231
1232#[cfg(test)]
1233mod tests {
1234    use super::*;
1235
1236    #[test]
1237    fn token_gets_work() {
1238        let token = Token{ state: Word, val:"hi", start_pos:4 };
1239        assert_eq!(token.start(), 4);
1240        assert_eq!(token.value(), "hi");
1241    }
1242
1243    #[test]
1244    fn tokeniser_building_works() {
1245        let mut tokeniser = Tokeniser::new();
1246        tokeniser.add_specials("!@%πŸ‘¨β€πŸ’»*");
1247        tokeniser.add_delimiter_pairs(&vec!["<>", "()", "{}", "πŸ‡ΊπŸ‡ΈπŸ‘‹πŸ½"]).unwrap();
1248        tokeniser.add_balanced_delimiters("\"").unwrap();
1249        tokeniser.set_sl_comment("//").unwrap();
1250        tokeniser.set_ml_comment("/*","*/").unwrap();
1251        assert_eq!(tokeniser.specials(),vec!["!", "@", "%", "πŸ‘¨β€πŸ’»", "*", "<", ">", "(", ")", "{", "}", "πŸ‡ΊπŸ‡Έ", "πŸ‘‹πŸ½", "\"", "/"]);
1252        assert_eq!(tokeniser.lr_delimiters(),vec![("<",">"),("(",")"),("{","}"),("πŸ‡ΊπŸ‡Έ","πŸ‘‹πŸ½")]);
1253        assert_eq!(tokeniser.bal_delimiters(),vec!["\""]);
1254        assert_eq!(tokeniser.sl_comment(),Some("//"));
1255        assert_eq!(tokeniser.ml_comment(),Some(("/*","*/")));
1256    }
1257
1258    #[test]
1259    fn tokeniser_tokenise_works() {
1260        let source = " hi, skdjfs;;    842\t 39fsl == 3\n \
1261        what's going on? idk... \n\
1262        fire___sldfksfl // what's going on? \n\
1263        idk what I'm πŸ‡ΊπŸ‡Έdoing \n\
1264        \n\
1265         nowπŸ‘‹πŸ½ hi Β£*$*@ \n\
1266        help!\n\
1267        \"hello\"hi";
1268        let mut tokeniser = Tokeniser::new();
1269        tokeniser.add_delimiter_pairs(&vec!["()","[]"]).unwrap();
1270        tokeniser.add_balanced_delimiter("\"").unwrap();
1271        tokeniser.set_sl_comment("//").unwrap();
1272        tokeniser.set_ml_comment("πŸ‡ΊπŸ‡Έ","πŸ‘‹πŸ½").unwrap();
1273        tokeniser.add_specials(",;=?.'Β£<>@*");
1274        assert_eq!(tokeniser.tokenise(source).unwrap(),vec![
1275            Token { state: WhiteSpace, val: " ", start_pos: 0 },
1276            Token { state: Word, val: "hi", start_pos: 1 },
1277            Token { state: SymbolString, val: ",", start_pos: 3 },
1278            Token { state: WhiteSpace, val: " ", start_pos: 4 },
1279            Token { state: Word, val: "skdjfs", start_pos: 5 },
1280            Token { state: SymbolString, val: ";;", start_pos: 11 },
1281            Token { state: WhiteSpace, val: "    ", start_pos: 13 },
1282            Token { state: Word, val: "842", start_pos: 17 },
1283            Token { state: WhiteSpace, val: "\t ", start_pos: 20 },
1284            Token { state: Word, val: "39fsl", start_pos: 22 },
1285            Token { state: WhiteSpace, val: " ", start_pos: 27 },
1286            Token { state: SymbolString, val: "==", start_pos: 28 },
1287            Token { state: WhiteSpace, val: " ", start_pos: 30 },
1288            Token { state: Word, val: "3", start_pos: 31 },
1289            Token { state: NewLine, val: "\n", start_pos: 32 },
1290            Token { state: WhiteSpace, val: " ", start_pos: 33 },
1291            Token { state: Word, val: "what", start_pos: 34 },
1292            Token { state: SymbolString, val: "'", start_pos: 38 },
1293            Token { state: Word, val: "s", start_pos: 39 },
1294            Token { state: WhiteSpace, val: " ", start_pos: 40 },
1295            Token { state: Word, val: "going", start_pos: 41 },
1296            Token { state: WhiteSpace, val: " ", start_pos: 46 },
1297            Token { state: Word, val: "on", start_pos: 47 },
1298            Token { state: SymbolString, val: "?", start_pos: 49 },
1299            Token { state: WhiteSpace, val: " ", start_pos: 50 },
1300            Token { state: Word, val: "idk", start_pos: 51 },
1301            Token { state: SymbolString, val: "...", start_pos: 54 },
1302            Token { state: WhiteSpace, val: " ", start_pos: 57 },
1303            Token { state: NewLine, val: "\n", start_pos: 58 },
1304            Token { state: Word, val: "fire___sldfksfl", start_pos: 59 },
1305            Token { state: WhiteSpace, val: " ", start_pos: 74 },
1306            Token { state: SLComment, val: "// what's going on? ", start_pos: 75 },
1307            Token { state: NewLine, val: "\n", start_pos: 95 },
1308            Token { state: Word, val: "idk", start_pos: 96 },
1309            Token { state: WhiteSpace, val: " ", start_pos: 99 },
1310            Token { state: Word, val: "what", start_pos: 100 },
1311            Token { state: WhiteSpace, val: " ", start_pos: 104 },
1312            Token { state: Word, val: "I", start_pos: 105 },
1313            Token { state: SymbolString, val: "'", start_pos: 106 },
1314            Token { state: Word, val: "m", start_pos: 107 },
1315            Token { state: WhiteSpace, val: " ", start_pos: 108 },
1316            Token { state: MLComment, val: "πŸ‡ΊπŸ‡Έdoing \n\nnowπŸ‘‹πŸ½", start_pos: 109 },
1317            Token { state: WhiteSpace, val: " ", start_pos: 136 },
1318            Token { state: Word, val: "hi", start_pos: 137 },
1319            Token { state: WhiteSpace, val: " ", start_pos: 139 },
1320            Token { state: SymbolString, val: "Β£*", start_pos: 140 },
1321            Token { state: Word, val: "$", start_pos: 143 },
1322            Token { state: SymbolString, val: "*@", start_pos: 144 },
1323            Token { state: WhiteSpace, val: " ", start_pos: 146 },
1324            Token { state: NewLine, val: "\n", start_pos: 147 },
1325            Token { state: Word, val: "help!", start_pos: 148 },
1326            Token { state: NewLine, val: "\n", start_pos: 153 },
1327            Token { state: BDelimiter, val: "\"", start_pos: 154 },
1328            Token { state: Word, val: "hello", start_pos: 155 },
1329            Token { state: BDelimiter, val: "\"", start_pos: 160 },
1330            Token { state: Word, val: "hi", start_pos: 161 }
1331        ]);
1332    }
1333
1334    #[test]
1335    fn test_tokeniser_tokenise_newline_handling() {
1336        let source = "hi   \r\n\n\rhiagain";
1337        let tokens = vec![
1338            Token { state: Word, val: "hi", start_pos: 0 },
1339            Token { state: WhiteSpace, val: "   ", start_pos: 2 },
1340            Token { state: NewLine, val: "\r\n", start_pos: 5 },
1341            Token { state: NewLine, val: "\n", start_pos: 7 },
1342            Token { state: NewLine, val: "\r", start_pos: 8},
1343            Token { state: Word, val: "hiagain", start_pos: 9}
1344        ];
1345        let tokeniser = Tokeniser::new();
1346        assert_eq!(tokeniser.tokenise(source).unwrap(), tokens);
1347    }
1348}