Skip to main content

melodium_lang/text/
word.rs

1//! Module in charge of textual words parsing and analysis.
2//!
3//! This module contains low-level functions doing parsing and analysis of text, as well as [word elements](Word), the smallest unit of text that can be parsed.
4//! All functions there are unicode-aware.
5
6use core::fmt::{Display, Formatter};
7use melodium_engine::designer::Reference;
8use regex::Regex;
9use std::str;
10use std::sync::Arc;
11
12/// Word, smallest unit of parsed text.
13///
14/// This structure embeds informations about a word, that can be anything like a name `MyFashionName`, value `12.345`, or any symbol like parenthesis, bracket, comma, etc.
15#[derive(Debug, Clone, Hash, PartialEq, Eq)]
16pub struct Word {
17    /// Literal text of the word.
18    pub text: String,
19    /// Kind of the word, may be None if the word is of an unknown kind.
20    pub kind: Option<Kind>,
21    /// Position of the word in the file.
22    pub position: Position,
23}
24
25impl Default for Word {
26    fn default() -> Self {
27        Word {
28            text: String::new(),
29            kind: None,
30            position: Position::default(),
31        }
32    }
33}
34
35/// Position of a word or element in text.
36///
37/// # Note
38/// All positions (`absolute_position`, `line_position`) are expected to be bytes indexes, not chars.
39#[derive(Default, Debug, Copy, Clone, Hash, PartialEq, Eq)]
40pub struct Position {
41    /// Absolute position of the word inside the text script, as byte index.
42    pub absolute_position: usize,
43    /// Line where the word is (starting at 1).
44    pub line_number: usize,
45    /// Position of the word on its line , as byte index, zero meaning the first char after '\n'.
46    pub line_position: usize,
47}
48
49#[derive(Default, Debug, Clone, Hash, PartialEq, Eq)]
50pub struct PositionnedString {
51    pub string: String,
52    pub position: Position,
53}
54
55impl Reference for PositionnedString {}
56
57impl PositionnedString {
58    pub fn remove_indent(&mut self) {
59        let mut prefix = None;
60        for line in self.string.lines() {
61            let trimmed_line = line.trim_start();
62            if !trimmed_line.is_empty() {
63                let whitespaces = line.split_at(line.find(trimmed_line).unwrap()).0;
64                prefix = Some(whitespaces.to_string());
65                break;
66            }
67        }
68
69        if let Some(prefix) = prefix {
70            let mut less_indented_string = String::new();
71            for line in self.string.lines() {
72                less_indented_string.push_str(line.strip_prefix(&prefix).unwrap_or(line));
73                less_indented_string.push_str("\n");
74            }
75            self.string = less_indented_string;
76        }
77    }
78
79    pub fn into_ref(&self) -> Arc<dyn Reference> {
80        Arc::new(self.clone())
81    }
82}
83
84impl From<&Word> for PositionnedString {
85    fn from(word: &Word) -> Self {
86        Self {
87            string: word.text.clone(),
88            position: word.position.clone(),
89        }
90    }
91}
92
93/// Kind of word.
94///
95/// "Kind" designates what the word fundamentaly is, meaning a `Name` is some text that designates name of something (including keyword), `Opening*` and `Closing*` are obvious, as well as `Equal`, `Colon`, `Comma`, etc.
96///
97/// Some "special" kinds of words, like `Comment`, `Annotations`, or `RightArrow` are there because they designates very specific patterns of text that can be easily and cheaply identified, and considered as single elements for all other parsing steps.
98#[derive(Debug, PartialEq, Eq, Copy, Clone, Hash)]
99pub enum Kind {
100    /// Comment, anything like `//…` or `/* … */`.
101    Comment,
102    /// Annotation, anything like `#…`.
103    Annotation,
104    /// `(`
105    OpeningParenthesis,
106    /// `)`
107    ClosingParenthesis,
108    /// `{`
109    OpeningBrace,
110    /// `}`
111    ClosingBrace,
112    /// `[`
113    OpeningBracket,
114    /// `]`
115    ClosingBracket,
116    /// `<`
117    OpeningChevron,
118    /// `>`
119    ClosingChevron,
120    /// `=`
121    Equal,
122    /// `:`
123    Colon,
124    /// `,`
125    Comma,
126    /// `.`
127    Dot,
128    /// `/`
129    Slash,
130    /// `_`
131    Underscore,
132    /// `+`,
133    Plus,
134    /// An arrow made of one or more `-` terminated by `>`, `--->`.
135    RightArrow,
136    /// Anything corresponding to a name, meaning anything that is composed of letters (Unicode definition) or numbers, but not starting with a number.
137    Name,
138    /// Same thing than `Name`, but having `@` in the first place.
139    Context,
140    /// Same thing than `Name`, but having `|` in the first place.
141    Function,
142    /// Anything matching a number, starting optionally with `-`, or any digit, and having an arbitrary number of digits, with at most one point `.` inside.
143    Number,
144    /// Any string starting and ending with `"` (with a preservation of `\"` and `\\`) or using incremental braces with `${` and `}`.
145    String,
146    /// A character enclosed by `'`
147    Character,
148    /// A byte composed of `0xFF`, `FF` being any hexadecimal value from range `[0-9A-F]`.
149    Byte,
150}
151
152impl Display for Kind {
153    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
154        let str = match self {
155            Kind::Comment => "// Comment",
156            Kind::Annotation => "# Annotation",
157            Kind::OpeningParenthesis => "(",
158            Kind::ClosingParenthesis => ")",
159            Kind::OpeningBrace => "{",
160            Kind::ClosingBrace => "}",
161            Kind::OpeningBracket => "[",
162            Kind::ClosingBracket => "]",
163            Kind::OpeningChevron => "<",
164            Kind::ClosingChevron => ">",
165            Kind::Equal => "=",
166            Kind::Colon => ":",
167            Kind::Comma => ",",
168            Kind::Dot => ".",
169            Kind::Slash => "/",
170            Kind::Underscore => "_",
171            Kind::Plus => "+",
172            Kind::RightArrow => "->",
173            Kind::Name => "name",
174            Kind::Context => "context (@Context)",
175            Kind::Function => "function (|function)",
176            Kind::Number => "number",
177            Kind::String => r#"string ("string")"#,
178            Kind::Character => "character ('c')",
179            Kind::Byte => "byte (0x2A)",
180        };
181        write!(f, "{}", str)
182    }
183}
184
185/// Convenience structure for internal treatments.
186///
187/// Embeds different informations in fancy way, instead of a tuple.
188#[derive(Debug)]
189struct KindCheck {
190    pub is_that_kind: bool,
191    pub end_at: usize,
192    pub is_well_formed: bool,
193}
194
195impl Default for KindCheck {
196    fn default() -> Self {
197        KindCheck {
198            is_that_kind: false,
199            end_at: 0,
200            is_well_formed: false,
201        }
202    }
203}
204
205/// Make primary parsing of text, and return words inside it.
206///
207/// Returns a list of [words](Word) contained inside the text, as `Ok` if parsing went without error (implying every word has an associated kind), or as `Err` if something hasn't been recognized (the last word will be the erroneous one, and may be without kind).
208///
209/// See [expect_word] and [expect_word_kind] for example of usage.
210pub fn get_words(script: &str) -> Result<Vec<Word>, Vec<Word>> {
211    let mut words = Vec::new();
212    let mut remaining_script = script.trim_start();
213    let mut actual_position = script.len() - remaining_script.len();
214    let mut kind_check: KindCheck;
215
216    while !remaining_script.is_empty() {
217        let kind: Option<Kind>;
218
219        // Check if word is Comment.
220        if {
221            kind_check = manage_comment(remaining_script);
222            kind_check.is_that_kind
223        } {
224            kind = Some(Kind::Comment);
225        }
226        // Check if word is Annotation
227        else if {
228            kind_check = manage_annotation(remaining_script);
229            kind_check.is_that_kind
230        } {
231            kind = Some(Kind::Annotation);
232        }
233        // Check if word is OpeningParenthesis
234        else if {
235            kind_check = manage_single_char('(', remaining_script);
236            kind_check.is_that_kind
237        } {
238            kind = Some(Kind::OpeningParenthesis);
239        }
240        // Check if word is ClosingParenthesis
241        else if {
242            kind_check = manage_single_char(')', remaining_script);
243            kind_check.is_that_kind
244        } {
245            kind = Some(Kind::ClosingParenthesis);
246        }
247        // Check if word is OpeningBrace
248        else if {
249            kind_check = manage_single_char('{', remaining_script);
250            kind_check.is_that_kind
251        } {
252            kind = Some(Kind::OpeningBrace);
253        }
254        // Check if word is ClosingBrace
255        else if {
256            kind_check = manage_single_char('}', remaining_script);
257            kind_check.is_that_kind
258        } {
259            kind = Some(Kind::ClosingBrace);
260        }
261        // Check if word is OpeningBracket
262        else if {
263            kind_check = manage_single_char('[', remaining_script);
264            kind_check.is_that_kind
265        } {
266            kind = Some(Kind::OpeningBracket);
267        }
268        // Check if word is ClosingBracket
269        else if {
270            kind_check = manage_single_char(']', remaining_script);
271            kind_check.is_that_kind
272        } {
273            kind = Some(Kind::ClosingBracket);
274        }
275        // Check if word is OpeningChevron
276        else if {
277            kind_check = manage_single_char('<', remaining_script);
278            kind_check.is_that_kind
279        } {
280            kind = Some(Kind::OpeningChevron);
281        }
282        // Check if word is ClosingChevron
283        else if {
284            kind_check = manage_single_char('>', remaining_script);
285            kind_check.is_that_kind
286        } {
287            kind = Some(Kind::ClosingChevron);
288        }
289        // Check if word is Equal
290        else if {
291            kind_check = manage_single_char('=', remaining_script);
292            kind_check.is_that_kind
293        } {
294            kind = Some(Kind::Equal);
295        }
296        // Check if word is Colon
297        else if {
298            kind_check = manage_single_char(':', remaining_script);
299            kind_check.is_that_kind
300        } {
301            kind = Some(Kind::Colon);
302        }
303        // Check if word is Comma
304        else if {
305            kind_check = manage_single_char(',', remaining_script);
306            kind_check.is_that_kind
307        } {
308            kind = Some(Kind::Comma);
309        }
310        // Check if word is Dot
311        else if {
312            kind_check = manage_single_char('.', remaining_script);
313            kind_check.is_that_kind
314        } {
315            kind = Some(Kind::Dot);
316        }
317        // Check if word is Slash
318        else if {
319            kind_check = manage_single_char('/', remaining_script);
320            kind_check.is_that_kind
321        } {
322            kind = Some(Kind::Slash);
323        }
324        // Check if word is Underscore
325        else if {
326            kind_check = manage_single_char('_', remaining_script);
327            kind_check.is_that_kind
328        } {
329            kind = Some(Kind::Underscore);
330        }
331        // Check if word is Plus
332        else if {
333            kind_check = manage_single_char('+', remaining_script);
334            kind_check.is_that_kind
335        } {
336            kind = Some(Kind::Plus);
337        }
338        // Check if word is RightArrow
339        else if {
340            kind_check = manage_right_arrow(remaining_script);
341            kind_check.is_that_kind
342        } {
343            kind = Some(Kind::RightArrow);
344        }
345        // Check if word is Name
346        else if {
347            kind_check = manage_name(remaining_script);
348            kind_check.is_that_kind
349        } {
350            kind = Some(Kind::Name);
351        }
352        // Check if word is Context
353        else if {
354            kind_check = manage_context(remaining_script);
355            kind_check.is_that_kind
356        } {
357            kind = Some(Kind::Context);
358        }
359        // Check if word is Function
360        else if {
361            kind_check = manage_function(remaining_script);
362            kind_check.is_that_kind
363        } {
364            kind = Some(Kind::Function);
365        }
366        // Check if word is Byte
367        else if {
368            kind_check = manage_byte(remaining_script);
369            kind_check.is_that_kind
370        } {
371            kind = Some(Kind::Byte);
372        }
373        // Check if word is Number
374        else if {
375            kind_check = manage_number(remaining_script);
376            kind_check.is_that_kind
377        } {
378            kind = Some(Kind::Number);
379        }
380        // Check if word is String
381        else if {
382            kind_check = manage_string(remaining_script);
383            kind_check.is_that_kind
384        } {
385            kind = Some(Kind::String);
386        }
387        // Check if word is Char
388        else if {
389            kind_check = manage_char(remaining_script);
390            kind_check.is_that_kind
391        } {
392            kind = Some(Kind::Character);
393        }
394        // The word is unknown
395        else {
396            kind_check = KindCheck {
397                is_that_kind: false,
398                end_at: 1,
399                is_well_formed: false,
400            };
401            kind = None;
402        }
403
404        if let Some(splitted_script) = remaining_script.split_at_checked(kind_check.end_at) {
405            let (line, pos_in_line) = get_line_pos(script, actual_position);
406            let word = Word {
407                text: splitted_script.0.to_string(),
408                position: Position {
409                    absolute_position: actual_position,
410                    line_position: pos_in_line,
411                    line_number: line,
412                },
413                kind: kind,
414            };
415
416            words.push(word);
417
418            if !kind_check.is_well_formed {
419                return Err(words);
420            } else {
421                let after_word = splitted_script.1.trim_start();
422                actual_position += remaining_script.len() - after_word.len();
423                remaining_script = after_word;
424            }
425        } else {
426            return Err(words);
427        }
428    }
429
430    Ok(words)
431}
432
433fn get_line_pos(text: &str, pos: usize) -> (usize, usize) {
434    let considered_text = text.split_at(pos).0;
435    let newlines_indices = considered_text.match_indices('\n');
436
437    let counter = newlines_indices.clone();
438    let lines = counter.count() + 1;
439
440    let line_start;
441    if lines > 1 {
442        line_start = newlines_indices.last().unwrap().0 + 1;
443    } else {
444        line_start = 0;
445    }
446
447    let pos_in_line = pos - line_start;
448
449    (lines, pos_in_line)
450}
451
452fn manage_comment(text: &str) -> KindCheck {
453    if text.starts_with("//") {
454        let end_of_comment = text.find('\n');
455        KindCheck {
456            is_that_kind: true,
457            end_at: end_of_comment.unwrap_or_else(|| text.len()),
458            is_well_formed: true,
459        }
460    } else if text.starts_with("/*") {
461        let end_of_comment = text.find("*/");
462        KindCheck {
463            is_that_kind: true,
464            end_at: end_of_comment.unwrap_or_else(|| text.len()) + 2,
465            is_well_formed: end_of_comment.is_some(),
466        }
467    } else {
468        KindCheck::default()
469    }
470}
471
472fn manage_annotation(text: &str) -> KindCheck {
473    if text.starts_with('#') {
474        let end_of_annotation = text.find('\n');
475        KindCheck {
476            is_that_kind: true,
477            end_at: end_of_annotation.unwrap_or_else(|| text.len()),
478            is_well_formed: true,
479        }
480    } else {
481        KindCheck::default()
482    }
483}
484
485fn manage_single_char(c: char, text: &str) -> KindCheck {
486    if text.starts_with(c) {
487        KindCheck {
488            is_that_kind: true,
489            end_at: 1,
490            is_well_formed: true,
491        }
492    } else {
493        KindCheck::default()
494    }
495}
496
497fn manage_right_arrow(text: &str) -> KindCheck {
498    lazy_static! {
499        static ref REGEX_RIGHT_ARROW: Regex = Regex::new(r"^-+>").unwrap();
500    }
501    let mat = REGEX_RIGHT_ARROW.find(text);
502    if mat.is_some() {
503        KindCheck {
504            is_that_kind: true,
505            end_at: mat.unwrap().end(),
506            is_well_formed: true,
507        }
508    } else {
509        KindCheck::default()
510    }
511}
512
513fn manage_name(text: &str) -> KindCheck {
514    lazy_static! {
515        static ref REGEX_NAME: Regex =
516            Regex::new(r"^[\p{Alphabetic}\p{M}\p{Pc}\p{Join_Control}]\w*").unwrap();
517    }
518    let mat = REGEX_NAME.find(text);
519    if mat.is_some() {
520        KindCheck {
521            is_that_kind: true,
522            end_at: mat.unwrap().end(),
523            is_well_formed: true,
524        }
525    } else {
526        KindCheck::default()
527    }
528}
529
530fn manage_context(text: &str) -> KindCheck {
531    lazy_static! {
532        static ref REGEX_CONTEXT: Regex =
533            Regex::new(r"^@[\p{Alphabetic}\p{M}\p{Pc}\p{Join_Control}]\w*").unwrap();
534    }
535    let mat = REGEX_CONTEXT.find(text);
536    if mat.is_some() {
537        KindCheck {
538            is_that_kind: true,
539            end_at: mat.unwrap().end(),
540            is_well_formed: true,
541        }
542    } else {
543        KindCheck::default()
544    }
545}
546
547fn manage_function(text: &str) -> KindCheck {
548    lazy_static! {
549        static ref REGEX_CONTEXT: Regex =
550            Regex::new(r"^\|[\p{Alphabetic}\p{M}\p{Pc}\p{Join_Control}]\w*").unwrap();
551    }
552    let mat = REGEX_CONTEXT.find(text);
553    if mat.is_some() {
554        KindCheck {
555            is_that_kind: true,
556            end_at: mat.unwrap().end(),
557            is_well_formed: true,
558        }
559    } else {
560        KindCheck::default()
561    }
562}
563
564fn manage_number(text: &str) -> KindCheck {
565    lazy_static! {
566        static ref REGEX_NUMBER: Regex = Regex::new(r"^-?[0-9]*\.?[0-9]+").unwrap();
567    }
568    let mat = REGEX_NUMBER.find(text);
569    if mat.is_some() {
570        KindCheck {
571            is_that_kind: true,
572            end_at: mat.unwrap().end(),
573            is_well_formed: true,
574        }
575    } else {
576        KindCheck::default()
577    }
578}
579
580fn manage_string(text: &str) -> KindCheck {
581    lazy_static! {
582        static ref REGEX_STRING: Regex = Regex::new(r##"^"(?:[^"\\]|\\.)*""##).unwrap();
583    }
584    if text.starts_with('"') {
585        let mat = REGEX_STRING.find(text);
586        if mat.is_some() {
587            KindCheck {
588                is_that_kind: true,
589                end_at: mat.unwrap().end(),
590                is_well_formed: true,
591            }
592        } else {
593            KindCheck {
594                is_that_kind: true,
595                end_at: text.len(),
596                is_well_formed: false,
597            }
598        }
599    } else if text.starts_with("${") {
600        let num_braces = text.chars().skip(1).take_while(|c| *c == '{').count();
601        let mut end_braces: String = "}".into();
602        for _ in 1..num_braces {
603            end_braces.push('}');
604        }
605        if let Some(end_string_position) = text.find(&end_braces) {
606            KindCheck {
607                is_that_kind: true,
608                end_at: end_string_position + num_braces,
609                is_well_formed: true,
610            }
611        } else {
612            KindCheck {
613                is_that_kind: true,
614                end_at: text.len(),
615                is_well_formed: false,
616            }
617        }
618    } else {
619        KindCheck::default()
620    }
621}
622
623fn manage_char(text: &str) -> KindCheck {
624    lazy_static! {
625        static ref REGEX_CHAR: Regex = Regex::new(r##"^'(?:[^'\\]|\.)+'"##).unwrap();
626    }
627    if text.starts_with('\'') {
628        let mat = REGEX_CHAR.find(text);
629        if mat.is_some() {
630            KindCheck {
631                is_that_kind: true,
632                end_at: mat.unwrap().end(),
633                is_well_formed: true,
634            }
635        } else {
636            KindCheck {
637                is_that_kind: true,
638                end_at: text.len(),
639                is_well_formed: false,
640            }
641        }
642    } else {
643        KindCheck::default()
644    }
645}
646
647fn manage_byte(text: &str) -> KindCheck {
648    lazy_static! {
649        static ref REGEX_BYTE: Regex = Regex::new(r##"^(?:0x[0-9A-F]{2})"##).unwrap();
650    }
651    if text.starts_with("0x") {
652        let mat = REGEX_BYTE.find(text);
653        if mat.is_some() {
654            KindCheck {
655                is_that_kind: true,
656                end_at: mat.unwrap().end(),
657                is_well_formed: true,
658            }
659        } else {
660            KindCheck {
661                is_that_kind: true,
662                end_at: text.len(),
663                is_well_formed: false,
664            }
665        }
666    } else {
667        KindCheck::default()
668    }
669}
670
671#[cfg(test)]
672mod tests {
673
674    use super::*;
675
676    #[test]
677    fn test_well_formated_comments() {
678        let comments = "// A comment
679        //Anoter comment
680        Not_a_comment
681        /*A continuous comment*/
682        /* A
683         * quite
684         * long
685         * comment
686         */
687        /* A shorter comment */";
688
689        let words = get_words(comments).unwrap();
690        let kinds: Vec<bool> = words
691            .iter()
692            .map(|w| w.kind == Some(Kind::Comment))
693            .collect();
694
695        assert_eq!(vec![true, true, false, true, true, true], kinds);
696    }
697
698    #[test]
699    fn test_well_formated_numbers() {
700        let numbers = "0
701        -12
702        1.234
703        Not_a_number
704        -1.234
705        -0
706        00000000000000000000000000000";
707
708        let words = get_words(numbers).unwrap();
709        let kinds: Vec<bool> = words.iter().map(|w| w.kind == Some(Kind::Number)).collect();
710
711        assert_eq!(vec![true, true, true, false, true, true, true], kinds);
712    }
713}