slj 0.6.6

Programski jezik v slovenščini | A programming language in Slovenian
Documentation
use std::{fmt::Debug, hash::Hash};
use regex::Regex;

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Token<'a> {
    Ločilo      (&'a str, usize, usize),
    Operator    (&'a str, usize, usize),
    Rezerviranka(&'a str, usize, usize),
    Ime         (&'a str, usize, usize),
    Tip         (&'a str, usize, usize),
    Literal     (L<'a>),
    Neznano     (&'a str, usize, usize),
}

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum L<'a> {
    Bool(&'a str, usize, usize),
    Celo(&'a str, usize, usize),
    Real(&'a str, usize, usize),
    Znak(&'a str, usize, usize),
    Niz (&'a str, usize, usize),
}

fn bool<'a>(val: &'a str, v: usize, z: usize) -> Token {
    use Token::*;
    use L::*;
    Literal(Bool(val, v, z))
}

fn celo<'a>(val: &'a str, v: usize, z: usize) -> Token {
    use Token::*;
    use L::*;
    Literal(Celo(val, v, z))
}

fn real<'a>(val: &'a str, v: usize, z: usize) -> Token {
    use Token::*;
    use L::*;
    Literal(Real(val, v, z))
}

fn znak<'a>(val: &'a str, v: usize, z: usize) -> Token {
    use Token::*;
    use L::*;
    Literal(Znak(val, v, z))
}

fn niz<'a>(val: &'a str, v: usize, z: usize) -> Token {
    use Token::*;
    use L::*;
    Literal(Niz(val, v, z))
}

impl<'a> Token<'a> {
    pub fn lokacija(&self) -> (usize, usize) {
        use Token::*;
        use L::*;
        match self {
            Operator(_, v, z) | Ločilo(_, v, z) | Rezerviranka(_, v, z) | Ime(_, v, z) | Tip(_, v, z) | Neznano(_, v, z) => (*v, *z),
            Token::Literal(literal) => match literal {
                Bool(_, v, z) | Celo(_, v, z) | Real(_, v, z) | Znak(_, v, z) | Niz(_, v, z) => (*v, *z),
            }
        }
    }

    pub fn lokacija_str(&self) -> String {
        use Token::*;
        use L::*;
        match self {
            Operator(.., v, z) | Ločilo(.., v, z) | Rezerviranka(.., v, z) | Ime(.., v, z) | Tip(.., v, z) | Neznano(.., v, z) => format!("[{v}, {z})"),
            Token::Literal(literal) => match literal {
                Bool(.., v, z) | Celo(.., v, z) | Real(.., v, z) | Znak(.., v, z) | Niz(.., v, z) => format!("[{v}, {z})"),
            }
        }
    }

    pub fn len(&self) -> usize {
        use Token::*;
        use L::*;
        match self {
            Operator(val, ..) | Ločilo(val, ..) | Rezerviranka(val, ..) | Ime(val, ..) | Tip(val, ..) | Neznano(val, ..) => val.len(),
            Token::Literal(literal) => match literal {
                Bool(val, ..) | Celo(val, ..) | Real(val, ..) | Znak(val, ..) | Niz(val, ..) => val.len(),
            }
        }
    }

    pub fn as_str(&self) -> &'a str {
        use Token::*;
        use L::*;
        match self {
            Operator(val, ..) | Ločilo(val, ..) | Rezerviranka(val, ..) | Ime(val, ..) | Tip(val, ..) | Neznano(val, ..) => val,
            Token::Literal(literal) => match literal {
                Bool(val, ..) | Celo(val, ..) | Real(val, ..) | Znak(val, ..) | Niz(val, ..) => val,
            }
        }
    }
}

impl ToString for Token<'_> {
    fn to_string(&self) -> String {
        use Token::*;
        use L::*;
        match self {
            Operator(val, ..) | Ločilo(val, ..) | Rezerviranka(val, ..) | Ime(val, ..) | Tip(val, ..) | Neznano(val, ..) => val.to_string(),
            Token::Literal(literal) => match literal {
                Bool(val, ..) | Celo(val, ..) | Real(val, ..) | Znak(val, ..) | Niz(val, ..) => val.to_string(),
            }
        }
    }
}

impl Hash for Token<'_> {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
        use Token::*;
        use L::*;
        match self {
            Operator(val, ..) | Ločilo(val, ..) | Rezerviranka(val, ..) | Ime(val, ..) | Tip(val, ..) | Neznano(val, ..) => val.hash(state),
            Token::Literal(literal) => match literal {
                Bool(val, ..) | Celo(val, ..) | Real(val, ..) | Znak(val, ..) | Niz(val, ..) => val.hash(state),
            }
        }
    }
}

pub struct Tokenizer;

pub trait Tokenize<'a> {
    fn tokenize(&'a self) -> Vec<Token<'a>>;
}

impl<'a> Tokenize<'a> for &str {
    fn tokenize(&'a self) -> Vec<Token<'a>> {
        Tokenizer::tokenize(self)
    }
}


impl<'a> Tokenizer {

    pub fn tokenize(tekst: &'a str) -> Vec<Token> {
        use Token::*;

        const ZADNJA_MEJA: &str = r#"(?:["=<>|&!+\-*/%^@,;:#\n(){}\[\]]|\b|$)"#;
        const PRESLEDEK: &str = r"([^\S\n]*)";

        let regexi: Vec<(Regex, fn(&'a str, usize, usize) -> Token<'a>)> = vec![
            (Regex::new(&format!(r"^{PRESLEDEK}(naj|čene|če|dokler|za|funkcija|vrni|prekini){ZADNJA_MEJA}")).unwrap(), Rezerviranka),
            (Regex::new(&format!(r"^{PRESLEDEK}(brez|bool|celo|real|znak){ZADNJA_MEJA}")).unwrap(), Tip),
            (Regex::new(&format!(r"^{PRESLEDEK}(resnica|laž){ZADNJA_MEJA}")).unwrap(), bool),
            (Regex::new(&format!(r"^{PRESLEDEK}('(.|\\[\\nrt'])')")).unwrap(), znak),
            (Regex::new(&format!( "^{PRESLEDEK}(\"[^\n\"]*\")")).unwrap(), niz),
            (Regex::new(&format!(r"^{PRESLEDEK}(\d+\.\d+|\d{{1,3}}(_\d{{3}})+\.(\d{{3}}_)+\d{{1,3}}){ZADNJA_MEJA}")).unwrap(), real),
            (Regex::new(&format!(r"^{PRESLEDEK}(\d+|\d{{1,3}}(_\d{{3}})+){ZADNJA_MEJA}")).unwrap(), celo),
            (Regex::new(&format!(r"^{PRESLEDEK}([.,;:#\n(){{}}\[\]]|->)")).unwrap(), Ločilo),
            (Regex::new(&format!(r"^{PRESLEDEK}(?x)(
                        # pretvorba
                            kot |
                        # zamik
                            <<= | >>= | << | >> |
                        # primerjava
                            == | != | <= | >= | < | > |
                        # spreminjanje vrednosti
                            \*\*= | \|\|= | &&= | [+\-*/%|&^]= |
                        # Boolovi operatorji
                            \|\| | && | ! |
                        # aritmetika
                            \*\* | [+\-*/%] |
                        # binarna aritmetika
                            [|&^] |
                        # prirejanje
                            = |
                        # referenca
                            @
                        )")).unwrap(), Operator),
            (Regex::new(&format!(r"^{PRESLEDEK}([_\p{{Letter}}][\w\d]*){ZADNJA_MEJA}")).unwrap(), Ime),
            (Regex::new(&format!(r"^{PRESLEDEK}(\S*){ZADNJA_MEJA}")).unwrap(), Neznano),
        ];

        let mut tokeni: Vec<Token> = Vec::new();
        let mut vrstica = 1;
        let mut znak = 1;

        let mut i: usize = 0;

        while i < tekst.len() {
            match najdi_token(&regexi, &tekst[i..], vrstica, znak) {
                Some((token, dolžina)) => {
                    match token {
                        Neznano("", ..) => (),
                        _ => tokeni.push(token),
                    }
                    if let Ločilo("\n", ..) = token {
                        vrstica += 1;
                        znak = 1;
                    }
                    else {
                        // vzamemo chars().count() namesto len(),
                        // saj je važno število znakov in ne bajtov
                        znak += tekst[i..i+dolžina].chars().count();
                    }
                    i += dolžina;
                },
                None => (),
            };
        }

        tokeni
    }

}

fn najdi_token<'a>(regexi: &[(Regex, fn(&'a str, usize, usize) -> Token<'a>)], beseda: &'a str, vrstica: usize, znak: usize) -> Option<(Token<'a>, usize)> {
    let (regex, token) = &regexi[0];

    match regex.captures(beseda) {
        Some(skupine) => {
            // 1. skupina je zadetek
            let presledek = skupine.get(1).unwrap();
            let zadetek = skupine.get(2).unwrap();
            let velikost_presledka = presledek.as_str().chars().count();
            Some((token(zadetek.as_str(), vrstica, znak + velikost_presledka), zadetek.end()))
        },
        None =>
            if regexi.len() > 1 {
                najdi_token(&regexi[1..], beseda, vrstica, znak)
            }
            else {
                None
            },
    }
}

#[cfg(test)]
mod testi {
    use super::{Token::*, L::*, Tokenize, *};

    #[test]
    fn rezervirane_besede() {
        assert_eq!("naj".tokenize(), [Rezerviranka("naj", 1, 1)]);
        assert_eq!("čene".tokenize(), [Rezerviranka("čene", 1, 1)]);
        assert_eq!("če".tokenize(), [Rezerviranka("če", 1, 1)]);
        assert_eq!("dokler".tokenize(), [Rezerviranka("dokler", 1, 1)]);
        assert_eq!("za".tokenize(), [Rezerviranka("za", 1, 1)]);
        assert_eq!("funkcija".tokenize(), [Rezerviranka("funkcija", 1, 1)]);
        assert_eq!("vrni".tokenize(), [Rezerviranka("vrni", 1, 1)]);
        assert_eq!("prekini".tokenize(), [Rezerviranka("prekini", 1, 1)]);
    }

    #[test]
    fn literali() {
        assert_eq!("resnica".tokenize(), [Literal(Bool("resnica", 1, 1))]);
        assert_eq!("laž".tokenize(), [Literal(Bool("laž", 1, 1))]);

        assert_eq!("'a'".tokenize(), [Literal(Znak("'a'", 1, 1))]);
        assert_eq!("'đ'".tokenize(), [Literal(Znak("'đ'", 1, 1))]);
        assert_eq!(r"'\n'".tokenize(), [Literal(Znak(r"'\n'", 1, 1))]);
        assert_eq!(r"'\\'".tokenize(), [Literal(Znak(r"'\\'", 1, 1))]);
        assert_eq!(r"'\r'".tokenize(), [Literal(Znak(r"'\r'", 1, 1))]);
        assert_eq!(r"'\f'".tokenize(), [Neznano(r"'\f'", 1, 1)]);

        assert_eq!("\"\"".tokenize(), [Literal(Niz("\"\"", 1, 1))]);
        assert_eq!("\"niz\"".tokenize(), [Literal(Niz("\"niz\"", 1, 1))]);
        assert_eq!("\"3.14\"".tokenize(), [Literal(Niz("\"3.14\"", 1, 1))]);
        assert_eq!("\"{}\\n\"".tokenize(), [Literal(Niz("\"{}\\n\"", 1, 1))]);
        assert_eq!("\"{}\\n\" \"smola\"".tokenize(), [Literal(Niz("\"{}\\n\"", 1, 1)), Literal(Niz("\"smola\"", 1, 8))]);

        assert_eq!("0".tokenize(), [Literal(Celo("0", 1, 1))]);
        assert_eq!("13".tokenize(), [Literal(Celo("13", 1, 1))]);
        assert_eq!("1_000_000".tokenize(), [Literal(Celo("1_000_000", 1, 1))]);

        assert_eq!("0.5".tokenize(), [Literal(Real("0.5", 1, 1))]);
        assert_eq!("3.14".tokenize(), [Literal(Real("3.14", 1, 1))]);
    }

    #[test]
    fn ime() {
        assert_eq!("a".tokenize(), [Ime("a", 1, 1)]);
        assert_eq!("švajs".tokenize(), [Ime("švajs", 1, 1)]);
        assert_eq!("švajs  mašina".tokenize(), [Ime("švajs", 1, 1), Ime("mašina", 1, 8)]);
        assert_eq!("__groot__".tokenize(), [Ime("__groot__", 1, 1)]);
        assert_eq!("kamelskaTelewizje".tokenize(), [Ime("kamelskaTelewizje", 1, 1)]);
        assert_eq!("RabeljskoJezero123".tokenize(), [Ime("RabeljskoJezero123", 1, 1)]);
        assert_eq!("0cyka".tokenize(), [Neznano("0cyka", 1, 1)]);
    }

    #[test]
    fn operatorji() {
        assert_eq!("a kot real".tokenize(), [Ime("a", 1, 1), Operator("kot", 1, 3), Tip("real", 1, 7)]);

        assert_eq!("a<<=b".tokenize(), [Ime("a", 1, 1), Operator("<<=", 1, 2), Ime("b", 1, 5)]);
        assert_eq!("a<< b".tokenize(), [Ime("a", 1, 1), Operator("<<", 1, 2), Ime("b", 1, 5)]);
        assert_eq!("a>>=b".tokenize(), [Ime("a", 1, 1), Operator(">>=", 1, 2), Ime("b", 1, 5)]);
        assert_eq!("a >>b".tokenize(), [Ime("a", 1, 1), Operator(">>", 1, 3), Ime("b", 1, 5)]);

        assert_eq!("a==b".tokenize(), [Ime("a", 1, 1), Operator("==", 1, 2), Ime("b", 1, 4)]);
        assert_eq!("a!=b".tokenize(), [Ime("a", 1, 1), Operator("!=", 1, 2), Ime("b", 1, 4)]);
        assert_eq!("a<=b".tokenize(), [Ime("a", 1, 1), Operator("<=", 1, 2), Ime("b", 1, 4)]);
        assert_eq!("a>=b".tokenize(), [Ime("a", 1, 1), Operator(">=", 1, 2), Ime("b", 1, 4)]);
        assert_eq!("a<b".tokenize(), [Ime("a", 1, 1), Operator("<", 1, 2), Ime("b", 1, 3)]);
        assert_eq!("a>b".tokenize(), [Ime("a", 1, 1), Operator(">", 1, 2), Ime("b", 1, 3)]);

        assert_eq!("a+b".tokenize(), [Ime("a", 1, 1), Operator("+", 1, 2), Ime("b", 1, 3)]);
        assert_eq!("a-b".tokenize(), [Ime("a", 1, 1), Operator("-", 1, 2), Ime("b", 1, 3)]);
        assert_eq!("a*b".tokenize(), [Ime("a", 1, 1), Operator("*", 1, 2), Ime("b", 1, 3)]);
        assert_eq!("a/b".tokenize(), [Ime("a", 1, 1), Operator("/", 1, 2), Ime("b", 1, 3)]);
        assert_eq!("a%b".tokenize(), [Ime("a", 1, 1), Operator("%", 1, 2), Ime("b", 1, 3)]);

        assert_eq!("3+2".tokenize(), [Literal(Celo("3", 1, 1)), Operator("+", 1, 2), Literal(Celo("2", 1, 3))]);
        assert_eq!("3-2".tokenize(), [Literal(Celo("3", 1, 1)), Operator("-", 1, 2), Literal(Celo("2", 1, 3))]);
        assert_eq!("3*2".tokenize(), [Literal(Celo("3", 1, 1)), Operator("*", 1, 2), Literal(Celo("2", 1, 3))]);
        assert_eq!("3/2".tokenize(), [Literal(Celo("3", 1, 1)), Operator("/", 1, 2), Literal(Celo("2", 1, 3))]);
        assert_eq!("3%2".tokenize(), [Literal(Celo("3", 1, 1)), Operator("%", 1, 2), Literal(Celo("2", 1, 3))]);
    }

    #[test]
    fn ločila() {
        assert_eq!("a,b".tokenize(),  [Ime("a", 1, 1), Ločilo(",",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a;b".tokenize(),  [Ime("a", 1, 1), Ločilo(";",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a:b".tokenize(),  [Ime("a", 1, 1), Ločilo(":",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a#b".tokenize(),  [Ime("a", 1, 1), Ločilo("#",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a(b".tokenize(),  [Ime("a", 1, 1), Ločilo("(",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a)b".tokenize(),  [Ime("a", 1, 1), Ločilo(")",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a)b".tokenize(),  [Ime("a", 1, 1), Ločilo(")",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a{b".tokenize(),  [Ime("a", 1, 1), Ločilo("{",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a}b".tokenize(),  [Ime("a", 1, 1), Ločilo("}",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a[b".tokenize(),  [Ime("a", 1, 1), Ločilo("[",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a]b".tokenize(),  [Ime("a", 1, 1), Ločilo("]",  1, 2), Ime("b", 1, 3)]);
        assert_eq!("a\nb".tokenize(), [Ime("a", 1, 1), Ločilo("\n", 1, 2), Ime("b", 2, 1)]);
    }

    #[test]
    fn preprost() {
        assert_eq!("če čene".tokenize(), [Rezerviranka("če", 1, 1), Rezerviranka("čene", 1, 4)]);
    }

    #[test]
    fn napreden() {
        assert_eq!(
            "če\nresnica{dokler laž{natisni(\"nemogoče\")}}".tokenize(),
            [ Rezerviranka("če", 1, 1), Ločilo("\n", 1, 3), 
              bool("resnica", 2, 1), Ločilo("{", 2, 8), Rezerviranka("dokler", 2, 9), bool("laž", 2, 16),
              Ločilo("{", 2, 19), Ime("natisni", 2, 20), Ločilo("(", 2, 27), niz("\"nemogoče\"", 2, 28), Ločilo(")", 2, 38),
              Ločilo("}", 2, 39), Ločilo("}", 2, 40) ]
        );
    }

}