rs_args/lexing/
mod.rs

1use self::errors::LexicalError;
2
3pub mod errors;
4
5/// A token is a single unit of a command, such as a word, number or symbol.
6/// This is used to convert single characters in a more machine-readable format.
7/// For example, the string "ls -l" would be converted to a list of tokens like
8/// so: ["ls", "-l"].
9///
10/// The token kind is a simple enum, which is used to distinguish between the
11/// different types of tokens. The token value is a string, which contains the
12/// actual value of the token. The token position is a simple integer, which
13/// contains the position of the token in the input string. This is used for
14/// error reporting.
15///
16#[derive(Debug)]
17#[allow(dead_code)] // TODO: Remove one the parser is implemented
18pub struct Token {
19    kind: TokenKind,
20    value: String,
21}
22
23impl Token {
24    pub fn get_kind(&self) -> &TokenKind {
25        &self.kind
26    }
27    pub fn get_value(&self) -> &String {
28        &self.value
29    }
30    pub fn unknown(c: &char) -> Token {
31        Token { kind: TokenKind::Unknown, value: c.to_string() }
32    }
33}
34
35/// As we are only implementing a very simple shell, we only need a few token
36/// kinds. These are the token kinds we are going to use:
37/// 1. Identifier: A word, such as "ls" or "echo".
38/// 2. String: A string, such as "Hello, world!".
39/// 3. Number: A number, such as "123".
40/// 4. Equals: The equals sign, used for key-value pairs.
41/// 5. Dash: The dash sign, used for flags.
42#[derive(Debug, PartialEq, Eq)]
43pub enum TokenKind {
44    Identifier,
45    String,
46    Number,
47    Equals,
48    Dash,
49    Unknown,
50}
51
52pub fn tokenise(input: String) -> Result<Vec<Token>, LexicalError> {
53    let mut tokens: Vec<Token> = Vec::new();
54    let mut chars = input.chars().collect::<Vec<char>>();
55
56    while let Some(c) = chars.first() {
57        let result = match c {
58            '-' => Ok(Token { kind: TokenKind::Dash, value: chars.remove(0).to_string() }),
59            '=' => Ok(Token { kind: TokenKind::Equals, value: chars.remove(0).to_string() }),
60            ' ' => {
61                chars.remove(0);
62                continue;
63            }
64            _ => {
65                if c.is_numeric() {
66                    tokenise_number(&mut chars)
67                } else if c.is_alphabetic() {
68                    Ok(tokenise_identifier(&mut chars))
69                } else if *c == '"' {
70                    Ok(tokenise_string(&mut chars))
71                } else {
72                    Ok(Token::unknown(c))
73                }
74            }
75        };
76
77        if let Ok(value) = &result {
78            if let TokenKind::Unknown = value.get_kind() {
79                return Err(LexicalError::UnknownToken(String::from(value.get_value())));
80            }
81        } else {
82            return Err(result.err().unwrap());
83        }
84
85        tokens.push(result.unwrap());
86    }
87    Ok(tokens)
88}
89
90fn tokenise_identifier(chars: &mut Vec<char>) -> Token {
91    let mut value = String::new();
92
93    while let Some(c) = chars.first() {
94        if c.is_alphanumeric() {
95            value.push(chars.remove(0));
96        } else {
97            break;
98        }
99    }
100
101    Token { kind: TokenKind::Identifier, value }
102}
103fn tokenise_number(chars: &mut Vec<char>) -> Result<Token, LexicalError> {
104    let mut value = String::new();
105
106    let mut decimals = 0;
107    while let Some(c) = chars.first() {
108        if c.is_numeric() || *c == '.' {
109            if *c == '.' {
110                decimals += 1;
111            }
112            value.push(chars.remove(0));
113        } else {
114            break;
115        }
116    }
117
118    if decimals > 1 {
119        return Err(LexicalError::InvalidDecimalPoint(decimals));
120    }
121    Ok(Token { kind: TokenKind::Number, value })
122}
123
124fn tokenise_string(chars: &mut Vec<char>) -> Token {
125    chars.remove(0);
126    let mut value = String::new();
127
128    while let Some(c) = chars.first() {
129        if *c == '"' {
130            chars.remove(0);
131            break;
132        } else {
133            value.push(chars.remove(0));
134        }
135    }
136
137    Token { kind: TokenKind::String, value }
138}
139
140#[cfg(test)]
141pub mod tests;