seqc/parser/
token.rs

1//! Token type and low-level tokenization/escape/float helpers.
2
3/// A token with its source position (1-indexed).
4#[derive(Debug, Clone)]
5pub struct Token {
6    pub text: String,
7    /// Line number (0-indexed for LSP compatibility)
8    pub line: usize,
9    /// Column number (0-indexed)
10    pub column: usize,
11}
12
13impl Token {
14    fn new(text: String, line: usize, column: usize) -> Self {
15        Token { text, line, column }
16    }
17}
18
19impl PartialEq<&str> for Token {
20    fn eq(&self, other: &&str) -> bool {
21        self.text == *other
22    }
23}
24
25impl PartialEq<str> for Token {
26    fn eq(&self, other: &str) -> bool {
27        self.text == other
28    }
29}
30
31pub(super) fn annotate_error_with_line(msg: String, tok: Option<&Token>) -> String {
32    if msg.starts_with("at line ") {
33        return msg;
34    }
35    let line = tok.map(|t| t.line).unwrap_or(0);
36    format!("at line {}: {}", line + 1, msg)
37}
38
39/// Check if a token looks like a float literal
40///
41/// Float literals contain either:
42/// - A decimal point: `3.14`, `.5`, `5.`
43/// - Scientific notation: `1e10`, `1E-5`, `1.5e3`
44///
45/// This check must happen BEFORE integer parsing to avoid
46/// parsing "5" in "5.0" as an integer.
47pub(super) fn is_float_literal(token: &str) -> bool {
48    // Skip leading minus sign for negative numbers
49    let s = token.strip_prefix('-').unwrap_or(token);
50
51    // Must have at least one digit
52    if s.is_empty() {
53        return false;
54    }
55
56    // Check for decimal point or scientific notation
57    s.contains('.') || s.contains('e') || s.contains('E')
58}
59
60/// Process escape sequences in a string literal
61///
62/// Supported escape sequences:
63/// - `\"` -> `"`  (quote)
64/// - `\\` -> `\`  (backslash)
65/// - `\n` -> newline
66/// - `\r` -> carriage return
67/// - `\t` -> tab
68/// - `\xNN` -> Unicode code point U+00NN (hex value 00-FF)
69///
70/// # Note on `\xNN` encoding
71///
72/// The `\xNN` escape creates a Unicode code point U+00NN, not a raw byte.
73/// For values 0x00-0x7F (ASCII), this maps directly to the byte value.
74/// For values 0x80-0xFF (Latin-1 Supplement), the character is stored as
75/// a multi-byte UTF-8 sequence. For example:
76/// - `\x41` -> 'A' (1 byte in UTF-8)
77/// - `\x1b` -> ESC (1 byte in UTF-8, used for ANSI terminal codes)
78/// - `\xFF` -> 'ÿ' (U+00FF, 2 bytes in UTF-8: 0xC3 0xBF)
79///
80/// This matches Python 3 and Rust string behavior. For terminal ANSI codes,
81/// which are the primary use case, all values are in the ASCII range.
82///
83/// # Errors
84/// Returns error if an unknown escape sequence is encountered
85pub(super) fn unescape_string(s: &str) -> Result<String, String> {
86    let mut result = String::new();
87    let mut chars = s.chars();
88
89    while let Some(ch) = chars.next() {
90        if ch == '\\' {
91            match chars.next() {
92                Some('"') => result.push('"'),
93                Some('\\') => result.push('\\'),
94                Some('n') => result.push('\n'),
95                Some('r') => result.push('\r'),
96                Some('t') => result.push('\t'),
97                Some('x') => {
98                    // Hex escape: \xNN
99                    let hex1 = chars.next().ok_or_else(|| {
100                        "Incomplete hex escape sequence '\\x' - expected 2 hex digits".to_string()
101                    })?;
102                    let hex2 = chars.next().ok_or_else(|| {
103                        format!(
104                            "Incomplete hex escape sequence '\\x{}' - expected 2 hex digits",
105                            hex1
106                        )
107                    })?;
108
109                    let hex_str: String = [hex1, hex2].iter().collect();
110                    let byte_val = u8::from_str_radix(&hex_str, 16).map_err(|_| {
111                        format!(
112                            "Invalid hex escape sequence '\\x{}' - expected 2 hex digits (00-FF)",
113                            hex_str
114                        )
115                    })?;
116
117                    result.push(byte_val as char);
118                }
119                Some(c) => {
120                    return Err(format!(
121                        "Unknown escape sequence '\\{}' in string literal. \
122                         Supported: \\\" \\\\ \\n \\r \\t \\xNN",
123                        c
124                    ));
125                }
126                None => {
127                    return Err("String ends with incomplete escape sequence '\\'".to_string());
128                }
129            }
130        } else {
131            result.push(ch);
132        }
133    }
134
135    Ok(result)
136}
137
138pub(super) fn tokenize(source: &str) -> Vec<Token> {
139    let mut tokens = Vec::new();
140    let mut current = String::new();
141    let mut current_start_line = 0;
142    let mut current_start_col = 0;
143    let mut in_string = false;
144    let mut prev_was_backslash = false;
145
146    // Track current position (0-indexed)
147    let mut line = 0;
148    let mut col = 0;
149
150    for ch in source.chars() {
151        if in_string {
152            current.push(ch);
153            if ch == '"' && !prev_was_backslash {
154                // Unescaped quote ends the string
155                in_string = false;
156                tokens.push(Token::new(
157                    current.clone(),
158                    current_start_line,
159                    current_start_col,
160                ));
161                current.clear();
162                prev_was_backslash = false;
163            } else if ch == '\\' && !prev_was_backslash {
164                // Start of escape sequence
165                prev_was_backslash = true;
166            } else {
167                // Regular character or escaped character
168                prev_was_backslash = false;
169            }
170            // Track newlines inside strings
171            if ch == '\n' {
172                line += 1;
173                col = 0;
174            } else {
175                col += 1;
176            }
177        } else if ch == '"' {
178            if !current.is_empty() {
179                tokens.push(Token::new(
180                    current.clone(),
181                    current_start_line,
182                    current_start_col,
183                ));
184                current.clear();
185            }
186            in_string = true;
187            current_start_line = line;
188            current_start_col = col;
189            current.push(ch);
190            prev_was_backslash = false;
191            col += 1;
192        } else if ch.is_whitespace() {
193            if !current.is_empty() {
194                tokens.push(Token::new(
195                    current.clone(),
196                    current_start_line,
197                    current_start_col,
198                ));
199                current.clear();
200            }
201            // Preserve newlines for comment handling
202            if ch == '\n' {
203                tokens.push(Token::new("\n".to_string(), line, col));
204                line += 1;
205                col = 0;
206            } else {
207                col += 1;
208            }
209        } else if "():;[]{},#".contains(ch) {
210            // `#` is split out so that `#comment` (no space) tokenizes as
211            // `#` + `comment` and the parser's `skip_comments` consumes
212            // it as a line comment, matching Python/Bash/Ruby behaviour.
213            // Without this split, `#comment` would accumulate into a
214            // single identifier-shaped token and reach the parser as an
215            // undefined word call.
216            if !current.is_empty() {
217                tokens.push(Token::new(
218                    current.clone(),
219                    current_start_line,
220                    current_start_col,
221                ));
222                current.clear();
223            }
224            tokens.push(Token::new(ch.to_string(), line, col));
225            col += 1;
226        } else {
227            if current.is_empty() {
228                current_start_line = line;
229                current_start_col = col;
230            }
231            current.push(ch);
232            col += 1;
233        }
234    }
235
236    // Check for unclosed string literal
237    if in_string {
238        // Return error by adding a special error token
239        // The parser will handle this as a parse error
240        tokens.push(Token::new(
241            "<<<UNCLOSED_STRING>>>".to_string(),
242            current_start_line,
243            current_start_col,
244        ));
245    } else if !current.is_empty() {
246        tokens.push(Token::new(current, current_start_line, current_start_col));
247    }
248
249    tokens
250}
seqc/parser/token.rs

seqc/parser/
token.rs