Skip to main content

seqc/parser/
token.rs

1//! Token type and low-level tokenization/escape/float helpers.
2
3/// A token with its source position (1-indexed).
4#[derive(Debug, Clone)]
5pub struct Token {
6    pub text: String,
7    /// Line number (0-indexed for LSP compatibility)
8    pub line: usize,
9    /// Column number (0-indexed)
10    pub column: usize,
11}
12
13impl Token {
14    fn new(text: String, line: usize, column: usize) -> Self {
15        Token { text, line, column }
16    }
17}
18
19impl PartialEq<&str> for Token {
20    fn eq(&self, other: &&str) -> bool {
21        self.text == *other
22    }
23}
24
25impl PartialEq<str> for Token {
26    fn eq(&self, other: &str) -> bool {
27        self.text == other
28    }
29}
30
31pub(super) fn annotate_error_with_line(msg: String, tok: Option<&Token>) -> String {
32    if msg.starts_with("at line ") {
33        return msg;
34    }
35    let line = tok.map(|t| t.line).unwrap_or(0);
36    format!("at line {}: {}", line + 1, msg)
37}
38
39/// Check if a token looks like a float literal
40///
41/// Float literals contain either:
42/// - A decimal point: `3.14`, `.5`, `5.`
43/// - Scientific notation: `1e10`, `1E-5`, `1.5e3`
44///
45/// This check must happen BEFORE integer parsing to avoid
46/// parsing "5" in "5.0" as an integer.
47pub(super) fn is_float_literal(token: &str) -> bool {
48    // Skip leading minus sign for negative numbers
49    let s = token.strip_prefix('-').unwrap_or(token);
50
51    // Must have at least one digit
52    if s.is_empty() {
53        return false;
54    }
55
56    // Check for decimal point or scientific notation
57    s.contains('.') || s.contains('e') || s.contains('E')
58}
59
60/// Process escape sequences in a string literal, returning the raw byte
61/// payload. Seq strings are byte-clean — `\xNN` produces the literal byte
62/// `0xNN`, not the UTF-8 encoding of the codepoint U+00NN.
63///
64/// Supported escape sequences:
65/// - `\"` -> `"`  (quote)
66/// - `\\` -> `\`  (backslash)
67/// - `\n` -> newline
68/// - `\r` -> carriage return
69/// - `\t` -> tab
70/// - `\xNN` -> the single byte `0xNN` (00-FF)
71///
72/// # `\xNN` byte semantics
73///
74/// `\xNN` is a *byte*, not a codepoint:
75/// - `\x41` -> `0x41` ('A')
76/// - `\x1b` -> `0x1B` (ESC, for ANSI terminal codes)
77/// - `\xDC` -> `0xDC` (one byte; not the 2-byte UTF-8 of U+00DC)
78/// - `\x00` -> `0x00` (one NUL byte; embedded NULs are legal)
79///
80/// Non-escape characters in the source are copied to the output as their
81/// UTF-8 byte sequence — so `"héllo"` is still 6 UTF-8 bytes. The change
82/// is only that `\xNN` no longer round-trips through `char` (which it
83/// did before, silently producing 2-byte UTF-8 for high-byte escapes).
84///
85/// This makes byte-clean binary protocol literals (OSC alignment NULs,
86/// raw IEEE-754 byte patterns, magic-number headers) expressible in
87/// Seq source.
88///
89/// # Errors
90/// Returns error if an unknown escape sequence is encountered.
91pub(super) fn unescape_string(s: &str) -> Result<Vec<u8>, String> {
92    let mut result: Vec<u8> = Vec::with_capacity(s.len());
93    let mut chars = s.chars();
94
95    while let Some(ch) = chars.next() {
96        if ch == '\\' {
97            match chars.next() {
98                Some('"') => result.push(b'"'),
99                Some('\\') => result.push(b'\\'),
100                Some('n') => result.push(b'\n'),
101                Some('r') => result.push(b'\r'),
102                Some('t') => result.push(b'\t'),
103                Some('x') => {
104                    // Hex escape: \xNN — emit the literal byte 0xNN.
105                    let hex1 = chars.next().ok_or_else(|| {
106                        "Incomplete hex escape sequence '\\x' - expected 2 hex digits".to_string()
107                    })?;
108                    let hex2 = chars.next().ok_or_else(|| {
109                        format!(
110                            "Incomplete hex escape sequence '\\x{}' - expected 2 hex digits",
111                            hex1
112                        )
113                    })?;
114
115                    let hex_str: String = [hex1, hex2].iter().collect();
116                    let byte_val = u8::from_str_radix(&hex_str, 16).map_err(|_| {
117                        format!(
118                            "Invalid hex escape sequence '\\x{}' - expected 2 hex digits (00-FF)",
119                            hex_str
120                        )
121                    })?;
122
123                    result.push(byte_val);
124                }
125                Some(c) => {
126                    return Err(format!(
127                        "Unknown escape sequence '\\{}' in string literal. \
128                         Supported: \\\" \\\\ \\n \\r \\t \\xNN",
129                        c
130                    ));
131                }
132                None => {
133                    return Err("String ends with incomplete escape sequence '\\'".to_string());
134                }
135            }
136        } else {
137            // Source-level char: emit its UTF-8 bytes verbatim.
138            let mut buf = [0u8; 4];
139            result.extend_from_slice(ch.encode_utf8(&mut buf).as_bytes());
140        }
141    }
142
143    Ok(result)
144}
145
146pub(super) fn tokenize(source: &str) -> Vec<Token> {
147    let mut tokens = Vec::new();
148    let mut current = String::new();
149    let mut current_start_line = 0;
150    let mut current_start_col = 0;
151    let mut in_string = false;
152    let mut prev_was_backslash = false;
153
154    // Track current position (0-indexed)
155    let mut line = 0;
156    let mut col = 0;
157
158    for ch in source.chars() {
159        if in_string {
160            current.push(ch);
161            if ch == '"' && !prev_was_backslash {
162                // Unescaped quote ends the string
163                in_string = false;
164                tokens.push(Token::new(
165                    current.clone(),
166                    current_start_line,
167                    current_start_col,
168                ));
169                current.clear();
170                prev_was_backslash = false;
171            } else if ch == '\\' && !prev_was_backslash {
172                // Start of escape sequence
173                prev_was_backslash = true;
174            } else {
175                // Regular character or escaped character
176                prev_was_backslash = false;
177            }
178            // Track newlines inside strings
179            if ch == '\n' {
180                line += 1;
181                col = 0;
182            } else {
183                col += 1;
184            }
185        } else if ch == '"' {
186            if !current.is_empty() {
187                tokens.push(Token::new(
188                    current.clone(),
189                    current_start_line,
190                    current_start_col,
191                ));
192                current.clear();
193            }
194            in_string = true;
195            current_start_line = line;
196            current_start_col = col;
197            current.push(ch);
198            prev_was_backslash = false;
199            col += 1;
200        } else if ch.is_whitespace() {
201            if !current.is_empty() {
202                tokens.push(Token::new(
203                    current.clone(),
204                    current_start_line,
205                    current_start_col,
206                ));
207                current.clear();
208            }
209            // Preserve newlines for comment handling
210            if ch == '\n' {
211                tokens.push(Token::new("\n".to_string(), line, col));
212                line += 1;
213                col = 0;
214            } else {
215                col += 1;
216            }
217        } else if "():;[]{},#".contains(ch) {
218            // `#` is split out so that `#comment` (no space) tokenizes as
219            // `#` + `comment` and the parser's `skip_comments` consumes
220            // it as a line comment, matching Python/Bash/Ruby behaviour.
221            // Without this split, `#comment` would accumulate into a
222            // single identifier-shaped token and reach the parser as an
223            // undefined word call.
224            if !current.is_empty() {
225                tokens.push(Token::new(
226                    current.clone(),
227                    current_start_line,
228                    current_start_col,
229                ));
230                current.clear();
231            }
232            tokens.push(Token::new(ch.to_string(), line, col));
233            col += 1;
234        } else {
235            if current.is_empty() {
236                current_start_line = line;
237                current_start_col = col;
238            }
239            current.push(ch);
240            col += 1;
241        }
242    }
243
244    // Check for unclosed string literal
245    if in_string {
246        // Return error by adding a special error token
247        // The parser will handle this as a parse error
248        tokens.push(Token::new(
249            "<<<UNCLOSED_STRING>>>".to_string(),
250            current_start_line,
251            current_start_col,
252        ));
253    } else if !current.is_empty() {
254        tokens.push(Token::new(current, current_start_line, current_start_col));
255    }
256
257    tokens
258}