seqc/parser/token.rs
1//! Token type and low-level tokenization/escape/float helpers.
2
3/// A token with its source position (1-indexed).
4#[derive(Debug, Clone)]
5pub struct Token {
6 pub text: String,
7 /// Line number (0-indexed for LSP compatibility)
8 pub line: usize,
9 /// Column number (0-indexed)
10 pub column: usize,
11}
12
13impl Token {
14 fn new(text: String, line: usize, column: usize) -> Self {
15 Token { text, line, column }
16 }
17}
18
19impl PartialEq<&str> for Token {
20 fn eq(&self, other: &&str) -> bool {
21 self.text == *other
22 }
23}
24
25impl PartialEq<str> for Token {
26 fn eq(&self, other: &str) -> bool {
27 self.text == other
28 }
29}
30
31pub(super) fn annotate_error_with_line(msg: String, tok: Option<&Token>) -> String {
32 if msg.starts_with("at line ") {
33 return msg;
34 }
35 let line = tok.map(|t| t.line).unwrap_or(0);
36 format!("at line {}: {}", line + 1, msg)
37}
38
39/// Check if a token looks like a float literal
40///
41/// Float literals contain either:
42/// - A decimal point: `3.14`, `.5`, `5.`
43/// - Scientific notation: `1e10`, `1E-5`, `1.5e3`
44///
45/// This check must happen BEFORE integer parsing to avoid
46/// parsing "5" in "5.0" as an integer.
47pub(super) fn is_float_literal(token: &str) -> bool {
48 // Skip leading minus sign for negative numbers
49 let s = token.strip_prefix('-').unwrap_or(token);
50
51 // Must have at least one digit
52 if s.is_empty() {
53 return false;
54 }
55
56 // Check for decimal point or scientific notation
57 s.contains('.') || s.contains('e') || s.contains('E')
58}
59
60/// Process escape sequences in a string literal
61///
62/// Supported escape sequences:
63/// - `\"` -> `"` (quote)
64/// - `\\` -> `\` (backslash)
65/// - `\n` -> newline
66/// - `\r` -> carriage return
67/// - `\t` -> tab
68/// - `\xNN` -> Unicode code point U+00NN (hex value 00-FF)
69///
70/// # Note on `\xNN` encoding
71///
72/// The `\xNN` escape creates a Unicode code point U+00NN, not a raw byte.
73/// For values 0x00-0x7F (ASCII), this maps directly to the byte value.
74/// For values 0x80-0xFF (Latin-1 Supplement), the character is stored as
75/// a multi-byte UTF-8 sequence. For example:
76/// - `\x41` -> 'A' (1 byte in UTF-8)
77/// - `\x1b` -> ESC (1 byte in UTF-8, used for ANSI terminal codes)
78/// - `\xFF` -> 'ΓΏ' (U+00FF, 2 bytes in UTF-8: 0xC3 0xBF)
79///
80/// This matches Python 3 and Rust string behavior. For terminal ANSI codes,
81/// which are the primary use case, all values are in the ASCII range.
82///
83/// # Errors
84/// Returns error if an unknown escape sequence is encountered
85pub(super) fn unescape_string(s: &str) -> Result<String, String> {
86 let mut result = String::new();
87 let mut chars = s.chars();
88
89 while let Some(ch) = chars.next() {
90 if ch == '\\' {
91 match chars.next() {
92 Some('"') => result.push('"'),
93 Some('\\') => result.push('\\'),
94 Some('n') => result.push('\n'),
95 Some('r') => result.push('\r'),
96 Some('t') => result.push('\t'),
97 Some('x') => {
98 // Hex escape: \xNN
99 let hex1 = chars.next().ok_or_else(|| {
100 "Incomplete hex escape sequence '\\x' - expected 2 hex digits".to_string()
101 })?;
102 let hex2 = chars.next().ok_or_else(|| {
103 format!(
104 "Incomplete hex escape sequence '\\x{}' - expected 2 hex digits",
105 hex1
106 )
107 })?;
108
109 let hex_str: String = [hex1, hex2].iter().collect();
110 let byte_val = u8::from_str_radix(&hex_str, 16).map_err(|_| {
111 format!(
112 "Invalid hex escape sequence '\\x{}' - expected 2 hex digits (00-FF)",
113 hex_str
114 )
115 })?;
116
117 result.push(byte_val as char);
118 }
119 Some(c) => {
120 return Err(format!(
121 "Unknown escape sequence '\\{}' in string literal. \
122 Supported: \\\" \\\\ \\n \\r \\t \\xNN",
123 c
124 ));
125 }
126 None => {
127 return Err("String ends with incomplete escape sequence '\\'".to_string());
128 }
129 }
130 } else {
131 result.push(ch);
132 }
133 }
134
135 Ok(result)
136}
137
138pub(super) fn tokenize(source: &str) -> Vec<Token> {
139 let mut tokens = Vec::new();
140 let mut current = String::new();
141 let mut current_start_line = 0;
142 let mut current_start_col = 0;
143 let mut in_string = false;
144 let mut prev_was_backslash = false;
145
146 // Track current position (0-indexed)
147 let mut line = 0;
148 let mut col = 0;
149
150 for ch in source.chars() {
151 if in_string {
152 current.push(ch);
153 if ch == '"' && !prev_was_backslash {
154 // Unescaped quote ends the string
155 in_string = false;
156 tokens.push(Token::new(
157 current.clone(),
158 current_start_line,
159 current_start_col,
160 ));
161 current.clear();
162 prev_was_backslash = false;
163 } else if ch == '\\' && !prev_was_backslash {
164 // Start of escape sequence
165 prev_was_backslash = true;
166 } else {
167 // Regular character or escaped character
168 prev_was_backslash = false;
169 }
170 // Track newlines inside strings
171 if ch == '\n' {
172 line += 1;
173 col = 0;
174 } else {
175 col += 1;
176 }
177 } else if ch == '"' {
178 if !current.is_empty() {
179 tokens.push(Token::new(
180 current.clone(),
181 current_start_line,
182 current_start_col,
183 ));
184 current.clear();
185 }
186 in_string = true;
187 current_start_line = line;
188 current_start_col = col;
189 current.push(ch);
190 prev_was_backslash = false;
191 col += 1;
192 } else if ch.is_whitespace() {
193 if !current.is_empty() {
194 tokens.push(Token::new(
195 current.clone(),
196 current_start_line,
197 current_start_col,
198 ));
199 current.clear();
200 }
201 // Preserve newlines for comment handling
202 if ch == '\n' {
203 tokens.push(Token::new("\n".to_string(), line, col));
204 line += 1;
205 col = 0;
206 } else {
207 col += 1;
208 }
209 } else if "():;[]{},#".contains(ch) {
210 // `#` is split out so that `#comment` (no space) tokenizes as
211 // `#` + `comment` and the parser's `skip_comments` consumes
212 // it as a line comment, matching Python/Bash/Ruby behaviour.
213 // Without this split, `#comment` would accumulate into a
214 // single identifier-shaped token and reach the parser as an
215 // undefined word call.
216 if !current.is_empty() {
217 tokens.push(Token::new(
218 current.clone(),
219 current_start_line,
220 current_start_col,
221 ));
222 current.clear();
223 }
224 tokens.push(Token::new(ch.to_string(), line, col));
225 col += 1;
226 } else {
227 if current.is_empty() {
228 current_start_line = line;
229 current_start_col = col;
230 }
231 current.push(ch);
232 col += 1;
233 }
234 }
235
236 // Check for unclosed string literal
237 if in_string {
238 // Return error by adding a special error token
239 // The parser will handle this as a parse error
240 tokens.push(Token::new(
241 "<<<UNCLOSED_STRING>>>".to_string(),
242 current_start_line,
243 current_start_col,
244 ));
245 } else if !current.is_empty() {
246 tokens.push(Token::new(current, current_start_line, current_start_col));
247 }
248
249 tokens
250}