seqc/parser/token.rs
1//! Token type and low-level tokenization/escape/float helpers.
2
3/// A token with its source position (1-indexed).
4#[derive(Debug, Clone)]
5pub struct Token {
6 pub text: String,
7 /// Line number (0-indexed for LSP compatibility)
8 pub line: usize,
9 /// Column number (0-indexed)
10 pub column: usize,
11}
12
13impl Token {
14 fn new(text: String, line: usize, column: usize) -> Self {
15 Token { text, line, column }
16 }
17}
18
19impl PartialEq<&str> for Token {
20 fn eq(&self, other: &&str) -> bool {
21 self.text == *other
22 }
23}
24
25impl PartialEq<str> for Token {
26 fn eq(&self, other: &str) -> bool {
27 self.text == other
28 }
29}
30
31pub(super) fn annotate_error_with_line(msg: String, tok: Option<&Token>) -> String {
32 if msg.starts_with("at line ") {
33 return msg;
34 }
35 let line = tok.map(|t| t.line).unwrap_or(0);
36 format!("at line {}: {}", line + 1, msg)
37}
38
39/// Check if a token looks like a float literal
40///
41/// Float literals contain either:
42/// - A decimal point: `3.14`, `.5`, `5.`
43/// - Scientific notation: `1e10`, `1E-5`, `1.5e3`
44///
45/// This check must happen BEFORE integer parsing to avoid
46/// parsing "5" in "5.0" as an integer.
47pub(super) fn is_float_literal(token: &str) -> bool {
48 // Skip leading minus sign for negative numbers
49 let s = token.strip_prefix('-').unwrap_or(token);
50
51 // Must have at least one digit
52 if s.is_empty() {
53 return false;
54 }
55
56 // Check for decimal point or scientific notation
57 s.contains('.') || s.contains('e') || s.contains('E')
58}
59
60/// Process escape sequences in a string literal, returning the raw byte
61/// payload. Seq strings are byte-clean — `\xNN` produces the literal byte
62/// `0xNN`, not the UTF-8 encoding of the codepoint U+00NN.
63///
64/// Supported escape sequences:
65/// - `\"` -> `"` (quote)
66/// - `\\` -> `\` (backslash)
67/// - `\n` -> newline
68/// - `\r` -> carriage return
69/// - `\t` -> tab
70/// - `\xNN` -> the single byte `0xNN` (00-FF)
71///
72/// # `\xNN` byte semantics
73///
74/// `\xNN` is a *byte*, not a codepoint:
75/// - `\x41` -> `0x41` ('A')
76/// - `\x1b` -> `0x1B` (ESC, for ANSI terminal codes)
77/// - `\xDC` -> `0xDC` (one byte; not the 2-byte UTF-8 of U+00DC)
78/// - `\x00` -> `0x00` (one NUL byte; embedded NULs are legal)
79///
80/// Non-escape characters in the source are copied to the output as their
81/// UTF-8 byte sequence — so `"héllo"` is still 6 UTF-8 bytes. The change
82/// is only that `\xNN` no longer round-trips through `char` (which it
83/// did before, silently producing 2-byte UTF-8 for high-byte escapes).
84///
85/// This makes byte-clean binary protocol literals (OSC alignment NULs,
86/// raw IEEE-754 byte patterns, magic-number headers) expressible in
87/// Seq source.
88///
89/// # Errors
90/// Returns error if an unknown escape sequence is encountered.
91pub(super) fn unescape_string(s: &str) -> Result<Vec<u8>, String> {
92 let mut result: Vec<u8> = Vec::with_capacity(s.len());
93 let mut chars = s.chars();
94
95 while let Some(ch) = chars.next() {
96 if ch == '\\' {
97 match chars.next() {
98 Some('"') => result.push(b'"'),
99 Some('\\') => result.push(b'\\'),
100 Some('n') => result.push(b'\n'),
101 Some('r') => result.push(b'\r'),
102 Some('t') => result.push(b'\t'),
103 Some('x') => {
104 // Hex escape: \xNN — emit the literal byte 0xNN.
105 let hex1 = chars.next().ok_or_else(|| {
106 "Incomplete hex escape sequence '\\x' - expected 2 hex digits".to_string()
107 })?;
108 let hex2 = chars.next().ok_or_else(|| {
109 format!(
110 "Incomplete hex escape sequence '\\x{}' - expected 2 hex digits",
111 hex1
112 )
113 })?;
114
115 let hex_str: String = [hex1, hex2].iter().collect();
116 let byte_val = u8::from_str_radix(&hex_str, 16).map_err(|_| {
117 format!(
118 "Invalid hex escape sequence '\\x{}' - expected 2 hex digits (00-FF)",
119 hex_str
120 )
121 })?;
122
123 result.push(byte_val);
124 }
125 Some(c) => {
126 return Err(format!(
127 "Unknown escape sequence '\\{}' in string literal. \
128 Supported: \\\" \\\\ \\n \\r \\t \\xNN",
129 c
130 ));
131 }
132 None => {
133 return Err("String ends with incomplete escape sequence '\\'".to_string());
134 }
135 }
136 } else {
137 // Source-level char: emit its UTF-8 bytes verbatim.
138 let mut buf = [0u8; 4];
139 result.extend_from_slice(ch.encode_utf8(&mut buf).as_bytes());
140 }
141 }
142
143 Ok(result)
144}
145
146pub(super) fn tokenize(source: &str) -> Vec<Token> {
147 let mut tokens = Vec::new();
148 let mut current = String::new();
149 let mut current_start_line = 0;
150 let mut current_start_col = 0;
151 let mut in_string = false;
152 let mut prev_was_backslash = false;
153
154 // Track current position (0-indexed)
155 let mut line = 0;
156 let mut col = 0;
157
158 for ch in source.chars() {
159 if in_string {
160 current.push(ch);
161 if ch == '"' && !prev_was_backslash {
162 // Unescaped quote ends the string
163 in_string = false;
164 tokens.push(Token::new(
165 current.clone(),
166 current_start_line,
167 current_start_col,
168 ));
169 current.clear();
170 prev_was_backslash = false;
171 } else if ch == '\\' && !prev_was_backslash {
172 // Start of escape sequence
173 prev_was_backslash = true;
174 } else {
175 // Regular character or escaped character
176 prev_was_backslash = false;
177 }
178 // Track newlines inside strings
179 if ch == '\n' {
180 line += 1;
181 col = 0;
182 } else {
183 col += 1;
184 }
185 } else if ch == '"' {
186 if !current.is_empty() {
187 tokens.push(Token::new(
188 current.clone(),
189 current_start_line,
190 current_start_col,
191 ));
192 current.clear();
193 }
194 in_string = true;
195 current_start_line = line;
196 current_start_col = col;
197 current.push(ch);
198 prev_was_backslash = false;
199 col += 1;
200 } else if ch.is_whitespace() {
201 if !current.is_empty() {
202 tokens.push(Token::new(
203 current.clone(),
204 current_start_line,
205 current_start_col,
206 ));
207 current.clear();
208 }
209 // Preserve newlines for comment handling
210 if ch == '\n' {
211 tokens.push(Token::new("\n".to_string(), line, col));
212 line += 1;
213 col = 0;
214 } else {
215 col += 1;
216 }
217 } else if "():;[]{},#".contains(ch) {
218 // `#` is split out so that `#comment` (no space) tokenizes as
219 // `#` + `comment` and the parser's `skip_comments` consumes
220 // it as a line comment, matching Python/Bash/Ruby behaviour.
221 // Without this split, `#comment` would accumulate into a
222 // single identifier-shaped token and reach the parser as an
223 // undefined word call.
224 if !current.is_empty() {
225 tokens.push(Token::new(
226 current.clone(),
227 current_start_line,
228 current_start_col,
229 ));
230 current.clear();
231 }
232 tokens.push(Token::new(ch.to_string(), line, col));
233 col += 1;
234 } else {
235 if current.is_empty() {
236 current_start_line = line;
237 current_start_col = col;
238 }
239 current.push(ch);
240 col += 1;
241 }
242 }
243
244 // Check for unclosed string literal
245 if in_string {
246 // Return error by adding a special error token
247 // The parser will handle this as a parse error
248 tokens.push(Token::new(
249 "<<<UNCLOSED_STRING>>>".to_string(),
250 current_start_line,
251 current_start_col,
252 ));
253 } else if !current.is_empty() {
254 tokens.push(Token::new(current, current_start_line, current_start_col));
255 }
256
257 tokens
258}