1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
//! Source-level desugaring for `rust { ... }` FFI blocks.
//!
//! Why pre-lex and not a new `StmtKind`? The parity roadmap pins a rule: "Do not add new
//! `ExprKind`/`StmtKind` variants for new behavior." So `rust { ... }` is surfaced as a
//! BEGIN-wrapped builtin call — syntactic sugar only. At the source level we replace the
//! block with:
//!
//! ```text
//! BEGIN { __stryke_rust_compile(q‹SOURCE›, $LINE); }
//! ```
//!
//! so later phases (lexer / parser / interpreter) see normal Perl. The runtime builtin
//! [`crate::rust_ffi::compile_and_register`] handles rustc invocation, dlopen, and sub
//! registration.
//!
//! The scanner is intentionally simple: it treats `rust` as a keyword only when it sits
//! at a Perl statement boundary (start-of-file, or after `;`, `{`, `}`, or a newline not
//! inside a string). Inside the block body we use Rust's brace/string/comment rules.
//! False positives on exotic constructs fall through unchanged — the existing lexer then
//! reports the normal Perl error.
/// Rewrite `code` so every top-level `rust { ... }` statement becomes a BEGIN-wrapped call
/// into the FFI runtime. Returns the modified source, or the original if no block exists.
///
/// This is called by [`crate::parse_with_file`] before the lexer runs. It is also a no-op
/// safe function: any source not containing a `rust {` sequence is returned unchanged with
/// minimal overhead (one `memmem`-style scan).
pub fn desugar_rust_blocks(code: &str) -> String {
// Fast path: no candidate substring → no work. Avoids full tokenization on every parse.
if !code.contains("rust") {
return code.to_string();
}
let bytes = code.as_bytes();
let mut out = String::with_capacity(code.len());
let mut i = 0;
let mut can_start_stmt = true; // start-of-file is a statement boundary
let mut line = 1usize;
while i < bytes.len() {
let c = bytes[i];
// Perl-side skipping: stay out of strings, regex literals, and comments so we never
// match `rust {` that is really inside `"...rust { ..."` or `# rust { ...`.
// Note: newlines DO NOT imply a statement boundary in Perl — expressions routinely
// span lines. Only `;`, `{`, `}` reset `can_start_stmt`.
match c {
b'\n' => {
out.push('\n');
i += 1;
line += 1;
continue;
}
b' ' | b'\t' | b'\r' => {
out.push(c as char);
i += 1;
continue;
}
b'#' => {
// Perl line comment to end of line.
while i < bytes.len() && bytes[i] != b'\n' {
out.push(bytes[i] as char);
i += 1;
}
continue;
}
b'"' | b'\'' | b'`' => {
// Double- / single- / backtick-quoted string. Treat `\X` as an escape.
let quote = c;
out.push(c as char);
i += 1;
while i < bytes.len() {
let b = bytes[i];
if b == b'\\' && i + 1 < bytes.len() {
out.push(b as char);
out.push(bytes[i + 1] as char);
if bytes[i + 1] == b'\n' {
line += 1;
}
i += 2;
continue;
}
out.push(b as char);
i += 1;
if b == b'\n' {
line += 1;
}
if b == quote {
break;
}
}
can_start_stmt = false;
continue;
}
b';' | b'{' | b'}' => {
out.push(c as char);
i += 1;
can_start_stmt = true;
continue;
}
_ => {}
}
// Candidate: identifier start?
let is_ident_start = c.is_ascii_alphabetic() || c == b'_';
if !is_ident_start {
out.push(c as char);
i += 1;
can_start_stmt = false;
continue;
}
// Read the identifier.
let start = i;
while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
i += 1;
}
let ident = &code[start..i];
// Qualify: only `rust` at a statement boundary, with an opening `{` following after
// optional whitespace. Anything else stays untouched.
if ident == "rust" && can_start_stmt {
// Peek past whitespace (newlines allowed) for `{`.
let mut j = i;
let mut inline_newlines = 0usize;
while j < bytes.len() && matches!(bytes[j], b' ' | b'\t' | b'\r' | b'\n') {
if bytes[j] == b'\n' {
inline_newlines += 1;
}
j += 1;
}
if j < bytes.len() && bytes[j] == b'{' {
// Found `rust { ... }` — scan the Rust body with Rust rules.
if let Some((body, end)) = scan_rust_block(bytes, j) {
let block_line = line;
// Total newlines consumed across `rust`→`{` whitespace + Rust body +
// trailing `}`. We pad the replacement with the same number of `\n`
// characters so later parser diagnostics keep their source line number.
let body_newlines = body.bytes().filter(|&b| b == b'\n').count();
let total_newlines = inline_newlines + body_newlines;
out.push_str(&emit_begin_call(body, block_line));
for _ in 0..total_newlines {
out.push('\n');
}
line += total_newlines;
i = end;
can_start_stmt = true;
continue;
}
// Unbalanced — let the normal lexer report the error.
}
}
out.push_str(ident);
can_start_stmt = false;
}
out
}
/// Scan a Rust `{ ... }` block starting at the `{` byte at `open`. Returns `(body, end)`
/// where `body` excludes the outer braces and `end` is the byte index immediately after
/// the closing `}`. Handles string literals, raw strings (`r"..."` / `r#"..."#`),
/// character literals, line comments, and nested block comments per Rust's lexer.
fn scan_rust_block(bytes: &[u8], open: usize) -> Option<(&str, usize)> {
debug_assert_eq!(bytes[open], b'{');
let mut i = open + 1;
let body_start = i;
let mut depth: i32 = 1;
while i < bytes.len() {
let c = bytes[i];
match c {
b'{' => {
depth += 1;
i += 1;
}
b'}' => {
depth -= 1;
if depth == 0 {
let body = std::str::from_utf8(&bytes[body_start..i]).ok()?;
return Some((body, i + 1));
}
i += 1;
}
b'/' if i + 1 < bytes.len() && bytes[i + 1] == b'/' => {
// Line comment.
i += 2;
while i < bytes.len() && bytes[i] != b'\n' {
i += 1;
}
}
b'/' if i + 1 < bytes.len() && bytes[i + 1] == b'*' => {
// Nested block comment.
i += 2;
let mut cdepth: i32 = 1;
while i < bytes.len() && cdepth > 0 {
if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'*' {
cdepth += 1;
i += 2;
} else if i + 1 < bytes.len() && bytes[i] == b'*' && bytes[i + 1] == b'/' {
cdepth -= 1;
i += 2;
} else {
i += 1;
}
}
}
b'"' => {
// Rust string literal; `\"` escapes, `\\` escapes.
i += 1;
while i < bytes.len() {
match bytes[i] {
b'\\' if i + 1 < bytes.len() => i += 2,
b'"' => {
i += 1;
break;
}
_ => i += 1,
}
}
}
b'r' if i + 1 < bytes.len() && (bytes[i + 1] == b'"' || bytes[i + 1] == b'#') => {
// Raw string: `r"..."` or `r#"..."#` with any number of hashes.
let mut j = i + 1;
let mut hashes = 0usize;
while j < bytes.len() && bytes[j] == b'#' {
hashes += 1;
j += 1;
}
if j >= bytes.len() || bytes[j] != b'"' {
// Not a raw string start — treat as identifier.
i += 1;
continue;
}
j += 1;
// Find closing `"` followed by exactly `hashes` hashes.
while j < bytes.len() {
if bytes[j] == b'"' {
let mut k = j + 1;
let mut matched = 0;
while matched < hashes && k < bytes.len() && bytes[k] == b'#' {
matched += 1;
k += 1;
}
if matched == hashes {
j = k;
break;
}
j += 1;
} else {
j += 1;
}
}
i = j;
}
b'\'' => {
// Char literal or lifetime. If followed by a valid char-literal body + `'`,
// skip it. Otherwise treat as lifetime (no closing quote). Cheap heuristic:
// advance past `\x` escapes and one ascii char, then check for `'`.
let mut j = i + 1;
if j < bytes.len() && bytes[j] == b'\\' && j + 1 < bytes.len() {
j += 2;
// `\u{...}`
if j < bytes.len() && bytes[j] == b'{' {
while j < bytes.len() && bytes[j] != b'}' {
j += 1;
}
if j < bytes.len() {
j += 1;
}
}
} else if j < bytes.len() {
j += 1;
}
if j < bytes.len() && bytes[j] == b'\'' {
i = j + 1;
} else {
// Lifetime — skip the single quote and continue.
i += 1;
}
}
_ => i += 1,
}
}
None
}
/// Build the replacement Perl source for a `rust { BODY }` block. Uses `q<>` brackets with
/// an escape-proof `q«»` style delimiter to avoid colliding with the body content.
fn emit_begin_call(body: &str, line: usize) -> String {
// Use an unusual 4-char delimiter very unlikely to appear in Rust source; the Perl lexer
// accepts `q{…}` with nested braces, but we cannot guarantee Rust has balanced braces at
// the q{} level, so base64-encode the body instead and pass as a plain double-quoted
// string. That sidesteps all delimiter collision worries.
use base64::Engine as _;
let encoded = base64::engine::general_purpose::STANDARD.encode(body.as_bytes());
format!(
"BEGIN {{ __stryke_rust_compile(\"{}\", {}); }}",
encoded, line
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn no_rust_keyword_pass_through() {
let src = "print 'hello';\n";
assert_eq!(desugar_rust_blocks(src), src);
}
#[test]
fn rust_keyword_in_string_is_not_expanded() {
let src = "print \"rust { not a block }\";\n";
assert_eq!(desugar_rust_blocks(src), src);
}
#[test]
fn rust_keyword_in_comment_is_not_expanded() {
let src = "# rust { not a block }\nprint 1;\n";
assert_eq!(desugar_rust_blocks(src), src);
}
#[test]
fn simple_rust_block_is_replaced_with_begin_call() {
let src =
"rust { pub extern \"C\" fn add(a: i64, b: i64) -> i64 { a + b } }\nprint add(1, 2);\n";
let out = desugar_rust_blocks(src);
assert!(out.contains("BEGIN"), "no BEGIN: {out}");
assert!(out.contains("__stryke_rust_compile"), "no builtin call");
assert!(!out.contains("pub extern"), "Rust body leaked: {out}");
assert!(out.contains("print add(1, 2);"));
}
#[test]
fn rust_block_with_nested_braces_is_balanced() {
let src = "rust { pub extern \"C\" fn f() -> i64 { let v = vec![1,2,3]; v.iter().sum::<i64>() } }";
let out = desugar_rust_blocks(src);
assert!(out.starts_with("BEGIN"));
assert!(!out.contains("pub extern"), "Rust body leaked");
}
#[test]
fn rust_block_with_string_containing_brace() {
let src = "rust { pub extern \"C\" fn g() -> i64 { let s = \"}\"; s.len() as i64 } }";
let out = desugar_rust_blocks(src);
assert!(out.starts_with("BEGIN"), "desugar failed: {out}");
assert!(!out.contains("pub extern"));
}
#[test]
fn rust_block_with_raw_string() {
let src = "rust { pub extern \"C\" fn h() -> i64 { let s = r#\"}\"#; s.len() as i64 } }";
let out = desugar_rust_blocks(src);
assert!(out.starts_with("BEGIN"), "desugar failed");
}
#[test]
fn rust_block_with_line_comment() {
let src = "rust { // } closing brace in comment\n pub extern \"C\" fn j() {} }";
let out = desugar_rust_blocks(src);
assert!(out.starts_with("BEGIN"), "desugar failed");
}
#[test]
fn rust_block_with_block_comment() {
let src = "rust { /* } closing brace */ pub extern \"C\" fn k() {} }";
let out = desugar_rust_blocks(src);
assert!(out.starts_with("BEGIN"), "desugar failed");
}
#[test]
fn identifier_starting_with_rust_is_not_matched() {
let src = "my $rusty = 1; sub rusty { 1 }\n";
assert_eq!(desugar_rust_blocks(src), src);
}
#[test]
fn rust_keyword_mid_expression_is_not_matched() {
// Here `rust` is not at a statement boundary (after `=`), so it is left alone.
let src = "my $x = rust { 1 }\n";
assert_eq!(desugar_rust_blocks(src), src);
}
}