tree-sitter-perl-c 0.14.0

Tree-sitter Perl grammar binding (C FFI). Conventional C/tree-sitter reference implementation, kept alongside the native v3 parser for compatibility and comparison.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
//! Tree-sitter Perl grammar binding (C FFI).
//!
//! This crate is the conventional C/tree-sitter reference implementation for
//! Perl, maintained alongside the native v3 Rust parser ([`perl-parser`]) for
//! compatibility testing and comparison. It vendors a snapshot of the upstream
//! [tree-sitter-perl] C grammar (`parser.c` + `scanner.c`) under `c-src/` and
//! exposes it via a hand-written FFI declaration — no `bindgen` or `libclang`
//! dependency is required to build.
//!
//! ## Relation to `tree-sitter-perl-rs`
//!
//! [`tree-sitter-perl-rs`] is a thin facade over the native v3 Rust parser and
//! is the recommended choice for new Rust projects. This crate (`tree-sitter-perl-c`)
//! should be preferred when:
//!
//! - **Compatibility testing** — comparing parse output against the upstream
//!   C reference grammar.
//! - **Non-Rust tree-sitter tooling** — the C grammar snapshot can be used as
//!   a build dependency for language bindings in other languages.
//! - **Baseline benchmarking** — measuring parse throughput of the C grammar
//!   against the native v3 parser.
//!
//! ## Build requirements
//!
//! Only a C compiler is required (e.g., `cc`/`gcc`/`clang` on Linux/macOS,
//! MSVC or MinGW on Windows). No `libclang` or `bindgen` toolchain is needed.
//!
//! ## Quick start
//!
//! ```rust
//! use tree_sitter_perl_c::parse_perl_code;
//!
//! let tree = parse_perl_code("my $x = 42;").unwrap();
//! assert!(!tree.root_node().has_error());
//! println!("{}", tree.root_node().to_sexp());
//! ```
//!
//! [tree-sitter-perl]: https://github.com/tree-sitter-perl/tree-sitter-perl
//! [`perl-parser`]: https://docs.rs/perl-parser
//! [`tree-sitter-perl-rs`]: https://docs.rs/tree-sitter-perl-rs

use std::{fmt, path::Path};
use tree_sitter::{Language, Parser};

/// Reusable Perl parser for hot parse loops.
///
/// Construct once and call [`PerlParser::parse_bytes`] or
/// [`PerlParser::parse_code`] repeatedly to avoid parser setup overhead.
#[non_exhaustive]
pub struct PerlParser {
    parser: Parser,
}

/// Typed errors produced by Perl parse helpers in this crate.
#[non_exhaustive]
#[derive(Debug)]
pub enum ParsePerlError {
    /// Configuring the parser with the Perl language failed.
    LanguageSetup(tree_sitter::LanguageError),
    /// Tree-sitter returned `None` instead of a parse tree.
    ParseReturnedNone,
    /// Reading source bytes from disk failed.
    Io(std::io::Error),
}

impl fmt::Display for ParsePerlError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::LanguageSetup(error) => {
                write!(f, "failed to configure parser language: {error:?}")
            }
            Self::ParseReturnedNone => write!(f, "tree-sitter returned no parse tree"),
            Self::Io(error) => write!(f, "failed to read Perl source file: {error}"),
        }
    }
}

impl std::error::Error for ParsePerlError {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self {
            Self::LanguageSetup(error) => Some(error),
            Self::ParseReturnedNone => None,
            Self::Io(error) => Some(error),
        }
    }
}

impl From<tree_sitter::LanguageError> for ParsePerlError {
    fn from(value: tree_sitter::LanguageError) -> Self {
        Self::LanguageSetup(value)
    }
}

impl From<std::io::Error> for ParsePerlError {
    fn from(value: std::io::Error) -> Self {
        Self::Io(value)
    }
}

/// Returns the tree-sitter [`Language`] for Perl (C grammar).
///
/// Use this to configure a [`tree_sitter::Parser`] or to create query objects
/// that target the Perl grammar.
///
/// # Example
///
/// ```rust
/// use tree_sitter_perl_c::language;
/// use tree_sitter::Parser;
///
/// let lang = language();
/// let mut parser = Parser::new();
/// parser.set_language(&lang).unwrap();
/// ```
pub fn language() -> Language {
    // SAFETY: `tree_sitter_perl` is generated by tree-sitter-cli and linked via `build.rs`.
    // It returns a raw pointer to a valid static TSLanguage struct for the Perl grammar. The function
    // has no preconditions on the calling thread, takes no arguments, holds no borrows, and
    // cannot cause aliasing or memory-safety issues. Soundness depends on the build script
    // linking the correct, ABI-compatible parser object, which cc::Build in build.rs guarantees.
    unsafe { tree_sitter_perl() }
}

/// Creates a [`tree_sitter::Parser`] configured for Perl.
///
/// Returns an error if the language version is incompatible with the linked
/// tree-sitter runtime (this should not happen in practice).
///
/// Prefer this over [`create_parser`] in new code — it surfaces errors
/// explicitly.
///
/// # Example
///
/// ```rust
/// use tree_sitter_perl_c::try_create_parser;
///
/// let mut parser = try_create_parser().unwrap();
/// let tree = parser.parse("my $x = 1;", None).unwrap();
/// assert!(!tree.root_node().has_error());
/// ```
pub fn try_create_parser() -> Result<Parser, tree_sitter::LanguageError> {
    let mut parser = Parser::new();
    parser.set_language(&language())?;
    Ok(parser)
}

impl PerlParser {
    /// Creates a reusable Perl parser instance.
    pub fn new() -> Result<Self, tree_sitter::LanguageError> {
        Ok(Self { parser: try_create_parser()? })
    }

    /// Parses Perl source bytes using this parser instance.
    pub fn parse_bytes(&mut self, code: &[u8]) -> Result<tree_sitter::Tree, ParsePerlError> {
        try_parse_with_parser(&mut self.parser, code)
    }

    /// Parses Perl source text using this parser instance.
    pub fn parse_code(&mut self, code: &str) -> Result<tree_sitter::Tree, ParsePerlError> {
        self.parse_bytes(code.as_bytes())
    }
}

fn try_parse_with_parser(
    parser: &mut Parser,
    code: &[u8],
) -> Result<tree_sitter::Tree, ParsePerlError> {
    match parser.parse(code, None) {
        Some(tree) => Ok(tree),
        None => Err(ParsePerlError::ParseReturnedNone),
    }
}

/// Creates a [`tree_sitter::Parser`] configured for Perl, silently ignoring
/// language-set errors.
///
/// This is a compatibility shim. Prefer [`try_create_parser`] in new code so
/// that version mismatches are not swallowed.
///
/// # Example
///
/// ```rust
/// use tree_sitter_perl_c::create_parser;
///
/// let parser = create_parser();
/// assert!(parser.language().is_some());
/// ```
pub fn create_parser() -> Parser {
    let mut parser = Parser::new();
    let _ = parser.set_language(&language());
    parser
}

/// Parses Perl source bytes and returns the resulting [`tree_sitter::Tree`].
///
/// This accepts arbitrary bytes so callers can parse non-UTF-8 source files,
/// for example Perl scripts with Latin-1 encoded strings or binary data
/// embedded in `__DATA__` sections.
///
/// # Notes
///
/// The tree-sitter C grammar receives the raw bytes as-is. A UTF-8 BOM
/// (`\xEF\xBB\xBF`) at the start of the file is not stripped automatically
/// and may produce an error node in the resulting tree.  Strip it before
/// calling this function if strict grammar compliance is required.
///
/// # Errors
///
/// Returns an error if the parser cannot be initialised (version mismatch) or
/// if tree-sitter returns `None` from `parse` (cancelled or timed out).
pub fn parse_perl_bytes(code: &[u8]) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
    try_parse_perl_bytes(code).map_err(Into::into)
}

/// Parses Perl source bytes and returns the resulting [`tree_sitter::Tree`].
///
/// This typed variant allows callers to distinguish parser setup failures from
/// parse cancellation/timeouts (`None` from tree-sitter).
pub fn try_parse_perl_bytes(code: &[u8]) -> Result<tree_sitter::Tree, ParsePerlError> {
    let mut parser = try_create_parser().map_err(ParsePerlError::LanguageSetup)?;
    try_parse_with_parser(&mut parser, code)
}

/// Parses Perl source bytes using a caller-provided configured [`tree_sitter::Parser`].
///
/// This helper is intended for performance-sensitive code paths where a single
/// parser is reused across many snippets. The parser must already be configured
/// with [`language`].
///
/// # Errors
///
/// Returns an error if tree-sitter returns `None` from `parse` (cancelled or
/// timed out).
pub fn parse_perl_bytes_with_parser(
    parser: &mut Parser,
    code: &[u8],
) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
    try_parse_perl_bytes_with_parser(parser, code).map_err(Into::into)
}

/// Parses Perl source bytes using a caller-provided configured [`tree_sitter::Parser`].
///
/// This typed variant allows callers to explicitly handle parse cancellation/timeouts
/// (`None` from tree-sitter) as [`ParsePerlError::ParseReturnedNone`].
pub fn try_parse_perl_bytes_with_parser(
    parser: &mut Parser,
    code: &[u8],
) -> Result<tree_sitter::Tree, ParsePerlError> {
    try_parse_with_parser(parser, code)
}

/// Parses a Perl source string and returns the resulting [`tree_sitter::Tree`].
///
/// # Errors
///
/// Returns an error if the parser cannot be initialised (version mismatch) or
/// if tree-sitter returns `None` from `parse` (cancelled or timed out).
///
/// # Example
///
/// ```rust
/// use tree_sitter_perl_c::parse_perl_code;
///
/// let tree = parse_perl_code("my $x = 42;").unwrap();
/// assert!(!tree.root_node().has_error());
/// ```
pub fn parse_perl_code(code: &str) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
    try_parse_perl_code(code).map_err(Into::into)
}

/// Parses a Perl source string and returns the resulting [`tree_sitter::Tree`].
///
/// This typed variant allows callers to inspect whether parser setup failed or
/// tree-sitter returned no parse tree.
pub fn try_parse_perl_code(code: &str) -> Result<tree_sitter::Tree, ParsePerlError> {
    try_parse_perl_bytes(code.as_bytes())
}

/// Parses a Perl source string using a caller-provided configured [`tree_sitter::Parser`].
///
/// This helper avoids creating and configuring a new parser for each parse call.
/// The parser must already be configured with [`language`].
///
/// # Errors
///
/// Returns an error if tree-sitter returns `None` from `parse` (cancelled or
/// timed out).
pub fn parse_perl_code_with_parser(
    parser: &mut Parser,
    code: &str,
) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
    try_parse_perl_code_with_parser(parser, code).map_err(Into::into)
}

/// Parses a Perl source string using a caller-provided configured [`tree_sitter::Parser`].
///
/// This typed variant allows callers to explicitly handle parse cancellation/timeouts
/// (`None` from tree-sitter) as [`ParsePerlError::ParseReturnedNone`].
pub fn try_parse_perl_code_with_parser(
    parser: &mut Parser,
    code: &str,
) -> Result<tree_sitter::Tree, ParsePerlError> {
    try_parse_perl_bytes_with_parser(parser, code.as_bytes())
}

/// Reads a file from `path` and parses it as Perl source.
///
/// # Errors
///
/// Returns an error if the file cannot be read or if parsing fails (see
/// [`parse_perl_code`]).
pub fn parse_perl_file<P: AsRef<Path>>(
    path: P,
) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
    try_parse_perl_file(path).map_err(Into::into)
}

/// Reads a file from `path` and parses it as Perl source.
///
/// This typed variant allows callers to distinguish IO failures from parser
/// setup and parse-`None` outcomes.
pub fn try_parse_perl_file<P: AsRef<Path>>(path: P) -> Result<tree_sitter::Tree, ParsePerlError> {
    let code = std::fs::read(path).map_err(ParsePerlError::Io)?;
    try_parse_perl_bytes(&code)
}

/// Returns the scanner backend identifier for this crate.
///
/// Always returns `"c-scanner"`. Useful when code needs to distinguish between
/// this crate and the Rust-native [`tree-sitter-perl-rs`] backend.
///
/// [`tree-sitter-perl-rs`]: https://docs.rs/tree-sitter-perl-rs
pub fn get_scanner_config() -> &'static str {
    "c-scanner"
}

#[cfg(test)]
mod tests {
    use super::*;
    use tree_sitter::{Query, QueryCursor, StreamingIterator};

    const INJECTIONS_QUERY: &str = include_str!("../../../tree-sitter-perl/queries/injections.scm");

    fn capture_text<'a>(
        query: &'a Query,
        code: &'a str,
        capture: tree_sitter::QueryCapture<'a>,
    ) -> Option<(&'a str, &'a str)> {
        let name = query.capture_names().get(capture.index as usize)?;
        let text = capture.node.utf8_text(code.as_bytes()).ok()?;
        Some((*name, text))
    }

    #[test]
    fn test_language_loading() {
        let lang = language();
        let count = lang.node_kind_count();
        println!("C implementation node kind count: {}", count);
        // Language is valid if we can get its node kind count
        assert!(count > 0);
    }

    #[test]
    fn test_basic_parsing() -> Result<(), Box<dyn std::error::Error>> {
        let code = "my $var = 'hello';";
        let tree = parse_perl_code(code)?;
        assert!(!tree.root_node().has_error());
        Ok(())
    }

    #[test]
    fn test_parse_bytes() -> Result<(), Box<dyn std::error::Error>> {
        let code = b"my $var = 'hello';";
        let tree = parse_perl_bytes(code)?;
        assert!(!tree.root_node().has_error());
        Ok(())
    }

    #[test]
    fn test_parse_bytes_with_reused_parser() -> Result<(), Box<dyn std::error::Error>> {
        let mut parser = try_create_parser()?;

        let first = parse_perl_bytes_with_parser(&mut parser, b"my $x = 1;")?;
        assert!(!first.root_node().has_error());

        let second = parse_perl_bytes_with_parser(&mut parser, b"my $y = 2;")?;
        assert!(!second.root_node().has_error());

        Ok(())
    }

    #[test]
    fn test_parse_code_with_reused_parser() -> Result<(), Box<dyn std::error::Error>> {
        let mut parser = try_create_parser()?;

        let first = parse_perl_code_with_parser(&mut parser, "my $name = 'Perl';")?;
        assert!(!first.root_node().has_error());

        let second = parse_perl_code_with_parser(&mut parser, "print $name;")?;
        assert!(!second.root_node().has_error());

        Ok(())
    }

    #[test]
    fn test_typed_parse_none_error_variant_is_emitted() {
        let mut parser = Parser::new();
        let result = try_parse_with_parser(&mut parser, b"my $var = 'hello';");
        assert!(matches!(result, Err(ParsePerlError::ParseReturnedNone)));
    }

    #[test]
    fn test_typed_language_setup_error_variant_mapping() {
        let error = ParsePerlError::from(tree_sitter::LanguageError::Version(0));
        assert!(matches!(error, ParsePerlError::LanguageSetup(_)));
    }

    #[test]
    fn test_parser_creation() {
        let parser = create_parser();
        assert!(parser.language().is_some());
    }

    #[test]
    fn test_reusable_parser_parses_multiple_inputs() -> Result<(), Box<dyn std::error::Error>> {
        let mut parser = PerlParser::new()?;
        let first = parser.parse_code("my $x = 1;")?;
        let second = parser.parse_code("my $y = 2;")?;
        assert!(!first.root_node().has_error());
        assert!(!second.root_node().has_error());
        Ok(())
    }

    /// Verify that error state from one parse does not bleed into the next.
    /// A parser reused after parsing invalid Perl must produce a clean tree
    /// for the subsequent valid input.
    #[test]
    fn test_reusable_parser_error_state_does_not_bleed() -> Result<(), Box<dyn std::error::Error>> {
        let mut parser = PerlParser::new()?;
        // First parse: syntactically invalid Perl — tree must exist but have error nodes.
        let bad_tree = parser.parse_code("my $x = @@@@@@;")?;
        assert!(bad_tree.root_node().has_error(), "invalid Perl should produce error nodes");
        // Second parse: valid Perl — must produce a clean tree despite the previous error.
        let good_tree = parser.parse_code("my $y = 42;")?;
        assert!(!good_tree.root_node().has_error(), "valid Perl after error parse must be clean");
        Ok(())
    }

    #[test]
    fn test_inline_cpp_injection_query_matches_heredoc_body()
    -> Result<(), Box<dyn std::error::Error>> {
        let code = "use Inline CPP => <<'END_CPP';\n#include <string>\nclass Greet {};\nEND_CPP\n";
        let tree = parse_perl_code(code)?;
        let query = Query::new(&language(), INJECTIONS_QUERY)?;
        let mut cursor = QueryCursor::new();

        let mut matched = false;
        let mut matches = cursor.matches(&query, tree.root_node(), code.as_bytes());
        while let Some(m) = matches.next() {
            let mut saw_inline_package = false;
            let mut saw_inline_language = false;
            let mut saw_injection_content = false;

            for capture in m.captures {
                if let Some((name, text)) = capture_text(&query, code, *capture) {
                    match name {
                        "inline.package" => saw_inline_package = text == "Inline",
                        "inline.language" => saw_inline_language = text == "CPP",
                        "injection.content" => {
                            saw_injection_content = capture.node.kind() == "heredoc_content"
                                && text.contains("#include <string>");
                        }
                        _ => {}
                    }
                }
            }

            if saw_inline_package && saw_inline_language && saw_injection_content {
                matched = true;
                break;
            }
        }

        assert!(matched, "expected Inline::CPP heredoc to match the injection query");
        Ok(())
    }

    /// Verify that `parse_perl_bytes` returns a tree (possibly with error nodes) for
    /// input prefixed with a UTF-8 BOM.  The BOM is NOT stripped; callers are responsible
    /// for removing it if the grammar produces undesired error nodes.
    #[test]
    fn test_parse_bytes_with_utf8_bom_returns_tree() -> Result<(), Box<dyn std::error::Error>> {
        // UTF-8 BOM (\xEF\xBB\xBF) followed by valid Perl
        let bom_source = b"\xEF\xBB\xBFmy $x = 1;";
        let tree = parse_perl_bytes(bom_source)?;
        // The tree must be returned even if the BOM causes an error node
        assert_eq!(tree.root_node().kind(), "source_file");
        Ok(())
    }

    /// Verify that `parse_perl_bytes` handles a completely empty input.
    #[test]
    fn test_parse_bytes_empty_source() -> Result<(), Box<dyn std::error::Error>> {
        let tree = parse_perl_bytes(b"")?;
        assert_eq!(tree.root_node().kind(), "source_file");
        Ok(())
    }
}

// SAFETY: See the SAFETY comment on the `language()` function above.
// This is the only unsafe code in the crate — the single FFI symbol we need
// from the compiled C grammar. No bindgen is used; the declaration is
// hand-written to avoid a libclang build dependency.
unsafe extern "C" {
    fn tree_sitter_perl() -> Language;
}