Skip to main content

tree_sitter_perl_c/
lib.rs

1//! Tree-sitter Perl grammar binding (C FFI).
2//!
3//! This crate is the conventional C/tree-sitter reference implementation for
4//! Perl, maintained alongside the native v3 Rust parser ([`perl-parser`]) for
5//! compatibility testing and comparison. It vendors a snapshot of the upstream
6//! [tree-sitter-perl] C grammar (`parser.c` + `scanner.c`) under `c-src/` and
7//! exposes it via a hand-written FFI declaration — no `bindgen` or `libclang`
8//! dependency is required to build.
9//!
10//! ## Relation to `tree-sitter-perl-rs`
11//!
12//! [`tree-sitter-perl-rs`] is a thin facade over the native v3 Rust parser and
13//! is the recommended choice for new Rust projects. This crate (`tree-sitter-perl-c`)
14//! should be preferred when:
15//!
16//! - **Compatibility testing** — comparing parse output against the upstream
17//!   C reference grammar.
18//! - **Non-Rust tree-sitter tooling** — the C grammar snapshot can be used as
19//!   a build dependency for language bindings in other languages.
20//! - **Baseline benchmarking** — measuring parse throughput of the C grammar
21//!   against the native v3 parser.
22//!
23//! ## Build requirements
24//!
25//! Only a C compiler is required (e.g., `cc`/`gcc`/`clang` on Linux/macOS,
26//! MSVC or MinGW on Windows). No `libclang` or `bindgen` toolchain is needed.
27//!
28//! ## Quick start
29//!
30//! ```rust
31//! use tree_sitter_perl_c::parse_perl_code;
32//!
33//! let tree = parse_perl_code("my $x = 42;").unwrap();
34//! assert!(!tree.root_node().has_error());
35//! println!("{}", tree.root_node().to_sexp());
36//! ```
37//!
38//! [tree-sitter-perl]: https://github.com/tree-sitter-perl/tree-sitter-perl
39//! [`perl-parser`]: https://docs.rs/perl-parser
40//! [`tree-sitter-perl-rs`]: https://docs.rs/tree-sitter-perl-rs
41
42use std::{fmt, path::Path};
43use tree_sitter::{Language, Parser};
44
45/// Reusable Perl parser for hot parse loops.
46///
47/// Construct once and call [`PerlParser::parse_bytes`] or
48/// [`PerlParser::parse_code`] repeatedly to avoid parser setup overhead.
49#[non_exhaustive]
50pub struct PerlParser {
51    parser: Parser,
52}
53
54/// Typed errors produced by Perl parse helpers in this crate.
55#[non_exhaustive]
56#[derive(Debug)]
57pub enum ParsePerlError {
58    /// Configuring the parser with the Perl language failed.
59    LanguageSetup(tree_sitter::LanguageError),
60    /// Tree-sitter returned `None` instead of a parse tree.
61    ParseReturnedNone,
62    /// Reading source bytes from disk failed.
63    Io(std::io::Error),
64}
65
66impl fmt::Display for ParsePerlError {
67    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
68        match self {
69            Self::LanguageSetup(error) => {
70                write!(f, "failed to configure parser language: {error:?}")
71            }
72            Self::ParseReturnedNone => write!(f, "tree-sitter returned no parse tree"),
73            Self::Io(error) => write!(f, "failed to read Perl source file: {error}"),
74        }
75    }
76}
77
78impl std::error::Error for ParsePerlError {
79    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
80        match self {
81            Self::LanguageSetup(error) => Some(error),
82            Self::ParseReturnedNone => None,
83            Self::Io(error) => Some(error),
84        }
85    }
86}
87
88impl From<tree_sitter::LanguageError> for ParsePerlError {
89    fn from(value: tree_sitter::LanguageError) -> Self {
90        Self::LanguageSetup(value)
91    }
92}
93
94impl From<std::io::Error> for ParsePerlError {
95    fn from(value: std::io::Error) -> Self {
96        Self::Io(value)
97    }
98}
99
100/// Returns the tree-sitter [`Language`] for Perl (C grammar).
101///
102/// Use this to configure a [`tree_sitter::Parser`] or to create query objects
103/// that target the Perl grammar.
104///
105/// # Example
106///
107/// ```rust
108/// use tree_sitter_perl_c::language;
109/// use tree_sitter::Parser;
110///
111/// let lang = language();
112/// let mut parser = Parser::new();
113/// parser.set_language(&lang).unwrap();
114/// ```
115pub fn language() -> Language {
116    // SAFETY: `tree_sitter_perl` is generated by tree-sitter-cli and linked via `build.rs`.
117    // It returns a raw pointer to a valid static TSLanguage struct for the Perl grammar. The function
118    // has no preconditions on the calling thread, takes no arguments, holds no borrows, and
119    // cannot cause aliasing or memory-safety issues. Soundness depends on the build script
120    // linking the correct, ABI-compatible parser object, which cc::Build in build.rs guarantees.
121    unsafe { tree_sitter_perl() }
122}
123
124/// Creates a [`tree_sitter::Parser`] configured for Perl.
125///
126/// Returns an error if the language version is incompatible with the linked
127/// tree-sitter runtime (this should not happen in practice).
128///
129/// Prefer this over [`create_parser`] in new code — it surfaces errors
130/// explicitly.
131///
132/// # Example
133///
134/// ```rust
135/// use tree_sitter_perl_c::try_create_parser;
136///
137/// let mut parser = try_create_parser().unwrap();
138/// let tree = parser.parse("my $x = 1;", None).unwrap();
139/// assert!(!tree.root_node().has_error());
140/// ```
141pub fn try_create_parser() -> Result<Parser, tree_sitter::LanguageError> {
142    let mut parser = Parser::new();
143    parser.set_language(&language())?;
144    Ok(parser)
145}
146
147impl PerlParser {
148    /// Creates a reusable Perl parser instance.
149    pub fn new() -> Result<Self, tree_sitter::LanguageError> {
150        Ok(Self { parser: try_create_parser()? })
151    }
152
153    /// Parses Perl source bytes using this parser instance.
154    pub fn parse_bytes(&mut self, code: &[u8]) -> Result<tree_sitter::Tree, ParsePerlError> {
155        try_parse_with_parser(&mut self.parser, code)
156    }
157
158    /// Parses Perl source text using this parser instance.
159    pub fn parse_code(&mut self, code: &str) -> Result<tree_sitter::Tree, ParsePerlError> {
160        self.parse_bytes(code.as_bytes())
161    }
162}
163
164fn try_parse_with_parser(
165    parser: &mut Parser,
166    code: &[u8],
167) -> Result<tree_sitter::Tree, ParsePerlError> {
168    match parser.parse(code, None) {
169        Some(tree) => Ok(tree),
170        None => Err(ParsePerlError::ParseReturnedNone),
171    }
172}
173
174/// Creates a [`tree_sitter::Parser`] configured for Perl, silently ignoring
175/// language-set errors.
176///
177/// This is a compatibility shim. Prefer [`try_create_parser`] in new code so
178/// that version mismatches are not swallowed.
179///
180/// # Example
181///
182/// ```rust
183/// use tree_sitter_perl_c::create_parser;
184///
185/// let parser = create_parser();
186/// assert!(parser.language().is_some());
187/// ```
188pub fn create_parser() -> Parser {
189    let mut parser = Parser::new();
190    let _ = parser.set_language(&language());
191    parser
192}
193
194/// Parses Perl source bytes and returns the resulting [`tree_sitter::Tree`].
195///
196/// This accepts arbitrary bytes so callers can parse non-UTF-8 source files,
197/// for example Perl scripts with Latin-1 encoded strings or binary data
198/// embedded in `__DATA__` sections.
199///
200/// # Notes
201///
202/// The tree-sitter C grammar receives the raw bytes as-is. A UTF-8 BOM
203/// (`\xEF\xBB\xBF`) at the start of the file is not stripped automatically
204/// and may produce an error node in the resulting tree.  Strip it before
205/// calling this function if strict grammar compliance is required.
206///
207/// # Errors
208///
209/// Returns an error if the parser cannot be initialised (version mismatch) or
210/// if tree-sitter returns `None` from `parse` (cancelled or timed out).
211pub fn parse_perl_bytes(code: &[u8]) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
212    try_parse_perl_bytes(code).map_err(Into::into)
213}
214
215/// Parses Perl source bytes and returns the resulting [`tree_sitter::Tree`].
216///
217/// This typed variant allows callers to distinguish parser setup failures from
218/// parse cancellation/timeouts (`None` from tree-sitter).
219pub fn try_parse_perl_bytes(code: &[u8]) -> Result<tree_sitter::Tree, ParsePerlError> {
220    let mut parser = try_create_parser().map_err(ParsePerlError::LanguageSetup)?;
221    try_parse_with_parser(&mut parser, code)
222}
223
224/// Parses Perl source bytes using a caller-provided configured [`tree_sitter::Parser`].
225///
226/// This helper is intended for performance-sensitive code paths where a single
227/// parser is reused across many snippets. The parser must already be configured
228/// with [`language`].
229///
230/// # Errors
231///
232/// Returns an error if tree-sitter returns `None` from `parse` (cancelled or
233/// timed out).
234pub fn parse_perl_bytes_with_parser(
235    parser: &mut Parser,
236    code: &[u8],
237) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
238    try_parse_perl_bytes_with_parser(parser, code).map_err(Into::into)
239}
240
241/// Parses Perl source bytes using a caller-provided configured [`tree_sitter::Parser`].
242///
243/// This typed variant allows callers to explicitly handle parse cancellation/timeouts
244/// (`None` from tree-sitter) as [`ParsePerlError::ParseReturnedNone`].
245pub fn try_parse_perl_bytes_with_parser(
246    parser: &mut Parser,
247    code: &[u8],
248) -> Result<tree_sitter::Tree, ParsePerlError> {
249    try_parse_with_parser(parser, code)
250}
251
252/// Parses a Perl source string and returns the resulting [`tree_sitter::Tree`].
253///
254/// # Errors
255///
256/// Returns an error if the parser cannot be initialised (version mismatch) or
257/// if tree-sitter returns `None` from `parse` (cancelled or timed out).
258///
259/// # Example
260///
261/// ```rust
262/// use tree_sitter_perl_c::parse_perl_code;
263///
264/// let tree = parse_perl_code("my $x = 42;").unwrap();
265/// assert!(!tree.root_node().has_error());
266/// ```
267pub fn parse_perl_code(code: &str) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
268    try_parse_perl_code(code).map_err(Into::into)
269}
270
271/// Parses a Perl source string and returns the resulting [`tree_sitter::Tree`].
272///
273/// This typed variant allows callers to inspect whether parser setup failed or
274/// tree-sitter returned no parse tree.
275pub fn try_parse_perl_code(code: &str) -> Result<tree_sitter::Tree, ParsePerlError> {
276    try_parse_perl_bytes(code.as_bytes())
277}
278
279/// Parses a Perl source string using a caller-provided configured [`tree_sitter::Parser`].
280///
281/// This helper avoids creating and configuring a new parser for each parse call.
282/// The parser must already be configured with [`language`].
283///
284/// # Errors
285///
286/// Returns an error if tree-sitter returns `None` from `parse` (cancelled or
287/// timed out).
288pub fn parse_perl_code_with_parser(
289    parser: &mut Parser,
290    code: &str,
291) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
292    try_parse_perl_code_with_parser(parser, code).map_err(Into::into)
293}
294
295/// Parses a Perl source string using a caller-provided configured [`tree_sitter::Parser`].
296///
297/// This typed variant allows callers to explicitly handle parse cancellation/timeouts
298/// (`None` from tree-sitter) as [`ParsePerlError::ParseReturnedNone`].
299pub fn try_parse_perl_code_with_parser(
300    parser: &mut Parser,
301    code: &str,
302) -> Result<tree_sitter::Tree, ParsePerlError> {
303    try_parse_perl_bytes_with_parser(parser, code.as_bytes())
304}
305
306/// Reads a file from `path` and parses it as Perl source.
307///
308/// # Errors
309///
310/// Returns an error if the file cannot be read or if parsing fails (see
311/// [`parse_perl_code`]).
312pub fn parse_perl_file<P: AsRef<Path>>(
313    path: P,
314) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
315    try_parse_perl_file(path).map_err(Into::into)
316}
317
318/// Reads a file from `path` and parses it as Perl source.
319///
320/// This typed variant allows callers to distinguish IO failures from parser
321/// setup and parse-`None` outcomes.
322pub fn try_parse_perl_file<P: AsRef<Path>>(path: P) -> Result<tree_sitter::Tree, ParsePerlError> {
323    let code = std::fs::read(path).map_err(ParsePerlError::Io)?;
324    try_parse_perl_bytes(&code)
325}
326
327/// Returns the scanner backend identifier for this crate.
328///
329/// Always returns `"c-scanner"`. Useful when code needs to distinguish between
330/// this crate and the Rust-native [`tree-sitter-perl-rs`] backend.
331///
332/// [`tree-sitter-perl-rs`]: https://docs.rs/tree-sitter-perl-rs
333pub fn get_scanner_config() -> &'static str {
334    "c-scanner"
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340    use tree_sitter::{Query, QueryCursor, StreamingIterator};
341
342    const INJECTIONS_QUERY: &str = include_str!("../../../tree-sitter-perl/queries/injections.scm");
343
344    fn capture_text<'a>(
345        query: &'a Query,
346        code: &'a str,
347        capture: tree_sitter::QueryCapture<'a>,
348    ) -> Option<(&'a str, &'a str)> {
349        let name = query.capture_names().get(capture.index as usize)?;
350        let text = capture.node.utf8_text(code.as_bytes()).ok()?;
351        Some((*name, text))
352    }
353
354    #[test]
355    fn test_language_loading() {
356        let lang = language();
357        let count = lang.node_kind_count();
358        println!("C implementation node kind count: {}", count);
359        // Language is valid if we can get its node kind count
360        assert!(count > 0);
361    }
362
363    #[test]
364    fn test_basic_parsing() -> Result<(), Box<dyn std::error::Error>> {
365        let code = "my $var = 'hello';";
366        let tree = parse_perl_code(code)?;
367        assert!(!tree.root_node().has_error());
368        Ok(())
369    }
370
371    #[test]
372    fn test_parse_bytes() -> Result<(), Box<dyn std::error::Error>> {
373        let code = b"my $var = 'hello';";
374        let tree = parse_perl_bytes(code)?;
375        assert!(!tree.root_node().has_error());
376        Ok(())
377    }
378
379    #[test]
380    fn test_parse_bytes_with_reused_parser() -> Result<(), Box<dyn std::error::Error>> {
381        let mut parser = try_create_parser()?;
382
383        let first = parse_perl_bytes_with_parser(&mut parser, b"my $x = 1;")?;
384        assert!(!first.root_node().has_error());
385
386        let second = parse_perl_bytes_with_parser(&mut parser, b"my $y = 2;")?;
387        assert!(!second.root_node().has_error());
388
389        Ok(())
390    }
391
392    #[test]
393    fn test_parse_code_with_reused_parser() -> Result<(), Box<dyn std::error::Error>> {
394        let mut parser = try_create_parser()?;
395
396        let first = parse_perl_code_with_parser(&mut parser, "my $name = 'Perl';")?;
397        assert!(!first.root_node().has_error());
398
399        let second = parse_perl_code_with_parser(&mut parser, "print $name;")?;
400        assert!(!second.root_node().has_error());
401
402        Ok(())
403    }
404
405    #[test]
406    fn test_typed_parse_none_error_variant_is_emitted() {
407        let mut parser = Parser::new();
408        let result = try_parse_with_parser(&mut parser, b"my $var = 'hello';");
409        assert!(matches!(result, Err(ParsePerlError::ParseReturnedNone)));
410    }
411
412    #[test]
413    fn test_typed_language_setup_error_variant_mapping() {
414        let error = ParsePerlError::from(tree_sitter::LanguageError::Version(0));
415        assert!(matches!(error, ParsePerlError::LanguageSetup(_)));
416    }
417
418    #[test]
419    fn test_parser_creation() {
420        let parser = create_parser();
421        assert!(parser.language().is_some());
422    }
423
424    #[test]
425    fn test_reusable_parser_parses_multiple_inputs() -> Result<(), Box<dyn std::error::Error>> {
426        let mut parser = PerlParser::new()?;
427        let first = parser.parse_code("my $x = 1;")?;
428        let second = parser.parse_code("my $y = 2;")?;
429        assert!(!first.root_node().has_error());
430        assert!(!second.root_node().has_error());
431        Ok(())
432    }
433
434    /// Verify that error state from one parse does not bleed into the next.
435    /// A parser reused after parsing invalid Perl must produce a clean tree
436    /// for the subsequent valid input.
437    #[test]
438    fn test_reusable_parser_error_state_does_not_bleed() -> Result<(), Box<dyn std::error::Error>> {
439        let mut parser = PerlParser::new()?;
440        // First parse: syntactically invalid Perl — tree must exist but have error nodes.
441        let bad_tree = parser.parse_code("my $x = @@@@@@;")?;
442        assert!(bad_tree.root_node().has_error(), "invalid Perl should produce error nodes");
443        // Second parse: valid Perl — must produce a clean tree despite the previous error.
444        let good_tree = parser.parse_code("my $y = 42;")?;
445        assert!(!good_tree.root_node().has_error(), "valid Perl after error parse must be clean");
446        Ok(())
447    }
448
449    #[test]
450    fn test_inline_cpp_injection_query_matches_heredoc_body()
451    -> Result<(), Box<dyn std::error::Error>> {
452        let code = "use Inline CPP => <<'END_CPP';\n#include <string>\nclass Greet {};\nEND_CPP\n";
453        let tree = parse_perl_code(code)?;
454        let query = Query::new(&language(), INJECTIONS_QUERY)?;
455        let mut cursor = QueryCursor::new();
456
457        let mut matched = false;
458        let mut matches = cursor.matches(&query, tree.root_node(), code.as_bytes());
459        while let Some(m) = matches.next() {
460            let mut saw_inline_package = false;
461            let mut saw_inline_language = false;
462            let mut saw_injection_content = false;
463
464            for capture in m.captures {
465                if let Some((name, text)) = capture_text(&query, code, *capture) {
466                    match name {
467                        "inline.package" => saw_inline_package = text == "Inline",
468                        "inline.language" => saw_inline_language = text == "CPP",
469                        "injection.content" => {
470                            saw_injection_content = capture.node.kind() == "heredoc_content"
471                                && text.contains("#include <string>");
472                        }
473                        _ => {}
474                    }
475                }
476            }
477
478            if saw_inline_package && saw_inline_language && saw_injection_content {
479                matched = true;
480                break;
481            }
482        }
483
484        assert!(matched, "expected Inline::CPP heredoc to match the injection query");
485        Ok(())
486    }
487
488    /// Verify that `parse_perl_bytes` returns a tree (possibly with error nodes) for
489    /// input prefixed with a UTF-8 BOM.  The BOM is NOT stripped; callers are responsible
490    /// for removing it if the grammar produces undesired error nodes.
491    #[test]
492    fn test_parse_bytes_with_utf8_bom_returns_tree() -> Result<(), Box<dyn std::error::Error>> {
493        // UTF-8 BOM (\xEF\xBB\xBF) followed by valid Perl
494        let bom_source = b"\xEF\xBB\xBFmy $x = 1;";
495        let tree = parse_perl_bytes(bom_source)?;
496        // The tree must be returned even if the BOM causes an error node
497        assert_eq!(tree.root_node().kind(), "source_file");
498        Ok(())
499    }
500
501    /// Verify that `parse_perl_bytes` handles a completely empty input.
502    #[test]
503    fn test_parse_bytes_empty_source() -> Result<(), Box<dyn std::error::Error>> {
504        let tree = parse_perl_bytes(b"")?;
505        assert_eq!(tree.root_node().kind(), "source_file");
506        Ok(())
507    }
508}
509
510// SAFETY: See the SAFETY comment on the `language()` function above.
511// This is the only unsafe code in the crate — the single FFI symbol we need
512// from the compiled C grammar. No bindgen is used; the declaration is
513// hand-written to avoid a libclang build dependency.
514unsafe extern "C" {
515    fn tree_sitter_perl() -> Language;
516}