tree_sitter_perl_c/lib.rs
1//! Tree-sitter Perl grammar binding (C FFI).
2//!
3//! This crate is the conventional C/tree-sitter reference implementation for
4//! Perl, maintained alongside the native v3 Rust parser ([`perl-parser`]) for
5//! compatibility testing and comparison. It vendors a snapshot of the upstream
6//! [tree-sitter-perl] C grammar (`parser.c` + `scanner.c`) under `c-src/` and
7//! exposes it via a hand-written FFI declaration — no `bindgen` or `libclang`
8//! dependency is required to build.
9//!
10//! ## Relation to `tree-sitter-perl-rs`
11//!
12//! [`tree-sitter-perl-rs`] is a thin facade over the native v3 Rust parser and
13//! is the recommended choice for new Rust projects. This crate (`tree-sitter-perl-c`)
14//! should be preferred when:
15//!
16//! - **Compatibility testing** — comparing parse output against the upstream
17//! C reference grammar.
18//! - **Non-Rust tree-sitter tooling** — the C grammar snapshot can be used as
19//! a build dependency for language bindings in other languages.
20//! - **Baseline benchmarking** — measuring parse throughput of the C grammar
21//! against the native v3 parser.
22//!
23//! ## Build requirements
24//!
25//! Only a C compiler is required (e.g., `cc`/`gcc`/`clang` on Linux/macOS,
26//! MSVC or MinGW on Windows). No `libclang` or `bindgen` toolchain is needed.
27//!
28//! ## Quick start
29//!
30//! ```rust
31//! use tree_sitter_perl_c::parse_perl_code;
32//!
33//! let tree = parse_perl_code("my $x = 42;").unwrap();
34//! assert!(!tree.root_node().has_error());
35//! println!("{}", tree.root_node().to_sexp());
36//! ```
37//!
38//! [tree-sitter-perl]: https://github.com/tree-sitter-perl/tree-sitter-perl
39//! [`perl-parser`]: https://docs.rs/perl-parser
40//! [`tree-sitter-perl-rs`]: https://docs.rs/tree-sitter-perl-rs
41
42use std::{fmt, path::Path};
43use tree_sitter::{Language, Parser};
44
45/// Reusable Perl parser for hot parse loops.
46///
47/// Construct once and call [`PerlParser::parse_bytes`] or
48/// [`PerlParser::parse_code`] repeatedly to avoid parser setup overhead.
49#[non_exhaustive]
50pub struct PerlParser {
51 parser: Parser,
52}
53
54/// Typed errors produced by Perl parse helpers in this crate.
55#[non_exhaustive]
56#[derive(Debug)]
57pub enum ParsePerlError {
58 /// Configuring the parser with the Perl language failed.
59 LanguageSetup(tree_sitter::LanguageError),
60 /// Tree-sitter returned `None` instead of a parse tree.
61 ParseReturnedNone,
62 /// Reading source bytes from disk failed.
63 Io(std::io::Error),
64}
65
66impl fmt::Display for ParsePerlError {
67 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
68 match self {
69 Self::LanguageSetup(error) => {
70 write!(f, "failed to configure parser language: {error:?}")
71 }
72 Self::ParseReturnedNone => write!(f, "tree-sitter returned no parse tree"),
73 Self::Io(error) => write!(f, "failed to read Perl source file: {error}"),
74 }
75 }
76}
77
78impl std::error::Error for ParsePerlError {
79 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
80 match self {
81 Self::LanguageSetup(error) => Some(error),
82 Self::ParseReturnedNone => None,
83 Self::Io(error) => Some(error),
84 }
85 }
86}
87
88impl From<tree_sitter::LanguageError> for ParsePerlError {
89 fn from(value: tree_sitter::LanguageError) -> Self {
90 Self::LanguageSetup(value)
91 }
92}
93
94impl From<std::io::Error> for ParsePerlError {
95 fn from(value: std::io::Error) -> Self {
96 Self::Io(value)
97 }
98}
99
100/// Returns the tree-sitter [`Language`] for Perl (C grammar).
101///
102/// Use this to configure a [`tree_sitter::Parser`] or to create query objects
103/// that target the Perl grammar.
104///
105/// # Example
106///
107/// ```rust
108/// use tree_sitter_perl_c::language;
109/// use tree_sitter::Parser;
110///
111/// let lang = language();
112/// let mut parser = Parser::new();
113/// parser.set_language(&lang).unwrap();
114/// ```
115pub fn language() -> Language {
116 // SAFETY: `tree_sitter_perl` is generated by tree-sitter-cli and linked via `build.rs`.
117 // It returns a raw pointer to a valid static TSLanguage struct for the Perl grammar. The function
118 // has no preconditions on the calling thread, takes no arguments, holds no borrows, and
119 // cannot cause aliasing or memory-safety issues. Soundness depends on the build script
120 // linking the correct, ABI-compatible parser object, which cc::Build in build.rs guarantees.
121 unsafe { tree_sitter_perl() }
122}
123
124/// Creates a [`tree_sitter::Parser`] configured for Perl.
125///
126/// Returns an error if the language version is incompatible with the linked
127/// tree-sitter runtime (this should not happen in practice).
128///
129/// Prefer this over [`create_parser`] in new code — it surfaces errors
130/// explicitly.
131///
132/// # Example
133///
134/// ```rust
135/// use tree_sitter_perl_c::try_create_parser;
136///
137/// let mut parser = try_create_parser().unwrap();
138/// let tree = parser.parse("my $x = 1;", None).unwrap();
139/// assert!(!tree.root_node().has_error());
140/// ```
141pub fn try_create_parser() -> Result<Parser, tree_sitter::LanguageError> {
142 let mut parser = Parser::new();
143 parser.set_language(&language())?;
144 Ok(parser)
145}
146
147impl PerlParser {
148 /// Creates a reusable Perl parser instance.
149 pub fn new() -> Result<Self, tree_sitter::LanguageError> {
150 Ok(Self { parser: try_create_parser()? })
151 }
152
153 /// Parses Perl source bytes using this parser instance.
154 pub fn parse_bytes(&mut self, code: &[u8]) -> Result<tree_sitter::Tree, ParsePerlError> {
155 try_parse_with_parser(&mut self.parser, code)
156 }
157
158 /// Parses Perl source text using this parser instance.
159 pub fn parse_code(&mut self, code: &str) -> Result<tree_sitter::Tree, ParsePerlError> {
160 self.parse_bytes(code.as_bytes())
161 }
162}
163
164fn try_parse_with_parser(
165 parser: &mut Parser,
166 code: &[u8],
167) -> Result<tree_sitter::Tree, ParsePerlError> {
168 match parser.parse(code, None) {
169 Some(tree) => Ok(tree),
170 None => Err(ParsePerlError::ParseReturnedNone),
171 }
172}
173
174/// Creates a [`tree_sitter::Parser`] configured for Perl, silently ignoring
175/// language-set errors.
176///
177/// This is a compatibility shim. Prefer [`try_create_parser`] in new code so
178/// that version mismatches are not swallowed.
179///
180/// # Example
181///
182/// ```rust
183/// use tree_sitter_perl_c::create_parser;
184///
185/// let parser = create_parser();
186/// assert!(parser.language().is_some());
187/// ```
188pub fn create_parser() -> Parser {
189 let mut parser = Parser::new();
190 let _ = parser.set_language(&language());
191 parser
192}
193
194/// Parses Perl source bytes and returns the resulting [`tree_sitter::Tree`].
195///
196/// This accepts arbitrary bytes so callers can parse non-UTF-8 source files,
197/// for example Perl scripts with Latin-1 encoded strings or binary data
198/// embedded in `__DATA__` sections.
199///
200/// # Notes
201///
202/// The tree-sitter C grammar receives the raw bytes as-is. A UTF-8 BOM
203/// (`\xEF\xBB\xBF`) at the start of the file is not stripped automatically
204/// and may produce an error node in the resulting tree. Strip it before
205/// calling this function if strict grammar compliance is required.
206///
207/// # Errors
208///
209/// Returns an error if the parser cannot be initialised (version mismatch) or
210/// if tree-sitter returns `None` from `parse` (cancelled or timed out).
211pub fn parse_perl_bytes(code: &[u8]) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
212 try_parse_perl_bytes(code).map_err(Into::into)
213}
214
215/// Parses Perl source bytes and returns the resulting [`tree_sitter::Tree`].
216///
217/// This typed variant allows callers to distinguish parser setup failures from
218/// parse cancellation/timeouts (`None` from tree-sitter).
219pub fn try_parse_perl_bytes(code: &[u8]) -> Result<tree_sitter::Tree, ParsePerlError> {
220 let mut parser = try_create_parser().map_err(ParsePerlError::LanguageSetup)?;
221 try_parse_with_parser(&mut parser, code)
222}
223
224/// Parses Perl source bytes using a caller-provided configured [`tree_sitter::Parser`].
225///
226/// This helper is intended for performance-sensitive code paths where a single
227/// parser is reused across many snippets. The parser must already be configured
228/// with [`language`].
229///
230/// # Errors
231///
232/// Returns an error if tree-sitter returns `None` from `parse` (cancelled or
233/// timed out).
234pub fn parse_perl_bytes_with_parser(
235 parser: &mut Parser,
236 code: &[u8],
237) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
238 try_parse_perl_bytes_with_parser(parser, code).map_err(Into::into)
239}
240
241/// Parses Perl source bytes using a caller-provided configured [`tree_sitter::Parser`].
242///
243/// This typed variant allows callers to explicitly handle parse cancellation/timeouts
244/// (`None` from tree-sitter) as [`ParsePerlError::ParseReturnedNone`].
245pub fn try_parse_perl_bytes_with_parser(
246 parser: &mut Parser,
247 code: &[u8],
248) -> Result<tree_sitter::Tree, ParsePerlError> {
249 try_parse_with_parser(parser, code)
250}
251
252/// Parses a Perl source string and returns the resulting [`tree_sitter::Tree`].
253///
254/// # Errors
255///
256/// Returns an error if the parser cannot be initialised (version mismatch) or
257/// if tree-sitter returns `None` from `parse` (cancelled or timed out).
258///
259/// # Example
260///
261/// ```rust
262/// use tree_sitter_perl_c::parse_perl_code;
263///
264/// let tree = parse_perl_code("my $x = 42;").unwrap();
265/// assert!(!tree.root_node().has_error());
266/// ```
267pub fn parse_perl_code(code: &str) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
268 try_parse_perl_code(code).map_err(Into::into)
269}
270
271/// Parses a Perl source string and returns the resulting [`tree_sitter::Tree`].
272///
273/// This typed variant allows callers to inspect whether parser setup failed or
274/// tree-sitter returned no parse tree.
275pub fn try_parse_perl_code(code: &str) -> Result<tree_sitter::Tree, ParsePerlError> {
276 try_parse_perl_bytes(code.as_bytes())
277}
278
279/// Parses a Perl source string using a caller-provided configured [`tree_sitter::Parser`].
280///
281/// This helper avoids creating and configuring a new parser for each parse call.
282/// The parser must already be configured with [`language`].
283///
284/// # Errors
285///
286/// Returns an error if tree-sitter returns `None` from `parse` (cancelled or
287/// timed out).
288pub fn parse_perl_code_with_parser(
289 parser: &mut Parser,
290 code: &str,
291) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
292 try_parse_perl_code_with_parser(parser, code).map_err(Into::into)
293}
294
295/// Parses a Perl source string using a caller-provided configured [`tree_sitter::Parser`].
296///
297/// This typed variant allows callers to explicitly handle parse cancellation/timeouts
298/// (`None` from tree-sitter) as [`ParsePerlError::ParseReturnedNone`].
299pub fn try_parse_perl_code_with_parser(
300 parser: &mut Parser,
301 code: &str,
302) -> Result<tree_sitter::Tree, ParsePerlError> {
303 try_parse_perl_bytes_with_parser(parser, code.as_bytes())
304}
305
306/// Reads a file from `path` and parses it as Perl source.
307///
308/// # Errors
309///
310/// Returns an error if the file cannot be read or if parsing fails (see
311/// [`parse_perl_code`]).
312pub fn parse_perl_file<P: AsRef<Path>>(
313 path: P,
314) -> Result<tree_sitter::Tree, Box<dyn std::error::Error>> {
315 try_parse_perl_file(path).map_err(Into::into)
316}
317
318/// Reads a file from `path` and parses it as Perl source.
319///
320/// This typed variant allows callers to distinguish IO failures from parser
321/// setup and parse-`None` outcomes.
322pub fn try_parse_perl_file<P: AsRef<Path>>(path: P) -> Result<tree_sitter::Tree, ParsePerlError> {
323 let code = std::fs::read(path).map_err(ParsePerlError::Io)?;
324 try_parse_perl_bytes(&code)
325}
326
327/// Returns the scanner backend identifier for this crate.
328///
329/// Always returns `"c-scanner"`. Useful when code needs to distinguish between
330/// this crate and the Rust-native [`tree-sitter-perl-rs`] backend.
331///
332/// [`tree-sitter-perl-rs`]: https://docs.rs/tree-sitter-perl-rs
333pub fn get_scanner_config() -> &'static str {
334 "c-scanner"
335}
336
337#[cfg(test)]
338mod tests {
339 use super::*;
340 use tree_sitter::{Query, QueryCursor, StreamingIterator};
341
342 const INJECTIONS_QUERY: &str = include_str!("../../../tree-sitter-perl/queries/injections.scm");
343
344 fn capture_text<'a>(
345 query: &'a Query,
346 code: &'a str,
347 capture: tree_sitter::QueryCapture<'a>,
348 ) -> Option<(&'a str, &'a str)> {
349 let name = query.capture_names().get(capture.index as usize)?;
350 let text = capture.node.utf8_text(code.as_bytes()).ok()?;
351 Some((*name, text))
352 }
353
354 #[test]
355 fn test_language_loading() {
356 let lang = language();
357 let count = lang.node_kind_count();
358 println!("C implementation node kind count: {}", count);
359 // Language is valid if we can get its node kind count
360 assert!(count > 0);
361 }
362
363 #[test]
364 fn test_basic_parsing() -> Result<(), Box<dyn std::error::Error>> {
365 let code = "my $var = 'hello';";
366 let tree = parse_perl_code(code)?;
367 assert!(!tree.root_node().has_error());
368 Ok(())
369 }
370
371 #[test]
372 fn test_parse_bytes() -> Result<(), Box<dyn std::error::Error>> {
373 let code = b"my $var = 'hello';";
374 let tree = parse_perl_bytes(code)?;
375 assert!(!tree.root_node().has_error());
376 Ok(())
377 }
378
379 #[test]
380 fn test_parse_bytes_with_reused_parser() -> Result<(), Box<dyn std::error::Error>> {
381 let mut parser = try_create_parser()?;
382
383 let first = parse_perl_bytes_with_parser(&mut parser, b"my $x = 1;")?;
384 assert!(!first.root_node().has_error());
385
386 let second = parse_perl_bytes_with_parser(&mut parser, b"my $y = 2;")?;
387 assert!(!second.root_node().has_error());
388
389 Ok(())
390 }
391
392 #[test]
393 fn test_parse_code_with_reused_parser() -> Result<(), Box<dyn std::error::Error>> {
394 let mut parser = try_create_parser()?;
395
396 let first = parse_perl_code_with_parser(&mut parser, "my $name = 'Perl';")?;
397 assert!(!first.root_node().has_error());
398
399 let second = parse_perl_code_with_parser(&mut parser, "print $name;")?;
400 assert!(!second.root_node().has_error());
401
402 Ok(())
403 }
404
405 #[test]
406 fn test_typed_parse_none_error_variant_is_emitted() {
407 let mut parser = Parser::new();
408 let result = try_parse_with_parser(&mut parser, b"my $var = 'hello';");
409 assert!(matches!(result, Err(ParsePerlError::ParseReturnedNone)));
410 }
411
412 #[test]
413 fn test_typed_language_setup_error_variant_mapping() {
414 let error = ParsePerlError::from(tree_sitter::LanguageError::Version(0));
415 assert!(matches!(error, ParsePerlError::LanguageSetup(_)));
416 }
417
418 #[test]
419 fn test_parser_creation() {
420 let parser = create_parser();
421 assert!(parser.language().is_some());
422 }
423
424 #[test]
425 fn test_reusable_parser_parses_multiple_inputs() -> Result<(), Box<dyn std::error::Error>> {
426 let mut parser = PerlParser::new()?;
427 let first = parser.parse_code("my $x = 1;")?;
428 let second = parser.parse_code("my $y = 2;")?;
429 assert!(!first.root_node().has_error());
430 assert!(!second.root_node().has_error());
431 Ok(())
432 }
433
434 /// Verify that error state from one parse does not bleed into the next.
435 /// A parser reused after parsing invalid Perl must produce a clean tree
436 /// for the subsequent valid input.
437 #[test]
438 fn test_reusable_parser_error_state_does_not_bleed() -> Result<(), Box<dyn std::error::Error>> {
439 let mut parser = PerlParser::new()?;
440 // First parse: syntactically invalid Perl — tree must exist but have error nodes.
441 let bad_tree = parser.parse_code("my $x = @@@@@@;")?;
442 assert!(bad_tree.root_node().has_error(), "invalid Perl should produce error nodes");
443 // Second parse: valid Perl — must produce a clean tree despite the previous error.
444 let good_tree = parser.parse_code("my $y = 42;")?;
445 assert!(!good_tree.root_node().has_error(), "valid Perl after error parse must be clean");
446 Ok(())
447 }
448
449 #[test]
450 fn test_inline_cpp_injection_query_matches_heredoc_body()
451 -> Result<(), Box<dyn std::error::Error>> {
452 let code = "use Inline CPP => <<'END_CPP';\n#include <string>\nclass Greet {};\nEND_CPP\n";
453 let tree = parse_perl_code(code)?;
454 let query = Query::new(&language(), INJECTIONS_QUERY)?;
455 let mut cursor = QueryCursor::new();
456
457 let mut matched = false;
458 let mut matches = cursor.matches(&query, tree.root_node(), code.as_bytes());
459 while let Some(m) = matches.next() {
460 let mut saw_inline_package = false;
461 let mut saw_inline_language = false;
462 let mut saw_injection_content = false;
463
464 for capture in m.captures {
465 if let Some((name, text)) = capture_text(&query, code, *capture) {
466 match name {
467 "inline.package" => saw_inline_package = text == "Inline",
468 "inline.language" => saw_inline_language = text == "CPP",
469 "injection.content" => {
470 saw_injection_content = capture.node.kind() == "heredoc_content"
471 && text.contains("#include <string>");
472 }
473 _ => {}
474 }
475 }
476 }
477
478 if saw_inline_package && saw_inline_language && saw_injection_content {
479 matched = true;
480 break;
481 }
482 }
483
484 assert!(matched, "expected Inline::CPP heredoc to match the injection query");
485 Ok(())
486 }
487
488 /// Verify that `parse_perl_bytes` returns a tree (possibly with error nodes) for
489 /// input prefixed with a UTF-8 BOM. The BOM is NOT stripped; callers are responsible
490 /// for removing it if the grammar produces undesired error nodes.
491 #[test]
492 fn test_parse_bytes_with_utf8_bom_returns_tree() -> Result<(), Box<dyn std::error::Error>> {
493 // UTF-8 BOM (\xEF\xBB\xBF) followed by valid Perl
494 let bom_source = b"\xEF\xBB\xBFmy $x = 1;";
495 let tree = parse_perl_bytes(bom_source)?;
496 // The tree must be returned even if the BOM causes an error node
497 assert_eq!(tree.root_node().kind(), "source_file");
498 Ok(())
499 }
500
501 /// Verify that `parse_perl_bytes` handles a completely empty input.
502 #[test]
503 fn test_parse_bytes_empty_source() -> Result<(), Box<dyn std::error::Error>> {
504 let tree = parse_perl_bytes(b"")?;
505 assert_eq!(tree.root_node().kind(), "source_file");
506 Ok(())
507 }
508}
509
510// SAFETY: See the SAFETY comment on the `language()` function above.
511// This is the only unsafe code in the crate — the single FFI symbol we need
512// from the compiled C grammar. No bindgen is used; the declaration is
513// hand-written to avoid a libclang build dependency.
514unsafe extern "C" {
515 fn tree_sitter_perl() -> Language;
516}