perl-corpus 0.13.1

Test corpus management and generators for Perl parsers
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
//! Comprehensive Perl test corpus and property-based testing infrastructure
//!
//! This crate provides a curated collection of Perl code samples for testing parser correctness,
//! edge case coverage, and LSP feature validation. It includes both manually curated test cases
//! and property-based test generators for comprehensive coverage.
//!
//! # Architecture
//!
//! The corpus is organized into several layers:
//!
//! - **Curated Test Cases**: Hand-written examples covering Perl syntax edge cases
//! - **Property-Based Generators**: Randomized code generation for fuzz testing
//! - **Real-World Samples**: Code from CPAN and production Perl projects
//! - **Metadata System**: Tag-based organization with section markers and test IDs
//!
//! # Corpus Organization
//!
//! Test cases are stored in text files with section markers and metadata:
//!
//! ```text
//! ==========================================
//! Basic Variable Declaration
//! ==========================================
//! # @id: vars.basic.my
//! # @tags: variables, declaration
//! my $x = 42;
//! ---
//! (expected AST representation)
//! ```
//!
//! Each section includes:
//! - **Title**: Human-readable test case name
//! - **Metadata**: ID, tags, Perl version requirements, flags
//! - **Body**: Perl code to parse
//! - **Expected Output**: Optional AST or error expectations (after `---`)
//!
//! # Usage
//!
//! ## Loading Corpus Files
//!
//! ```rust,ignore
//! use perl_corpus::{CorpusPaths, get_corpus_files};
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! let files = get_corpus_files();
//!
//! for file in files {
//!     println!("Found corpus file: {:?}", file.path);
//! }
//! # Ok(())
//! # }
//! ```
//!
//! ## Parsing Corpus Sections
//!
//! ```rust
//! use perl_corpus::parse_file;
//! use std::path::Path;
//!
//! # fn example() -> anyhow::Result<()> {
//! # let path = Path::new("test_corpus/variables.txt");
//! # if !path.exists() { return Ok(()); }
//! let sections = parse_file(path)?;
//!
//! for section in sections {
//!     println!("Section: {} (id: {})", section.title, section.id);
//!     println!("Tags: {:?}", section.tags);
//!     println!("Code:\n{}", section.body);
//! }
//! # Ok(())
//! # }
//! ```
//!
//! ## Finding Cases by Tag
//!
//! ```rust
//! use perl_corpus::{parse_dir, find_by_tag};
//! use std::path::Path;
//!
//! # fn example() -> anyhow::Result<()> {
//! # let corpus_dir = Path::new("test_corpus");
//! # if !corpus_dir.exists() { return Ok(()); }
//! let all_sections = parse_dir(corpus_dir)?;
//! let regex_tests = find_by_tag(&all_sections, "regex");
//!
//! println!("Found {} regex test cases", regex_tests.len());
//! # Ok(())
//! # }
//! ```
//!
//! ## Using Property-Based Generators
//!
//! ```rust,ignore
//! use perl_corpus::{generate_perl_code_with_seed, CodegenOptions};
//!
//! // Generate random valid Perl code
//! let code = generate_perl_code_with_seed(10, 42);
//! println!("Generated:\n{}", code);
//!
//! // Generate with specific options
//! let options = CodegenOptions::default();
//! let modern_code = generate_perl_code(&options);
//! ```
//!
//! ## Specialized Test Case Modules
//!
//! The corpus includes focused generators for specific Perl features:
//!
//! ### Complex Data Structures
//!
//! ```rust,ignore
//! use perl_corpus::{complex_data_structure_cases, find_complex_case};
//!
//! let cases = complex_data_structure_cases();
//! if let Some(nested) = find_complex_case("nested-arrays") {
//!     println!("Test: {}", nested.description);
//!     println!("Code:\n{}", nested.code);
//! }
//! ```
//!
//! ### Continue/Redo Blocks
//!
//! ```rust
//! use perl_corpus::{continue_redo_cases, valid_continue_redo_cases};
//!
//! let all_cases = continue_redo_cases();
//! let valid_only = valid_continue_redo_cases();
//! ```
//!
//! ### Format Statements
//!
//! ```rust,ignore
//! use perl_corpus::{format_statement_cases, FormatStatementGenerator};
//!
//! let cases = format_statement_cases();
//! let generator = FormatStatementGenerator::new(42);
//! ```
//!
//! ### Glob Expressions
//!
//! ```rust,ignore
//! use perl_corpus::{glob_expression_cases, GlobExpressionGenerator};
//!
//! let cases = glob_expression_cases();
//! let generator = GlobExpressionGenerator::new(42);
//! ```
//!
//! ### Tie Interface
//!
//! ```rust
//! use perl_corpus::{tie_interface_cases, tie_cases_by_tag};
//!
//! let all_tie = tie_interface_cases();
//! let scalar_tie = tie_cases_by_tag("scalar");
//! ```
//!
//! # Corpus Layers
//!
//! The corpus is organized into three layers accessible via [`CorpusLayer`]:
//!
//! - **`CorpusLayer::Main`**: Core test cases in `test_corpus/`
//! - **`CorpusLayer::TreeSitter`**: Tree-sitter grammar tests in `tree-sitter-perl/test/corpus/`
//! - **`CorpusLayer::Fuzz`**: Fuzzing inputs and edge cases in `crates/perl-corpus/fuzz/`
//!
//! ## Environment Configuration
//!
//! Override the corpus root with the `CORPUS_ROOT` environment variable:
//!
//! ```bash
//! export CORPUS_ROOT=/path/to/custom/corpus
//! cargo test
//! ```
//!
//! # Integration with Parser Testing
//!
//! The corpus integrates with `perl-parser` test suites:
//!
//! ```rust,ignore
//! use perl_parser::Parser;
//! use perl_corpus::{parse_dir, find_by_tag};
//!
//! # fn test_parser_with_corpus() -> anyhow::Result<()> {
//! # let corpus_dir = std::path::Path::new("test_corpus");
//! let sections = parse_dir(corpus_dir)?;
//! let regex_cases = find_by_tag(&sections, "regex");
//!
//! for case in regex_cases {
//!     let mut parser = Parser::new(&case.body);
//!     let result = parser.parse();
//!     assert!(result.is_ok(), "Failed to parse: {}", case.title);
//! }
//! # Ok(())
//! # }
//! ```
//!
//! # Test Case Validation
//!
//! Corpus files can include validation flags:
//!
//! - **`parser-sensitive`**: Requires specific parser version
//! - **`perl-version:5.26`**: Requires Perl 5.26+ features
//! - **`expected-error`**: Test case should produce parse error
//! - **`wip`**: Work in progress, may not parse correctly yet
//!
//! # Contributing Test Cases
//!
//! To add new test cases:
//!
//! 1. Create or edit a corpus file in `test_corpus/`
//! 2. Use section markers (`====`) to separate cases
//! 3. Add metadata tags for categorization
//! 4. Include expected output after `---` separator
//! 5. Run `cargo test` to validate
//!
//! See existing corpus files for examples and conventions.
#![allow(clippy::pedantic)]
// Corpus crate - focus on core clippy lints only
// Lint enforcement: library code must use tracing, not direct stderr/stdout prints.
#![deny(clippy::print_stderr, clippy::print_stdout)]
#![cfg_attr(test, allow(clippy::print_stderr, clippy::print_stdout))]

pub mod cases;
pub mod codegen;
pub mod concepts;
pub mod continue_redo;
pub mod files;
pub mod fixture_expectations;
pub mod format_statements;
pub mod r#gen;
pub mod glob_expressions;
pub mod gold;
pub mod index;
pub mod inventory;
pub mod lint;
pub mod meta;
pub mod sidecar;
pub mod tie_interface;

use anyhow::{Context, Result};
pub use cases::{
    ComplexDataStructureCase, EdgeCase, EdgeCaseGenerator, complex_data_structure_cases,
    edge_cases, find_complex_case, get_complex_data_structure_tests, sample_complex_case,
};
pub use codegen::{
    CodegenOptions, StatementKind, generate_perl_code, generate_perl_code_with_options,
    generate_perl_code_with_seed, generate_perl_code_with_statements,
};
pub use concepts::{ConceptRow, LoadedConcept, load_concept_registry};
pub use continue_redo::{
    ContinueRedoCase, cases_by_tag as continue_redo_cases_by_tag, continue_redo_cases,
    find_case as find_continue_redo_case, invalid_cases as invalid_continue_redo_cases,
    valid_cases as valid_continue_redo_cases,
};
pub use files::{
    CORPUS_ROOT_ENV, CorpusFile, CorpusLayer, CorpusPaths, get_all_test_files, get_corpus_files,
    get_corpus_files_from, get_fuzz_files, get_test_files,
};
pub use format_statements::{
    FormatStatementCase, FormatStatementGenerator, find_format_case, format_statement_cases,
};
pub use glob_expressions::{
    GlobExpressionCase, GlobExpressionGenerator, find_glob_case, glob_expression_cases,
};
pub use gold::{
    CompletionAssertion, CompletionAssertionKind, CompletionGoldExpected, CompletionGoldFixture,
    DocumentSymbolAssertion, DocumentSymbolAssertionKind, DocumentSymbolGoldExpected,
    DocumentSymbolGoldFixture, GoldAssertion, GoldExpected, GoldFixture, GotoAssertion,
    GotoAssertionKind, GotoGoldExpected, GotoGoldFixture, HoverAssertion, HoverAssertionKind,
    HoverGoldExpected, HoverGoldFixture, load_completion_gold_fixtures,
    load_document_symbol_gold_fixtures, load_gold_fixture, load_gold_fixtures,
    load_gold_fixtures_from, load_goto_gold_fixtures, load_hover_gold_fixtures,
};
pub use inventory::{
    CorpusInventory, InventoryIds, InventoryMarkers, InventoryTags, build_inventory,
    build_inventory_from_paths, generator_families, inventory_from_sections,
};
use meta::Section;
use regex::Regex;
pub use sidecar::{
    ConceptRegistry, ExpectationMode, FixtureExpectationSidecar, SidecarConcept, SidecarExpect,
    SidecarMetrics, SidecarSnapshots, SidecarValidation, discover_sidecars, expected_fixture_path,
    load_and_validate_sidecar, parse_sidecar, validate_sidecar,
};
use std::collections::HashMap;
use std::{fs, path::Path};
pub use tie_interface::{
    TieInterfaceCase, find_tie_case, tie_cases_by_tag, tie_cases_by_tags_all,
    tie_cases_by_tags_any, tie_interface_cases,
};

// Regex patterns - use Option for graceful degradation if compilation fails
static SEC_RE: once_cell::sync::Lazy<Option<Regex>> =
    once_cell::sync::Lazy::new(|| Regex::new(r"(?m)^=+\s*$").ok());
static META_RE: once_cell::sync::Lazy<Option<Regex>> = once_cell::sync::Lazy::new(|| {
    Regex::new(r"(?m)^#\s*@(?P<k>id|tags|perl|flags):\s*(?P<v>.*)$").ok()
});

fn slugify_title(title: &str) -> String {
    let mut slug = String::new();
    let mut last_dash = false;

    for ch in title.chars() {
        let ch = ch.to_ascii_lowercase();
        if ch.is_ascii_alphanumeric() {
            slug.push(ch);
            last_dash = false;
        } else if !last_dash && !slug.is_empty() {
            slug.push('-');
            last_dash = true;
        }
    }

    while slug.ends_with('-') {
        slug.pop();
    }

    slug
}

/// Parse a corpus file into sections.
pub fn parse_file(path: &Path) -> Result<Vec<Section>> {
    let text = fs::read_to_string(path).with_context(|| format!("reading {}", path.display()))?;
    let mut sections = Vec::new();
    let file_stem = path
        .file_stem()
        .and_then(|stem| {
            let slug = slugify_title(&stem.to_string_lossy());
            if slug.is_empty() { None } else { Some(slug) }
        })
        .unwrap_or_else(|| "corpus".to_string());
    let mut auto_ids: HashMap<String, usize> = HashMap::new();
    let mut section_index = 0usize;

    // If regex compilation failed, return empty sections (graceful degradation)
    let Some(sec_re) = SEC_RE.as_ref() else {
        return Ok(sections);
    };

    // Find all section delimiters
    let raw_delims: Vec<usize> = sec_re.find_iter(&text).map(|m| m.start()).collect();

    // Filter out closing delimiters in paired-delimiter format.
    // A closing delimiter has only 2 lines (delimiter + title) between
    // it and the preceding opening delimiter.
    let mut opening_delims: Vec<usize> = Vec::new();
    let mut i = 0;
    while i < raw_delims.len() {
        opening_delims.push(raw_delims[i]);
        if i + 1 < raw_delims.len() {
            let between = &text[raw_delims[i]..raw_delims[i + 1]];
            if between.lines().count() == 2 {
                i += 2; // skip closing delimiter
                continue;
            }
        }
        i += 1;
    }

    // Build offset array: prelude sentinel + opening delimiters + EOF sentinel
    let mut offs = vec![0usize];
    offs.extend(&opening_delims);
    offs.dedup(); // remove duplicate 0 when first delimiter is at start
    offs.push(text.len());

    for w in offs.windows(2) {
        let start = w[0];
        let end = w[1];

        // Skip prelude (text before first section delimiter)
        let first_line = text[start..end].lines().next().unwrap_or("");
        if !sec_re.is_match(first_line) {
            continue;
        }

        section_index += 1;

        // Extract section text
        let section_text = &text[start..end];
        let lines: Vec<&str> = section_text.lines().collect();

        if lines.len() < 2 {
            continue;
        } // malformed section

        // Title is the line after "===="
        let title = lines[1].trim().to_string();

        // Skip closing delimiter in paired-delimiter format (===\nTitle\n===)
        let after_title_idx = if lines.len() > 2 && sec_re.is_match(lines[2]) { 3 } else { 2 };

        // Gather metadata lines following title
        let mut meta = HashMap::<String, String>::new();
        let mut body_start_idx = after_title_idx;

        // Use META_RE if available, otherwise skip metadata parsing
        if let Some(meta_re) = META_RE.as_ref() {
            for (i, line) in lines.iter().enumerate().skip(after_title_idx) {
                if let Some(cap) = meta_re.captures(line) {
                    meta.insert(cap["k"].to_string(), cap["v"].trim().to_string());
                    body_start_idx = i + 1;
                } else if !line.starts_with('#') || line.trim().is_empty() {
                    body_start_idx = i;
                    break;
                }
            }
        }

        // Extract metadata fields
        let mut id = meta.get("id").cloned().unwrap_or_default();
        let tags = meta
            .get("tags")
            .map(|s| {
                s.replace(',', " ").split_whitespace().map(|t| t.to_lowercase()).collect::<Vec<_>>()
            })
            .unwrap_or_default();
        let perl = meta.get("perl").cloned().filter(|s| !s.is_empty());
        let flags = meta
            .get("flags")
            .map(|s| {
                s.replace(',', " ").split_whitespace().map(|t| t.to_string()).collect::<Vec<_>>()
            })
            .unwrap_or_default();

        // Extract body (code after metadata)
        let body_lines = if body_start_idx < lines.len() { &lines[body_start_idx..] } else { &[] };
        let body_end =
            body_lines.iter().position(|line| line.trim() == "---").unwrap_or(body_lines.len());
        let body = body_lines[..body_end].join("\n").trim().to_string();

        if id.is_empty() {
            let title_slug = slugify_title(&title);
            let base = if title_slug.is_empty() {
                format!("section-{}", section_index)
            } else {
                title_slug
            };
            let base_id = format!("{}.{}", file_stem, base);
            let count = auto_ids.entry(base_id.clone()).or_insert(0);
            id = if *count == 0 { base_id } else { format!("{}-{}", base_id, *count + 1) };
            *count += 1;
        }

        // Calculate line number (for error reporting)
        let line_num = text[..start].lines().count() + 1;

        // Get file name, use empty OsStr if path has no file name component
        let file_name = path.file_name().unwrap_or_default();

        sections.push(Section {
            id,
            title,
            file: file_name.to_string_lossy().into(),
            tags,
            perl,
            flags,
            body,
            line: Some(line_num),
        });
    }

    Ok(sections)
}

/// Scan the `test_corpus/` directory.
pub fn parse_dir(dir: &Path) -> Result<Vec<Section>> {
    let mut all = Vec::new();

    // Build glob pattern
    let pattern = format!("{}/**/*.txt", dir.display());

    for entry in glob::glob(&pattern)? {
        let p = entry?;

        // Skip index/tag files
        let filename = p.file_name().unwrap_or_default().to_string_lossy();
        if filename.starts_with('_') || filename.starts_with('.') {
            continue;
        }

        all.extend(parse_file(&p)?);
    }

    // Sort by file and ID for stable output
    all.sort_by(|a, b| a.file.cmp(&b.file).then_with(|| a.id.cmp(&b.id)));

    Ok(all)
}

/// Find sections by tag
pub fn find_by_tag<'a>(sections: &'a [Section], tag: &str) -> Vec<&'a Section> {
    sections.iter().filter(|s| s.has_tag(tag)).collect()
}

/// Find sections by flag
pub fn find_by_flag<'a>(sections: &'a [Section], flag: &str) -> Vec<&'a Section> {
    sections.iter().filter(|s| s.has_flag(flag)).collect()
}

#[cfg(test)]
mod tests {
    use super::*;
    use perl_tdd_support::{must, must_some};
    use std::fs;
    use std::path::PathBuf;
    use std::time::{SystemTime, UNIX_EPOCH};

    fn temp_file(prefix: &str) -> PathBuf {
        let mut path = std::env::temp_dir();
        let nanos = SystemTime::now().duration_since(UNIX_EPOCH).unwrap_or_default().as_nanos();
        path.push(format!("{}_{}.txt", prefix, nanos));
        path
    }

    #[test]
    fn parse_file_strips_ast_and_generates_id() {
        let path = temp_file("perl_corpus_parse");
        let contents = r#"==========================================
Sample Section
==========================================

my $x = 1;

---
(source_file
  (expression_statement
    (assignment_expression
      (variable_declaration
        (scalar
          (varname)))
      (number))))

==========================================
Tagged Section
==========================================
# @id: custom.id
# @tags: alpha, Beta
# @flags: parser-sensitive
my $y = 2;
"#;

        must(fs::write(&path, contents));
        let sections = must(parse_file(&path));
        must(fs::remove_file(&path));

        // Note: The parser currently finds 3 sections due to the way === delimiters work
        // This is expected behavior with the current parsing logic
        assert!(sections.len() >= 2);

        // Find the sections by checking their content/ids
        let sample_section = must_some(sections.iter().find(|s| s.body.contains("my $x = 1;")));
        let tagged_section = must_some(sections.iter().find(|s| s.id == "custom.id"));

        assert_eq!(sample_section.body, "my $x = 1;");
        assert!(!sample_section.body.contains("---"));
        assert_eq!(tagged_section.id, "custom.id");
        assert_eq!(tagged_section.tags, vec!["alpha".to_string(), "beta".to_string()]);
        assert_eq!(tagged_section.flags, vec!["parser-sensitive".to_string()]);
        assert_eq!(tagged_section.body, "my $y = 2;");
    }
}