perl-corpus 0.12.1

//! Comprehensive Perl test corpus and property-based testing infrastructure
//!
//! This crate provides a curated collection of Perl code samples for testing parser correctness,
//! edge case coverage, and LSP feature validation. It includes both manually curated test cases
//! and property-based test generators for comprehensive coverage.
//!
//! # Architecture
//!
//! The corpus is organized into several layers:
//!
//! - **Curated Test Cases**: Hand-written examples covering Perl syntax edge cases
//! - **Property-Based Generators**: Randomized code generation for fuzz testing
//! - **Real-World Samples**: Code from CPAN and production Perl projects
//! - **Metadata System**: Tag-based organization with section markers and test IDs
//!
//! # Corpus Organization
//!
//! Test cases are stored in text files with section markers and metadata:
//!
//! ```text
//! ==========================================
//! Basic Variable Declaration
//! ==========================================
//! # @id: vars.basic.my
//! # @tags: variables, declaration
//! my $x = 42;
//! ---
//! (expected AST representation)
//! ```
//!
//! Each section includes:
//! - **Title**: Human-readable test case name
//! - **Metadata**: ID, tags, Perl version requirements, flags
//! - **Body**: Perl code to parse
//! - **Expected Output**: Optional AST or error expectations (after `---`)
//!
//! # Usage
//!
//! ## Loading Corpus Files
//!
//! ```rust,ignore
//! use perl_corpus::{CorpusPaths, get_corpus_files};
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! let files = get_corpus_files();
//!
//! for file in files {
//!     println!("Found corpus file: {:?}", file.path);
//! }
//! # Ok(())
//! # }
//! ```
//!
//! ## Parsing Corpus Sections
//!
//! ```rust
//! use perl_corpus::parse_file;
//! use std::path::Path;
//!
//! # fn example() -> anyhow::Result<()> {
//! # let path = Path::new("test_corpus/variables.txt");
//! # if !path.exists() { return Ok(()); }
//! let sections = parse_file(path)?;
//!
//! for section in sections {
//!     println!("Section: {} (id: {})", section.title, section.id);
//!     println!("Tags: {:?}", section.tags);
//!     println!("Code:\n{}", section.body);
//! }
//! # Ok(())
//! # }
//! ```
//!
//! ## Finding Cases by Tag
//!
//! ```rust
//! use perl_corpus::{parse_dir, find_by_tag};
//! use std::path::Path;
//!
//! # fn example() -> anyhow::Result<()> {
//! # let corpus_dir = Path::new("test_corpus");
//! # if !corpus_dir.exists() { return Ok(()); }
//! let all_sections = parse_dir(corpus_dir)?;
//! let regex_tests = find_by_tag(&all_sections, "regex");
//!
//! println!("Found {} regex test cases", regex_tests.len());
//! # Ok(())
//! # }
//! ```
//!
//! ## Using Property-Based Generators
//!
//! ```rust,ignore
//! use perl_corpus::{generate_perl_code_with_seed, CodegenOptions};
//!
//! // Generate random valid Perl code
//! let code = generate_perl_code_with_seed(10, 42);
//! println!("Generated:\n{}", code);
//!
//! // Generate with specific options
//! let options = CodegenOptions::default();
//! let modern_code = generate_perl_code(&options);
//! ```
//!
//! ## Specialized Test Case Modules
//!
//! The corpus includes focused generators for specific Perl features:
//!
//! ### Complex Data Structures
//!
//! ```rust,ignore
//! use perl_corpus::{complex_data_structure_cases, find_complex_case};
//!
//! let cases = complex_data_structure_cases();
//! if let Some(nested) = find_complex_case("nested-arrays") {
//!     println!("Test: {}", nested.description);
//!     println!("Code:\n{}", nested.code);
//! }
//! ```
//!
//! ### Continue/Redo Blocks
//!
//! ```rust
//! use perl_corpus::{continue_redo_cases, valid_continue_redo_cases};
//!
//! let all_cases = continue_redo_cases();
//! let valid_only = valid_continue_redo_cases();
//! ```
//!
//! ### Format Statements
//!
//! ```rust,ignore
//! use perl_corpus::{format_statement_cases, FormatStatementGenerator};
//!
//! let cases = format_statement_cases();
//! let generator = FormatStatementGenerator::new(42);
//! ```
//!
//! ### Glob Expressions
//!
//! ```rust,ignore
//! use perl_corpus::{glob_expression_cases, GlobExpressionGenerator};
//!
//! let cases = glob_expression_cases();
//! let generator = GlobExpressionGenerator::new(42);
//! ```
//!
//! ### Tie Interface
//!
//! ```rust
//! use perl_corpus::{tie_interface_cases, tie_cases_by_tag};
//!
//! let all_tie = tie_interface_cases();
//! let scalar_tie = tie_cases_by_tag("scalar");
//! ```
//!
//! # Corpus Layers
//!
//! The corpus is organized into three layers accessible via [`CorpusLayer`]:
//!
//! - **`CorpusLayer::Main`**: Core test cases in `test_corpus/`
//! - **`CorpusLayer::TreeSitter`**: Tree-sitter grammar tests in `tree-sitter-perl/test/corpus/`
//! - **`CorpusLayer::Fuzz`**: Fuzzing inputs and edge cases in `crates/perl-corpus/fuzz/`
//!
//! ## Environment Configuration
//!
//! Override the corpus root with the `CORPUS_ROOT` environment variable:
//!
//! ```bash
//! export CORPUS_ROOT=/path/to/custom/corpus
//! cargo test
//! ```
//!
//! # Integration with Parser Testing
//!
//! The corpus integrates with `perl-parser` test suites:
//!
//! ```rust,ignore
//! use perl_parser::Parser;
//! use perl_corpus::{parse_dir, find_by_tag};
//!
//! # fn test_parser_with_corpus() -> anyhow::Result<()> {
//! # let corpus_dir = std::path::Path::new("test_corpus");
//! let sections = parse_dir(corpus_dir)?;
//! let regex_cases = find_by_tag(&sections, "regex");
//!
//! for case in regex_cases {
//!     let mut parser = Parser::new(&case.body);
//!     let result = parser.parse();
//!     assert!(result.is_ok(), "Failed to parse: {}", case.title);
//! }
//! # Ok(())
//! # }
//! ```
//!
//! # Test Case Validation
//!
//! Corpus files can include validation flags:
//!
//! - **`parser-sensitive`**: Requires specific parser version
//! - **`perl-version:5.26`**: Requires Perl 5.26+ features
//! - **`expected-error`**: Test case should produce parse error
//! - **`wip`**: Work in progress, may not parse correctly yet
//!
//! # Contributing Test Cases
//!
//! To add new test cases:
//!
//! 1. Create or edit a corpus file in `test_corpus/`
//! 2. Use section markers (`====`) to separate cases
//! 3. Add metadata tags for categorization
//! 4. Include expected output after `---` separator
//! 5. Run `cargo test` to validate
//!
//! See existing corpus files for examples and conventions.
#![allow(clippy::pedantic)] // Corpus crate - focus on core clippy lints only

pub mod cases;
pub mod codegen;
pub mod continue_redo;
pub mod files;
pub mod format_statements;
pub mod r#gen;
pub mod glob_expressions;
pub mod index;
pub mod lint;
pub mod meta;
pub mod tie_interface;

use anyhow::{Context, Result};
pub use cases::{
    ComplexDataStructureCase, EdgeCase, EdgeCaseGenerator, complex_data_structure_cases,
    edge_cases, find_complex_case, get_complex_data_structure_tests, sample_complex_case,
};
pub use codegen::{
    CodegenOptions, StatementKind, generate_perl_code, generate_perl_code_with_options,
    generate_perl_code_with_seed, generate_perl_code_with_statements,
};
pub use continue_redo::{
    ContinueRedoCase, cases_by_tag as continue_redo_cases_by_tag, continue_redo_cases,
    find_case as find_continue_redo_case, invalid_cases as invalid_continue_redo_cases,
    valid_cases as valid_continue_redo_cases,
};
pub use files::{
    CORPUS_ROOT_ENV, CorpusFile, CorpusLayer, CorpusPaths, get_all_test_files, get_corpus_files,
    get_corpus_files_from, get_fuzz_files, get_test_files,
};
pub use format_statements::{
    FormatStatementCase, FormatStatementGenerator, find_format_case, format_statement_cases,
};
pub use glob_expressions::{
    GlobExpressionCase, GlobExpressionGenerator, find_glob_case, glob_expression_cases,
};
use meta::Section;
use regex::Regex;
use std::collections::HashMap;
use std::{fs, path::Path};
pub use tie_interface::{
    TieInterfaceCase, find_tie_case, tie_cases_by_tag, tie_cases_by_tags_all,
    tie_cases_by_tags_any, tie_interface_cases,
};

// Regex patterns - use Option for graceful degradation if compilation fails
static SEC_RE: once_cell::sync::Lazy<Option<Regex>> =
    once_cell::sync::Lazy::new(|| Regex::new(r"(?m)^=+\s*$").ok());
static META_RE: once_cell::sync::Lazy<Option<Regex>> = once_cell::sync::Lazy::new(|| {
    Regex::new(r"(?m)^#\s*@(?P<k>id|tags|perl|flags):\s*(?P<v>.*)$").ok()
});

fn slugify_title(title: &str) -> String {
    let mut slug = String::new();
    let mut last_dash = false;

    for ch in title.chars() {
        let ch = ch.to_ascii_lowercase();
        if ch.is_ascii_alphanumeric() {
            slug.push(ch);
            last_dash = false;
        } else if !last_dash && !slug.is_empty() {
            slug.push('-');
            last_dash = true;
        }
    }

    while slug.ends_with('-') {
        slug.pop();
    }

    slug
}

/// Parse a corpus file into sections.
pub fn parse_file(path: &Path) -> Result<Vec<Section>> {
    let text = fs::read_to_string(path).with_context(|| format!("reading {}", path.display()))?;
    let mut sections = Vec::new();
    let file_stem = path
        .file_stem()
        .and_then(|stem| {
            let slug = slugify_title(&stem.to_string_lossy());
            if slug.is_empty() { None } else { Some(slug) }
        })
        .unwrap_or_else(|| "corpus".to_string());
    let mut auto_ids: HashMap<String, usize> = HashMap::new();
    let mut section_index = 0usize;

    // If regex compilation failed, return empty sections (graceful degradation)
    let Some(sec_re) = SEC_RE.as_ref() else {
        return Ok(sections);
    };

    // Find all section delimiters
    let raw_delims: Vec<usize> = sec_re.find_iter(&text).map(|m| m.start()).collect();

    // Filter out closing delimiters in paired-delimiter format.
    // A closing delimiter has only 2 lines (delimiter + title) between
    // it and the preceding opening delimiter.
    let mut opening_delims: Vec<usize> = Vec::new();
    let mut i = 0;
    while i < raw_delims.len() {
        opening_delims.push(raw_delims[i]);
        if i + 1 < raw_delims.len() {
            let between = &text[raw_delims[i]..raw_delims[i + 1]];
            if between.lines().count() == 2 {
                i += 2; // skip closing delimiter
                continue;
            }
        }
        i += 1;
    }

    // Build offset array: prelude sentinel + opening delimiters + EOF sentinel
    let mut offs = vec![0usize];
    offs.extend(&opening_delims);
    offs.dedup(); // remove duplicate 0 when first delimiter is at start
    offs.push(text.len());

    for w in offs.windows(2) {
        let start = w[0];
        let end = w[1];

        // Skip prelude (text before first section delimiter)
        let first_line = text[start..end].lines().next().unwrap_or("");
        if !sec_re.is_match(first_line) {
            continue;
        }

        section_index += 1;

        // Extract section text
        let section_text = &text[start..end];
        let lines: Vec<&str> = section_text.lines().collect();

        if lines.len() < 2 {
            continue;
        } // malformed section

        // Title is the line after "===="
        let title = lines[1].trim().to_string();

        // Skip closing delimiter in paired-delimiter format (===\nTitle\n===)
        let after_title_idx = if lines.len() > 2 && sec_re.is_match(lines[2]) { 3 } else { 2 };

        // Gather metadata lines following title
        let mut meta = HashMap::<String, String>::new();
        let mut body_start_idx = after_title_idx;

        // Use META_RE if available, otherwise skip metadata parsing
        if let Some(meta_re) = META_RE.as_ref() {
            for (i, line) in lines.iter().enumerate().skip(after_title_idx) {
                if let Some(cap) = meta_re.captures(line) {
                    meta.insert(cap["k"].to_string(), cap["v"].trim().to_string());
                    body_start_idx = i + 1;
                } else if !line.starts_with('#') || line.trim().is_empty() {
                    body_start_idx = i;
                    break;
                }
            }
        }

        // Extract metadata fields
        let mut id = meta.get("id").cloned().unwrap_or_default();
        let tags = meta
            .get("tags")
            .map(|s| {
                s.replace(',', " ").split_whitespace().map(|t| t.to_lowercase()).collect::<Vec<_>>()
            })
            .unwrap_or_default();
        let perl = meta.get("perl").cloned().filter(|s| !s.is_empty());
        let flags = meta
            .get("flags")
            .map(|s| {
                s.replace(',', " ").split_whitespace().map(|t| t.to_string()).collect::<Vec<_>>()
            })
            .unwrap_or_default();

        // Extract body (code after metadata)
        let body_lines = if body_start_idx < lines.len() { &lines[body_start_idx..] } else { &[] };
        let body_end =
            body_lines.iter().position(|line| line.trim() == "---").unwrap_or(body_lines.len());
        let body = body_lines[..body_end].join("\n").trim().to_string();

        if id.is_empty() {
            let title_slug = slugify_title(&title);
            let base = if title_slug.is_empty() {
                format!("section-{}", section_index)
            } else {
                title_slug
            };
            let base_id = format!("{}.{}", file_stem, base);
            let count = auto_ids.entry(base_id.clone()).or_insert(0);
            id = if *count == 0 { base_id } else { format!("{}-{}", base_id, *count + 1) };
            *count += 1;
        }

        // Calculate line number (for error reporting)
        let line_num = text[..start].lines().count() + 1;

        // Get file name, use empty OsStr if path has no file name component
        let file_name = path.file_name().unwrap_or_default();

        sections.push(Section {
            id,
            title,
            file: file_name.to_string_lossy().into(),
            tags,
            perl,
            flags,
            body,
            line: Some(line_num),
        });
    }

    Ok(sections)
}

/// Scan the `test_corpus/` directory.
pub fn parse_dir(dir: &Path) -> Result<Vec<Section>> {
    let mut all = Vec::new();

    // Build glob pattern
    let pattern = format!("{}/**/*.txt", dir.display());

    for entry in glob::glob(&pattern)? {
        let p = entry?;

        // Skip index/tag files
        let filename = p.file_name().unwrap_or_default().to_string_lossy();
        if filename.starts_with('_') || filename.starts_with('.') {
            continue;
        }

        all.extend(parse_file(&p)?);
    }

    // Sort by file and ID for stable output
    all.sort_by(|a, b| a.file.cmp(&b.file).then_with(|| a.id.cmp(&b.id)));

    Ok(all)
}

/// Find sections by tag
pub fn find_by_tag<'a>(sections: &'a [Section], tag: &str) -> Vec<&'a Section> {
    sections.iter().filter(|s| s.has_tag(tag)).collect()
}

/// Find sections by flag
pub fn find_by_flag<'a>(sections: &'a [Section], flag: &str) -> Vec<&'a Section> {
    sections.iter().filter(|s| s.has_flag(flag)).collect()
}

#[cfg(test)]
mod tests {
    use super::*;
    use perl_tdd_support::{must, must_some};
    use std::fs;
    use std::path::PathBuf;
    use std::time::{SystemTime, UNIX_EPOCH};

    fn temp_file(prefix: &str) -> PathBuf {
        let mut path = std::env::temp_dir();
        let nanos = SystemTime::now().duration_since(UNIX_EPOCH).unwrap_or_default().as_nanos();
        path.push(format!("{}_{}.txt", prefix, nanos));
        path
    }

    #[test]
    fn parse_file_strips_ast_and_generates_id() {
        let path = temp_file("perl_corpus_parse");
        let contents = r#"==========================================
Sample Section
==========================================

my $x = 1;

---
(source_file
  (expression_statement
    (assignment_expression
      (variable_declaration
        (scalar
          (varname)))
      (number))))

==========================================
Tagged Section
==========================================
# @id: custom.id
# @tags: alpha, Beta
# @flags: parser-sensitive
my $y = 2;
"#;

        must(fs::write(&path, contents));
        let sections = must(parse_file(&path));
        must(fs::remove_file(&path));

        // Note: The parser currently finds 3 sections due to the way === delimiters work
        // This is expected behavior with the current parsing logic
        assert!(sections.len() >= 2);

        // Find the sections by checking their content/ids
        let sample_section = must_some(sections.iter().find(|s| s.body.contains("my $x = 1;")));
        let tagged_section = must_some(sections.iter().find(|s| s.id == "custom.id"));

        assert_eq!(sample_section.body, "my $x = 1;");
        assert!(!sample_section.body.contains("---"));
        assert_eq!(tagged_section.id, "custom.id");
        assert_eq!(tagged_section.tags, vec!["alpha".to_string(), "beta".to_string()]);
        assert_eq!(tagged_section.flags, vec!["parser-sensitive".to_string()]);
        assert_eq!(tagged_section.body, "my $y = 2;");
    }
}