perl-corpus 0.13.3

Test corpus management and generators for Perl parsers
Documentation
use anyhow::{Context, Result, bail};
use serde::Deserialize;
use std::collections::BTreeSet;
use std::fs;
use std::path::{Path, PathBuf};

const CONCEPT_FILES: &[&str] = &[
    "lexer.toml",
    "parser.toml",
    "recovery.toml",
    "positions.toml",
    "incremental.toml",
    "tree_sitter.toml",
];

const KNOWN_STATUS: &[&str] = &["seed", "active", "planned", "deprecated"];

#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ConceptRegistryFile {
    #[serde(default)]
    pub concept: Vec<ConceptRow>,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ConceptRow {
    pub id: String,
    pub status: String,
    pub scope: ConceptScope,
    pub fixtures: ConceptFixtures,
    pub expect: ConceptExpect,
    pub snapshots: ConceptSnapshots,
    pub run: ConceptRun,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ConceptScope {
    pub crates: Vec<String>,
    pub risk_tags: Vec<String>,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ConceptFixtures {
    #[serde(default)]
    pub floors: Vec<String>,
    #[serde(default)]
    pub variants: Vec<String>,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ConceptExpect {
    pub panic: bool,
    pub timeout: bool,
    pub mode: String,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ConceptSnapshots {
    pub tokens: bool,
    pub ast: bool,
    pub spans: bool,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ConceptRun {
    pub pr: bool,
    pub nightly: bool,
    pub release: bool,
}

#[derive(Debug, Clone)]
pub struct LoadedConcept {
    pub source_file: String,
    pub row: ConceptRow,
}

pub fn load_concept_registry() -> Result<Vec<LoadedConcept>> {
    let root = super::files::CorpusPaths::discover().root;
    let concept_dir = root.join("crates/perl-corpus/concepts");
    load_concept_registry_from(&concept_dir, &root)
}

fn load_concept_registry_from(
    concept_dir: &Path,
    workspace_root: &Path,
) -> Result<Vec<LoadedConcept>> {
    let mut concepts = Vec::new();

    for file_name in CONCEPT_FILES {
        let path = concept_dir.join(file_name);
        let content = fs::read_to_string(&path)
            .with_context(|| format!("reading concept registry file {}", path.display()))?;

        let parsed: ConceptRegistryFile =
            toml::from_str(&content).with_context(|| format!("parsing {}", path.display()))?;

        for row in parsed.concept {
            validate_row(&row, workspace_root, &path)?;
            concepts.push(LoadedConcept { source_file: (*file_name).to_string(), row });
        }
    }

    concepts
        .sort_by(|a, b| a.row.id.cmp(&b.row.id).then_with(|| a.source_file.cmp(&b.source_file)));

    let mut seen = BTreeSet::new();
    for concept in &concepts {
        if !seen.insert(concept.row.id.clone()) {
            bail!("duplicate concept id: {}", concept.row.id);
        }
    }

    Ok(concepts)
}

fn validate_row(row: &ConceptRow, workspace_root: &Path, source_file: &Path) -> Result<()> {
    if !KNOWN_STATUS.contains(&row.status.as_str()) {
        bail!(
            "unknown status '{}' for concept '{}' in {}",
            row.status,
            row.id,
            source_file.display()
        );
    }

    for fixture in row.fixtures.floors.iter().chain(row.fixtures.variants.iter()) {
        let fixture_path = fixture_path(workspace_root, fixture);
        if !fixture_path.exists() {
            bail!(
                "fixture path '{}' for concept '{}' does not exist (resolved to {})",
                fixture,
                row.id,
                fixture_path.display()
            );
        }
    }

    Ok(())
}

fn fixture_path(workspace_root: &Path, fixture: &str) -> PathBuf {
    let path = Path::new(fixture);
    if path.is_absolute() {
        return path.to_path_buf();
    }

    workspace_root.join(path)
}

#[cfg(test)]
mod tests {
    use super::*;
    use perl_tdd_support::must;
    use std::time::{SystemTime, UNIX_EPOCH};

    fn temp_dir(prefix: &str) -> PathBuf {
        let mut dir = std::env::temp_dir();
        let nanos = SystemTime::now().duration_since(UNIX_EPOCH).unwrap_or_default().as_nanos();
        dir.push(format!("{}_{}_{}", prefix, std::process::id(), nanos));
        must(fs::create_dir_all(&dir));
        dir
    }

    fn write_registry(path: &Path, body: &str) {
        let concept_dir = path.join("concepts");
        must(fs::create_dir_all(&concept_dir));

        for file in CONCEPT_FILES {
            let content = if *file == "lexer.toml" { body } else { "" };
            must(fs::write(concept_dir.join(file), content));
        }
    }

    #[test]
    fn registry_loads_deterministically() {
        let root = temp_dir("concept_registry_deterministic");
        must(fs::create_dir_all(root.join("fixtures")));
        must(fs::write(root.join("fixtures/ok.pl"), "my $x = 1;\n"));

        write_registry(
            &root,
            r#"
[[concept]]
id = "z.last"
status = "seed"
scope = { crates = ["perl-parser"], risk_tags = ["parser"] }
fixtures = { floors = ["fixtures/ok.pl"], variants = [] }
expect = { panic = false, timeout = false, mode = "parse" }
snapshots = { tokens = true, ast = true, spans = false }
run = { pr = true, nightly = false, release = false }

[[concept]]
id = "a.first"
status = "seed"
scope = { crates = ["perl-lexer"], risk_tags = ["lexer"] }
fixtures = { floors = ["fixtures/ok.pl"], variants = [] }
expect = { panic = false, timeout = false, mode = "lex" }
snapshots = { tokens = true, ast = false, spans = true }
run = { pr = true, nightly = true, release = false }
"#,
        );

        let loaded = must(load_concept_registry_from(&root.join("concepts"), &root));
        let ids: Vec<_> = loaded.iter().map(|c| c.row.id.as_str()).collect();
        assert_eq!(ids, vec!["a.first", "z.last"]);
    }

    #[test]
    fn duplicate_ids_are_rejected() -> Result<()> {
        let root = temp_dir("concept_registry_dupe");
        must(fs::create_dir_all(root.join("fixtures")));
        must(fs::write(root.join("fixtures/ok.pl"), "my $x = 1;\n"));

        write_registry(
            &root,
            r#"
[[concept]]
id = "dup.id"
status = "seed"
scope = { crates = ["perl-parser"], risk_tags = ["parser"] }
fixtures = { floors = ["fixtures/ok.pl"], variants = [] }
expect = { panic = false, timeout = false, mode = "parse" }
snapshots = { tokens = true, ast = true, spans = false }
run = { pr = true, nightly = true, release = false }

[[concept]]
id = "dup.id"
status = "seed"
scope = { crates = ["perl-parser"], risk_tags = ["parser"] }
fixtures = { floors = ["fixtures/ok.pl"], variants = [] }
expect = { panic = false, timeout = false, mode = "parse" }
snapshots = { tokens = true, ast = true, spans = false }
run = { pr = true, nightly = true, release = false }
"#,
        );

        let Err(err) = load_concept_registry_from(&root.join("concepts"), &root) else {
            bail!("expected duplicate id failure");
        };
        assert!(err.to_string().contains("duplicate concept id"));
        Ok(())
    }

    #[test]
    fn missing_fixture_path_is_rejected() -> Result<()> {
        let root = temp_dir("concept_registry_fixture");

        write_registry(
            &root,
            r#"
[[concept]]
id = "bad.fixture"
status = "seed"
scope = { crates = ["perl-parser"], risk_tags = ["parser"] }
fixtures = { floors = ["fixtures/does-not-exist.pl"], variants = [] }
expect = { panic = false, timeout = false, mode = "parse" }
snapshots = { tokens = true, ast = false, spans = false }
run = { pr = true, nightly = false, release = false }
"#,
        );

        let Err(err) = load_concept_registry_from(&root.join("concepts"), &root) else {
            bail!("expected fixture existence failure");
        };
        assert!(err.to_string().contains("does not exist"));
        Ok(())
    }

    #[test]
    fn unknown_top_level_sections_are_rejected() -> Result<()> {
        let root = temp_dir("concept_registry_unknown");
        must(fs::create_dir_all(root.join("fixtures")));
        must(fs::write(root.join("fixtures/ok.pl"), "my $x = 1;\n"));

        write_registry(
            &root,
            r#"
[[concept]]
id = "good.one"
status = "seed"
scope = { crates = ["perl-parser"], risk_tags = ["parser"] }
fixtures = { floors = ["fixtures/ok.pl"], variants = [] }
expect = { panic = false, timeout = false, mode = "parse" }
snapshots = { tokens = true, ast = true, spans = false }
run = { pr = true, nightly = true, release = false }

[extra]
hello = "world"
"#,
        );

        let Err(err) = load_concept_registry_from(&root.join("concepts"), &root) else {
            bail!("expected unknown section failure");
        };
        let message = err.to_string();
        assert!(message.contains("unknown") || message.contains("extra"));
        Ok(())
    }
}