parsuna 0.1.0

Parsuna: recoverable, pull-based parsers with precise errors
Documentation
use std::fmt::Write;
use std::path::PathBuf;

use crate::codegen::rust;
use crate::codegen::EmittedFile;
use crate::lowering::StateTable;

pub fn emit(st: &StateTable) -> Vec<EmittedFile> {
    let rust_file = rust::emit(st).into_iter().next().unwrap().contents;
    let name = if st.grammar_name.is_empty() {
        "parser".to_string()
    } else {
        st.grammar_name.clone()
    };

    let tokens: Vec<String> = st.tokens.iter().map(|t| t.name.clone()).collect();
    let rules: Vec<String> = st.entry_states.iter().map(|(n, _)| n.clone()).collect();
    let lib = build_lib_rs(&rust_file, &name, &st.rule_kinds, &tokens, &rules);
    let cargo = build_cargo_toml(&name);
    let pyproject = build_pyproject(&name);
    vec![
        EmittedFile {
            path: PathBuf::from("src/lib.rs"),
            contents: lib,
        },
        EmittedFile {
            path: PathBuf::from("Cargo.toml"),
            contents: cargo,
        },
        EmittedFile {
            path: PathBuf::from("pyproject.toml"),
            contents: pyproject,
        },
    ]
}

fn build_lib_rs(
    rust_file: &str,
    name: &str,
    rule_kinds: &[String],
    tokens: &[String],
    rules: &[String],
) -> String {
    let mut out = String::new();

    writeln!(
        &mut out,
        "//! Generated by parsuna — do not edit by hand."
    )
    .unwrap();
    writeln!(&mut out, "//!").unwrap();
    writeln!(
        &mut out,
        "//! This crate compiles to a Python extension module exposing the"
    )
    .unwrap();
    writeln!(
        &mut out,
        "//! grammar's parser. From Python: `import <name>; p = <name>.parse_<rule>(src)`"
    )
    .unwrap();
    writeln!(&mut out, "//! then iterate `p` to receive Event objects.").unwrap();
    writeln!(&mut out).unwrap();

    // Strip the generated Rust file's `#![...]` inner attributes; we emit
    // our own at the crate root below.
    for line in rust_file.lines() {
        if line.starts_with("#![") {
            continue;
        }
        writeln!(&mut out, "{}", line).unwrap();
    }
    writeln!(&mut out).unwrap();

    out.push_str(PY_BINDINGS_CORE);
    writeln!(&mut out).unwrap();

    writeln!(
        &mut out,
        "#[pyo3::pyclass(frozen, module = \"{}\", name = \"RuleKind\")]",
        name
    )
    .unwrap();
    writeln!(&mut out, "struct PyRuleKind;").unwrap();
    writeln!(&mut out, "#[pyo3::pymethods]").unwrap();
    writeln!(&mut out, "impl PyRuleKind {{").unwrap();
    for n in rule_kinds {
        let p = crate::codegen::common::pascal(n);
        writeln!(
            &mut out,
            "    #[classattr] pub const {}: u16 = RuleKind::{}.id();",
            p, p
        )
        .unwrap();
    }
    writeln!(&mut out, "    #[staticmethod]").unwrap();
    writeln!(&mut out, "    fn name(kind: u16) -> &'static str {{").unwrap();
    writeln!(&mut out, "        match kind {{").unwrap();
    for (i, n) in rule_kinds.iter().enumerate() {
        writeln!(&mut out, "            {} => \"{}\",", i, n).unwrap();
    }
    writeln!(&mut out, "            _ => \"?\",").unwrap();
    writeln!(&mut out, "        }}").unwrap();
    writeln!(&mut out, "    }}").unwrap();
    writeln!(&mut out, "}}").unwrap();
    writeln!(&mut out).unwrap();

    writeln!(
        &mut out,
        "#[pyo3::pyclass(frozen, module = \"{}\", name = \"TokenKind\")]",
        name
    )
    .unwrap();
    writeln!(&mut out, "struct PyTokenKind;").unwrap();
    writeln!(&mut out, "#[pyo3::pymethods]").unwrap();
    writeln!(&mut out, "impl PyTokenKind {{").unwrap();
    for n in tokens {
        let p = crate::codegen::common::pascal(n);
        writeln!(
            &mut out,
            "    #[classattr] pub const {}: i16 = TokenKind::{} as i16;",
            p, p
        )
        .unwrap();
    }
    writeln!(
        &mut out,
        "    #[classattr] pub const EOF: i16 = parsuna_rt::TOKEN_EOF;"
    )
    .unwrap();
    writeln!(
        &mut out,
        "    #[classattr] pub const ERROR: i16 = parsuna_rt::TOKEN_ERROR;"
    )
    .unwrap();
    writeln!(&mut out, "}}").unwrap();
    writeln!(&mut out).unwrap();

    for r in rules {
        writeln!(
            &mut out,
            "/// Parse the `{}` rule from a string and return a [`PyParser`].",
            r
        )
        .unwrap();
        writeln!(&mut out, "#[pyo3::pyfunction(name = \"parse_{}\")]", r).unwrap();
        writeln!(&mut out, "fn parse_{}_py(src: &str) -> PyParser {{", r).unwrap();
        writeln!(&mut out, "    let lex = parsuna_rt::StreamingLexer::new(std::io::Cursor::new(src.as_bytes().to_vec()), &LEXER_CONFIG);").unwrap();
        writeln!(
            &mut out,
            "    PyParser {{ inner: Parser::new(lex, ENTRY_{}) }}",
            r.to_uppercase()
        )
        .unwrap();
        writeln!(&mut out, "}}").unwrap();
    }
    writeln!(&mut out).unwrap();

    writeln!(&mut out, "#[pyo3::pymodule]").unwrap();
    writeln!(
        &mut out,
        "fn {}(_py: pyo3::Python, m: &pyo3::types::PyModule) -> pyo3::PyResult<()> {{",
        name
    )
    .unwrap();
    writeln!(&mut out, "    m.add_class::<PyPos>()?;").unwrap();
    writeln!(&mut out, "    m.add_class::<PySpan>()?;").unwrap();
    writeln!(&mut out, "    m.add_class::<PyError>()?;").unwrap();
    writeln!(&mut out, "    m.add_class::<PyEvent>()?;").unwrap();
    writeln!(&mut out, "    m.add_class::<PyParser>()?;").unwrap();
    writeln!(&mut out, "    m.add_class::<PyRuleKind>()?;").unwrap();
    writeln!(&mut out, "    m.add_class::<PyTokenKind>()?;").unwrap();
    for r in rules {
        writeln!(
            &mut out,
            "    m.add_function(pyo3::wrap_pyfunction!(parse_{}_py, m)?)?;",
            r
        )
        .unwrap();
    }
    writeln!(&mut out, "    Ok(())").unwrap();
    writeln!(&mut out, "}}").unwrap();

    out
}

fn build_cargo_toml(name: &str) -> String {
    format!(
        r#"[package]
name = "{name}"
version = "0.1.0"
edition = "2021"

[lib]
name = "{name}"
crate-type = ["cdylib"]

[dependencies]
# Point this `path` at your local parsuna-rt checkout, or replace with a
# crates.io dependency once parsuna-rt is published.
parsuna-rt = {{ path = "../../runtimes/rust" }}
pyo3 = {{ version = "0.20", features = ["extension-module"] }}
"#
    )
}

fn build_pyproject(name: &str) -> String {
    format!(
        r#"[build-system]
requires = ["maturin>=1.0,<2.0"]
build-backend = "maturin"

[project]
name = "{name}"
version = "0.1.0"
description = "Generated parser for the `{name}` grammar (parsuna)."
requires-python = ">=3.8"
classifiers = [
    "Programming Language :: Rust",
    "Programming Language :: Python :: Implementation :: CPython",
]

[tool.maturin]
features = ["pyo3/extension-module"]
"#
    )
}

const PY_BINDINGS_CORE: &str = r#"
use pyo3::prelude::*;

/// Source position: byte offset plus 1-based line/column.
#[pyclass(frozen, get_all, name = "Pos")]
#[derive(Clone, Copy, Debug)]
struct PyPos { offset: u32, line: u32, column: u32 }

#[pymethods]
impl PyPos {
    fn __repr__(&self) -> String { format!("Pos({}, {}, {})", self.offset, self.line, self.column) }
}

/// Half-open span `[start, end)` over the source.
#[pyclass(frozen, get_all, name = "Span")]
#[derive(Clone, Copy, Debug)]
struct PySpan { start: PyPos, end: PyPos }

#[pymethods]
impl PySpan {
    fn __repr__(&self) -> String {
        format!("Span({}:{}-{}:{})", self.start.line, self.start.column, self.end.line, self.end.column)
    }
}

/// A recoverable parse or lex error.
#[pyclass(frozen, get_all, name = "Error")]
#[derive(Clone, Debug)]
struct PyError { message: String }

#[pymethods]
impl PyError {
    fn __repr__(&self) -> String {
        format!("Error({:?})", self.message)
    }
}

/// A single pull-parser event. `tag` is one of "enter", "exit", "token",
/// or "error"; the populated payload field depends on the tag.
#[pyclass(frozen, get_all, name = "Event")]
#[derive(Clone, Debug)]
struct PyEvent {
    tag: String,
    span: PySpan,
    kind: Option<i32>,
    text: Option<String>,
    error: Option<PyError>,
}

#[pymethods]
impl PyEvent {
    fn is_enter(&self) -> bool { self.tag == "enter" }
    fn is_exit(&self) -> bool { self.tag == "exit" }
    fn is_token(&self) -> bool { self.tag == "token" }
    fn is_error(&self) -> bool { self.tag == "error" }
    fn __repr__(&self) -> String {
        match (self.tag.as_str(), self.text.as_deref(), self.error.as_ref()) {
            ("enter", _, _) => format!("Event(enter rule={})", self.kind.unwrap_or(0)),
            ("exit", _, _)  => format!("Event(exit rule={})", self.kind.unwrap_or(0)),
            ("token", Some(t), _) => format!("Event(token kind={} text={:?})", self.kind.unwrap_or(0), t),
            ("token", None, _)    => format!("Event(token kind={})", self.kind.unwrap_or(0)),
            ("error", _, Some(d)) => format!("Event(error {:?})", d.message),
            _ => "Event(?)".to_string(),
        }
    }
}

fn to_py_pos(p: parsuna_rt::Pos) -> PyPos { PyPos { offset: p.offset, line: p.line, column: p.column } }
fn to_py_span(s: parsuna_rt::Span) -> PySpan { PySpan { start: to_py_pos(s.start), end: to_py_pos(s.end) } }
fn to_py_diag(d: parsuna_rt::Error) -> PyError {
    PyError { message: d.message.into_owned() }
}
fn to_py_event(ev: Event) -> PyEvent {
    match ev {
        parsuna_rt::Event::Enter { rule, pos } => PyEvent {
            tag: "enter".into(),
            span: to_py_span(parsuna_rt::Span::point(pos)),
            kind: Some(rule as i32),
            text: None,
            error: None,
        },
        parsuna_rt::Event::Exit { rule, pos } => PyEvent {
            tag: "exit".into(),
            span: to_py_span(parsuna_rt::Span::point(pos)),
            kind: Some(rule as i32),
            text: None,
            error: None,
        },
        parsuna_rt::Event::Error(d) => {
            let span = to_py_span(d.span);
            PyEvent { tag: "error".into(), span, kind: None, text: None, error: Some(to_py_diag(d)) }
        }
        parsuna_rt::Event::Token(t) => {
            let span = to_py_span(t.span);
            PyEvent { tag: "token".into(), span, kind: Some(t.kind as i32), text: Some(t.text.into_owned()), error: None }
        }
    }
}

/// Pull-based parser. Iterate to walk the parse as a sequence of
/// [`PyEvent`] values, or call `next_event` manually.
#[pyclass(unsendable, name = "Parser")]
struct PyParser { inner: Parser<'static, parsuna_rt::StreamingLexer<std::io::Cursor<Vec<u8>>, TokenKind>> }

#[pymethods]
impl PyParser {
    fn next_event(&mut self) -> Option<PyEvent> { self.inner.next_event().map(to_py_event) }

    fn __iter__(slf: PyRef<Self>) -> PyRef<Self> { slf }
    fn __next__(&mut self) -> Option<PyEvent> { self.next_event() }
}
"#;