subplot 0.4.0

tools for specifying, documenting, and implementing automated acceptance tests for systems and software
Documentation
use lazy_static::lazy_static;
use log::trace;
use pandoc_ast::{Attr, Block, Inline, Map, MetaValue, Pandoc};
use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag};
use regex::Regex;
use serde::Deserialize;
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};

lazy_static! {
    // Pattern that recognises a YAML block at the beginning of a file.
    static ref LEADING_YAML_PATTERN: Regex = Regex::new(r"^(?:\S*\n)*(?P<yaml>-{3,}\n([^.].*\n)*\.{3,}\n)(?P<text>(.*\n)*)$").unwrap();


    // Pattern that recognises a YAML block at the end of a file.
    static ref TRAILING_YAML_PATTERN: Regex = Regex::new(r"(?P<text>(.*\n)*)\n*(?P<yaml>-{3,}\n([^.].*\n)*\.{3,}\n)(?:\S*\n)*$").unwrap();
}

/// An abstract syntax tree representation of a Markdown file.
///
/// This represents a Markdown file as an abstract syntax tree
/// compatible with Pandoc's AST. The document YAML metadata MUST be
/// at the top or bottom of the file, excluding leading or trailing
/// empty lines.
#[derive(Debug)]
pub struct AbstractSyntaxTree {
    blocks: Vec<Block>,
    meta: Map<String, MetaValue>,
}

impl AbstractSyntaxTree {
    // Create a new AST.
    //
    // Note that this is not public.
    fn new(meta: Map<String, MetaValue>, blocks: Vec<Block>) -> Self {
        Self { blocks, meta }
    }

    /// Return a Pandoc-compatible AST.
    pub fn to_pandoc(&self) -> Pandoc {
        Pandoc {
            meta: self.meta.clone(),
            blocks: self.blocks.clone(),
            pandoc_api_version: vec![1, 20],
        }
    }
}

impl std::str::FromStr for AbstractSyntaxTree {
    type Err = Error;

    /// Create an abstract syntax tree from a string.
    fn from_str(markdown: &str) -> Result<Self, Self::Err> {
        trace!("Parsing markdown");
        let ast = if let Some((yaml, markdown)) = get_yaml(&LEADING_YAML_PATTERN, markdown) {
            trace!("Found leading YAML: {:?}", yaml);
            let meta = Metadata::new(yaml)?.to_map();
            let blocks = parse_blocks(markdown);
            AbstractSyntaxTree::new(meta, blocks)
        } else if let Some((yaml, _markdown)) = get_yaml(&TRAILING_YAML_PATTERN, markdown) {
            trace!("Found trailing YAML: {:?}", yaml);
            let meta = Metadata::new(yaml)?.to_map();
            let blocks = parse_blocks(markdown);
            AbstractSyntaxTree::new(meta, blocks)
        } else {
            trace!("No YAML to be found");
            let blocks = parse_blocks(markdown);
            AbstractSyntaxTree::new(Map::new(), blocks)
        };
        trace!("Parsing markdown: OK");
        Ok(ast)
    }
}

// Extract a YAML metadata block using a given regex.
fn get_yaml<'a>(pat: &Regex, markdown: &'a str) -> Option<(&'a str, &'a str)> {
    trace!("Markdown: {:?}", markdown);
    if let Some(c) = pat.captures(markdown) {
        trace!("YAML regex matches: {:?}", c);
        let yaml = c.name("yaml");
        let text = c.name("text");
        trace!("YAML metadata: {:?}", yaml);
        trace!("markdown: {:?}", text);
        if yaml.is_some() && text.is_some() {
            trace!("YAML regex captures YAML and text");
            let yaml = yaml?;
            let text = text?;
            let yaml = &markdown[yaml.start()..yaml.end()];
            let text = &markdown[text.start()..text.end()];
            assert!(yaml.starts_with("---"));
            assert!(yaml.ends_with("...\n"));
            return Some((yaml, text));
        } else {
            trace!("YAML regex fails to capture YAML");
        }
    } else {
        trace!("YAML regex does not match");
    }
    None
}

// Parse Markdown into a sequence of Blocks.
fn parse_blocks(markdown: &str) -> Vec<Block> {
    trace!("Parsing blocks");

    // Define the Markdown parser.
    let mut options = Options::empty();
    options.insert(Options::ENABLE_TABLES);
    options.insert(Options::ENABLE_FOOTNOTES);
    options.insert(Options::ENABLE_STRIKETHROUGH);
    options.insert(Options::ENABLE_TASKLISTS);
    options.insert(Options::ENABLE_SMART_PUNCTUATION);
    let parser = Parser::new_ext(markdown, options);

    // The sequence of blocks that represents the parsed document.
    let mut blocks = vec![];

    // The current set of inline elements we've collected. This gets
    // emptied whenever we finish a block.
    let mut inlines: Vec<Inline> = vec![];

    for event in parser {
        trace!("Parsing event: {:?}", event);
        match event {
            // We ignore these for now. They're not needed for codegen.
            Event::Html(_)
            | Event::FootnoteReference(_)
            | Event::SoftBreak
            | Event::HardBreak
            | Event::Rule
            | Event::TaskListMarker(_) => (),

            // Inline text of various kinds.
            Event::Text(text) => inlines.push(inline_text(&text)),
            Event::Code(text) => inlines.push(inline_code(&text)),

            // We only handle the end events.
            Event::Start(_) => (),

            // End of a block or inline.
            Event::End(tag) => match tag {
                // Collect inline elements for later inclusion in a block.
                Tag::Emphasis | Tag::Strong | Tag::Strikethrough => {
                    inline_from_inlines(&tag, &mut inlines)
                }
                Tag::Paragraph => blocks.push(paragraph(&mut inlines)),
                Tag::Heading(level, _fragment, _classes) => {
                    blocks.push(heading(level as i64, &mut inlines))
                }
                Tag::CodeBlock(kind) => blocks.push(code_block(&kind, &mut inlines)),
                Tag::Image(_link, dest, title) => blocks.push(image_block(&dest, &title)),
                // We don't handle anything else yet.
                _ => (),
            },
        }
    }

    // We MUST have emptied all inline elements.
    // assert!(inlines.is_empty());

    trace!("Parsing blocks: OK");
    blocks
}

fn inline_text(text: &str) -> Inline {
    Inline::Str(text.to_string())
}

fn inline_code(text: &str) -> Inline {
    let attr = ("".to_string(), vec![], vec![]);
    Inline::Code(attr, text.to_string())
}

fn paragraph(inlines: &mut Vec<Inline>) -> Block {
    Block::Para(std::mem::take(inlines))
}

fn heading(level: i64, inlines: &mut Vec<Inline>) -> Block {
    let attr = ("".to_string(), vec![], vec![]);
    Block::Header(level, attr, std::mem::take(inlines))
}

fn image_block(dest: &str, title: &str) -> Block {
    let attr = ("".to_string(), vec![], vec![]);
    Block::Para(vec![Inline::Image(
        attr,
        vec![],
        (dest.to_string(), title.to_string()),
    )])
}

fn code_block(kind: &CodeBlockKind, inlines: &mut Vec<Inline>) -> Block {
    trace!("code block: {:?}", kind);
    let attr = if let CodeBlockKind::Fenced(lang) = kind {
        trace!("fenced code block, lang={:?}", lang);
        parse_code_block_attrs(lang)
    } else {
        trace!("indented code block");
        parse_code_block_attrs("")
    };
    trace!("code block attrs: {:?}", attr);
    let mut code = String::new();
    for inline in inlines.drain(0..) {
        let text = plain_text_inline(inline);
        code.push_str(&text);
    }
    // pulldown_cmark and pandoc differ in their codeblock handling,
    // pulldown_cmark has an extra newline which we trim for now to be
    // compatible with pandoc's parsing
    if !code.is_empty() {
        assert_eq!(code.pop(), Some('\n'));
    }
    Block::CodeBlock(attr, code)
}

fn plain_text_inline(inline: Inline) -> String {
    match inline {
        Inline::Str(text) => text,
        Inline::Code(_, text) => text,
        Inline::Emph(inlines) => {
            let mut text = String::new();
            for inline in inlines {
                text.push_str(&plain_text_inline(inline));
            }
            text
        }
        _ => panic!("not text in code block: {:?}", inline),
    }
}

fn parse_code_block_attrs(attrs: &str) -> Attr {
    trace!("parsing code block attrs: {:?}", attrs);
    let mut id = "".to_string();
    let mut classes = vec![];
    let mut keyvalues = vec![];
    if attrs.starts_with('{') && attrs.ends_with('}') {
        let attrs = &attrs[1..attrs.len() - 1];
        for word in attrs.split_ascii_whitespace() {
            if let Some(x) = word.strip_prefix('#') {
                id = x.to_string();
            } else if let Some(x) = word.strip_prefix('.') {
                classes.push(x.to_string());
            } else if let Some(i) = word.find('=') {
                let k = &word[..i];
                let v = &word[i + 1..];
                keyvalues.push((k.to_string(), v.to_string()));
            }
        }
    } else if !attrs.is_empty() {
        classes.push(attrs.to_string());
    }
    (id, classes, keyvalues)
}

fn inline_from_inlines(tag: &Tag, inlines: &mut Vec<Inline>) {
    let new_inlines = inlines.clone();
    inlines.clear();

    let inline = match tag {
        Tag::Emphasis => Inline::Emph(new_inlines),
        Tag::Strong => Inline::Strong(new_inlines),
        Tag::Strikethrough => Inline::Strikeout(new_inlines),
        _ => unreachable!(),
    };

    inlines.push(inline);
}

/// Errors from Markdown parsing.
#[derive(Debug, thiserror::Error)]
pub enum Error {
    #[error(transparent)]
    Regex(#[from] regex::Error),

    #[error(transparent)]
    Yaml(#[from] serde_yaml::Error),
}

// Document metadata.
//
// This is expressed in the Markdown input file as an embedded YAML
// block.
//
// Note that this structure needs to be able to capture any metadata
// block we can work with, in any input file. By being strict here we
// make it easier to tell the user when a metadata block has, say, a
// misspelled field.
#[derive(Debug, Default, Deserialize)]
#[serde(deny_unknown_fields)]
struct Metadata {
    title: String,
    subtitle: Option<String>,
    author: Option<String>,
    date: Option<String>,
    classes: Option<Vec<String>>,
    bibliography: Option<Vec<PathBuf>>,
    bindings: Option<Vec<PathBuf>>,
    documentclass: Option<String>,
    #[serde(default)]
    impls: BTreeMap<String, Vec<PathBuf>>,
}

impl Metadata {
    fn new(yaml_text: &str) -> Result<Self, Error> {
        trace!("Parsing YAML");
        let meta: Self = serde_yaml::from_str(yaml_text)?;
        Ok(meta)
    }

    fn to_map(&self) -> Map<String, MetaValue> {
        trace!("Creating metadata map from parsed YAML");
        let mut map: Map<String, MetaValue> = Map::new();
        map.insert(s("title"), meta_string(&self.title));
        if let Some(v) = &self.subtitle {
            map.insert(s("subtitle"), meta_string(v));
        }
        if let Some(v) = &self.author {
            map.insert(s("author"), meta_string(v));
        }
        if let Some(v) = &self.date {
            map.insert(s("date"), meta_string(v));
        }
        if let Some(v) = &self.classes {
            map.insert(s("classes"), meta_strings(v));
        }
        if !self.impls.is_empty() {
            let impls = self
                .impls
                .iter()
                .map(|(k, v)| (k.to_owned(), Box::new(meta_path_bufs(v))))
                .collect();
            map.insert(s("impls"), MetaValue::MetaMap(impls));
        }
        if let Some(v) = &self.bibliography {
            map.insert(s("bibliography"), meta_path_bufs(v));
        }
        if let Some(v) = &self.bindings {
            map.insert(s("bindings"), meta_path_bufs(v));
        }
        if let Some(v) = &self.documentclass {
            map.insert(s("documentclass"), meta_string(v));
        }
        trace!("Created metadata map from parsed YAML");
        map
    }
}

fn s(s: &str) -> String {
    s.to_string()
}

fn meta_string(s: &str) -> MetaValue {
    MetaValue::MetaString(s.to_string())
}

fn meta_strings(v: &[String]) -> MetaValue {
    MetaValue::MetaList(v.iter().map(|s| meta_string(s)).collect())
}

fn meta_path_buf(p: &Path) -> MetaValue {
    meta_string(&p.display().to_string())
}

fn meta_path_bufs(v: &[PathBuf]) -> MetaValue {
    MetaValue::MetaList(v.iter().map(|p| meta_path_buf(p)).collect())
}

#[cfg(test)]
mod test {
    use super::{parse_code_block_attrs, AbstractSyntaxTree, Metadata};
    use super::{Block, Inline};
    use std::path::PathBuf;
    use std::str::FromStr;

    #[test]
    fn code_block_attrs() {
        assert_eq!(parse_code_block_attrs(""), ("".to_string(), vec![], vec![]));
        assert_eq!(
            parse_code_block_attrs("foo"),
            ("".to_string(), vec!["foo".to_string()], vec![])
        );
        assert_eq!(
            parse_code_block_attrs("{#foo}"),
            ("foo".to_string(), vec![], vec![])
        );
        assert_eq!(
            parse_code_block_attrs("{#foo .file bar=yo}"),
            (
                "foo".to_string(),
                vec!["file".to_string()],
                vec![("bar".to_string(), "yo".to_string())]
            )
        );
    }

    #[test]
    fn empty_input() {
        let ast = AbstractSyntaxTree::from_str("").unwrap();
        let doc = ast.to_pandoc();
        assert!(doc.blocks.is_empty());
        assert!(doc.meta.is_empty());
        assert!(!doc.pandoc_api_version.is_empty());
    }

    #[test]
    fn simple() {
        let ast = AbstractSyntaxTree::from_str(
            "\
            # Introduction \n\
            \n\
            First paragraph.\n\
            ",
        )
        .unwrap();
        let doc = ast.to_pandoc();
        assert!(doc.meta.is_empty());
        assert!(!doc.pandoc_api_version.is_empty());

        let attr = ("".to_string(), vec![], vec![]);
        let h = Block::Header(1, attr, vec![Inline::Str("Introduction".to_string())]);
        let para = Block::Para(vec![Inline::Str("First paragraph.".to_string())]);
        assert_eq!(doc.blocks, &[h, para]);
    }

    #[test]
    fn parses_leading_meta() {
        let markdown = "\n\n---\ntitle: Foo Bar\n...\nfoobar\n";
        let ast = AbstractSyntaxTree::from_str(markdown).unwrap();
        let doc = ast.to_pandoc();
        let keys: Vec<String> = doc.meta.keys().cloned().collect();
        assert_eq!(keys, ["title"]);
    }

    #[test]
    fn parses_trailing_meta() {
        let markdown = "foobar\n---\ntitle: Foo Bar\n...\n\n\n";
        let ast = AbstractSyntaxTree::from_str(markdown).unwrap();
        let doc = ast.to_pandoc();
        let keys: Vec<String> = doc.meta.keys().cloned().collect();
        assert_eq!(keys, ["title"]);
    }

    #[test]
    fn full_meta() {
        let meta = Metadata::new(
            "\
title: Foo Bar
date: today
classes: [json, text]
impls:
  python:
   - foo.py
   - bar.py
bibliography:
- foo.bib
- bar.bib
bindings:
- foo.yaml
- bar.yaml
",
        )
        .unwrap();
        assert_eq!(meta.title, "Foo Bar");
        assert_eq!(meta.date.unwrap(), "today");
        assert_eq!(meta.classes.unwrap(), &["json", "text"]);
        assert_eq!(
            meta.bibliography.unwrap(),
            &[path("foo.bib"), path("bar.bib")]
        );
        assert_eq!(
            meta.bindings.unwrap(),
            &[path("foo.yaml"), path("bar.yaml")]
        );
        assert!(!meta.impls.is_empty());
        for (k, v) in meta.impls.iter() {
            assert_eq!(k, "python");
            assert_eq!(v, &[path("foo.py"), path("bar.py")]);
        }
    }

    fn path(s: &str) -> PathBuf {
        PathBuf::from(s)
    }
}