use lazy_static::lazy_static;
use log::trace;
use pandoc_ast::{Attr, Block, Inline, Map, MetaValue, Pandoc};
use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag};
use regex::Regex;
use serde::Deserialize;
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
lazy_static! {
static ref LEADING_YAML_PATTERN: Regex = Regex::new(r"^(?:\S*\n)*(?P<yaml>-{3,}\n([^.].*\n)*\.{3,}\n)(?P<text>(.*\n)*)$").unwrap();
static ref TRAILING_YAML_PATTERN: Regex = Regex::new(r"(?P<text>(.*\n)*)\n*(?P<yaml>-{3,}\n([^.].*\n)*\.{3,}\n)(?:\S*\n)*$").unwrap();
}
#[derive(Debug)]
pub struct AbstractSyntaxTree {
blocks: Vec<Block>,
meta: Map<String, MetaValue>,
}
impl AbstractSyntaxTree {
fn new(meta: Map<String, MetaValue>, blocks: Vec<Block>) -> Self {
Self { blocks, meta }
}
pub fn to_pandoc(&self) -> Pandoc {
Pandoc {
meta: self.meta.clone(),
blocks: self.blocks.clone(),
pandoc_api_version: vec![1, 20],
}
}
}
impl std::str::FromStr for AbstractSyntaxTree {
type Err = Error;
fn from_str(markdown: &str) -> Result<Self, Self::Err> {
trace!("Parsing markdown");
let ast = if let Some((yaml, markdown)) = get_yaml(&LEADING_YAML_PATTERN, markdown) {
trace!("Found leading YAML: {:?}", yaml);
let meta = Metadata::new(yaml)?.to_map();
let blocks = parse_blocks(markdown);
AbstractSyntaxTree::new(meta, blocks)
} else if let Some((yaml, _markdown)) = get_yaml(&TRAILING_YAML_PATTERN, markdown) {
trace!("Found trailing YAML: {:?}", yaml);
let meta = Metadata::new(yaml)?.to_map();
let blocks = parse_blocks(markdown);
AbstractSyntaxTree::new(meta, blocks)
} else {
trace!("No YAML to be found");
let blocks = parse_blocks(markdown);
AbstractSyntaxTree::new(Map::new(), blocks)
};
trace!("Parsing markdown: OK");
Ok(ast)
}
}
fn get_yaml<'a>(pat: &Regex, markdown: &'a str) -> Option<(&'a str, &'a str)> {
trace!("Markdown: {:?}", markdown);
if let Some(c) = pat.captures(markdown) {
trace!("YAML regex matches: {:?}", c);
let yaml = c.name("yaml");
let text = c.name("text");
trace!("YAML metadata: {:?}", yaml);
trace!("markdown: {:?}", text);
if yaml.is_some() && text.is_some() {
trace!("YAML regex captures YAML and text");
let yaml = yaml?;
let text = text?;
let yaml = &markdown[yaml.start()..yaml.end()];
let text = &markdown[text.start()..text.end()];
assert!(yaml.starts_with("---"));
assert!(yaml.ends_with("...\n"));
return Some((yaml, text));
} else {
trace!("YAML regex fails to capture YAML");
}
} else {
trace!("YAML regex does not match");
}
None
}
fn parse_blocks(markdown: &str) -> Vec<Block> {
trace!("Parsing blocks");
let mut options = Options::empty();
options.insert(Options::ENABLE_TABLES);
options.insert(Options::ENABLE_FOOTNOTES);
options.insert(Options::ENABLE_STRIKETHROUGH);
options.insert(Options::ENABLE_TASKLISTS);
options.insert(Options::ENABLE_SMART_PUNCTUATION);
let parser = Parser::new_ext(markdown, options);
let mut blocks = vec![];
let mut inlines: Vec<Inline> = vec![];
for event in parser {
trace!("Parsing event: {:?}", event);
match event {
Event::Html(_)
| Event::FootnoteReference(_)
| Event::SoftBreak
| Event::HardBreak
| Event::Rule
| Event::TaskListMarker(_) => (),
Event::Text(text) => inlines.push(inline_text(&text)),
Event::Code(text) => inlines.push(inline_code(&text)),
Event::Start(_) => (),
Event::End(tag) => match tag {
Tag::Emphasis | Tag::Strong | Tag::Strikethrough => {
inline_from_inlines(&tag, &mut inlines)
}
Tag::Paragraph => blocks.push(paragraph(&mut inlines)),
Tag::Heading(level, _fragment, _classes) => {
blocks.push(heading(level as i64, &mut inlines))
}
Tag::CodeBlock(kind) => blocks.push(code_block(&kind, &mut inlines)),
Tag::Image(_link, dest, title) => blocks.push(image_block(&dest, &title)),
_ => (),
},
}
}
trace!("Parsing blocks: OK");
blocks
}
fn inline_text(text: &str) -> Inline {
Inline::Str(text.to_string())
}
fn inline_code(text: &str) -> Inline {
let attr = ("".to_string(), vec![], vec![]);
Inline::Code(attr, text.to_string())
}
fn paragraph(inlines: &mut Vec<Inline>) -> Block {
Block::Para(std::mem::take(inlines))
}
fn heading(level: i64, inlines: &mut Vec<Inline>) -> Block {
let attr = ("".to_string(), vec![], vec![]);
Block::Header(level, attr, std::mem::take(inlines))
}
fn image_block(dest: &str, title: &str) -> Block {
let attr = ("".to_string(), vec![], vec![]);
Block::Para(vec![Inline::Image(
attr,
vec![],
(dest.to_string(), title.to_string()),
)])
}
fn code_block(kind: &CodeBlockKind, inlines: &mut Vec<Inline>) -> Block {
trace!("code block: {:?}", kind);
let attr = if let CodeBlockKind::Fenced(lang) = kind {
trace!("fenced code block, lang={:?}", lang);
parse_code_block_attrs(lang)
} else {
trace!("indented code block");
parse_code_block_attrs("")
};
trace!("code block attrs: {:?}", attr);
let mut code = String::new();
for inline in inlines.drain(0..) {
let text = plain_text_inline(inline);
code.push_str(&text);
}
if !code.is_empty() {
assert_eq!(code.pop(), Some('\n'));
}
Block::CodeBlock(attr, code)
}
fn plain_text_inline(inline: Inline) -> String {
match inline {
Inline::Str(text) => text,
Inline::Code(_, text) => text,
Inline::Emph(inlines) => {
let mut text = String::new();
for inline in inlines {
text.push_str(&plain_text_inline(inline));
}
text
}
_ => panic!("not text in code block: {:?}", inline),
}
}
fn parse_code_block_attrs(attrs: &str) -> Attr {
trace!("parsing code block attrs: {:?}", attrs);
let mut id = "".to_string();
let mut classes = vec![];
let mut keyvalues = vec![];
if attrs.starts_with('{') && attrs.ends_with('}') {
let attrs = &attrs[1..attrs.len() - 1];
for word in attrs.split_ascii_whitespace() {
if let Some(x) = word.strip_prefix('#') {
id = x.to_string();
} else if let Some(x) = word.strip_prefix('.') {
classes.push(x.to_string());
} else if let Some(i) = word.find('=') {
let k = &word[..i];
let v = &word[i + 1..];
keyvalues.push((k.to_string(), v.to_string()));
}
}
} else if !attrs.is_empty() {
classes.push(attrs.to_string());
}
(id, classes, keyvalues)
}
fn inline_from_inlines(tag: &Tag, inlines: &mut Vec<Inline>) {
let new_inlines = inlines.clone();
inlines.clear();
let inline = match tag {
Tag::Emphasis => Inline::Emph(new_inlines),
Tag::Strong => Inline::Strong(new_inlines),
Tag::Strikethrough => Inline::Strikeout(new_inlines),
_ => unreachable!(),
};
inlines.push(inline);
}
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[error(transparent)]
Regex(#[from] regex::Error),
#[error(transparent)]
Yaml(#[from] serde_yaml::Error),
}
#[derive(Debug, Default, Deserialize)]
#[serde(deny_unknown_fields)]
struct Metadata {
title: String,
subtitle: Option<String>,
author: Option<String>,
date: Option<String>,
classes: Option<Vec<String>>,
bibliography: Option<Vec<PathBuf>>,
bindings: Option<Vec<PathBuf>>,
documentclass: Option<String>,
#[serde(default)]
impls: BTreeMap<String, Vec<PathBuf>>,
}
impl Metadata {
fn new(yaml_text: &str) -> Result<Self, Error> {
trace!("Parsing YAML");
let meta: Self = serde_yaml::from_str(yaml_text)?;
Ok(meta)
}
fn to_map(&self) -> Map<String, MetaValue> {
trace!("Creating metadata map from parsed YAML");
let mut map: Map<String, MetaValue> = Map::new();
map.insert(s("title"), meta_string(&self.title));
if let Some(v) = &self.subtitle {
map.insert(s("subtitle"), meta_string(v));
}
if let Some(v) = &self.author {
map.insert(s("author"), meta_string(v));
}
if let Some(v) = &self.date {
map.insert(s("date"), meta_string(v));
}
if let Some(v) = &self.classes {
map.insert(s("classes"), meta_strings(v));
}
if !self.impls.is_empty() {
let impls = self
.impls
.iter()
.map(|(k, v)| (k.to_owned(), Box::new(meta_path_bufs(v))))
.collect();
map.insert(s("impls"), MetaValue::MetaMap(impls));
}
if let Some(v) = &self.bibliography {
map.insert(s("bibliography"), meta_path_bufs(v));
}
if let Some(v) = &self.bindings {
map.insert(s("bindings"), meta_path_bufs(v));
}
if let Some(v) = &self.documentclass {
map.insert(s("documentclass"), meta_string(v));
}
trace!("Created metadata map from parsed YAML");
map
}
}
fn s(s: &str) -> String {
s.to_string()
}
fn meta_string(s: &str) -> MetaValue {
MetaValue::MetaString(s.to_string())
}
fn meta_strings(v: &[String]) -> MetaValue {
MetaValue::MetaList(v.iter().map(|s| meta_string(s)).collect())
}
fn meta_path_buf(p: &Path) -> MetaValue {
meta_string(&p.display().to_string())
}
fn meta_path_bufs(v: &[PathBuf]) -> MetaValue {
MetaValue::MetaList(v.iter().map(|p| meta_path_buf(p)).collect())
}
#[cfg(test)]
mod test {
use super::{parse_code_block_attrs, AbstractSyntaxTree, Metadata};
use super::{Block, Inline};
use std::path::PathBuf;
use std::str::FromStr;
#[test]
fn code_block_attrs() {
assert_eq!(parse_code_block_attrs(""), ("".to_string(), vec![], vec![]));
assert_eq!(
parse_code_block_attrs("foo"),
("".to_string(), vec!["foo".to_string()], vec![])
);
assert_eq!(
parse_code_block_attrs("{#foo}"),
("foo".to_string(), vec![], vec![])
);
assert_eq!(
parse_code_block_attrs("{#foo .file bar=yo}"),
(
"foo".to_string(),
vec!["file".to_string()],
vec![("bar".to_string(), "yo".to_string())]
)
);
}
#[test]
fn empty_input() {
let ast = AbstractSyntaxTree::from_str("").unwrap();
let doc = ast.to_pandoc();
assert!(doc.blocks.is_empty());
assert!(doc.meta.is_empty());
assert!(!doc.pandoc_api_version.is_empty());
}
#[test]
fn simple() {
let ast = AbstractSyntaxTree::from_str(
"\
# Introduction \n\
\n\
First paragraph.\n\
",
)
.unwrap();
let doc = ast.to_pandoc();
assert!(doc.meta.is_empty());
assert!(!doc.pandoc_api_version.is_empty());
let attr = ("".to_string(), vec![], vec![]);
let h = Block::Header(1, attr, vec![Inline::Str("Introduction".to_string())]);
let para = Block::Para(vec![Inline::Str("First paragraph.".to_string())]);
assert_eq!(doc.blocks, &[h, para]);
}
#[test]
fn parses_leading_meta() {
let markdown = "\n\n---\ntitle: Foo Bar\n...\nfoobar\n";
let ast = AbstractSyntaxTree::from_str(markdown).unwrap();
let doc = ast.to_pandoc();
let keys: Vec<String> = doc.meta.keys().cloned().collect();
assert_eq!(keys, ["title"]);
}
#[test]
fn parses_trailing_meta() {
let markdown = "foobar\n---\ntitle: Foo Bar\n...\n\n\n";
let ast = AbstractSyntaxTree::from_str(markdown).unwrap();
let doc = ast.to_pandoc();
let keys: Vec<String> = doc.meta.keys().cloned().collect();
assert_eq!(keys, ["title"]);
}
#[test]
fn full_meta() {
let meta = Metadata::new(
"\
title: Foo Bar
date: today
classes: [json, text]
impls:
python:
- foo.py
- bar.py
bibliography:
- foo.bib
- bar.bib
bindings:
- foo.yaml
- bar.yaml
",
)
.unwrap();
assert_eq!(meta.title, "Foo Bar");
assert_eq!(meta.date.unwrap(), "today");
assert_eq!(meta.classes.unwrap(), &["json", "text"]);
assert_eq!(
meta.bibliography.unwrap(),
&[path("foo.bib"), path("bar.bib")]
);
assert_eq!(
meta.bindings.unwrap(),
&[path("foo.yaml"), path("bar.yaml")]
);
assert!(!meta.impls.is_empty());
for (k, v) in meta.impls.iter() {
assert_eq!(k, "python");
assert_eq!(v, &[path("foo.py"), path("bar.py")]);
}
}
fn path(s: &str) -> PathBuf {
PathBuf::from(s)
}
}