use markdown::{
mdast::{Code, Node, Text},
to_mdast, ParseOptions,
};
use serde::de::DeserializeOwned;
use serde_yaml::Value;
use std::collections::VecDeque;
use thiserror::Error;
#[derive(Error, Debug)]
#[error(transparent)]
pub struct ExtractionError(#[from] ExtractionErrorImpl);
#[derive(Error, Debug)]
enum ExtractionErrorImpl {
#[error("The YAML was valid, but it didn't match the expected format: {0}")]
YamlFoundButFormatWrong(serde_yaml::Error),
#[error("YAML parsing failed with: {0}")]
ParseError(#[from] serde_yaml::Error),
#[error("The string to parse was empty")]
NoneFound,
}
impl ExtractionErrorImpl {
fn most_representative(a: Self, b: Self) -> Self {
match (&a, &b) {
(Self::YamlFoundButFormatWrong(_), _) => a,
(_, Self::YamlFoundButFormatWrong(_)) => b,
(Self::ParseError(_), _) => a,
(_, Self::ParseError(_)) => b,
_ => a,
}
}
}
fn extract_yaml<T: DeserializeOwned>(code_block: &str) -> Result<T, ExtractionErrorImpl> {
let code_block = Some(code_block)
.filter(|s| !s.is_empty())
.ok_or_else(|| ExtractionErrorImpl::NoneFound)?;
let yaml: Value = serde_yaml::from_str(code_block)?;
serde_yaml::from_value(yaml).map_err(ExtractionErrorImpl::YamlFoundButFormatWrong)
}
pub fn find_yaml<T: DeserializeOwned>(text: &str) -> Result<Vec<T>, ExtractionError> {
let mut current_error = ExtractionErrorImpl::NoneFound;
if text.is_empty() {
return Err(current_error.into());
}
if !text.starts_with("```") {
match extract_yaml(text) {
Ok(o) => return Ok(vec![o]),
Err(e) => current_error = ExtractionErrorImpl::most_representative(current_error, e),
}
}
let options = ParseOptions::default();
let ast = to_mdast(text, &options).expect("we're not using MDX, so this should never fail");
let mut nodes = vec![ast];
let mut found: VecDeque<_> = VecDeque::new();
while let Some(node) = nodes.pop() {
if let Some(children) = node.children() {
children.iter().for_each(|child| nodes.push(child.clone()));
}
if let Node::Code(Code { value, lang, .. }) = node {
let lang = lang.unwrap_or_default();
match lang.as_str() {
"yaml" | "yml" | "json" | "" => {
let code_block = value.as_str();
match extract_yaml(code_block) {
Ok(o) => found.push_front(o),
Err(e) => {
current_error =
ExtractionErrorImpl::most_representative(current_error, e)
}
}
}
_ => {}
}
}
}
if !found.is_empty() {
Ok(found.into())
} else {
Err(current_error.into())
}
}
pub fn extract_labeled_text(text: &str) -> Vec<(String, String)> {
let options = ParseOptions::default();
let ast = to_mdast(text, &options).expect("markdown parsing can't fail");
let mut nodes = VecDeque::new();
nodes.push_back(ast);
let mut extracted_labels = Vec::new();
while let Some(node) = nodes.pop_front() {
let found = match &node {
Node::Text(Text { value, .. }) => {
extract_label_and_text(value.to_owned()).map(|(label, text)| (label, text))
}
Node::Paragraph(_) | Node::ListItem(_) => {
find_labeled_text(&node).map(|(label, text)| (label, text))
}
_ => None,
};
if let Some(kv) = found {
extracted_labels.push(kv)
} else if let Some(children) = node.children() {
for (index, child) in children.iter().cloned().enumerate() {
nodes.insert(index, child);
}
}
}
extracted_labels
}
fn find_labeled_text(n: &Node) -> Option<(String, String)> {
if let Node::Text(Text { value, .. }) = n {
extract_label_and_text(value.to_owned())
} else {
let children = n.children()?;
if children.len() == 2 {
let key = children
.get(0)
.map(inner_text)
.map(format_key)
.filter(|k| !k.is_empty());
let value = children.get(1).map(inner_text).map(format_value);
key.and_then(|key| value.map(|value| (key, value)))
} else {
None
}
}
}
fn extract_label_and_text(text: String) -> Option<(String, String)> {
let value_split: Vec<&str> = text.splitn(2, ':').collect();
if value_split.len() == 2 {
let label = value_split[0].trim().to_string();
if label.is_empty() {
return None;
}
let text = value_split[1].trim().to_string();
Some((label, text))
} else {
None
}
}
fn inner_text(n: &Node) -> String {
if let Node::Text(Text { value, .. }) = n {
return value.to_owned();
}
let mut deq = VecDeque::new();
deq.push_back(n.clone());
let mut text = String::new();
while let Some(node) = deq.pop_front() {
if let Some(children) = node.children() {
deq.extend(children.iter().cloned());
}
if let Node::Text(Text { value, .. }) = node {
text.push_str(value.as_str());
}
}
text
}
fn format_key(s: String) -> String {
let key = s.trim();
key.strip_suffix(':').unwrap_or(key).to_owned()
}
fn format_value(s: String) -> String {
s.trim()
.strip_prefix(':')
.unwrap_or(&s)
.trim_start()
.to_owned()
}