#![forbid(unsafe_code)]
#![warn(missing_docs, future_incompatible, rust_2018_idioms)]
use std::ops::Range;
use pulldown_cmark::{DefaultBrokenLinkCallback, Event, OffsetIter, Parser, Tag, TagEnd};
pub use pulldown_cmark::CodeBlockKind;
#[derive(Debug, Clone, PartialEq)]
pub struct CodeBlock {
pub kind: CodeBlockKind<'static>,
pub language: Option<String>,
pub info_string: String,
pub attributes: Option<String>,
pub source: String,
pub byte_range: Range<usize>,
pub line_range: Range<usize>,
pub indent: usize,
}
impl CodeBlock {
#[must_use]
pub fn is_fenced(&self) -> bool {
self.kind.is_fenced()
}
#[must_use]
pub fn has_info_word(&self, word: &str) -> bool {
self.info_string
.split_whitespace()
.any(|token| token == word)
}
pub fn attributes(&self) -> impl Iterator<Item = &str> {
self.attributes.as_deref().unwrap_or("").split_whitespace()
}
#[must_use]
pub fn has_attribute(&self, attribute: &str) -> bool {
self.attributes().any(|token| token == attribute)
}
}
pub struct CodeBlockExtractor<'a> {
parser: OffsetIter<'a, DefaultBrokenLinkCallback>,
markdown: &'a str,
}
impl<'a> CodeBlockExtractor<'a> {
#[must_use]
pub fn from_markdown(markdown: &'a str) -> Self {
Self {
parser: Parser::new(markdown).into_offset_iter(),
markdown,
}
}
}
impl Iterator for CodeBlockExtractor<'_> {
type Item = CodeBlock;
fn next(&mut self) -> Option<Self::Item> {
while let Some((event, range)) = self.parser.next() {
if let Event::Start(Tag::CodeBlock(kind)) = event {
return Some(self.collect_code_block(kind, range));
}
}
None
}
}
impl CodeBlockExtractor<'_> {
fn collect_code_block(
&mut self,
kind: CodeBlockKind<'_>,
start_range: Range<usize>,
) -> CodeBlock {
let mut source = String::new();
let mut end_offset = start_range.end;
for (event, range) in &mut self.parser {
match event {
Event::Text(text) => {
source.push_str(&text);
end_offset = range.end;
}
Event::End(TagEnd::CodeBlock) => {
end_offset = range.end;
break;
}
_ => {}
}
}
let kind = kind.into_static();
let info_string = match &kind {
CodeBlockKind::Fenced(info_string) => info_string.to_string(),
CodeBlockKind::Indented => String::new(),
};
let (language, attributes) = parse_info_string(&info_string);
let indent = self
.markdown
.get(..start_range.start)
.and_then(|source| source.lines().last())
.unwrap_or("")
.chars()
.take_while(|character| character.is_whitespace())
.count();
CodeBlock {
kind,
language,
info_string,
attributes,
source,
byte_range: start_range.start..end_offset,
line_range: line_number(self.markdown, start_range.start)
.saturating_sub(usize::from(indent > 0))
..line_number(self.markdown, end_offset),
indent,
}
}
}
#[must_use]
pub fn code_blocks(markdown: &str) -> CodeBlockExtractor<'_> {
CodeBlockExtractor::from_markdown(markdown)
}
fn parse_info_string(info_string: &str) -> (Option<String>, Option<String>) {
let trimmed = info_string.trim();
if trimmed.is_empty() {
return (None, None);
}
match trimmed.split_once(char::is_whitespace) {
Some((language, attributes)) => {
let attributes = attributes.trim();
(
Some(language.to_string()),
(!attributes.is_empty()).then(|| attributes.to_string()),
)
}
None => (Some(trimmed.to_string()), None),
}
}
fn line_number(markdown: &str, offset: usize) -> usize {
markdown[..offset].lines().count()
}
#[cfg(test)]
mod tests {
use super::{CodeBlockExtractor, CodeBlockKind, code_blocks};
#[test]
fn extracts_fenced_code_blocks() {
let markdown = "# Title\n\n```rust mdcr-skip key=value\nfn main() {}\n```\n";
let blocks = CodeBlockExtractor::from_markdown(markdown).collect::<Vec<_>>();
assert_eq!(blocks.len(), 1);
assert!(matches!(blocks[0].kind, CodeBlockKind::Fenced(_)));
assert_eq!(blocks[0].language.as_deref(), Some("rust"));
assert_eq!(blocks[0].attributes.as_deref(), Some("mdcr-skip key=value"));
assert_eq!(blocks[0].info_string, "rust mdcr-skip key=value");
assert_eq!(blocks[0].source, "fn main() {}\n");
assert_eq!(blocks[0].line_range, 2..5);
assert!(blocks[0].has_info_word("mdcr-skip"));
assert!(blocks[0].has_attribute("mdcr-skip"));
}
#[test]
fn extracts_fenced_code_block_attributes() {
let markdown = "```rust a b c\nfn main() {}\n```\n";
let blocks = code_blocks(markdown).collect::<Vec<_>>();
assert_eq!(blocks[0].language.as_deref(), Some("rust"));
assert_eq!(blocks[0].attributes.as_deref(), Some("a b c"));
assert_eq!(blocks[0].attributes().collect::<Vec<_>>(), ["a", "b", "c"]);
}
#[test]
fn extracts_indented_code_blocks() {
let markdown = "Before\n\n indented\n\nAfter\n";
let blocks = code_blocks(markdown).collect::<Vec<_>>();
assert_eq!(blocks.len(), 1);
assert!(matches!(blocks[0].kind, CodeBlockKind::Indented));
assert_eq!(blocks[0].language, None);
assert_eq!(blocks[0].source, "indented\n");
}
}