Skip to main content

pulldown_cmark_codeblock/
lib.rs

1//! Extract Markdown code blocks from Markdown documents parsed with
2//! [`pulldown-cmark`](https://crates.io/crates/pulldown-cmark).
3//!
4//! `pulldown-cmark` already exposes the fenced code block info string through
5//! `Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(info_string)))`. This
6//! crate builds on top of that lower-level event stream and returns complete,
7//! ready-to-use code block records:
8//!
9//! - fenced or indented block kind
10//! - language parsed from the first info string word
11//! - raw info string
12//! - remaining attributes as a raw string or token iterator
13//! - code block source text
14//! - byte range covering the whole block
15//! - zero-based line range covering the whole block
16//! - indentation before the opening marker
17//!
18//! # Example
19//!
20//! ````rust
21//! use pulldown_cmark_codeblock::{code_blocks, CodeBlockKind};
22//!
23//! let markdown = "# Title\n\n```rust runnable key=value\nfn main() {}\n```\n";
24//! let block = code_blocks(markdown).next().unwrap();
25//!
26//! assert!(matches!(block.kind, CodeBlockKind::Fenced(_)));
27//! assert_eq!(block.language.as_deref(), Some("rust"));
28//! assert_eq!(block.info_string, "rust runnable key=value");
29//! assert_eq!(block.attributes.as_deref(), Some("runnable key=value"));
30//! assert_eq!(block.attributes().collect::<Vec<_>>(), ["runnable", "key=value"]);
31//! assert_eq!(block.source, "fn main() {}\n");
32//! assert_eq!(block.line_range, 2..5);
33//! ````
34//!
35//! # API
36//!
37//! Use [`code_blocks`] for the concise iterator API:
38//!
39//! ````rust
40//! use pulldown_cmark_codeblock::code_blocks;
41//!
42//! let markdown = "Before\n\n```rust\nfn main() {}\n```\n";
43//! let blocks = code_blocks(markdown).collect::<Vec<_>>();
44//!
45//! assert_eq!(blocks.len(), 1);
46//! assert_eq!(blocks[0].language.as_deref(), Some("rust"));
47//! ````
48//!
49//! Use [`CodeBlockExtractor::from_markdown`] when you prefer constructing the
50//! iterator explicitly:
51//!
52//! ````rust
53//! use pulldown_cmark_codeblock::CodeBlockExtractor;
54//!
55//! let markdown = "```rust\nfn main() {}\n```\n";
56//! let blocks = CodeBlockExtractor::from_markdown(markdown).collect::<Vec<_>>();
57//!
58//! assert_eq!(blocks[0].source, "fn main() {}\n");
59//! ````
60//!
61//! Each extracted [`CodeBlock`] exposes:
62//!
63//! - [`CodeBlock::kind`]: [`CodeBlockKind::Fenced`] or
64//!   [`CodeBlockKind::Indented`]
65//! - [`CodeBlock::language`]: first info string word for fenced blocks
66//! - [`CodeBlock::info_string`]: complete fenced block info string
67//! - [`CodeBlock::attributes`]: remaining info string after the language
68//! - [`CodeBlock::source`]: code block body
69//! - [`CodeBlock::byte_range`]: byte range covering opening marker, body, and
70//!   closing marker
71//! - [`CodeBlock::line_range`]: zero-based line range covering the whole block
72//! - [`CodeBlock::indent`]: whitespace indentation before the opening marker
73//!
74//! It also provides helper methods:
75//!
76//! - [`CodeBlock::is_fenced`]
77//! - [`CodeBlock::has_info_word`]
78//! - [`CodeBlock::attributes`]
79//! - [`CodeBlock::has_attribute`]
80//!
81//! ````rust
82//! use pulldown_cmark_codeblock::code_blocks;
83//!
84//! let markdown = "```rust a b c\nfn main() {}\n```\n";
85//! let block = code_blocks(markdown).next().unwrap();
86//!
87//! assert!(block.is_fenced());
88//! assert!(block.has_info_word("rust"));
89//! assert!(block.has_attribute("b"));
90//! assert_eq!(block.attributes().collect::<Vec<_>>(), ["a", "b", "c"]);
91//! ````
92//!
93//! Indented code blocks are also returned. They do not have an info string,
94//! language, or attributes.
95//!
96//! ```rust
97//! use pulldown_cmark_codeblock::{code_blocks, CodeBlockKind};
98//!
99//! let markdown = "Before\n\n    indented\n\nAfter\n";
100//! let block = code_blocks(markdown).next().unwrap();
101//!
102//! assert!(matches!(block.kind, CodeBlockKind::Indented));
103//! assert_eq!(block.language, None);
104//! assert_eq!(block.info_string, "");
105//! assert_eq!(block.attributes, None);
106//! assert_eq!(block.source, "indented\n");
107//! ```
108#![forbid(unsafe_code)]
109#![warn(missing_docs, future_incompatible, rust_2018_idioms)]
110
111use std::ops::Range;
112
113use pulldown_cmark::{DefaultBrokenLinkCallback, Event, OffsetIter, Parser, Tag, TagEnd};
114
115pub use pulldown_cmark::CodeBlockKind;
116
117/// A code block extracted from a Markdown document.
118#[derive(Debug, Clone, PartialEq)]
119pub struct CodeBlock {
120    /// The code block kind.
121    pub kind: CodeBlockKind<'static>,
122    /// First word from the fenced code block info string.
123    ///
124    /// This is [`None`] for indented code blocks and for fenced code blocks
125    /// without an info string.
126    pub language: Option<String>,
127    /// Complete fenced code block info string, without the opening backticks.
128    ///
129    /// This is empty for indented code blocks.
130    pub info_string: String,
131    /// Remaining info string words after the language, if present.
132    pub attributes: Option<String>,
133    /// Code block body.
134    pub source: String,
135    /// Byte range covering the opening marker, body, and closing marker.
136    pub byte_range: Range<usize>,
137    /// Zero-based line range covering the opening marker, body, and closing marker.
138    pub line_range: Range<usize>,
139    /// Whitespace indentation before the opening marker.
140    pub indent: usize,
141}
142
143impl CodeBlock {
144    /// Returns true for fenced code blocks.
145    #[must_use]
146    pub fn is_fenced(&self) -> bool {
147        self.kind.is_fenced()
148    }
149
150    /// Returns true when the full info string contains `word` as a whitespace-separated token.
151    #[must_use]
152    pub fn has_info_word(&self, word: &str) -> bool {
153        self.info_string
154            .split_whitespace()
155            .any(|token| token == word)
156    }
157
158    /// Returns fenced code block attributes as whitespace-separated tokens.
159    ///
160    /// For ```` ```rust a b c ````, this yields `a`, `b`, and `c`.
161    pub fn attributes(&self) -> impl Iterator<Item = &str> {
162        self.attributes.as_deref().unwrap_or("").split_whitespace()
163    }
164
165    /// Returns true when the fenced code block attributes contain `attribute`.
166    #[must_use]
167    pub fn has_attribute(&self, attribute: &str) -> bool {
168        self.attributes().any(|token| token == attribute)
169    }
170}
171
172/// Iterator over Markdown code blocks.
173pub struct CodeBlockExtractor<'a> {
174    parser: OffsetIter<'a, DefaultBrokenLinkCallback>,
175    markdown: &'a str,
176}
177
178impl<'a> CodeBlockExtractor<'a> {
179    /// Creates an extractor from Markdown source.
180    #[must_use]
181    pub fn from_markdown(markdown: &'a str) -> Self {
182        Self {
183            parser: Parser::new(markdown).into_offset_iter(),
184            markdown,
185        }
186    }
187}
188
189impl Iterator for CodeBlockExtractor<'_> {
190    type Item = CodeBlock;
191
192    fn next(&mut self) -> Option<Self::Item> {
193        while let Some((event, range)) = self.parser.next() {
194            if let Event::Start(Tag::CodeBlock(kind)) = event {
195                return Some(self.collect_code_block(kind, range));
196            }
197        }
198
199        None
200    }
201}
202
203impl CodeBlockExtractor<'_> {
204    fn collect_code_block(
205        &mut self,
206        kind: CodeBlockKind<'_>,
207        start_range: Range<usize>,
208    ) -> CodeBlock {
209        let mut source = String::new();
210        let mut end_offset = start_range.end;
211
212        for (event, range) in &mut self.parser {
213            match event {
214                Event::Text(text) => {
215                    source.push_str(&text);
216                    end_offset = range.end;
217                }
218                Event::End(TagEnd::CodeBlock) => {
219                    end_offset = range.end;
220                    break;
221                }
222                _ => {}
223            }
224        }
225
226        let kind = kind.into_static();
227        let info_string = match &kind {
228            CodeBlockKind::Fenced(info_string) => info_string.to_string(),
229            CodeBlockKind::Indented => String::new(),
230        };
231        let (language, attributes) = parse_info_string(&info_string);
232        let indent = self
233            .markdown
234            .get(..start_range.start)
235            .and_then(|source| source.lines().last())
236            .unwrap_or("")
237            .chars()
238            .take_while(|character| character.is_whitespace())
239            .count();
240
241        CodeBlock {
242            kind,
243            language,
244            info_string,
245            attributes,
246            source,
247            byte_range: start_range.start..end_offset,
248            line_range: line_number(self.markdown, start_range.start)
249                .saturating_sub(usize::from(indent > 0))
250                ..line_number(self.markdown, end_offset),
251            indent,
252        }
253    }
254}
255
256/// Returns an iterator over Markdown code blocks.
257#[must_use]
258pub fn code_blocks(markdown: &str) -> CodeBlockExtractor<'_> {
259    CodeBlockExtractor::from_markdown(markdown)
260}
261
262fn parse_info_string(info_string: &str) -> (Option<String>, Option<String>) {
263    let trimmed = info_string.trim();
264
265    if trimmed.is_empty() {
266        return (None, None);
267    }
268
269    match trimmed.split_once(char::is_whitespace) {
270        Some((language, attributes)) => {
271            let attributes = attributes.trim();
272            (
273                Some(language.to_string()),
274                (!attributes.is_empty()).then(|| attributes.to_string()),
275            )
276        }
277        None => (Some(trimmed.to_string()), None),
278    }
279}
280
281fn line_number(markdown: &str, offset: usize) -> usize {
282    markdown[..offset].lines().count()
283}
284
285#[cfg(test)]
286mod tests {
287    use super::{CodeBlockExtractor, CodeBlockKind, code_blocks};
288
289    #[test]
290    fn extracts_fenced_code_blocks() {
291        let markdown = "# Title\n\n```rust mdcr-skip key=value\nfn main() {}\n```\n";
292
293        let blocks = CodeBlockExtractor::from_markdown(markdown).collect::<Vec<_>>();
294
295        assert_eq!(blocks.len(), 1);
296        assert!(matches!(blocks[0].kind, CodeBlockKind::Fenced(_)));
297        assert_eq!(blocks[0].language.as_deref(), Some("rust"));
298        assert_eq!(blocks[0].attributes.as_deref(), Some("mdcr-skip key=value"));
299        assert_eq!(blocks[0].info_string, "rust mdcr-skip key=value");
300        assert_eq!(blocks[0].source, "fn main() {}\n");
301        assert_eq!(blocks[0].line_range, 2..5);
302        assert!(blocks[0].has_info_word("mdcr-skip"));
303        assert!(blocks[0].has_attribute("mdcr-skip"));
304    }
305
306    #[test]
307    fn extracts_fenced_code_block_attributes() {
308        let markdown = "```rust a b c\nfn main() {}\n```\n";
309
310        let blocks = code_blocks(markdown).collect::<Vec<_>>();
311
312        assert_eq!(blocks[0].language.as_deref(), Some("rust"));
313        assert_eq!(blocks[0].attributes.as_deref(), Some("a b c"));
314        assert_eq!(blocks[0].attributes().collect::<Vec<_>>(), ["a", "b", "c"]);
315    }
316
317    #[test]
318    fn extracts_indented_code_blocks() {
319        let markdown = "Before\n\n    indented\n\nAfter\n";
320
321        let blocks = code_blocks(markdown).collect::<Vec<_>>();
322
323        assert_eq!(blocks.len(), 1);
324        assert!(matches!(blocks[0].kind, CodeBlockKind::Indented));
325        assert_eq!(blocks[0].language, None);
326        assert_eq!(blocks[0].source, "indented\n");
327    }
328}