pulldown_cmark_codeblock/lib.rs
1//! Extract Markdown code blocks from Markdown documents parsed with
2//! [`pulldown-cmark`](https://crates.io/crates/pulldown-cmark).
3//!
4//! `pulldown-cmark` already exposes the fenced code block info string through
5//! `Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(info_string)))`. This
6//! crate builds on top of that lower-level event stream and returns complete,
7//! ready-to-use code block records:
8//!
9//! - fenced or indented block kind
10//! - language parsed from the first info string word
11//! - raw info string
12//! - remaining attributes as a raw string or token iterator
13//! - code block source text
14//! - byte range covering the whole block
15//! - zero-based line range covering the whole block
16//! - indentation before the opening marker
17//!
18//! # Example
19//!
20//! ````rust
21//! use pulldown_cmark_codeblock::{code_blocks, CodeBlockKind};
22//!
23//! let markdown = "# Title\n\n```rust runnable key=value\nfn main() {}\n```\n";
24//! let block = code_blocks(markdown).next().unwrap();
25//!
26//! assert!(matches!(block.kind, CodeBlockKind::Fenced(_)));
27//! assert_eq!(block.language.as_deref(), Some("rust"));
28//! assert_eq!(block.info_string, "rust runnable key=value");
29//! assert_eq!(block.attributes.as_deref(), Some("runnable key=value"));
30//! assert_eq!(block.attributes().collect::<Vec<_>>(), ["runnable", "key=value"]);
31//! assert_eq!(block.source, "fn main() {}\n");
32//! assert_eq!(block.line_range, 2..5);
33//! ````
34//!
35//! # API
36//!
37//! Use [`code_blocks`] for the concise iterator API:
38//!
39//! ````rust
40//! use pulldown_cmark_codeblock::code_blocks;
41//!
42//! let markdown = "Before\n\n```rust\nfn main() {}\n```\n";
43//! let blocks = code_blocks(markdown).collect::<Vec<_>>();
44//!
45//! assert_eq!(blocks.len(), 1);
46//! assert_eq!(blocks[0].language.as_deref(), Some("rust"));
47//! ````
48//!
49//! Use [`CodeBlockExtractor::from_markdown`] when you prefer constructing the
50//! iterator explicitly:
51//!
52//! ````rust
53//! use pulldown_cmark_codeblock::CodeBlockExtractor;
54//!
55//! let markdown = "```rust\nfn main() {}\n```\n";
56//! let blocks = CodeBlockExtractor::from_markdown(markdown).collect::<Vec<_>>();
57//!
58//! assert_eq!(blocks[0].source, "fn main() {}\n");
59//! ````
60//!
61//! Each extracted [`CodeBlock`] exposes:
62//!
63//! - [`CodeBlock::kind`]: [`CodeBlockKind::Fenced`] or
64//! [`CodeBlockKind::Indented`]
65//! - [`CodeBlock::language`]: first info string word for fenced blocks
66//! - [`CodeBlock::info_string`]: complete fenced block info string
67//! - [`CodeBlock::attributes`]: remaining info string after the language
68//! - [`CodeBlock::source`]: code block body
69//! - [`CodeBlock::byte_range`]: byte range covering opening marker, body, and
70//! closing marker
71//! - [`CodeBlock::line_range`]: zero-based line range covering the whole block
72//! - [`CodeBlock::indent`]: whitespace indentation before the opening marker
73//!
74//! It also provides helper methods:
75//!
76//! - [`CodeBlock::is_fenced`]
77//! - [`CodeBlock::has_info_word`]
78//! - [`CodeBlock::attributes`]
79//! - [`CodeBlock::has_attribute`]
80//!
81//! ````rust
82//! use pulldown_cmark_codeblock::code_blocks;
83//!
84//! let markdown = "```rust a b c\nfn main() {}\n```\n";
85//! let block = code_blocks(markdown).next().unwrap();
86//!
87//! assert!(block.is_fenced());
88//! assert!(block.has_info_word("rust"));
89//! assert!(block.has_attribute("b"));
90//! assert_eq!(block.attributes().collect::<Vec<_>>(), ["a", "b", "c"]);
91//! ````
92//!
93//! Indented code blocks are also returned. They do not have an info string,
94//! language, or attributes.
95//!
96//! ```rust
97//! use pulldown_cmark_codeblock::{code_blocks, CodeBlockKind};
98//!
99//! let markdown = "Before\n\n indented\n\nAfter\n";
100//! let block = code_blocks(markdown).next().unwrap();
101//!
102//! assert!(matches!(block.kind, CodeBlockKind::Indented));
103//! assert_eq!(block.language, None);
104//! assert_eq!(block.info_string, "");
105//! assert_eq!(block.attributes, None);
106//! assert_eq!(block.source, "indented\n");
107//! ```
108#![forbid(unsafe_code)]
109#![warn(missing_docs, future_incompatible, rust_2018_idioms)]
110
111use std::ops::Range;
112
113use pulldown_cmark::{DefaultBrokenLinkCallback, Event, OffsetIter, Parser, Tag, TagEnd};
114
115pub use pulldown_cmark::CodeBlockKind;
116
117/// A code block extracted from a Markdown document.
118#[derive(Debug, Clone, PartialEq)]
119pub struct CodeBlock {
120 /// The code block kind.
121 pub kind: CodeBlockKind<'static>,
122 /// First word from the fenced code block info string.
123 ///
124 /// This is [`None`] for indented code blocks and for fenced code blocks
125 /// without an info string.
126 pub language: Option<String>,
127 /// Complete fenced code block info string, without the opening backticks.
128 ///
129 /// This is empty for indented code blocks.
130 pub info_string: String,
131 /// Remaining info string words after the language, if present.
132 pub attributes: Option<String>,
133 /// Code block body.
134 pub source: String,
135 /// Byte range covering the opening marker, body, and closing marker.
136 pub byte_range: Range<usize>,
137 /// Zero-based line range covering the opening marker, body, and closing marker.
138 pub line_range: Range<usize>,
139 /// Whitespace indentation before the opening marker.
140 pub indent: usize,
141}
142
143impl CodeBlock {
144 /// Returns true for fenced code blocks.
145 #[must_use]
146 pub fn is_fenced(&self) -> bool {
147 self.kind.is_fenced()
148 }
149
150 /// Returns true when the full info string contains `word` as a whitespace-separated token.
151 #[must_use]
152 pub fn has_info_word(&self, word: &str) -> bool {
153 self.info_string
154 .split_whitespace()
155 .any(|token| token == word)
156 }
157
158 /// Returns fenced code block attributes as whitespace-separated tokens.
159 ///
160 /// For ```` ```rust a b c ````, this yields `a`, `b`, and `c`.
161 pub fn attributes(&self) -> impl Iterator<Item = &str> {
162 self.attributes.as_deref().unwrap_or("").split_whitespace()
163 }
164
165 /// Returns true when the fenced code block attributes contain `attribute`.
166 #[must_use]
167 pub fn has_attribute(&self, attribute: &str) -> bool {
168 self.attributes().any(|token| token == attribute)
169 }
170}
171
172/// Iterator over Markdown code blocks.
173pub struct CodeBlockExtractor<'a> {
174 parser: OffsetIter<'a, DefaultBrokenLinkCallback>,
175 markdown: &'a str,
176}
177
178impl<'a> CodeBlockExtractor<'a> {
179 /// Creates an extractor from Markdown source.
180 #[must_use]
181 pub fn from_markdown(markdown: &'a str) -> Self {
182 Self {
183 parser: Parser::new(markdown).into_offset_iter(),
184 markdown,
185 }
186 }
187}
188
189impl Iterator for CodeBlockExtractor<'_> {
190 type Item = CodeBlock;
191
192 fn next(&mut self) -> Option<Self::Item> {
193 while let Some((event, range)) = self.parser.next() {
194 if let Event::Start(Tag::CodeBlock(kind)) = event {
195 return Some(self.collect_code_block(kind, range));
196 }
197 }
198
199 None
200 }
201}
202
203impl CodeBlockExtractor<'_> {
204 fn collect_code_block(
205 &mut self,
206 kind: CodeBlockKind<'_>,
207 start_range: Range<usize>,
208 ) -> CodeBlock {
209 let mut source = String::new();
210 let mut end_offset = start_range.end;
211
212 for (event, range) in &mut self.parser {
213 match event {
214 Event::Text(text) => {
215 source.push_str(&text);
216 end_offset = range.end;
217 }
218 Event::End(TagEnd::CodeBlock) => {
219 end_offset = range.end;
220 break;
221 }
222 _ => {}
223 }
224 }
225
226 let kind = kind.into_static();
227 let info_string = match &kind {
228 CodeBlockKind::Fenced(info_string) => info_string.to_string(),
229 CodeBlockKind::Indented => String::new(),
230 };
231 let (language, attributes) = parse_info_string(&info_string);
232 let indent = self
233 .markdown
234 .get(..start_range.start)
235 .and_then(|source| source.lines().last())
236 .unwrap_or("")
237 .chars()
238 .take_while(|character| character.is_whitespace())
239 .count();
240
241 CodeBlock {
242 kind,
243 language,
244 info_string,
245 attributes,
246 source,
247 byte_range: start_range.start..end_offset,
248 line_range: line_number(self.markdown, start_range.start)
249 .saturating_sub(usize::from(indent > 0))
250 ..line_number(self.markdown, end_offset),
251 indent,
252 }
253 }
254}
255
256/// Returns an iterator over Markdown code blocks.
257#[must_use]
258pub fn code_blocks(markdown: &str) -> CodeBlockExtractor<'_> {
259 CodeBlockExtractor::from_markdown(markdown)
260}
261
262fn parse_info_string(info_string: &str) -> (Option<String>, Option<String>) {
263 let trimmed = info_string.trim();
264
265 if trimmed.is_empty() {
266 return (None, None);
267 }
268
269 match trimmed.split_once(char::is_whitespace) {
270 Some((language, attributes)) => {
271 let attributes = attributes.trim();
272 (
273 Some(language.to_string()),
274 (!attributes.is_empty()).then(|| attributes.to_string()),
275 )
276 }
277 None => (Some(trimmed.to_string()), None),
278 }
279}
280
281fn line_number(markdown: &str, offset: usize) -> usize {
282 markdown[..offset].lines().count()
283}
284
285#[cfg(test)]
286mod tests {
287 use super::{CodeBlockExtractor, CodeBlockKind, code_blocks};
288
289 #[test]
290 fn extracts_fenced_code_blocks() {
291 let markdown = "# Title\n\n```rust mdcr-skip key=value\nfn main() {}\n```\n";
292
293 let blocks = CodeBlockExtractor::from_markdown(markdown).collect::<Vec<_>>();
294
295 assert_eq!(blocks.len(), 1);
296 assert!(matches!(blocks[0].kind, CodeBlockKind::Fenced(_)));
297 assert_eq!(blocks[0].language.as_deref(), Some("rust"));
298 assert_eq!(blocks[0].attributes.as_deref(), Some("mdcr-skip key=value"));
299 assert_eq!(blocks[0].info_string, "rust mdcr-skip key=value");
300 assert_eq!(blocks[0].source, "fn main() {}\n");
301 assert_eq!(blocks[0].line_range, 2..5);
302 assert!(blocks[0].has_info_word("mdcr-skip"));
303 assert!(blocks[0].has_attribute("mdcr-skip"));
304 }
305
306 #[test]
307 fn extracts_fenced_code_block_attributes() {
308 let markdown = "```rust a b c\nfn main() {}\n```\n";
309
310 let blocks = code_blocks(markdown).collect::<Vec<_>>();
311
312 assert_eq!(blocks[0].language.as_deref(), Some("rust"));
313 assert_eq!(blocks[0].attributes.as_deref(), Some("a b c"));
314 assert_eq!(blocks[0].attributes().collect::<Vec<_>>(), ["a", "b", "c"]);
315 }
316
317 #[test]
318 fn extracts_indented_code_blocks() {
319 let markdown = "Before\n\n indented\n\nAfter\n";
320
321 let blocks = code_blocks(markdown).collect::<Vec<_>>();
322
323 assert_eq!(blocks.len(), 1);
324 assert!(matches!(blocks[0].kind, CodeBlockKind::Indented));
325 assert_eq!(blocks[0].language, None);
326 assert_eq!(blocks[0].source, "indented\n");
327 }
328}