pulldown_cmark_frontmatter/
lib.rs

1#![doc = include_str!("../README.md")]
2#![forbid(unsafe_code)]
3#![warn(
4    clippy::cargo,
5    missing_docs,
6    clippy::pedantic,
7    future_incompatible,
8    rust_2018_idioms
9)]
10#![allow(
11    clippy::option_if_let_else,
12    clippy::module_name_repetitions,
13    clippy::missing_errors_doc
14)]
15
16use std::vec;
17
18use pulldown_cmark::{CodeBlockKind, CowStr, Event, DefaultBrokenLinkCallback};
19
20/// Extracts [`Frontmatter`] from any `Iterator<Item =
21/// pulldown_cmark::Event<'_>>`.
22///
23/// This type implements `Iterator<Item = pulldown_cmark::Event<'_>>`, so it can
24/// be used interchangeably with any Markdown processing code that previously
25/// interacted with [`pulldown_cmark::Parser`].
26///
27/// This type's [`Event`] iterator will look for a top-level
28/// heading (h1) and/or a code block at the start of the document. If either or
29/// both are detected, [`FrontmatterExtractor::frontmatter`] will be populated
30/// with the detected [`Frontmatter`].
31///
32/// If a code block is detected in the frontmatter, the code block's
33/// [`Event`]s not be returned when iterating. The top-level
34/// heading's events will be returned as they appear in the original iterator.
35pub struct FrontmatterExtractor<'a, T>
36where
37    T: Iterator<Item = Event<'a>>,
38{
39    /// The detected frontmatter, if any.
40    pub frontmatter: Option<Frontmatter<'a>>,
41    source: T,
42    state: DocumentAttributeParserState<'a>,
43}
44
45impl<'a, T> FrontmatterExtractor<'a, T>
46where
47    T: Iterator<Item = Event<'a>>,
48{
49    /// Returns a new instance that extracts frontmatter from the provided
50    /// [`Event`] iterator.
51    pub fn new(parser: T) -> Self {
52        Self {
53            source: parser,
54            frontmatter: None,
55            state: DocumentAttributeParserState::Parsing,
56        }
57    }
58
59    fn frontmater_mut(&mut self) -> &mut Frontmatter<'a> {
60        if self.frontmatter.is_none() {
61            self.frontmatter = Some(Frontmatter {
62                title: None,
63                code_block: None,
64            });
65        }
66
67        self.frontmatter.as_mut().expect("always initialized")
68    }
69
70    /// Scans the start of the document looking for [`Frontmatter`]. If
71    /// frontmatter is detected, it will be returned.
72    ///
73    /// This function will not consume the original iterator completely. It will
74    /// stop as soon as it is done detecting the frontmatter.
75    pub fn extract(mut self) -> Option<Frontmatter<'a>> {
76        while let Some(_) = self.next() {
77            if matches!(self.state, DocumentAttributeParserState::InDocument) {
78                break;
79            }
80        }
81
82        self.frontmatter
83    }
84
85    /// Scans the start of the document looking for [`Frontmatter`]. If
86    /// frontmatter is detected, it will be returned.
87    ///
88    /// The underlying iterator will be advanced to find the frontmatter, and
89    /// any [`Event`]s that would normally be returned will be buffered so that
90    /// they can still be returned from [`FrontmatterExtractor::next()`].
91    pub fn extract_buffered(&mut self) -> Option<&Frontmatter<'a>> {
92        let mut buffered_events = Vec::new();
93        while let Some(event) = self.next() {
94            buffered_events.push(event);
95            if self.extracted() {
96                break;
97            }
98        }
99
100        self.state = DocumentAttributeParserState::InDocumentBuffered(buffered_events.into_iter());
101
102        self.frontmatter.as_ref()
103    }
104
105    /// Returns true once the extractor is finished extracting the frontmatter.
106    ///
107    /// [`self.frontmatter`](Self::frontmatter) may not contain the full data
108    /// from the underlying document until this function returns true.
109    #[must_use]
110    pub const fn extracted(&self) -> bool {
111        matches!(self.state, DocumentAttributeParserState::InDocument)
112    }
113}
114
115impl<'a> FrontmatterExtractor<'a, pulldown_cmark::Parser<'a, DefaultBrokenLinkCallback>> {
116    /// Returns an instance that parses `markdown` with the default
117    /// [`pulldown_cmark::Parser`].
118    #[must_use]
119    pub fn from_markdown(markdown: &'a str) -> Self {
120        Self::new(pulldown_cmark::Parser::new(markdown))
121    }
122}
123
124impl<'a, T> Iterator for FrontmatterExtractor<'a, T>
125where
126    T: Iterator<Item = Event<'a>>,
127{
128    type Item = Event<'a>;
129
130    fn next(&mut self) -> Option<Self::Item> {
131        match &mut self.state {
132            DocumentAttributeParserState::InDocumentBuffered(buffered) => {
133                if let Some(event) = buffered.next() {
134                    return Some(event);
135                }
136
137                self.state = DocumentAttributeParserState::InDocument;
138                return self.source.next();
139            }
140            DocumentAttributeParserState::InDocument => return self.source.next(),
141            _ => {}
142        }
143
144        loop {
145            match self.source.next()? {
146                Event::Text(text) if self.state.in_document_title() => {
147                    self.frontmater_mut().title_mut().push_str(&text);
148                    return Some(Event::Text(text));
149                }
150                Event::Text(text) if self.state.in_code() => {
151                    let language = match self.state.code_block_kind() {
152                        CodeBlockKind::Indented => None,
153                        CodeBlockKind::Fenced(language) => Some(language),
154                    };
155                    let frontmatter = self.frontmater_mut();
156                    frontmatter.code_block = Some(CodeBlock {
157                        source: text,
158                        language,
159                    });
160                }
161                Event::Start(pulldown_cmark::Tag::Heading {
162                    level: pulldown_cmark::HeadingLevel::H1,
163                    id,
164                    classes,
165                    attrs,
166                }) if !self.state.in_document() => {
167                    self.state = DocumentAttributeParserState::InTitle;
168                    return Some(Event::Start(pulldown_cmark::Tag::Heading {
169                        level: pulldown_cmark::HeadingLevel::H1,
170                        id,
171                        classes,
172                        attrs,
173                    }));
174                }
175                Event::End(pulldown_cmark::TagEnd::Heading (
176                    pulldown_cmark::HeadingLevel::H1,
177                )) if !self.state.in_document() => {
178                    self.state = DocumentAttributeParserState::Parsing;
179                    return Some(Event::End(pulldown_cmark::TagEnd::Heading (
180                        pulldown_cmark::HeadingLevel::H1,
181                    )));
182                }
183                Event::Start(pulldown_cmark::Tag::CodeBlock(kind)) if !self.state.in_document() => {
184                    self.state = DocumentAttributeParserState::InAttributeCodeBlock(kind);
185                }
186                Event::End(pulldown_cmark::TagEnd::CodeBlock) if !self.state.in_document() => {
187                    self.state = DocumentAttributeParserState::InDocument;
188                }
189                other => {
190                    if !self.state.in_document_title() {
191                        self.state = DocumentAttributeParserState::InDocument;
192                    }
193
194                    return Some(other);
195                }
196            }
197        }
198    }
199}
200
201enum DocumentAttributeParserState<'a> {
202    Parsing,
203    InTitle,
204    InAttributeCodeBlock(CodeBlockKind<'a>),
205    InDocumentBuffered(vec::IntoIter<Event<'a>>),
206    InDocument,
207}
208
209impl<'a> DocumentAttributeParserState<'a> {
210    pub fn in_document(&self) -> bool {
211        matches!(self, Self::InDocument)
212    }
213
214    pub fn in_code(&self) -> bool {
215        matches!(self, Self::InAttributeCodeBlock(_))
216    }
217
218    pub fn code_block_kind(&self) -> CodeBlockKind<'a> {
219        if let Self::InAttributeCodeBlock(kind) = self {
220            kind.clone()
221        } else {
222            CodeBlockKind::Indented
223        }
224    }
225
226    pub fn in_document_title(&self) -> bool {
227        matches!(self, Self::InTitle)
228    }
229}
230
231/// Metadata stored within a Markdown document
232#[derive(Debug, Clone)]
233pub struct Frontmatter<'a> {
234    /// The top-level heading's plain-text contents, if the document began with
235    /// a top-level heading.
236    pub title: Option<String>,
237    /// The frontmatter code block, if detected.
238    pub code_block: Option<CodeBlock<'a>>,
239}
240
241impl<'a> Frontmatter<'a> {
242    fn title_mut(&mut self) -> &mut String {
243        if self.title.is_none() {
244            self.title = Some(String::new());
245        }
246
247        self.title.as_mut().expect("always initialized")
248    }
249}
250
251/// A code block from a Markdown document's [`Frontmatter`].
252#[derive(Clone, Debug)]
253pub struct CodeBlock<'a> {
254    /// The contents of the code block.
255    pub source: CowStr<'a>,
256    /// The language of the code block, which is the identifier following the
257    /// three backticks in a fenced Markdown code block.
258    pub language: Option<CowStr<'a>>,
259}
260
261#[test]
262fn attribute_parser_test() {
263    #[derive(serde::Serialize, serde::Deserialize, Debug)]
264    struct Attributes {
265        hello: String,
266    }
267    let source = r#"# My **Document**
268
269```toml
270hello = "world"
271```
272
273This is regular text
274"#;
275    let mut parser = FrontmatterExtractor::from_markdown(source);
276    let mut html = String::new();
277    pulldown_cmark::html::push_html(&mut html, &mut parser);
278    assert_eq!(
279        html,
280        "<h1>My <strong>Document</strong></h1>\n<p>This is regular text</p>\n"
281    );
282
283    let frontmatter = parser.frontmatter.expect("frontmatter not detected");
284
285    assert_eq!(frontmatter.title.as_deref(), Some("My Document"));
286
287    let code_block = frontmatter.code_block.expect("code block not detected");
288    assert_eq!(code_block.language, Some(CowStr::from("toml")));
289    let deserialized: Attributes = toml::from_str(&code_block.source).unwrap();
290
291    assert_eq!(deserialized.hello, "world");
292}
293
294#[test]
295fn extract_buffered() {
296    let mut parser = FrontmatterExtractor::from_markdown("# Heading\n\n    hello world\n\nBody");
297    let frontmatter = parser.extract_buffered().unwrap();
298    assert_eq!(frontmatter.title.as_deref(), Some("Heading"));
299    assert_eq!(
300        frontmatter.code_block.as_ref().unwrap().source.as_ref(),
301        "hello world\n"
302    );
303    let mut html = String::new();
304    pulldown_cmark::html::push_html(&mut html, parser);
305    assert_eq!(html, "<h1>Heading</h1>\n<p>Body</p>\n");
306}
307
308#[test]
309fn indented_parse_test() {
310    #[derive(serde::Serialize, serde::Deserialize, Debug)]
311    struct Attributes {
312        hello: String,
313    }
314    let source = r#"# My **Document**
315
316    hello = "world"
317
318This is regular text
319"#;
320    let mut parser = FrontmatterExtractor::from_markdown(source);
321    let mut html = String::new();
322    pulldown_cmark::html::push_html(&mut html, &mut parser);
323    assert_eq!(
324        html,
325        "<h1>My <strong>Document</strong></h1>\n<p>This is regular text</p>\n"
326    );
327
328    let frontmatter = parser.frontmatter.expect("frontmatter not detected");
329
330    assert_eq!(frontmatter.title.as_deref(), Some("My Document"));
331
332    let code_block = frontmatter.code_block.expect("code block not detected");
333    assert_eq!(code_block.language, None);
334    let deserialized: Attributes = toml::from_str(&code_block.source).unwrap();
335
336    assert_eq!(deserialized.hello, "world");
337}