concisemark/
lib.rs

1//! # ConciseMark - a simplified markdown parsing library
2//!
3//! ConciseMark can render markdown into HTML or Latex page, for example
4//!
5//!     use concisemark::Page;
6//!
7//!     let content = "# Title";
8//!     let page = Page::new(content);
9//!     let html = page.render();
10//!
11//! The output html will be
12//!
13//! ```text
14//! <div><h1>Title</h1></div>
15//! ```
16//!
17//! The outermost `div` is the root of the rendered html page.
18//!
19//! If you want to render the markdown into a pretty PDF document, you may be interested in
20//! [`Page::render_latex`], have it a look!
21//!
22//! ## Hook
23//!
24//! [`Page`] maintains an AST structure which you can use to hook the nodes you are
25//! interested in, please see its document for more information.
26//!
27pub mod meta;
28pub mod node;
29mod parser;
30mod render;
31pub mod token;
32pub mod utils;
33
34use meta::Meta;
35use node::Node;
36use parser::Parser;
37
38/// A placehodler for future usage
39#[derive(Debug)]
40pub struct PageOptions {}
41
42/// A markdown page
43pub struct Page {
44    /// Meta information for the page, such as author, tags ...
45    pub meta: Option<Meta>,
46    /// Page AST (abstract syntax tree), see [`Page::transform`] to learn how to modify it
47    pub ast: Node,
48    /// The markdown file content (with `meta` stripped). `ast` does not store any text but only node range,
49    /// and content is necessary to retrive node text with `ast` information.
50    pub content: String,
51    /// Page options, a placehodler for future usage
52    pub options: Option<PageOptions>,
53}
54
55impl Page {
56    /// Create a new markdown page from `content`
57    pub fn new<S: AsRef<str>>(content: S) -> Self {
58        let (meta, ast, content) = Parser::new(content).parse();
59        Self {
60            meta,
61            ast,
62            content,
63            options: None,
64        }
65    }
66
67    pub fn with_options(mut self, options: PageOptions) -> Self {
68        self.options = Some(options);
69        self
70    }
71
72    /// Render markdown into HTML page
73    ///
74    ///     use concisemark::Page;
75    ///
76    ///     let content = "# Title";
77    ///     let page = Page::new(content);
78    ///     let html = page.render();
79    ///
80    /// The output html will be
81    ///
82    /// ```text
83    /// <div><h1>Title</h1></div>
84    /// ```
85    pub fn render(&self) -> String {
86        self.render_with_hook(&|_| None)
87    }
88
89    /// Render markdown into XeLaTex source
90    ///
91    /// Note that latex can not embed image from url, you must download the image and fix the
92    /// image path to generate a working tex file, the following is a dirty and quick example.
93    ///
94    ///     use concisemark::Page;
95    ///     use concisemark::node::Node;
96    ///     use concisemark::node::NodeTagName;
97    ///     use concisemark::utils;
98    ///
99    ///     use std::fs::OpenOptions;
100    ///     use std::process::Command;
101    ///     use std::io::Write;
102    ///
103    ///     use indoc::indoc;
104    ///
105    ///     let content = indoc! {r#"
106    ///         ![animal-online](https://cn.bing.com/th?id=OHR.NorwayMuskox_EN-CN7806818932_1920x1080.jpg&w=720)
107    ///
108    ///         ![animal-offlie](assets/th.jpg)
109    ///     "#
110    ///     };
111    ///     let manifest_dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR"));
112    ///     let draft_dir = manifest_dir.join("draft");
113    ///     std::fs::create_dir_all(draft_dir.as_path()).unwrap();
114    ///
115    ///     let page = Page::new(content);
116    ///     let hook = |node: &Node| -> Result<(), ()> {
117    ///         let mut nodedata = node.data.borrow_mut();
118    ///         if nodedata.tag.name == NodeTagName::Image {
119    ///             let src = nodedata.tag.attrs.get("src").unwrap().to_owned();
120    ///             let name = nodedata.tag.attrs.get("name").unwrap().to_owned();
121    ///             let output_path;
122    ///             if src.starts_with("https://") || src.starts_with("http://") {
123    ///                 output_path = utils::download_image_fs(src, draft_dir.as_path(), name).unwrap();
124    ///             } else {
125    ///                 output_path = manifest_dir.join(src);
126    ///             }
127    ///             nodedata.tag.attrs.insert("src".to_owned(), format!("{}", output_path.display()));
128    ///         }
129    ///         Ok(())
130    ///     };
131    ///     page.transform(hook);
132    ///
133    ///     let setup = include_str!("../assets/setup.tex");
134    ///     let wanted = indoc! {r#"
135    ///         \begin{document}
136    ///         \begin{figure}[H]
137    ///         \centerline{\includegraphics[width=0.7\textwidth]{PLACEHOLDER_ONLINE}}
138    ///         \caption{animal-online}
139    ///         \end{figure}
140    ///         \begin{figure}[H]
141    ///         \centerline{\includegraphics[width=0.7\textwidth]{PLACEHOLDER_OFFLINE}}
142    ///         \caption{animal-offlie}
143    ///         \end{figure}
144    ///         \end{document}
145    ///     "#};
146    ///     let wanted = wanted.replace(
147    ///         "PLACEHOLDER_ONLINE",
148    ///         &format!("{}", manifest_dir.join("draft").join("animal-online.jpg").display())
149    ///     ).replace(
150    ///         "PLACEHOLDER_OFFLINE",
151    ///         &format!("{}", manifest_dir.join("assets").join("th.jpg").display())
152    ///     );
153    ///     let pagesrc = &page.render_latex()[setup.len()..];
154    ///     assert_eq!(wanted.trim(), pagesrc.trim());
155    ///
156    ///     let latex = page.render_latex();
157    ///     let texfile = draft_dir.join("output.tex");
158    ///     let mut f = OpenOptions::new().truncate(true).write(true).create(true).open(&texfile).unwrap();
159    ///     f.write(latex.as_bytes()).unwrap();
160    ///     let mut cmd = Command::new("xelatex");
161    ///     cmd.current_dir(&draft_dir);
162    ///     cmd.arg(&texfile);
163    ///     _ = cmd.output();
164    pub fn render_latex(&self) -> String {
165        let mut page = include_str!("../assets/setup.tex").to_owned();
166        let mut document = render::latex::Cmd::new("document").enclosed();
167        if let Some(meta) = &self.meta {
168            let title =
169                render::latex::Cmd::new("title").with_posarg(&meta.title);
170            document.append_cmd(&title);
171            if let Some(authors) = &meta.authors {
172                let authors = render::latex::Cmd::new("author")
173                    .with_posarg(authors.join(", "));
174                document.append_cmd(&authors);
175            }
176            let date = render::latex::Cmd::new("date")
177                .with_posarg(meta.date.to_string());
178            document.append_cmd(&date);
179            let maketitle = render::latex::Cmd::new("maketitle");
180            document.append_cmd(&maketitle);
181        }
182        document
183            .append(render::latex::generate(&self.ast, self.content.as_str()));
184        page.push_str(&document.to_string());
185        page
186    }
187
188    /// Render markdown into HTML page with hook
189    ///
190    /// If the hook returns None, then the default rendering function will be used or else
191    /// use the returned value as render result.
192    pub fn render_with_hook<F>(&self, hook: &F) -> String
193    where
194        F: Fn(&Node) -> Option<String>,
195    {
196        render::html::generate(&self.ast, self.content.as_str(), Some(hook))
197    }
198
199    /// Modify markdown AST node with hook.
200    ///
201    /// The error status of the hook function (when returns an Err) will not stop the transform
202    /// process, instead it will print the error as a log message.
203    ///
204    /// The following is an exmaple to change image url
205    ///
206    ///     use concisemark::node::{Node, NodeTagName};
207    ///     use concisemark::Page;
208    ///
209    ///     let content = "![imgs](/path/to/image.jpg)";
210    ///     let page = Page::new(content);
211    ///     let hook = |node: &Node| -> Result<(), ()> {
212    ///         let mut nodedata = node.data.borrow_mut();
213    ///         if nodedata.tag.name == NodeTagName::Image {
214    ///             let src = nodedata.tag.attrs.get("src").unwrap().to_owned();
215    ///             let src = if src.starts_with("/") {
216    ///                 format!("https://example.com{src}")
217    ///             } else {
218    ///                 format!("https://example.com/{src}")
219    ///             };
220    ///             nodedata.tag.attrs.insert("src".to_owned(), src);
221    ///         }
222    ///         Ok(())
223    ///     };
224    ///     let img = &page.ast.children()[0].children()[0];
225    ///     assert_eq!(img.data.borrow().tag.attrs.get("src").map(|s| s.as_str()), Some("/path/to/image.jpg"));
226    ///     page.transform(hook);
227    ///     assert_eq!(img.data.borrow().tag.attrs.get("src").map(|s| s.as_str()), Some("https://example.com/path/to/image.jpg"));
228    ///
229    pub fn transform<F, E>(&self, hook: F)
230    where
231        F: Fn(&Node) -> Result<(), E>,
232    {
233        self.ast.transform::<F, E>(&hook)
234    }
235}
236
237#[cfg(test)]
238mod tests {
239    use html5ever::{
240        driver::ParseOpts, local_name, namespace_url, ns, parse_fragment,
241        tendril::TendrilSink, tree_builder::TreeSink, QualName,
242    };
243    use indoc::indoc;
244    use markup5ever_rcdom::{Handle, NodeData, RcDom};
245    use node::NodeTagName;
246
247    use crate::*;
248
249    fn is_self_closing_tag(tag: &str) -> bool {
250        let self_closing_tag_list = vec![
251            // svg tags
252            "circle", "ellipse", "line", "path", "polygon", "polyline", "rect",
253            "stop", "use", // void tags
254            "area", "base", "br", "col", "command", "embed", "hr", "img",
255            "input", "keygen", "link", "meta", "param", "source", "track",
256            "wbr",
257        ];
258        self_closing_tag_list.iter().any(|&i| i == tag)
259    }
260
261    fn get_html_outline(dirty_html: &str) -> String {
262        fn walker(indent: usize, node: &Handle) -> String {
263            let indentstr = " ".repeat(indent);
264            let mut outline = indentstr.to_string();
265            if let NodeData::Element { ref name, .. } = node.data {
266                if is_self_closing_tag(&name.local) {
267                    outline += &format!("<{}", name.local);
268                } else {
269                    outline += &format!("<{}>\n", name.local);
270                }
271            }
272
273            for child in node.children.borrow().iter() {
274                if let NodeData::Element { .. } = child.data {
275                    outline += &walker(indent + 2, child);
276                }
277            }
278
279            if let NodeData::Element { ref name, .. } = node.data {
280                if is_self_closing_tag(&name.local) {
281                    outline += "/>\n";
282                } else {
283                    outline += &format!("{}</{}>\n", indentstr, name.local);
284                }
285            }
286
287            outline
288        }
289
290        let parser = parse_fragment(
291            RcDom::default(),
292            ParseOpts::default(),
293            QualName::new(None, ns!(html), local_name!("body")),
294            vec![],
295        );
296        let mut dom = parser.one(dirty_html);
297        let html = dom.get_document();
298        let body = &html.children.borrow()[0];
299        let mut outline = String::new();
300        for child in body.children.borrow().iter() {
301            outline += &walker(0, child);
302        }
303        outline
304    }
305
306    #[test]
307    fn test_heading() {
308        let tcases = [
309            ("# title", "1"),
310            ("## title", "2"),
311            ("### title", "3"),
312            ("#### title", "4"),
313            ("##### title", "5"),
314            ("###### title", "6"),
315            ("####### title", "6"),
316        ];
317        for (content, level) in tcases {
318            let page = Page::new(content);
319            let ast = page.ast.data.borrow();
320            assert_eq!(ast.tag.name, NodeTagName::Section);
321            assert_eq!(ast.children[0].borrow().tag.name, NodeTagName::Heading);
322            assert_eq!(
323                ast.children[0]
324                    .borrow()
325                    .tag
326                    .attrs
327                    .get("level")
328                    .map(|s| s.as_str()),
329                Some(level)
330            );
331        }
332    }
333
334    #[test]
335    fn test_list() {
336        let content = indoc! {r#"
337        - [nvim](https://neovim.io/) >= 0.7.0
338
339            nvim is great!
340
341        - [rust](https://www.rust-lang.org/tools/install) >= 1.64
342        "#};
343
344        let page = Page::new(content);
345        let html = page.render();
346        let outline = get_html_outline(html.as_str());
347        assert_eq!(
348            outline,
349            indoc! {r#"
350            <div>
351              <ul>
352                <li>
353                  <a>
354                  </a>
355                  <p>
356                  </p>
357                </li>
358                <li>
359                  <a>
360                  </a>
361                </li>
362              </ul>
363            </div>
364        "#}
365        );
366    }
367
368    #[test]
369    fn test_meta() {
370        let meta = r#"
371<!---
372title = "title"
373subtitle = "subtitle"
374date = "2023-08-27 10:39:05"
375authors = ["example <example@gmail>"]
376tags = []
377-->
378
379example
380
381"#;
382        let page = Page::new(meta);
383        assert!(page.meta.is_some());
384        let meta = page.meta.clone().unwrap();
385        assert_eq!(meta.title, "title");
386        assert_eq!(meta.subtitle, Some("subtitle".to_owned()));
387        assert_eq!(
388            format!("{}", meta.date.format("%Y-%m-%d %H:%M:%S")),
389            "2023-08-27 10:39:05"
390        );
391    }
392
393    #[test]
394    fn test_emphasis() {
395        let content = indoc! {r#"
396        This is a sentence with emphasis *itaclics* and **bold**.
397        "#};
398        let page = Page::new(content);
399        let html = page.render();
400        let wanted_html = indoc! {r#"
401        <div><p>This is a sentence with emphasis <em> itaclics </em>and <strong> bold </strong>. </p></div>
402        "#};
403        assert_eq!(html, wanted_html.trim());
404
405        let content = include_str!("../testdata/emphasis_01.md");
406        let page = Page::new(content);
407        let html = page.render();
408        assert_eq!(html, include_str!("../testdata/emphasis_01.html").trim());
409    }
410    #[test]
411    fn test_backquote_00() {
412        let content = include_str!("../testdata/backquote_00.md");
413        let page = Page::new(content);
414        let html = page.render();
415        let wanted_html = "<div><blockquote><p>a simple blockquote with very long body really long body ... </p></blockquote></div>";
416        assert_eq!(html, wanted_html);
417    }
418
419    #[test]
420    fn test_backquote_01() {
421        let content = include_str!("../testdata/backquote_01.md");
422        let page = Page::new(content);
423        let html = page.render();
424        let wanted_html = "<div><ul><li>title <blockquote><p>a simple line <br/>abc <strong> line </strong> <em> line </em>test </p></blockquote></li></ul></div>";
425        assert_eq!(html, wanted_html);
426    }
427
428    #[test]
429    fn test_backquote_02() {
430        let content = include_str!("../testdata/backquote_02.md");
431        let wanted_html = "<div><blockquote><p>a simple line <br/>line test </p></blockquote></div>";
432        let page = Page::new(content);
433        let html = page.render();
434        assert_eq!(html, wanted_html.trim());
435    }
436
437    #[test]
438    fn test_backquote_rich() {
439        let content = indoc! {r#"
440        > a simple line
441        >
442        > abc **line**
443        > *line*
444        test
445        "#};
446        let wanted_html = indoc! {r#"
447        <div><blockquote><p>a simple line <br/>abc <strong> line </strong> <em> line </em>test </p></blockquote></div>
448        "#};
449        let page = Page::new(content);
450        let html = page.render();
451        assert_eq!(html, wanted_html.trim());
452    }
453
454    #[test]
455    fn test_backquote_unicode() {
456        let content = indoc! {r#"
457        这是摘要
458
459        >测试
460        >
461        > 再次测试
462        "#};
463        let wanted_html = indoc! {r#"
464        <div><p>这是摘要</p><blockquote><p>测试<br/>再次测试</p></blockquote></div>
465        "#};
466        let page = Page::new(content);
467        let html = page.render();
468        assert_eq!(html, wanted_html.trim());
469    }
470
471    #[test]
472    fn test_para_ending_whitesapce_00() {
473        // require space between `2008 年` and `8 月 8 日`
474        let content = include_str!("../testdata/para_ending_whitespace_00.md");
475        let wanted_html = indoc! {r#"
476        <div><p>北京奥运会开幕式时间为 2008 年 8 月 8 日</p></div>
477        "#};
478        let page = Page::new(content);
479        let html = page.render();
480        assert_eq!(html, wanted_html.trim());
481    }
482
483    #[test]
484    fn test_para_ending_whitesapce_01() {
485        // require no space between `这是一段长` and `文本`, and
486        // no space between `这是一段引用` and `文本`
487        let content = include_str!("../testdata/para_ending_whitespace_01.md");
488        let wanted_html = indoc! {r#"
489        <div><p>这是一段长文本</p><blockquote><p>这是一段引用文本</p></blockquote></div>
490        "#};
491        let page = Page::new(content);
492        let html = page.render();
493        assert_eq!(html, wanted_html.trim());
494    }
495
496    #[test]
497    fn test_math_mode() {
498        let content = include_str!("../testdata/math_mode.md");
499        let page = Page::new(content);
500        let nodes = node::find_nodes_by_tag(&page.ast, node::NodeTagName::Math);
501        assert_eq!(nodes.len(), 2);
502        assert!(nodes[0].is_inlined(content));
503        assert!(!nodes[1].is_inlined(content));
504    }
505
506    #[test]
507    fn test_zh_cn_hybrid_in_para() {
508        let content = include_str!("../testdata/zh_cn_hybrid_in_para.md");
509        let page = Page::new(content);
510        let html = page.render();
511        assert_eq!(html, "<div><p>这是 2 根韭菜</p></div>");
512    }
513
514    #[test]
515    fn test_codeblock_00() {
516        let content = include_str!("../testdata/codeblock_00.md");
517        let page = Page::new(content);
518        let html = page.render();
519        assert_eq!(
520            html,
521            include_str!("../testdata/codeblock_00.html").trim_end()
522        );
523    }
524
525    #[test]
526    fn test_html_char_escape_00() {
527        let content = include_str!("../testdata/html_char_escape_00.md");
528        let page = Page::new(content);
529        let html = page.render();
530        assert_eq!(
531            html,
532            include_str!("../testdata/html_char_escape_00.html").trim_end()
533        );
534    }
535
536    #[test]
537    fn test_heading_00() {
538        let content = include_str!("../testdata/heading_00.md");
539        let page = Page::new(content);
540        let html = page.render();
541        assert_eq!(
542            html,
543            include_str!("../testdata/heading_00.html").trim_end()
544        );
545    }
546}