lssg_lib/lmarkdown/
lexer.rs

1use std::{
2    collections::{BTreeMap, HashMap},
3    io::Read,
4};
5
6use log::warn;
7
8use crate::{
9    char_reader::CharReader,
10    html::{self, parse_html_block},
11    parse_error::ParseError,
12};
13
14/// Remove any tailing new line or starting and ending spaces
15fn sanitize_text(text: String) -> String {
16    let mut lines = vec![];
17    for line in text.lines() {
18        let trimmed = line.trim();
19        if trimmed.len() > 0 {
20            lines.push(trimmed);
21        }
22    }
23
24    return lines.join("\n");
25}
26
27fn read_inline_tokens(text: &String) -> Result<Vec<Token>, ParseError> {
28    let mut reader = CharReader::<&[u8]>::from_string(text);
29
30    let mut tokens = vec![];
31    while let Some(c) = reader.peek_char(0)? {
32        if c == '<' {
33            // inline comment
34            if let Some(text) = reader.peek_until_match_inclusive("-->")? {
35                reader.consume(4)?; // skip start
36                let text = reader.consume_string(text.len() - 4 - 3)?;
37                reader.consume(3)?; // skip end
38                tokens.push(Token::Comment { raw: text });
39                continue;
40            }
41
42            if let Some((tag, attributes, content)) = html::parse_html_block(&mut reader)? {
43                let content = sanitize_text(content);
44                tokens.push(Token::Html {
45                    tag,
46                    attributes,
47                    tokens: read_inline_tokens(&content)?,
48                });
49                continue;
50            }
51        }
52
53        // https://spec.commonmark.org/0.30/#links
54        if c == '[' {
55            if let Some(raw_text) = reader.peek_until_from(1, |c| c == ']')? {
56                let href_start = 1 + raw_text.len();
57                if let Some('(') = reader.peek_char(href_start)? {
58                    if let Some(raw_href) = reader.peek_until_from(href_start + 1, |c| c == ')')? {
59                        reader.consume(1)?;
60                        let text = reader.consume_string(raw_text.len() - 1)?;
61                        reader.consume(2)?;
62                        let href = reader.consume_string(raw_href.len() - 1)?;
63                        reader.consume(1)?;
64                        let text = read_inline_tokens(&text)?;
65                        tokens.push(Token::Link { tokens: text, href });
66                        continue;
67                    }
68                }
69            }
70        }
71
72        if c == '*' {}
73
74        let c = reader.consume_char().unwrap().expect("has to be a char");
75        if let Some(Token::Text { text }) = tokens.last_mut() {
76            text.push(c)
77        } else {
78            tokens.push(Token::Text { text: c.into() })
79        }
80    }
81
82    return Ok(tokens);
83}
84
85// official spec: https://spec.commonmark.org/0.30/
86// https://github.com/markedjs/marked/blob/master/src/Lexer.ts
87// https://github.com/songquanpeng/md2html/blob/main/lexer/lexer.go
88// demo: https://marked.js.org/demo/
89// demo: https://spec.commonmark.org/dingus/
90/// A function to get the next markdown token using recrusive decent.
91/// Will first parse a block token (token for a whole line and then parse for any inline tokens when needed.
92pub fn read_token(reader: &mut CharReader<impl Read>) -> Result<Token, ParseError> {
93    match reader.peek_char(0)? {
94        None => return Ok(Token::EOF),
95        Some(c) => {
96            // if you start a new block with a newline skip it
97            if c == '\n' {
98                reader.consume_until_inclusive(|c| c == '\n' || c == '\r')?;
99                return Ok(Token::Space);
100            }
101
102            // if starts with comment in toml format it is an attribute
103            if reader.has_read() == false {
104                if c == '<' {
105                    if reader.peek_string(4)? == "<!--" {
106                        if let Some(comment) = reader.peek_until_match_inclusive("-->")? {
107                            match toml::from_str(&comment[4..comment.len() - 3]) {
108                                Ok(toml::Value::Table(table)) => {
109                                    reader.consume_until_inclusive(|c| c == '>')?;
110                                    return Ok(Token::Attributes { table });
111                                }
112                                Ok(_) => warn!("Attributes is not a table"),
113                                Err(e) => warn!("Not parsing possible Attributes: {e}"),
114                            }
115                        }
116                    }
117                }
118                if let Some((tag, attributes, content)) = parse_html_block(reader)? {
119                    let tokens = read_inline_tokens(&content)?;
120                    return Ok(Token::Html {
121                        tag,
122                        attributes,
123                        tokens,
124                    });
125                }
126            }
127
128            // Heading (#*{depth} {text})
129            if c == '#' {
130                let chars: Vec<char> = reader.peek_string(7)?.chars().collect();
131                let mut ignore = false;
132                let mut depth: u8 = 0;
133                for c in chars {
134                    match c {
135                        ' ' => break,
136                        '#' => depth += 1,
137                        _ => ignore = true,
138                    }
139                }
140                if ignore == false {
141                    let text: String = sanitize_text(
142                        reader
143                            .consume_until_inclusive(|c| c == '\n')?
144                            .chars()
145                            .skip(depth as usize + 1)
146                            .collect(),
147                    );
148                    let tokens = read_inline_tokens(&text)?;
149                    return Ok(Token::Heading { depth, tokens });
150                }
151            }
152
153            if c == '<' {
154                // comment
155                if "<!--" == reader.peek_string(4)? {
156                    if let Some(text) = reader.peek_until_match_inclusive("-->")? {
157                        reader.consume(4)?; // skip start
158                        let text = reader.consume_string(text.len() - 4 - 3)?;
159                        reader.consume(3)?; // skip end
160                        return Ok(Token::Comment { raw: text });
161                    }
162                }
163
164                if let Some((tag, attributes, content)) = html::parse_html_block(reader)? {
165                    let content = sanitize_text(content);
166                    let tokens = read_inline_tokens(&content)?;
167                    return Ok(Token::Html {
168                        tag,
169                        attributes,
170                        tokens,
171                    });
172                }
173            }
174
175            // https://spec.commonmark.org/0.30/#paragraphs
176            let text = sanitize_text(reader.consume_until_match_inclusive("\n")?);
177            let tokens = read_inline_tokens(&text)?;
178            return Ok(Token::Paragraph { tokens });
179        }
180    };
181}
182
183/// https://github.com/markedjs/marked/blob/master/src/Tokenizer.js
184#[derive(Debug, Clone, PartialEq)]
185pub enum Token {
186    Attributes {
187        table: toml::map::Map<String, toml::Value>,
188    },
189    Heading {
190        /// 0-6
191        depth: u8,
192        tokens: Vec<Token>,
193    },
194    Html {
195        tag: String,
196        attributes: HashMap<String, String>,
197        tokens: Vec<Token>,
198    },
199    /// Anything that is not an already declared inline element
200    Paragraph {
201        tokens: Vec<Token>,
202    },
203    Bold {
204        text: String,
205    },
206    Italic {
207        text: String,
208    },
209    Code {
210        language: String,
211        code: String,
212    },
213    // Space {
214    //     raw: String,
215    // },
216    Link {
217        /// The text portion of a link that contains Tokens
218        tokens: Vec<Token>,
219        href: String,
220    },
221    Text {
222        text: String,
223    },
224    Comment {
225        raw: String,
226    },
227    Break {
228        raw: String,
229    },
230    /// Indicating of a space between paragraphs
231    Space,
232    EOF,
233}
234
235impl Token {
236    pub fn is_text(&self) -> bool {
237        match self {
238            Token::Heading { .. }
239            | Token::Paragraph { .. }
240            | Token::Bold { .. }
241            | Token::Italic { .. }
242            | Token::Code { .. }
243            | Token::Link { .. }
244            | Token::Text { .. }
245            | Token::Html { .. } => true,
246            _ => false,
247        }
248    }
249}