lssg_lib/lmarkdown/
lexer.rs1use std::{
2 collections::{BTreeMap, HashMap},
3 io::Read,
4};
5
6use log::warn;
7
8use crate::{
9 char_reader::CharReader,
10 html::{self, parse_html_block},
11 parse_error::ParseError,
12};
13
14fn sanitize_text(text: String) -> String {
16 let mut lines = vec![];
17 for line in text.lines() {
18 let trimmed = line.trim();
19 if trimmed.len() > 0 {
20 lines.push(trimmed);
21 }
22 }
23
24 return lines.join("\n");
25}
26
27fn read_inline_tokens(text: &String) -> Result<Vec<Token>, ParseError> {
28 let mut reader = CharReader::<&[u8]>::from_string(text);
29
30 let mut tokens = vec![];
31 while let Some(c) = reader.peek_char(0)? {
32 if c == '<' {
33 if let Some(text) = reader.peek_until_match_inclusive("-->")? {
35 reader.consume(4)?; let text = reader.consume_string(text.len() - 4 - 3)?;
37 reader.consume(3)?; tokens.push(Token::Comment { raw: text });
39 continue;
40 }
41
42 if let Some((tag, attributes, content)) = html::parse_html_block(&mut reader)? {
43 let content = sanitize_text(content);
44 tokens.push(Token::Html {
45 tag,
46 attributes,
47 tokens: read_inline_tokens(&content)?,
48 });
49 continue;
50 }
51 }
52
53 if c == '[' {
55 if let Some(raw_text) = reader.peek_until_from(1, |c| c == ']')? {
56 let href_start = 1 + raw_text.len();
57 if let Some('(') = reader.peek_char(href_start)? {
58 if let Some(raw_href) = reader.peek_until_from(href_start + 1, |c| c == ')')? {
59 reader.consume(1)?;
60 let text = reader.consume_string(raw_text.len() - 1)?;
61 reader.consume(2)?;
62 let href = reader.consume_string(raw_href.len() - 1)?;
63 reader.consume(1)?;
64 let text = read_inline_tokens(&text)?;
65 tokens.push(Token::Link { tokens: text, href });
66 continue;
67 }
68 }
69 }
70 }
71
72 if c == '*' {}
73
74 let c = reader.consume_char().unwrap().expect("has to be a char");
75 if let Some(Token::Text { text }) = tokens.last_mut() {
76 text.push(c)
77 } else {
78 tokens.push(Token::Text { text: c.into() })
79 }
80 }
81
82 return Ok(tokens);
83}
84
85pub fn read_token(reader: &mut CharReader<impl Read>) -> Result<Token, ParseError> {
93 match reader.peek_char(0)? {
94 None => return Ok(Token::EOF),
95 Some(c) => {
96 if c == '\n' {
98 reader.consume_until_inclusive(|c| c == '\n' || c == '\r')?;
99 return Ok(Token::Space);
100 }
101
102 if reader.has_read() == false {
104 if c == '<' {
105 if reader.peek_string(4)? == "<!--" {
106 if let Some(comment) = reader.peek_until_match_inclusive("-->")? {
107 match toml::from_str(&comment[4..comment.len() - 3]) {
108 Ok(toml::Value::Table(table)) => {
109 reader.consume_until_inclusive(|c| c == '>')?;
110 return Ok(Token::Attributes { table });
111 }
112 Ok(_) => warn!("Attributes is not a table"),
113 Err(e) => warn!("Not parsing possible Attributes: {e}"),
114 }
115 }
116 }
117 }
118 if let Some((tag, attributes, content)) = parse_html_block(reader)? {
119 let tokens = read_inline_tokens(&content)?;
120 return Ok(Token::Html {
121 tag,
122 attributes,
123 tokens,
124 });
125 }
126 }
127
128 if c == '#' {
130 let chars: Vec<char> = reader.peek_string(7)?.chars().collect();
131 let mut ignore = false;
132 let mut depth: u8 = 0;
133 for c in chars {
134 match c {
135 ' ' => break,
136 '#' => depth += 1,
137 _ => ignore = true,
138 }
139 }
140 if ignore == false {
141 let text: String = sanitize_text(
142 reader
143 .consume_until_inclusive(|c| c == '\n')?
144 .chars()
145 .skip(depth as usize + 1)
146 .collect(),
147 );
148 let tokens = read_inline_tokens(&text)?;
149 return Ok(Token::Heading { depth, tokens });
150 }
151 }
152
153 if c == '<' {
154 if "<!--" == reader.peek_string(4)? {
156 if let Some(text) = reader.peek_until_match_inclusive("-->")? {
157 reader.consume(4)?; let text = reader.consume_string(text.len() - 4 - 3)?;
159 reader.consume(3)?; return Ok(Token::Comment { raw: text });
161 }
162 }
163
164 if let Some((tag, attributes, content)) = html::parse_html_block(reader)? {
165 let content = sanitize_text(content);
166 let tokens = read_inline_tokens(&content)?;
167 return Ok(Token::Html {
168 tag,
169 attributes,
170 tokens,
171 });
172 }
173 }
174
175 let text = sanitize_text(reader.consume_until_match_inclusive("\n")?);
177 let tokens = read_inline_tokens(&text)?;
178 return Ok(Token::Paragraph { tokens });
179 }
180 };
181}
182
183#[derive(Debug, Clone, PartialEq)]
185pub enum Token {
186 Attributes {
187 table: toml::map::Map<String, toml::Value>,
188 },
189 Heading {
190 depth: u8,
192 tokens: Vec<Token>,
193 },
194 Html {
195 tag: String,
196 attributes: HashMap<String, String>,
197 tokens: Vec<Token>,
198 },
199 Paragraph {
201 tokens: Vec<Token>,
202 },
203 Bold {
204 text: String,
205 },
206 Italic {
207 text: String,
208 },
209 Code {
210 language: String,
211 code: String,
212 },
213 Link {
217 tokens: Vec<Token>,
219 href: String,
220 },
221 Text {
222 text: String,
223 },
224 Comment {
225 raw: String,
226 },
227 Break {
228 raw: String,
229 },
230 Space,
232 EOF,
233}
234
235impl Token {
236 pub fn is_text(&self) -> bool {
237 match self {
238 Token::Heading { .. }
239 | Token::Paragraph { .. }
240 | Token::Bold { .. }
241 | Token::Italic { .. }
242 | Token::Code { .. }
243 | Token::Link { .. }
244 | Token::Text { .. }
245 | Token::Html { .. } => true,
246 _ => false,
247 }
248 }
249}