kami_parser/
lexer.rs

1#[derive(Clone, Debug)]
2pub(crate) enum TokenType {
3	Put,
4	Bold,
5	Strong,
6	Italic,
7	Emphasis,
8	LinkName,
9	LinkDir,
10	Attr,
11	Sub,
12	Sup,
13	Span,
14	Code,
15	Strike,
16	Under,
17	Header,
18	Html,
19	ListEl,
20	NumberedListEl,
21	LineBreak,
22	Para,
23	UList,
24	OList,
25	ListBlock, 
26	Image,
27	Raw,
28	TableRow,
29	TableCell,
30	TableHeader,
31	Table,
32}
33
34#[derive(Clone, Debug)]
35pub(crate) struct Token {
36	pub(crate) class: TokenType,
37	pub(crate) content: String,
38	pub(crate) subtokens: Vec<Token>,
39	pub(crate) attributes: String,
40}
41
42impl Token {
43	pub(crate) fn new() -> Self {
44		Self { class: TokenType::Put, content: String::new(), subtokens: Vec::new(), attributes: String::new() }
45	}
46	pub(crate) fn n_para() -> Self {
47		Self { class: TokenType::Para, content: String::new(), subtokens: Vec::new(), attributes: String::new() }
48	}
49	pub(crate) fn init(class: TokenType, content: String) -> Self {
50		Self { class: class, content: content, subtokens: Vec::new(), attributes: String::new() }
51	}
52	pub(crate) fn init_sub(class: TokenType, tcontent: Vec<Self>, content: String) -> Self {
53		Self { class: class, content: content, subtokens: tcontent, attributes: String::new() }
54	}
55	pub(crate) fn tokenize_content(self: &mut Self, borders: usize) {
56		self.subtokens = tokenize(&self.content[borders..self.content.len()-borders]).0;
57	}
58	pub(crate) fn tokenize_unclosed(self: &mut Self, borders: usize) {
59		self.subtokens = tokenize(&self.content[borders..self.content.len()]).0;
60	}
61}
62
63pub(crate) fn tokenize(input: &str) -> (Vec<Token>, String) {
64	let mut tokens:Vec<Token> = vec![];
65	let mut current_token: Token = Token::new();
66	let mut escaping = false;
67
68	let mut warnings = String::new();
69
70	let mut nlist_wait_space = false;
71
72	let mut strong_wait = false; // Variable used for closing a STRONG token
73	for (pos, cha) in input.char_indices() {
74		if cha == '\\'{
75			if escaping {
76				escaping = false;
77				current_token.content += &cha.to_string();
78			} else {
79				escaping = true;
80				match current_token.class {
81					TokenType::Bold | TokenType::Italic | TokenType::Strong | TokenType::Emphasis | TokenType::LinkName | TokenType::Sub | TokenType::Sup | TokenType::Code | TokenType::Span | TokenType::Under | TokenType::Strike => current_token.content +=  &cha.to_string(),
82					_ => (),
83				}
84			}
85		} else {
86			match current_token.class {
87				TokenType::Put => {
88					match cha {
89						'*' => {
90							if !escaping {
91								push_token(&mut tokens, &current_token);
92								current_token = Token::init(TokenType::Bold, cha.to_string());
93							} else { current_token.content += &cha.to_string(); }
94						},
95						'_' => {
96							if !escaping {
97								push_token(&mut tokens, &current_token);
98								current_token = Token::init(TokenType::Italic, cha.to_string());
99							} else { current_token.content += &cha.to_string(); }
100						},
101						'=' => {
102							if escaping {
103								push_token(&mut tokens, &current_token);
104								current_token = Token::init(TokenType::Raw, String::new());
105							} else { current_token.content += &cha.to_string(); }
106						},
107						'[' => {
108							if !escaping {
109								push_token(&mut tokens, &current_token);
110								current_token = Token::init(TokenType::LinkName, cha.to_string());
111							} else { current_token.content += &cha.to_string(); }
112						},
113						'~' => {
114							if !escaping {
115								push_token(&mut tokens, &current_token);
116								current_token = Token::init(TokenType::Sub, cha.to_string());
117							} else { current_token.content += &cha.to_string(); }
118						},
119						'^' => {
120							if !escaping {
121								push_token(&mut tokens, &current_token);
122								current_token = Token::init(TokenType::Sup, cha.to_string());
123							} else { current_token.content += &cha.to_string(); }
124						},
125						'!' => {
126							if !escaping {
127								push_token(&mut tokens, &current_token);
128								current_token = Token::init(TokenType::Image, cha.to_string());
129							} else { current_token.content += &cha.to_string(); }
130						},
131						'`' => {
132							if !escaping {
133								push_token(&mut tokens, &current_token);
134								current_token = Token::init(TokenType::Code, cha.to_string());
135							} else { current_token.content += &cha.to_string(); }
136						},
137						'@' => {
138							if !escaping {
139								push_token(&mut tokens, &current_token);
140								current_token = Token::init(TokenType::Span, cha.to_string());
141							} else { current_token.content += &cha.to_string(); }
142						},
143						'-' => {
144							if !escaping {
145								push_token(&mut tokens, &current_token);
146								current_token = Token::init(TokenType::Under, cha.to_string());
147							} else { current_token.content += &cha.to_string(); }
148						},
149						'#' => {
150							if pos == 0 { current_token = Token::init(TokenType::Header, cha.to_string()); }
151							else { current_token.content += &cha.to_string(); }
152						},
153						'<' => {
154							if !escaping {
155								push_token(&mut tokens, &current_token);
156								current_token = Token::init(TokenType::Html, cha.to_string());
157							} else { current_token.content += &cha.to_string(); }
158						},
159						'(' => {
160							if !escaping {
161								match tokens.last() {
162									None => current_token.content += &cha.to_string(),
163									Some(last_token) => {
164										match last_token.class {
165											TokenType::LinkName => current_token = Token::init(TokenType::LinkDir, cha.to_string()),
166											_ => current_token.content += &cha.to_string(),
167										}
168									}
169								}
170							} else { current_token.content += &cha.to_string(); }
171						},
172						'{' => {
173							if !escaping {
174								push_token(&mut tokens, &current_token);
175								match tokens.last() {
176									None => current_token = Token::init(TokenType::Attr, cha.to_string()),
177									Some(last_token) => {
178										match last_token.class {
179											TokenType::Put => {
180												tokens.pop();
181												current_token.content += &cha.to_string();
182											},
183											_ => current_token = Token::init(TokenType::Attr, cha.to_string()),
184										}
185									}
186								}
187							} else { current_token.content += &cha.to_string(); }
188						},
189						'|' => {
190							if !escaping {
191								if tokens.is_empty() && current_token.content.is_empty() {
192									push_token(&mut tokens, &current_token);
193									current_token = Token::init(TokenType::TableRow, String::new());
194								} else { current_token.content += &cha.to_string() }
195							} else { current_token.content += &cha.to_string() }
196						}
197						'n' => {
198							if escaping {
199								push_token(&mut tokens, &current_token);
200								current_token = Token::init(TokenType::LineBreak, String::from("BR"));
201								push_token(&mut tokens, &current_token);
202								current_token = Token::new();
203							} else {
204								current_token.content += &cha.to_string();
205							}
206						},
207						_ => current_token.content += &cha.to_string(),
208					}
209				},
210				TokenType::TableRow => current_token.content += &cha.to_string(),
211				TokenType::Raw => {
212					match cha {
213						'=' => {
214							if !escaping {
215								push_token(&mut tokens, &current_token);
216								current_token = Token::new();
217							}
218						},
219						_ => current_token.content += &cha.to_string()
220					}
221				}
222				TokenType::Bold => {
223					current_token.content += &cha.to_string();
224					match cha {
225						'*' => {
226							if current_token.content == "**" && !escaping { current_token.class = TokenType::Strong; }
227							else if !escaping {
228								current_token.tokenize_content(1);
229								push_token(&mut tokens, &current_token);
230								current_token = Token::new();
231							}
232						},
233						' ' => {
234							if current_token.content == "* " && !escaping {
235								if pos != 1 { current_token.class = TokenType::Put;	}
236								else {
237									current_token.class = TokenType::ListEl;
238									push_token(&mut tokens, &current_token);
239									current_token = Token::new();
240								}
241							}
242						},
243						_ => (),
244					}
245				},
246				TokenType::Strong => {
247					current_token.content += &cha.to_string();
248					match cha {
249						'*' => {
250							if !(pos == 2 && current_token.content == "***") {
251								if !escaping && !strong_wait { strong_wait = true; }
252								else if !escaping && strong_wait {
253									current_token.tokenize_content(2);
254									push_token(&mut tokens, &current_token);
255									current_token = Token::new();
256									strong_wait = false;
257								} else { strong_wait = false; }
258							} else {
259								current_token.class = TokenType::ListEl;
260							}
261						},
262						' ' => {
263							if current_token.content == "** " && !escaping {
264								if pos != 2 { current_token.class = TokenType::Put; }
265								else {
266									current_token.class = TokenType::ListEl;
267									push_token(&mut tokens, &current_token);
268									current_token = Token::new();
269								}
270							}
271						},
272						_ => (),
273					}
274				},
275				TokenType::ListEl => {
276					current_token.content += &cha.to_string();
277					match cha {
278						'*' => (),
279						' ' => {
280							push_token(&mut tokens, &current_token);
281							current_token = Token::new();
282						},
283						_ => current_token.class = TokenType::Put,
284					}
285				}
286				TokenType::Italic => {
287					current_token.content += &cha.to_string();
288					match cha {
289						'_' => {
290							if current_token.content == "__" && !escaping { current_token.class = TokenType::Emphasis; }
291							else if !escaping {
292								current_token.tokenize_content(1);
293								push_token(&mut tokens, &current_token);
294								current_token = Token::new();
295							}
296						},
297						' ' => if current_token.content == "_ " && !escaping { current_token.class = TokenType::Put },
298						_ => (),
299					}
300				},
301				TokenType::Emphasis => {
302					current_token.content += &cha.to_string();
303					match cha {
304						'_' => {
305							if !escaping && !strong_wait { strong_wait = true; }
306							else if !escaping && strong_wait {
307								current_token.tokenize_content(2);
308								push_token(&mut tokens, &current_token);
309								current_token = Token::new();
310								strong_wait = false;
311							} else { strong_wait = false; }
312						},
313						' ' => if current_token.content == "__ " && !escaping { current_token.class = TokenType::Put },
314						_ => (),
315					}
316				},
317				TokenType::Sub => {
318					current_token.content += &cha.to_string();
319					match cha {
320						'~' => {
321							if !escaping {
322								if current_token.content == "~~" {
323									current_token.class = TokenType::Strike;
324								} else {
325									current_token.tokenize_content(1);
326									push_token(&mut tokens, &current_token);
327									current_token = Token::new();
328								}
329							} 
330						},
331						' ' => if current_token.content == "~ " && !escaping { current_token.class = TokenType::Put },
332						_ => (),
333					}
334				},
335				TokenType::Image => {
336					current_token.content += &cha.to_string();
337					match cha {
338						'!' => {
339							if !escaping {
340								if current_token.content == "!!" {
341									current_token.class = TokenType::Put;
342								} else {
343									push_token(&mut tokens, &current_token);
344									current_token = Token::new();
345								}
346							}
347						},
348						' ' => {
349							if !escaping {
350								if current_token.content == "! " {
351									current_token.class = TokenType::Put;
352								}
353							}
354						},
355						_ => (),
356					}
357				},
358				TokenType::Sup => {
359					current_token.content += &cha.to_string();
360					match cha {
361						'^' => {
362							if !escaping {
363								current_token.tokenize_content(1);
364								push_token(&mut tokens, &current_token);
365								current_token = Token::new();
366							} 
367						},
368						' ' => if current_token.content == "^ " && !escaping { current_token.class = TokenType::Put },
369						_ => (),
370					}
371				},
372				TokenType::Span => {
373					current_token.content += &cha.to_string();
374					match cha {
375						'@' => {
376							if !escaping {
377								current_token.tokenize_content(1);
378								push_token(&mut tokens, &current_token);
379								current_token = Token::new();
380							} 
381						},
382						' ' => if current_token.content == "@ " && !escaping { current_token.class = TokenType::Put },
383						_ => (),
384					}
385				},
386				TokenType::Code => {
387					current_token.content += &cha.to_string();
388					match cha {
389						'`' => {
390							if !escaping {
391								current_token.tokenize_content(1);
392								push_token(&mut tokens, &current_token);
393								current_token = Token::new();
394							} 
395						},
396						_ => (),
397					}
398				},
399				TokenType::Strike => {
400					current_token.content += &cha.to_string();
401					match cha {
402						'~' => {
403							if !escaping && !strong_wait { strong_wait = true; }
404							else if !escaping && strong_wait {
405								current_token.tokenize_content(2);
406								push_token(&mut tokens, &current_token);
407								current_token = Token::new();
408								strong_wait = false;
409							} else { strong_wait = false; }
410						},
411						' ' => if current_token.content == "~~ " && !escaping { current_token.class = TokenType::Put },
412						_ => (),
413					}
414				},
415				TokenType::Under => {
416					if current_token.content == "-" {
417						current_token.content += &cha.to_string();
418						match cha {
419							'-' => (),
420							_ => current_token.class = TokenType::Put,
421						}
422					}
423					else {
424						current_token.content += &cha.to_string();
425						match cha {
426							'-' => {
427								if !escaping && !strong_wait { strong_wait = true; }
428								else if !escaping && strong_wait {
429									current_token.tokenize_content(2);
430									push_token(&mut tokens, &current_token);
431									current_token = Token::new();
432									strong_wait = false;
433								} else { strong_wait = false; }
434							},
435							' ' => if current_token.content == "-- " && !escaping { current_token.class = TokenType::Put },
436							_ => (),
437						}
438					}
439				},
440				TokenType::Html => {
441					current_token.content += &cha.to_string();
442					match cha {
443						'>' => {
444							if !escaping {
445								current_token.tokenize_content(1);
446								push_token(&mut tokens, &current_token);
447								current_token = Token::new();
448							}
449						},
450						' ' => if current_token.content == "< " && !escaping { current_token.class = TokenType::Put },
451						_ => (),
452					}
453				},
454				TokenType::LinkName => {
455					current_token.content += &cha.to_string();
456					match cha {
457						']' => {
458							if !escaping {
459								current_token.tokenize_content(1);
460								push_token(&mut tokens, &current_token);
461								current_token = Token::new();
462							}
463						},
464						_ => (),
465					}
466				},
467				TokenType::LinkDir => {
468					current_token.content += &cha.to_string();
469					match cha {
470						')' => {
471							if !escaping {
472								push_token(&mut tokens, &current_token);
473								current_token = Token::new();
474							}
475						},
476						_ => (),
477					}
478				},
479				TokenType::Attr => {
480					current_token.content += &cha.to_string();
481					match cha {
482						'}' => {
483							if !escaping {
484								match tokens.last_mut() {
485									None => {
486										push_token(&mut tokens, &current_token);
487									},
488									Some(last_token) =>	{
489										last_token.attributes = current_token.content.clone();
490									},
491								}
492								current_token = Token::new();
493							}
494						},
495						_ => (),
496					}
497				},
498				TokenType::Header => {
499					match cha {
500						'#' => {
501							nlist_wait_space = false;
502							current_token.content += &cha.to_string();
503						},
504						'{' => {
505							nlist_wait_space = false;
506							push_token(&mut tokens, &current_token);
507							current_token = Token::init(TokenType::Attr, cha.to_string());
508						},
509						' ' => {
510							if !nlist_wait_space {
511								push_token(&mut tokens, &current_token);
512								current_token = Token::new();
513							} else {
514								current_token.content += " ";
515								current_token.class = TokenType::NumberedListEl;
516								push_token(&mut tokens, &current_token);
517								current_token = Token::new();
518							}
519							nlist_wait_space = false;
520						},
521						'.' => {
522							if !nlist_wait_space { nlist_wait_space = true; }
523							else { 
524								push_token(&mut tokens, &current_token);
525								current_token = Token::new();
526								current_token.content += &cha.to_string();
527								nlist_wait_space = false;
528							}
529						}
530						_ => {
531							nlist_wait_space = false;
532							push_token(&mut tokens, &current_token);
533							current_token = Token::new();
534							current_token.content += &cha.to_string();
535						}
536					}
537				},
538				_ => panic!("Reached undefined token type {:?}", current_token.class),
539			}
540		}
541		if escaping && cha != '\\' { escaping = false; }
542	}
543	if !current_token.content.is_empty() {
544		match current_token.class {
545			TokenType::Put | TokenType::TableRow => (),
546			_ => warnings += &format!("WARNING: Unclosed {:?} token at {}\n", current_token.class, current_token.content),
547		}
548		match current_token.class {
549			TokenType::Bold | TokenType::Italic | TokenType::Sub | TokenType::Sup | TokenType::LinkName | TokenType::LinkDir | TokenType::Attr | TokenType::Image | TokenType::Html | TokenType::Code | TokenType::Span => {
550				push_token(&mut tokens, &Token::init(TokenType::Put, current_token.content[0..1].to_string()));
551				current_token.tokenize_unclosed(1);
552				tokens.append(&mut current_token.subtokens);
553			},
554			TokenType::Strong | TokenType::Emphasis | TokenType::Strike => {
555				push_token(&mut tokens, &Token::init(TokenType::Put, current_token.content[0..2].to_string()));
556				current_token.tokenize_unclosed(2);
557				tokens.append(&mut current_token.subtokens);
558			},
559			TokenType::Under => {
560				if current_token.content == "-" {
561					push_token(&mut tokens, &Token::init(TokenType::Put, current_token.content));
562					tokens.append(&mut current_token.subtokens);
563				} else {
564					push_token(&mut tokens, &Token::init(TokenType::Put, current_token.content[0..2].to_string()));
565					current_token.tokenize_unclosed(2);
566					tokens.append(&mut current_token.subtokens);
567				}
568			},
569			TokenType::Put | TokenType::TableRow => push_token(&mut tokens, &current_token),
570			_ => { 
571				push_token(&mut tokens, &current_token);
572				warnings += "The unclosing of the last token was impossible to handle for Kami, so the raw text has been outputted. Please contact the project maintainer about this.\n";
573			}
574		}
575	}
576	(tokens, warnings)
577}
578
579pub(crate) fn push_token(list: &mut Vec<Token>, token: &Token) {
580	if token.content != "" || !token.subtokens.is_empty() { list.push(token.clone()); }
581}
582
583