lang_pt/
lib.rs

1//! Language parsing tool (lang_pt) is a library to generate a recursive descent top-down parser to parse languages or text into Abstract Syntax Tree ([AST](ASTNode)).
2//!
3//! # Overview
4//! Parsers written for the languages like Javascript are often custom handwritten due to the complexity of the languages.
5//! However, writing custom parser code often increases development and maintenance costs for the parser.
6//! With an intention to reduce development efforts, the library has been created for building a parser for a high-level language (HLL).
7//! The goal for this library is to develop a flexible library to support a wide range of grammar keeping a fair performance in comparison to a custom-written parser.
8//!
9//!
10//! # Design
11//!
12//! A language parser is usually developed either by writing custom code by hand or using a parser generator tool.
13//! While building a parser using a parser generator, grammar for the language is implemented in a Domain Specific Language (DSL) specified by the generator tool.
14//! The generator will then compile the grammar and generate a parser code in the target runtime language.
15//! However, this parser library uses a set of production utilities to implement grammar in the rust programming language.
16//! Therefore, instead of writing grammar in the generator-specified language, one can make use of utilities
17//! like [Concat](crate::production::Concat), [Union](crate::production::Union), etc.
18//! to implement concatenation and alternative production of symbols.
19//!
20//! This parsing tool is also equipped with utilities like [Lookahead](crate::production::Lookahead), [Validator](crate::production::Validator),
21//! and [NonStructural](crate::production::NonStructural) to support custom validation, precedence-based parsing, etc.
22//! This parsing library can be used to parse a wide range of languages which often require custom functionality to be injected into the grammar.
23//! Moreover, the library also includes production utilities like [SeparatedList](crate::production::SeparatedList), and [Suffixes](crate::production::Suffixes),
24//! to ease writing grammar for a language.
25//!
26//! # Example
27//!
28//! Following is the JSON program implementation using lang_pt.
29//! ```
30//! // # Tokenization
31//!
32//! use lang_pt::production::ProductionBuilder;
33//! use lang_pt::{
34//!     lexeme::{Pattern, Punctuations},
35//!     production::{Concat, EOFProd, Node, SeparatedList, TokenField, TokenFieldSet, Union},
36//!     DefaultParser, NodeImpl, TokenImpl, Tokenizer,
37//! };
38//! use std::rc::Rc;
39//!
40//! #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)]
41//! // JSON token
42//! pub enum JSONToken {
43//!     EOF,
44//!     String,
45//!     Space,
46//!     Colon,
47//!     Comma,
48//!     Number,
49//!     Constant,
50//!     OpenBrace,
51//!     CloseBrace,
52//!     OpenBracket,
53//!     CloseBracket,
54//! }
55//!
56//! #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)]
57//! // Node value for AST
58//! pub enum JSONNode {
59//!     Key,
60//!     String,
61//!     Number,
62//!     Constant,
63//!     Array,
64//!     Object,
65//!     Item,
66//!     Main,
67//!     NULL,
68//! }
69//!
70//! impl TokenImpl for JSONToken {
71//!     fn eof() -> Self { JSONToken::EOF }
72//!     fn is_structural(&self) -> bool {
73//!         match self {
74//!             JSONToken::Space => false,
75//!             _ => true,
76//!         }
77//!     }
78//! }
79//! impl NodeImpl for JSONNode {
80//!     fn null() -> Self { JSONNode::NULL }
81//! }
82//!
83//! let punctuations = Rc::new(
84//!     Punctuations::new(vec![
85//!         ("{", JSONToken::OpenBrace),
86//!         ("}", JSONToken::CloseBrace),
87//!         ("[", JSONToken::OpenBracket),
88//!         ("]", JSONToken::CloseBracket),
89//!         (",", JSONToken::Comma),
90//!         (":", JSONToken::Colon),
91//!     ])
92//!     .unwrap(),
93//! );
94//!
95//! let dq_string = Rc::new(
96//!     Pattern::new(
97//!         JSONToken::String,
98//!         r#"^"([^"\\\r\n]|(\\[^\S\r\n]*[\r\n][^\S\r\n]*)|\\.)*""#, //["\\bfnrtv]
99//!     )
100//!     .unwrap(),
101//! );
102//!
103//! let lex_space = Rc::new(Pattern::new(JSONToken::Space, r"^\s+").unwrap());
104//! let number_literal = Rc::new(
105//!     Pattern::new(JSONToken::Number, r"^([0-9]+)(\.[0-9]+)?([eE][+-]?[0-9]+)?").unwrap(),
106//! );
107//! let const_literal = Rc::new(Pattern::new(JSONToken::Constant, r"^(true|false|null)").unwrap());
108//!
109//! let tokenizer = Tokenizer::new(vec![
110//!     lex_space,
111//!     punctuations,
112//!     dq_string,
113//!     number_literal,
114//!     const_literal,
115//! ]);
116//!
117//! // # Parser
118//!
119//! let eof = Rc::new(EOFProd::new(None));
120//!
121//! let json_key = Rc::new(TokenField::new(JSONToken::String, Some(JSONNode::Key)));
122//!
123//! let json_primitive_values = Rc::new(TokenFieldSet::new(vec![
124//!     (JSONToken::String, Some(JSONNode::String)),
125//!     (JSONToken::Constant, Some(JSONNode::Constant)),
126//!     (JSONToken::Number, Some(JSONNode::Number)),
127//! ]));
128//!
129//!
130//! let hidden_open_brace = Rc::new(TokenField::new(JSONToken::OpenBrace, None));
131//! let hidden_close_brace = Rc::new(TokenField::new(JSONToken::CloseBrace, None));
132//! let hidden_open_bracket = Rc::new(TokenField::new(JSONToken::OpenBracket, None));
133//! let hidden_close_bracket = Rc::new(TokenField::new(JSONToken::CloseBracket, None));
134//! let hidden_comma = Rc::new(TokenField::new(JSONToken::Comma, None));
135//! let hidden_colon = Rc::new(TokenField::new(JSONToken::Colon, None));
136//! let json_object = Rc::new(Concat::init("json_object"));
137//! let json_value_union = Rc::new(Union::init("json_value_union"));
138//! let json_object_item = Rc::new(Concat::new(
139//!     "json_object_item",
140//!     vec![
141//!         json_key.clone(),
142//!         hidden_colon.clone(),
143//!         json_value_union.clone(),
144//!     ],
145//! ));
146//!
147//! let json_object_item_node = Rc::new(Node::new(&json_object_item, Some(JSONNode::Item)));
148//! let json_object_item_list =
149//!     Rc::new(SeparatedList::new(&json_object_item_node, &hidden_comma, true).into_nullable());
150//! let json_array_item_list =
151//!     Rc::new(SeparatedList::new(&json_value_union, &hidden_comma, true).into_nullable());
152//! let json_array_node = Rc::new(
153//!     Concat::new(
154//!         "json_array",
155//!         vec![
156//!             hidden_open_bracket.clone(),
157//!             json_array_item_list.clone(),
158//!             hidden_close_bracket.clone(),
159//!         ],
160//!     )
161//!     .into_node(Some(JSONNode::Array)),
162//! );
163//!
164//! let json_object_node = Rc::new(Node::new(&json_object, Some(JSONNode::Object)));
165//!
166//! json_value_union
167//!     .set_symbols(vec![
168//!         json_primitive_values.clone(),
169//!         json_object_node.clone(),
170//!         json_array_node.clone(),
171//!     ])
172//!     .unwrap();
173//!
174//! json_object
175//!     .set_symbols(vec![
176//!         hidden_open_brace.clone(),
177//!         json_object_item_list,
178//!         hidden_close_brace.clone(),
179//!     ])
180//!     .unwrap();
181//!
182//! let main = Rc::new(Concat::new("root", vec![json_value_union, eof]));
183//! let main_node = Rc::new(Node::new(&main, Some(JSONNode::Main)));
184//! let parser = DefaultParser::new(Rc::new(tokenizer), main_node).unwrap();
185//!
186//! ```
187
188//! # License
189//! [lang_pt](crate) is provided under the MIT license. See [LICENSE](../LICENSE).
190mod ast_node;
191mod cache;
192mod code;
193mod doc;
194mod error;
195pub mod examples;
196mod field_tree;
197mod filtered_stream;
198mod impl_default;
199mod lex;
200pub mod lexeme;
201mod logger;
202mod parsing;
203mod position;
204pub mod production;
205mod success_data;
206mod tokenization;
207mod wrapper_index;
208
209use once_cell::unsync::OnceCell;
210use std::collections::{HashMap, HashSet};
211use std::fmt::{Debug, Display, Write};
212use std::hash::Hash;
213use std::rc::Rc;
214
215/// A trait implementation to generate default tokens to assign token values to the associated [ASTNode].
216///
217/// When the tokenizer or the parser encounter null production or end of file production during lexical analysis and parsing phase,
218/// self implementation will create and assign corresponding token in the token stream and [ASTNode].  
219/// A trait implementation to filter the tokens after lexical analysis.
220///
221/// The non structural tokens like whitespace, line break, in javascript language do not provide any grammatical meaning.
222/// Therefore these tokens can be omitted from the tokes stream to simplify the grammar and optimize the parser performance.
223pub trait TokenImpl: Copy + Debug + Eq + Hash + Ord {
224    fn eof() -> Self;
225    fn is_structural(&self) -> bool;
226}
227
228/// A trait implementation to generate default tokens to assign token values to the associated [ASTNode].
229///
230/// When the tokenizer or the parser encounter null production or end of file production during lexical analysis and parsing phase,
231/// self implementation will create and assign corresponding token in the token stream and [ASTNode].  
232pub trait NodeImpl: Debug + Clone {
233    /// Default token placeholder for null production.
234    fn null() -> Self;
235}
236
237#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
238/// A wrapper to indicate the index of the tokenized data in the [TokenStream].
239pub struct TokenPtr(usize);
240
241#[derive(Clone)]
242/// Abstract Syntax tree (AST) of the parsed input.
243pub struct ASTNode<TNode> {
244    pub node: TNode,
245    pub bound: Option<(TokenPtr, TokenPtr)>, // Start and end position information of the lexical stream generated from the tokenizer.
246    pub start: usize, // Actual starting position of the parsed utf-8 slice. This is different from the starting position of the parsed string.
247    pub end: usize, // Actual end point of the parsed utf-8 slice. This is different from the end of the parsed string.
248    pub children: Vec<ASTNode<TNode>>, // Children of the abstract syntax tree
249}
250
251#[derive(Debug, Hash, Clone, PartialEq, Eq)]
252/// Element of the tokenized data.
253pub struct Lex<TToken> {
254    pub token: TToken,
255    pub start: usize,
256    pub end: usize,
257}
258
259/// An interface implemented by all lexeme utilities which are primary element of a tokenizer.   
260pub trait ILexeme {
261    type Token: Copy + Debug + Eq + Ord;
262    type State: Copy + Debug + Eq + Ord;
263
264    /// Primary tokenization method implemented by each lexeme utility.
265    /// The analyzer will call this method for all the lexeme at the incremental locations of the input to create tokens.
266    fn consume(
267        &self,
268        code: &Code,
269        pointer: usize,
270        tokenized_stream: &Vec<Lex<Self::Token>>,
271        state_stack: &mut Vec<Self::State>,
272    ) -> Option<Lex<Self::Token>>;
273
274    fn get_grammar_field(&self) -> Vec<(Self::Token, String)>;
275}
276
277/// A trait consists of [tokenize](ITokenization::tokenize) method which takes input utf-8 string bytes and produces a tokens stream.
278///
279/// This interface implemented by [Tokenizer] and [CombinedTokenizer].
280pub trait ITokenization {
281    type Token;
282    fn tokenize(&self, code: &Code) -> Result<Vec<Lex<Self::Token>>, ParseError>;
283    fn build_grammar(&self) -> Result<String, std::fmt::Error>;
284}
285
286/// Base tokenization structure for lexical analysis.
287///
288/// The [Tokenizer] implements [ITokenization] where the [tokenize](ITokenization::tokenize) method
289/// from this trait will split the input string into a token stream and return the result.
290/// The [Tokenizer] object consists of lexeme utilities.
291/// During tokenization, each lexeme utility will be called sequentially to get split tokens input.
292///
293pub struct Tokenizer<TToken = i8, TState = u8> {
294    lexers: Vec<Rc<dyn ILexeme<Token = TToken, State = TState>>>,
295}
296
297/// A state-based tokenizer for lexical analysis.
298///
299/// A [CombinedTokenizer] consist of multiple set of lexeme utilities.
300/// During tokenization lexeme utilities corresponding to the state will be called sequentially to get split tokens input.
301/// A [StateMixin][crate::lexeme::StateMixin] or [ThunkStateMixin][crate::lexeme::ThunkStateMixin] can be used with to change the state stack during tokenization.
302///  
303/// Tokenizing a complex language syntax like template literal in javascript,
304/// required implementing a separate state to tokenize template the literal part of the input.
305/// Thus, a [CombinedTokenizer] allows us to define a multiple states-based lexer required to tokenize relatively complex language syntax.  
306/// Similar to the [Tokenizer] a [CombinedTokenizer] also implements [ITokenization]
307/// where the [tokenize](ITokenization::tokenize) method will split the input string into a stream of tokens.
308///
309pub struct CombinedTokenizer<TT = i8, TS = u8> {
310    analyzers: Vec<(TS, Vec<Rc<dyn ILexeme<Token = TT, State = TS>>>)>,
311    default_state: TS,
312    debug: OnceCell<Log<&'static str>>,
313}
314
315#[derive(Debug)]
316/// An error returned due to failed validation of production utilities and grammar.
317pub struct ImplementationError {
318    message: String,
319    what: String,
320}
321
322#[derive(Debug, Clone)]
323/// An error to indicate failure while consuming input into [AST](crate::ASTNode).
324///
325/// When production failed to parse inputs, the parser will try to implement alternative production or backtracking.
326/// However, [Validation](crate::ProductionError::Validation) error will simple terminate the parsing and return [Err] result.   
327pub enum ProductionError {
328    Unparsed,
329    Validation(usize, String),
330}
331
332#[derive(Debug)]
333/// An error returned when the parser failed to parse the input because of the language syntax error.
334pub struct ParseError {
335    pub pointer: usize,
336    pub message: String,
337}
338
339#[derive(Debug, Clone)]
340/// A wrapper implementation of the tokenized data.
341pub struct TokenStream<'lex, TNode> {
342    filtered_stream: Vec<TokenPtr>,
343    original_stream: &'lex Vec<Lex<TNode>>,
344}
345
346#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
347/// A wrapper implementation to indicate the indices of structural tokens of the [TokenStream].
348pub struct FltrPtr(usize);
349
350#[derive(Debug, Clone)]
351/// A [Ok] result value returned from the [production](IProduction) utility
352/// when it successfully consume production [derivation](IProduction::advance_token_ptr()).
353pub struct SuccessData<I, TNode> {
354    pub consumed_index: I,
355    pub children: Vec<ASTNode<TNode>>,
356}
357
358#[derive(Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord)]
359///  A unique key to save and retrieve parsed results for the Packrat parsing technique.
360pub struct CacheKey(usize);
361
362/// A result returned from [Production](IProduction) when it try to [consume][IProduction::advance_token_ptr] inputs.
363pub type ParsedResult<I, TToken> = Result<SuccessData<I, TToken>, ProductionError>;
364
365/// An object structure to store maximum successful parse position and parsed result for Packrat parsing technique.   
366pub struct Cache<TP, TToken> {
367    parsed_result_cache: HashMap<(CacheKey, usize), ParsedResult<TP, TToken>>,
368    max_parsed_point: usize,
369}
370
371/// A trait implemented by production utilities which are used to write the various production rule for writing the grammar.
372pub trait IProduction: Display {
373    type Node: NodeImpl;
374    type Token: TokenImpl;
375    /// Whether the production is nullable.
376    fn is_nullable(&self) -> bool;
377
378    /// Whether the production is nullable and the parsed tree should be hidden from the [ASTNode].
379    fn is_nullable_n_hidden(&self) -> bool;
380
381    /// Validate if any first set child production is left recursive and return id production is nullable.
382    fn obtain_nullability<'id>(
383        &'id self,
384        visited: HashMap<&'id str, usize>,
385    ) -> Result<bool, ImplementationError>;
386    // fn obtain_hidden_stat<'id>(&'id self, visited: &mut HashSet<&'id str>) -> bool;
387    fn impl_first_set(&self, first_set: &mut HashSet<Self::Token>);
388    // fn has_first_set(&self, lex_index: LexIndex, stream: &LexStream<Self::Token>) -> bool;
389
390    /// Write grammar for the production.
391    fn impl_grammar(
392        &self,
393        writer: &mut dyn Write,
394        added_rules: &mut HashSet<&'static str>,
395    ) -> Result<(), std::fmt::Error>;
396
397    /// Validate this and all children production for left recursion.
398    fn validate<'id>(
399        &'id self,
400        connected_sets: HashMap<&'id str, usize>,
401        visited_prod: &mut HashSet<&'id str>,
402    ) -> Result<(), ImplementationError>;
403
404    /// Consume input in filtered token stream.
405    fn advance_fltr_ptr(
406        &self,
407        code: &Code,
408        index: FltrPtr,
409        token_stream: &TokenStream<Self::Token>,
410        cache: &mut Cache<FltrPtr, Self::Node>,
411    ) -> ParsedResult<FltrPtr, Self::Node>;
412
413    /// Consume tokenized input data.
414    fn advance_token_ptr(
415        &self,
416        code: &Code,
417        index: TokenPtr,
418        token_stream: &TokenStream<Self::Token>,
419        cache: &mut Cache<FltrPtr, Self::Node>,
420    ) -> ParsedResult<TokenPtr, Self::Node>;
421
422    /// Consume a utf-8 byte string input.
423    fn advance_ptr(
424        &self,
425        code: &Code,
426        index: usize,
427        cache: &mut Cache<usize, Self::Node>,
428    ) -> ParsedResult<usize, Self::Node>;
429
430    fn build_grammar(&self) -> Result<String, std::fmt::Error> {
431        let mut writer = String::new();
432        writeln!(writer, "{}", self)?;
433        self.impl_grammar(&mut writer, &mut HashSet::new())?;
434        Ok(writer)
435    }
436}
437
438/// A parser structure to construct a tokenized based parsing program.
439pub struct DefaultParser<TN: NodeImpl = u8, TL: TokenImpl = i8> {
440    tokenizer: Rc<dyn ITokenization<Token = TL>>,
441    root: Rc<dyn IProduction<Node = TN, Token = TL>>,
442    #[cfg(debug_assertions)]
443    debug_production_map: HashMap<&'static str, Rc<dyn IProduction<Node = TN, Token = TL>>>,
444}
445
446/// A parser structure for parsing input without a tokenizer.
447pub struct LexerlessParser<TN: NodeImpl = u8, TL: TokenImpl = i8> {
448    root: Rc<dyn IProduction<Node = TN, Token = TL>>,
449    #[cfg(debug_assertions)]
450    debug_production_map: HashMap<&'static str, Rc<dyn IProduction<Node = TN, Token = TL>>>,
451}
452
453#[derive(Clone, Debug)]
454struct FieldTree<T> {
455    token: Option<T>,
456    children: Vec<(u8, FieldTree<T>)>,
457}
458
459#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
460/// The line and column information at code point.
461pub struct Position {
462    pub line: usize,
463    pub column: usize,
464}
465
466/// A wrapper for the input language to be parsed with lines information.
467pub struct Code<'c> {
468    pub value: &'c [u8],
469    line_breaks: OnceCell<Vec<usize>>,
470}
471
472#[derive(Debug, PartialEq, Eq, Clone, Copy)]
473/// A enum structure to assign multiple level debugging to lexeme and production utilities.
474pub enum Log<T> {
475    None,
476    Default(T),
477    Success(T),
478    Result(T),
479    Verbose(T),
480}
lang_pt/lib.rs

lang_pt/
lib.rs