json_five/rt/
tokenize.rs

1use crate::tokenize::{TokType, TokenSpan, TokenizationError};
2
3#[derive(Debug, PartialEq, Clone)]
4pub struct TokenContext {
5    pub start_lineno: usize,
6    pub start_colno: usize,
7    pub start_byte_offset: usize,
8    pub start_char_index: usize,
9    pub end_byte_offset: usize,
10}
11
12/// Represents a source token
13///
14/// Unlike the spans found in [crate::tokenize::Tokens], these tokens are
15/// of an owned type containing an owned String of the lexeme from source.
16///
17/// The typical way to obtain a [Token] is from the [source_to_tokens] function. But
18/// tokens can be created without source documents, too.
19///
20///
21#[derive(Debug, PartialEq, Clone)]
22pub struct Token {
23    /// The contents of the token, exactly as it appears in source
24    pub lexeme: String,
25
26    /// the type of the token
27    pub tok_type: TokType,
28
29    /// Contextual information about the Token's position in the source document (if available)
30    /// Because tokens can be created without a source doc, this field is an [Option] and may be [None].
31    pub context: Option<TokenContext>
32}
33
34/// Generate a Vec of [Token]s from a str.
35///
36///
37/// This function is complementary with [tokens_to_source]. The typical workflow
38/// is to use this function to generate a Vec of tokens, do something to modify it (
39/// e.g., add/remove/replace tokens) then use [tokens_to_source] to turn it back into JSON5 source.
40///
41/// Unlike the tokenizing functions available in the [crate::tokenize] module, this function
42/// produces owned [Token] objects containing (among other fields) an owned String of the lexeme,
43/// rather than a [crate::tokenize::Tokens] struct.
44///
45/// # Examples
46///
47/// ```rust
48/// use json_five::{source_to_tokens, tokens_to_source};
49/// use json_five::rt::tokenize::Token;
50/// use json_five::tokenize::TokType::Whitespace;
51///
52/// let tokens = source_to_tokens("  {my: 'json5'}  ").unwrap();
53///
54/// // remove all Whitespace tokens
55/// let new_tokens:Vec<Token> = tokens.into_iter().filter(|tok| tok.tok_type != Whitespace).collect();
56///
57/// // turn tokens back into source
58/// let new_source = tokens_to_source(&new_tokens);
59/// assert_eq!(new_source, String::from("{my:'json5'}"))
60/// ```
61///
62pub fn source_to_tokens(text: &str) -> Result<Vec<Token>, TokenizationError> {
63    use crate::tokenize::tokenize_rt_str;
64    // TODO: instead of going through the entire input to get the tokens
65    //       and then going through it again to get context
66    //       we should write a solution that does everything in one pass
67
68    let tokens = tokenize_rt_str(text)?;
69    let mut source_tokens: Vec<Token> = Vec::with_capacity(tokens.tok_spans.len());
70    let mut spans = tokens.tok_spans.iter();
71    let mut current_span = spans.next().unwrap(); // there will always at least be EOF
72    let mut next_span: &TokenSpan;
73    match spans.next() {
74        None => {
75            // empty doc
76            source_tokens.push(Token{lexeme: String::new(), tok_type:TokType::EOF, context: Some(TokenContext{start_byte_offset: 0, start_colno: 1, start_lineno: 1, start_char_index: 0, end_byte_offset: 0})});
77            return Ok(source_tokens)
78        }
79        Some(span) => {
80            next_span = span;
81        }
82    }
83    let mut lineno: usize = 1;
84    let mut colno: usize = 0;
85
86    let mut start_lineno = lineno;
87    let mut start_colno = 1;
88    let mut start_char_index = 0;
89
90    let mut current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
91    for (codepoint_off, (byte_off, char)) in text.char_indices().enumerate() {
92        colno += 1;
93        current_token_buffer.push(char);
94        if char == '\n' {
95            if current_token_buffer.len() == current_token_buffer.capacity() {
96                let context = TokenContext{start_byte_offset: current_span.0, start_lineno, start_colno, start_char_index, end_byte_offset: current_span.2};
97                let token = Token{lexeme: current_token_buffer, tok_type: current_span.1.clone(), context: Some(context)};
98                source_tokens.push(token);
99                match spans.next() {
100                    None => {
101                        assert_eq!(next_span.1, TokType::EOF, "Unexpected end of document while token remaining {:?}", next_span);
102                        let context = TokenContext{start_lineno:lineno, start_colno: colno, start_byte_offset: next_span.0, start_char_index: codepoint_off, end_byte_offset: next_span.0};
103                        let token = Token{lexeme: String::with_capacity(0), context: Some(context), tok_type: TokType::EOF};
104                        source_tokens.push(token);
105                        return Ok(source_tokens)
106                    }
107                    Some(span) => {
108                        current_span = next_span;
109                        next_span = span;
110                        current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
111                        lineno += 1;
112                        colno = 0;
113                        start_lineno = lineno;
114                        start_colno = 1;
115                        start_char_index = codepoint_off + 1;
116                        continue
117                    }
118                }
119            } else {
120                lineno += 1;
121                colno = 0;
122            }
123        }
124        if current_token_buffer.len() < current_token_buffer.capacity() {
125            continue
126        }
127        if current_token_buffer.len() == current_token_buffer.capacity() {
128
129            let context = TokenContext{start_byte_offset: current_span.0, start_lineno, start_colno, start_char_index, end_byte_offset: current_span.2};
130            let token = Token{lexeme: current_token_buffer, tok_type: current_span.1.clone(), context: Some(context)};
131            source_tokens.push(token);
132            match spans.next() {
133                None => {
134                    assert_eq!(next_span.1, TokType::EOF, "Unexpected end of document while token remaining {:?}", next_span);
135                    let context = TokenContext{start_lineno:lineno, start_colno: colno, start_byte_offset: next_span.0, start_char_index: codepoint_off, end_byte_offset: next_span.0};
136                    let token = Token{lexeme: String::with_capacity(0), context: Some(context), tok_type: TokType::EOF};
137                    source_tokens.push(token);
138                    return Ok(source_tokens)
139                }
140                Some(span) => {
141                    current_span = next_span;
142                    next_span = span;
143                    current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
144                    start_lineno = lineno;
145                    start_colno = colno + 1;
146                    start_char_index = codepoint_off + 1;
147                    continue
148                }
149            }
150        }
151        if byte_off > current_span.2 {
152            unreachable!("moved beyond current span")
153        }
154    }
155    unreachable!("Unexpected end of document");
156}
157
158/// Generate a String from a Vec of [Token]s
159///
160/// This function is complementary with [source_to_tokens]. The typical workflow
161/// is to use [source_to_tokens] to generate a Vec of tokens, do something to modify it (
162/// e.g., add/remove/replace tokens) then use this function to turn it back into JSON5 source.
163///
164///
165/// # Examples
166///
167/// ```rust
168/// use json_five::{source_to_tokens, tokens_to_source};
169/// use json_five::rt::tokenize::Token;
170/// use json_five::tokenize::TokType::Whitespace;
171///
172/// let tokens = source_to_tokens("  {my: 'json5'}  ").unwrap();
173///
174/// // remove all Whitespace tokens
175/// let new_tokens:Vec<Token> = tokens.into_iter().filter(|tok| tok.tok_type != Whitespace).collect();
176///
177/// // turn tokens back into source
178/// let new_source = tokens_to_source(&new_tokens);
179/// assert_eq!(new_source, String::from("{my:'json5'}"))
180/// ```
181pub fn tokens_to_source(tokens: &Vec<Token>) -> String {
182    let mut size = 0_usize;
183    for tok in tokens.iter() {
184        size += tok.lexeme.len()
185    }
186    let mut ret = String::with_capacity(size);
187    for tok in tokens.iter() {
188        ret.push_str(&tok.lexeme)
189    }
190    ret
191}
192
193
194
195mod tests {
196    use super::*;
197    #[test]
198    fn test() {
199        let text = "{\"foo\":\"bar\"}";
200        let tokens = source_to_tokens(text).unwrap();
201        assert_eq!(text, tokens_to_source(&tokens));
202    }
203
204    #[test]
205    fn test_rt() {
206        let text = r#"// A JSON5 document
207    {my: "value",
208//               ^^^^^
209// There is trailing whitespace above that will be formatted
210     another: "value"
211     }  // there is also trailing whitespace at the end of the doc
212    "#;
213        let tokens = source_to_tokens(text).unwrap();
214        println!("{:?}", tokens);
215        assert_eq!(text, tokens_to_source(&tokens));
216
217    }
218}