json_five/rt/
tokenize.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
use crate::tokenize::{TokType, TokenSpan, TokenizationError};

#[derive(Debug, PartialEq, Clone)]
pub struct TokenContext {
    pub start_lineno: usize,
    pub start_colno: usize,
    pub start_byte_offset: usize,
    pub start_char_index: usize,
    pub end_byte_offset: usize,
}

/// Represents a source token
///
/// Unlike the spans found in [crate::tokenize::Tokens], these tokens are
/// of an owned type containing an owned String of the lexeme from source.
///
/// The typical way to obtain a [Token] is from the [source_to_tokens] function. But
/// tokens can be created without source documents, too.
///
///
#[derive(Debug, PartialEq, Clone)]
pub struct Token {
    /// The contents of the token, exactly as it appears in source
    pub lexeme: String,

    /// the type of the token
    pub tok_type: TokType,

    /// Contextual information about the Token's position in the source document (if available)
    /// Because tokens can be created without a source doc, this field is an [Option] and may be [None].
    pub context: Option<TokenContext>
}

/// Generate a Vec of [Token]s from a str.
///
///
/// This function is complementary with [tokens_to_source]. The typical workflow
/// is to use this function to generate a Vec of tokens, do something to modify it (
/// e.g., add/remove/replace tokens) then use [tokens_to_source] to turn it back into JSON5 source.
///
/// Unlike the tokenizing functions available in the [crate::tokenize] module, this function
/// produces owned [Token] objects containing (among other fields) an owned String of the lexeme,
/// rather than a [crate::tokenize::Tokens] struct.
///
/// # Examples
///
/// ```rust
/// use json_five::{source_to_tokens, tokens_to_source};
/// use json_five::rt::tokenize::Token;
/// use json_five::tokenize::TokType::Whitespace;
///
/// let tokens = source_to_tokens("  {my: 'json5'}  ").unwrap();
///
/// // remove all Whitespace tokens
/// let new_tokens:Vec<Token> = tokens.into_iter().filter(|tok| tok.tok_type != Whitespace).collect();
///
/// // turn tokens back into source
/// let new_source = tokens_to_source(&new_tokens);
/// assert_eq!(new_source, String::from("{my:'json5'}"))
/// ```
///
pub fn source_to_tokens(text: &str) -> Result<Vec<Token>, TokenizationError> {
    use crate::tokenize::tokenize_rt_str;
    // TODO: instead of going through the entire input to get the tokens
    //       and then going through it again to get context
    //       we should write a solution that does everything in one pass

    let tokens = tokenize_rt_str(text)?;
    let mut source_tokens: Vec<Token> = Vec::with_capacity(tokens.tok_spans.len());
    let mut spans = tokens.tok_spans.iter();
    let mut current_span = spans.next().unwrap(); // there will always at least be EOF
    let mut next_span: &TokenSpan;
    match spans.next() {
        None => {
            // empty doc
            source_tokens.push(Token{lexeme: String::new(), tok_type:TokType::EOF, context: Some(TokenContext{start_byte_offset: 0, start_colno: 1, start_lineno: 1, start_char_index: 0, end_byte_offset: 0})});
            return Ok(source_tokens)
        }
        Some(span) => {
            next_span = span;
        }
    }
    let mut lineno: usize = 1;
    let mut colno: usize = 0;

    let mut start_lineno = lineno;
    let mut start_colno = 1;
    let mut start_char_index = 0;

    let mut current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
    for (codepoint_off, (byte_off, char)) in text.char_indices().enumerate() {
        colno += 1;
        current_token_buffer.push(char);
        if char == '\n' {
            if current_token_buffer.len() == current_token_buffer.capacity() {
                let context = TokenContext{start_byte_offset: current_span.0, start_lineno, start_colno, start_char_index, end_byte_offset: current_span.2};
                let token = Token{lexeme: current_token_buffer, tok_type: current_span.1.clone(), context: Some(context)};
                source_tokens.push(token);
                match spans.next() {
                    None => {
                        assert_eq!(next_span.1, TokType::EOF, "Unexpected end of document while token remaining {:?}", next_span);
                        let context = TokenContext{start_lineno:lineno, start_colno: colno, start_byte_offset: next_span.0, start_char_index: codepoint_off, end_byte_offset: next_span.0};
                        let token = Token{lexeme: String::with_capacity(0), context: Some(context), tok_type: TokType::EOF};
                        source_tokens.push(token);
                        return Ok(source_tokens)
                    }
                    Some(span) => {
                        current_span = next_span;
                        next_span = span;
                        current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
                        lineno += 1;
                        colno = 0;
                        start_lineno = lineno;
                        start_colno = 1;
                        start_char_index = codepoint_off + 1;
                        continue
                    }
                }
            } else {
                lineno += 1;
                colno = 0;
            }
        }
        if current_token_buffer.len() < current_token_buffer.capacity() {
            continue
        }
        if current_token_buffer.len() == current_token_buffer.capacity() {

            let context = TokenContext{start_byte_offset: current_span.0, start_lineno, start_colno, start_char_index, end_byte_offset: current_span.2};
            let token = Token{lexeme: current_token_buffer, tok_type: current_span.1.clone(), context: Some(context)};
            source_tokens.push(token);
            match spans.next() {
                None => {
                    assert_eq!(next_span.1, TokType::EOF, "Unexpected end of document while token remaining {:?}", next_span);
                    let context = TokenContext{start_lineno:lineno, start_colno: colno, start_byte_offset: next_span.0, start_char_index: codepoint_off, end_byte_offset: next_span.0};
                    let token = Token{lexeme: String::with_capacity(0), context: Some(context), tok_type: TokType::EOF};
                    source_tokens.push(token);
                    return Ok(source_tokens)
                }
                Some(span) => {
                    current_span = next_span;
                    next_span = span;
                    current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
                    start_lineno = lineno;
                    start_colno = colno + 1;
                    start_char_index = codepoint_off + 1;
                    continue
                }
            }
        }
        if byte_off > current_span.2 {
            unreachable!("moved beyond current span")
        }
    }
    unreachable!("Unexpected end of document");
}

/// Generate a String from a Vec of [Token]s
///
/// This function is complementary with [source_to_tokens]. The typical workflow
/// is to use [source_to_tokens] to generate a Vec of tokens, do something to modify it (
/// e.g., add/remove/replace tokens) then use this function to turn it back into JSON5 source.
///
///
/// # Examples
///
/// ```rust
/// use json_five::{source_to_tokens, tokens_to_source};
/// use json_five::rt::tokenize::Token;
/// use json_five::tokenize::TokType::Whitespace;
///
/// let tokens = source_to_tokens("  {my: 'json5'}  ").unwrap();
///
/// // remove all Whitespace tokens
/// let new_tokens:Vec<Token> = tokens.into_iter().filter(|tok| tok.tok_type != Whitespace).collect();
///
/// // turn tokens back into source
/// let new_source = tokens_to_source(&new_tokens);
/// assert_eq!(new_source, String::from("{my:'json5'}"))
/// ```
pub fn tokens_to_source(tokens: &Vec<Token>) -> String {
    let mut size = 0_usize;
    for tok in tokens.iter() {
        size += tok.lexeme.len()
    }
    let mut ret = String::with_capacity(size);
    for tok in tokens.iter() {
        ret.push_str(&tok.lexeme)
    }
    ret
}



mod tests {
    use super::*;
    #[test]
    fn test() {
        let text = "{\"foo\":\"bar\"}";
        let tokens = source_to_tokens(text).unwrap();
        assert_eq!(text, tokens_to_source(&tokens));
    }

    #[test]
    fn test_rt() {
        let text = r#"// A JSON5 document
    {my: "value",
//               ^^^^^
// There is trailing whitespace above that will be formatted
     another: "value"
     }  // there is also trailing whitespace at the end of the doc
    "#;
        let tokens = source_to_tokens(text).unwrap();
        println!("{:?}", tokens);
        assert_eq!(text, tokens_to_source(&tokens));

    }
}