json_five/rt/
tokenize.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
use crate::tokenize::{TokType, TokenSpan, TokenizationError};

#[derive(Debug, PartialEq, Clone)]
pub struct TokenContext {
    pub start_lineno: usize,
    pub start_colno: usize,
    pub start_byte_offset: usize,
    pub start_char_index: usize,
    pub end_byte_offset: usize,
}

#[derive(Debug, PartialEq, Clone)]
pub struct Token {
    pub lexeme: String,
    pub tok_type: TokType,
    pub context: Option<TokenContext>
}


pub fn source_to_tokens(text: &str) -> Result<Vec<Token>, TokenizationError> {
    use crate::tokenize::tokenize_rt_str;
    // TODO: instead of going through the entire input to get the tokens
    //       and then going through it again to get context
    //       we should write a solution that does everything in one pass

    let tokens = tokenize_rt_str(text)?;
    let mut source_tokens: Vec<Token> = Vec::with_capacity(tokens.tok_spans.len());
    let mut spans = tokens.tok_spans.iter();
    let mut current_span = spans.next().unwrap(); // there will always at least be EOF
    let mut next_span: &TokenSpan;
    match spans.next() {
        None => {
            // empty doc
            source_tokens.push(Token{lexeme: String::new(), tok_type:TokType::EOF, context: Some(TokenContext{start_byte_offset: 0, start_colno: 1, start_lineno: 1, start_char_index: 0, end_byte_offset: 0})});
            return Ok(source_tokens)
        }
        Some(span) => {
            next_span = span;
        }
    }
    let mut lineno: usize = 1;
    let mut colno: usize = 0;

    let mut start_lineno = lineno;
    let mut start_colno = 1;
    let mut start_char_index = 0;

    let mut current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
    for (codepoint_off, (byte_off, char)) in text.char_indices().enumerate() {
        colno += 1;
        current_token_buffer.push(char);
        if char == '\n' {
            if current_token_buffer.len() == current_token_buffer.capacity() {
                let context = TokenContext{start_byte_offset: current_span.0, start_lineno, start_colno, start_char_index, end_byte_offset: current_span.2};
                let token = Token{lexeme: current_token_buffer, tok_type: current_span.1.clone(), context: Some(context)};
                source_tokens.push(token);
                match spans.next() {
                    None => {
                        assert_eq!(next_span.1, TokType::EOF, "Unexpected end of document while token remaining {:?}", next_span);
                        let context = TokenContext{start_lineno:lineno, start_colno: colno, start_byte_offset: next_span.0, start_char_index: codepoint_off, end_byte_offset: next_span.0};
                        let token = Token{lexeme: String::with_capacity(0), context: Some(context), tok_type: TokType::EOF};
                        source_tokens.push(token);
                        return Ok(source_tokens)
                    }
                    Some(span) => {
                        current_span = next_span;
                        next_span = span;
                        current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
                        lineno += 1;
                        colno = 0;
                        start_lineno = lineno;
                        start_colno = 1;
                        start_char_index = codepoint_off + 1;
                        continue
                    }
                }
            } else {
                lineno += 1;
                colno = 0;
            }
        }
        if current_token_buffer.len() < current_token_buffer.capacity() {
            continue
        }
        if current_token_buffer.len() == current_token_buffer.capacity() {

            let context = TokenContext{start_byte_offset: current_span.0, start_lineno, start_colno, start_char_index, end_byte_offset: current_span.2};
            let token = Token{lexeme: current_token_buffer, tok_type: current_span.1.clone(), context: Some(context)};
            source_tokens.push(token);
            match spans.next() {
                None => {
                    assert_eq!(next_span.1, TokType::EOF, "Unexpected end of document while token remaining {:?}", next_span);
                    let context = TokenContext{start_lineno:lineno, start_colno: colno, start_byte_offset: next_span.0, start_char_index: codepoint_off, end_byte_offset: next_span.0};
                    let token = Token{lexeme: String::with_capacity(0), context: Some(context), tok_type: TokType::EOF};
                    source_tokens.push(token);
                    return Ok(source_tokens)
                }
                Some(span) => {
                    current_span = next_span;
                    next_span = span;
                    current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
                    start_lineno = lineno;
                    start_colno = colno + 1;
                    start_char_index = codepoint_off + 1;
                    continue
                }
            }
        }
        if byte_off > current_span.2 {
            unreachable!("moved beyond current span")
        }
    }
    unreachable!("Unexpected end of document");
}


pub fn tokens_to_source(tokens: &Vec<Token>) -> String {
    let mut size = 0_usize;
    for tok in tokens.iter() {
        size += tok.lexeme.len()
    }
    let mut ret = String::with_capacity(size);
    for tok in tokens.iter() {
        ret.push_str(&tok.lexeme)
    }
    ret
}



mod tests {
    use super::*;
    #[test]
    fn test() {
        let text = "{\"foo\":\"bar\"}";
        let tokens = source_to_tokens(text).unwrap();
        assert_eq!(text, tokens_to_source(&tokens));
    }

    #[test]
    fn test_rt() {
        let text = r#"// A JSON5 document
    {my: "value",
//               ^^^^^
// There is trailing whitespace above that will be formatted
     another: "value"
     }  // there is also trailing whitespace at the end of the doc
    "#;
        let tokens = source_to_tokens(text).unwrap();
        println!("{:?}", tokens);
        assert_eq!(text, tokens_to_source(&tokens));

    }
}