json_five/rt/tokenize.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
use crate::tokenize::{TokType, TokenSpan, TokenizationError};
#[derive(Debug, PartialEq, Clone)]
pub struct TokenContext {
pub start_lineno: usize,
pub start_colno: usize,
pub start_byte_offset: usize,
pub start_char_index: usize,
pub end_byte_offset: usize,
}
/// Represents a source token
///
/// Unlike the spans found in [crate::tokenize::Tokens], these tokens are
/// of an owned type containing an owned String of the lexeme from source.
///
/// The typical way to obtain a [Token] is from the [source_to_tokens] function. But
/// tokens can be created without source documents, too.
///
///
#[derive(Debug, PartialEq, Clone)]
pub struct Token {
/// The contents of the token, exactly as it appears in source
pub lexeme: String,
/// the type of the token
pub tok_type: TokType,
/// Contextual information about the Token's position in the source document (if available)
/// Because tokens can be created without a source doc, this field is an [Option] and may be [None].
pub context: Option<TokenContext>
}
/// Generate a Vec of [Token]s from a str.
///
///
/// This function is complementary with [tokens_to_source]. The typical workflow
/// is to use this function to generate a Vec of tokens, do something to modify it (
/// e.g., add/remove/replace tokens) then use [tokens_to_source] to turn it back into JSON5 source.
///
/// Unlike the tokenizing functions available in the [crate::tokenize] module, this function
/// produces owned [Token] objects containing (among other fields) an owned String of the lexeme,
/// rather than a [crate::tokenize::Tokens] struct.
///
/// # Examples
///
/// ```rust
/// use json_five::{source_to_tokens, tokens_to_source};
/// use json_five::rt::tokenize::Token;
/// use json_five::tokenize::TokType::Whitespace;
///
/// let tokens = source_to_tokens(" {my: 'json5'} ").unwrap();
///
/// // remove all Whitespace tokens
/// let new_tokens:Vec<Token> = tokens.into_iter().filter(|tok| tok.tok_type != Whitespace).collect();
///
/// // turn tokens back into source
/// let new_source = tokens_to_source(&new_tokens);
/// assert_eq!(new_source, String::from("{my:'json5'}"))
/// ```
///
pub fn source_to_tokens(text: &str) -> Result<Vec<Token>, TokenizationError> {
use crate::tokenize::tokenize_rt_str;
// TODO: instead of going through the entire input to get the tokens
// and then going through it again to get context
// we should write a solution that does everything in one pass
let tokens = tokenize_rt_str(text)?;
let mut source_tokens: Vec<Token> = Vec::with_capacity(tokens.tok_spans.len());
let mut spans = tokens.tok_spans.iter();
let mut current_span = spans.next().unwrap(); // there will always at least be EOF
let mut next_span: &TokenSpan;
match spans.next() {
None => {
// empty doc
source_tokens.push(Token{lexeme: String::new(), tok_type:TokType::EOF, context: Some(TokenContext{start_byte_offset: 0, start_colno: 1, start_lineno: 1, start_char_index: 0, end_byte_offset: 0})});
return Ok(source_tokens)
}
Some(span) => {
next_span = span;
}
}
let mut lineno: usize = 1;
let mut colno: usize = 0;
let mut start_lineno = lineno;
let mut start_colno = 1;
let mut start_char_index = 0;
let mut current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
for (codepoint_off, (byte_off, char)) in text.char_indices().enumerate() {
colno += 1;
current_token_buffer.push(char);
if char == '\n' {
if current_token_buffer.len() == current_token_buffer.capacity() {
let context = TokenContext{start_byte_offset: current_span.0, start_lineno, start_colno, start_char_index, end_byte_offset: current_span.2};
let token = Token{lexeme: current_token_buffer, tok_type: current_span.1.clone(), context: Some(context)};
source_tokens.push(token);
match spans.next() {
None => {
assert_eq!(next_span.1, TokType::EOF, "Unexpected end of document while token remaining {:?}", next_span);
let context = TokenContext{start_lineno:lineno, start_colno: colno, start_byte_offset: next_span.0, start_char_index: codepoint_off, end_byte_offset: next_span.0};
let token = Token{lexeme: String::with_capacity(0), context: Some(context), tok_type: TokType::EOF};
source_tokens.push(token);
return Ok(source_tokens)
}
Some(span) => {
current_span = next_span;
next_span = span;
current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
lineno += 1;
colno = 0;
start_lineno = lineno;
start_colno = 1;
start_char_index = codepoint_off + 1;
continue
}
}
} else {
lineno += 1;
colno = 0;
}
}
if current_token_buffer.len() < current_token_buffer.capacity() {
continue
}
if current_token_buffer.len() == current_token_buffer.capacity() {
let context = TokenContext{start_byte_offset: current_span.0, start_lineno, start_colno, start_char_index, end_byte_offset: current_span.2};
let token = Token{lexeme: current_token_buffer, tok_type: current_span.1.clone(), context: Some(context)};
source_tokens.push(token);
match spans.next() {
None => {
assert_eq!(next_span.1, TokType::EOF, "Unexpected end of document while token remaining {:?}", next_span);
let context = TokenContext{start_lineno:lineno, start_colno: colno, start_byte_offset: next_span.0, start_char_index: codepoint_off, end_byte_offset: next_span.0};
let token = Token{lexeme: String::with_capacity(0), context: Some(context), tok_type: TokType::EOF};
source_tokens.push(token);
return Ok(source_tokens)
}
Some(span) => {
current_span = next_span;
next_span = span;
current_token_buffer = String::with_capacity(current_span.2 - current_span.0);
start_lineno = lineno;
start_colno = colno + 1;
start_char_index = codepoint_off + 1;
continue
}
}
}
if byte_off > current_span.2 {
unreachable!("moved beyond current span")
}
}
unreachable!("Unexpected end of document");
}
/// Generate a String from a Vec of [Token]s
///
/// This function is complementary with [source_to_tokens]. The typical workflow
/// is to use [source_to_tokens] to generate a Vec of tokens, do something to modify it (
/// e.g., add/remove/replace tokens) then use this function to turn it back into JSON5 source.
///
///
/// # Examples
///
/// ```rust
/// use json_five::{source_to_tokens, tokens_to_source};
/// use json_five::rt::tokenize::Token;
/// use json_five::tokenize::TokType::Whitespace;
///
/// let tokens = source_to_tokens(" {my: 'json5'} ").unwrap();
///
/// // remove all Whitespace tokens
/// let new_tokens:Vec<Token> = tokens.into_iter().filter(|tok| tok.tok_type != Whitespace).collect();
///
/// // turn tokens back into source
/// let new_source = tokens_to_source(&new_tokens);
/// assert_eq!(new_source, String::from("{my:'json5'}"))
/// ```
pub fn tokens_to_source(tokens: &Vec<Token>) -> String {
let mut size = 0_usize;
for tok in tokens.iter() {
size += tok.lexeme.len()
}
let mut ret = String::with_capacity(size);
for tok in tokens.iter() {
ret.push_str(&tok.lexeme)
}
ret
}
mod tests {
use super::*;
#[test]
fn test() {
let text = "{\"foo\":\"bar\"}";
let tokens = source_to_tokens(text).unwrap();
assert_eq!(text, tokens_to_source(&tokens));
}
#[test]
fn test_rt() {
let text = r#"// A JSON5 document
{my: "value",
// ^^^^^
// There is trailing whitespace above that will be formatted
another: "value"
} // there is also trailing whitespace at the end of the doc
"#;
let tokens = source_to_tokens(text).unwrap();
println!("{:?}", tokens);
assert_eq!(text, tokens_to_source(&tokens));
}
}