use crate::buffer::{Lexeme, Position, Token};
use crate::errors::*;
use crate::util::LineIterator;
use std::cmp;
use syntect::parsing::{ParseState, ScopeStack, ScopeStackOp, SyntaxReference, SyntaxSet};
use unicode_segmentation::UnicodeSegmentation;
pub struct TokenIterator<'a> {
scopes: ScopeStack,
parser: ParseState,
lines: LineIterator<'a>,
current_line: Option<&'a str>,
current_byte_offset: usize,
current_position: Position,
line_events: Vec<(usize, ScopeStackOp)>,
syntaxes: &'a SyntaxSet,
pub error: Option<Error>,
}
impl<'a> TokenIterator<'a> {
pub fn new(
data: &'a str,
def: &'a SyntaxReference,
syntaxes: &'a SyntaxSet,
) -> Result<TokenIterator<'a>> {
let mut token_iterator = TokenIterator {
scopes: ScopeStack::new(),
parser: ParseState::new(def),
lines: LineIterator::new(data),
current_line: None,
current_byte_offset: 0,
current_position: Position { line: 0, offset: 0 },
line_events: Vec::new(),
syntaxes,
error: None,
};
token_iterator.parse_next_line()?;
Ok(token_iterator)
}
fn next_token(&mut self) -> Option<Token<'a>> {
let token_result = self.build_next_token();
if let Err(e) = token_result {
self.error = Some(e);
return None;
}
if let Ok(Some(token)) = token_result {
return Some(token);
}
let parse_result = self.parse_next_line();
if let Err(e) = parse_result {
self.error = Some(e);
return None;
}
if self.current_line.is_some() {
Some(Token::Newline)
} else {
None
}
}
fn build_next_token(&mut self) -> Result<Option<Token<'a>>> {
let mut lexeme = None;
if let Some(line) = self.current_line {
let end_of_line = if line.ends_with('\n') {
line.len() - 1
} else {
line.len()
};
while let Some((event_offset, scope_change)) = self.line_events.pop() {
if event_offset > self.current_byte_offset {
let end_of_token = cmp::min(event_offset, end_of_line);
lexeme = Some(Token::Lexeme(Lexeme {
value: &line[self.current_byte_offset..end_of_token],
scope: self.scopes.clone(),
position: self.current_position,
}));
self.current_position.offset += line[self.current_byte_offset..end_of_token]
.graphemes(true)
.count();
self.current_byte_offset = event_offset;
}
self.scopes.apply(&scope_change)?;
if lexeme.is_some() {
return Ok(lexeme);
}
}
if self.current_byte_offset < end_of_line {
lexeme = Some(Token::Lexeme(Lexeme {
value: &line[self.current_byte_offset..end_of_line],
scope: self.scopes.clone(),
position: self.current_position,
}));
}
}
self.current_line = None;
Ok(lexeme)
}
fn parse_next_line(&mut self) -> Result<()> {
if let Some((line_number, line)) = self.lines.next() {
let mut line_events = self.parser.parse_line(line, self.syntaxes)?;
line_events.reverse();
self.line_events = line_events;
self.current_line = Some(line);
self.current_position = Position {
line: line_number,
offset: 0,
};
self.current_byte_offset = 0;
} else {
self.current_line = None;
}
Ok(())
}
}
impl<'a> Iterator for TokenIterator<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
self.next_token()
}
}
#[cfg(test)]
mod tests {
use super::TokenIterator;
use crate::buffer::{Lexeme, Position, ScopeStack, Token};
use syntect::parsing::{Scope, SyntaxSet};
#[test]
fn token_iterator_returns_correct_tokens() {
let syntax_set = SyntaxSet::load_defaults_newlines();
let def = syntax_set.find_syntax_by_extension("rs");
let iterator = TokenIterator::new(
"struct Buffer {\n// comment\n data: String\n}garbage\n\n",
def.unwrap(),
&syntax_set,
)
.unwrap();
let mut scope_stack = ScopeStack::new();
let mut expected_tokens = Vec::new();
scope_stack.push(Scope::new("source.rust").unwrap());
scope_stack.push(Scope::new("meta.struct.rust").unwrap());
scope_stack.push(Scope::new("storage.type.struct.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme {
value: "struct",
scope: scope_stack.clone(),
position: Position { line: 0, offset: 0 },
}));
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme {
value: " ",
scope: scope_stack.clone(),
position: Position { line: 0, offset: 6 },
}));
scope_stack.push(Scope::new("entity.name.struct.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme {
value: "Buffer",
scope: scope_stack.clone(),
position: Position { line: 0, offset: 7 },
}));
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme {
value: " ",
scope: scope_stack.clone(),
position: Position {
line: 0,
offset: 13,
},
}));
scope_stack.push(Scope::new("meta.block.rust").unwrap());
scope_stack.push(Scope::new("punctuation.section.block.begin.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme {
value: "{",
scope: scope_stack.clone(),
position: Position {
line: 0,
offset: 14,
},
}));
expected_tokens.push(Token::Newline);
scope_stack.pop();
scope_stack.push(Scope::new("comment.line.double-slash.rust").unwrap());
scope_stack.push(Scope::new("punctuation.definition.comment.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme {
value: "//",
scope: scope_stack.clone(),
position: Position { line: 1, offset: 0 },
}));
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme {
value: " comment",
scope: scope_stack.clone(),
position: Position { line: 1, offset: 2 },
}));
expected_tokens.push(Token::Newline);
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme {
value: " ",
scope: scope_stack.clone(),
position: Position { line: 2, offset: 0 },
}));
scope_stack.push(Scope::new("variable.other.member.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme {
value: "data",
scope: scope_stack.clone(),
position: Position { line: 2, offset: 2 },
}));
scope_stack.pop();
scope_stack.push(Scope::new("punctuation.separator.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme {
value: ":",
scope: scope_stack.clone(),
position: Position { line: 2, offset: 6 },
}));
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme {
value: " String",
scope: scope_stack.clone(),
position: Position { line: 2, offset: 7 },
}));
expected_tokens.push(Token::Newline);
scope_stack.push(Scope::new("punctuation.section.block.end.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme {
value: "}",
scope: scope_stack.clone(),
position: Position { line: 3, offset: 0 },
}));
scope_stack.pop();
scope_stack.pop();
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme {
value: "garbage",
scope: scope_stack.clone(),
position: Position { line: 3, offset: 1 },
}));
expected_tokens.push(Token::Newline);
expected_tokens.push(Token::Newline);
let actual_tokens: Vec<Token> = iterator.collect();
for (index, token) in expected_tokens.into_iter().enumerate() {
assert_eq!(token, actual_tokens[index]);
}
}
#[test]
fn token_iterator_handles_content_without_trailing_newline() {
let syntax_set = SyntaxSet::load_defaults_newlines();
let def = syntax_set.find_syntax_plain_text();
let iterator = TokenIterator::new("struct", def, &syntax_set).unwrap();
let mut expected_tokens = Vec::new();
expected_tokens.push(Token::Lexeme(Lexeme {
value: "struct",
scope: ScopeStack::from_vec(vec![Scope::new("text.plain").unwrap()]),
position: Position { line: 0, offset: 0 },
}));
let actual_tokens: Vec<Token> = iterator.collect();
for (index, token) in expected_tokens.into_iter().enumerate() {
assert_eq!(token, actual_tokens[index]);
}
}
#[test]
fn token_iterator_handles_unicode_characters() {
let syntax_set = SyntaxSet::load_defaults_newlines();
let def = syntax_set.find_syntax_by_extension("rs");
let iterator = TokenIterator::new("€16", def.unwrap(), &syntax_set).unwrap();
let mut scope_stack = ScopeStack::new();
let mut expected_tokens = Vec::new();
scope_stack.push(Scope::new("source.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme {
value: "€",
scope: scope_stack.clone(),
position: Position { line: 0, offset: 0 },
}));
scope_stack.push(Scope::new("constant.numeric.integer.decimal.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme {
value: "16",
scope: scope_stack.clone(),
position: Position { line: 0, offset: 1 },
}));
let actual_tokens: Vec<Token> = iterator.collect();
for (index, token) in expected_tokens.into_iter().enumerate() {
assert_eq!(token, actual_tokens[index]);
}
}
}