use std::cmp;
use buffer::{Lexeme, Position, Token};
use syntect::parsing::{ParseState, ScopeStack, ScopeStackOp, SyntaxDefinition};
use util::LineIterator;
use unicode_segmentation::UnicodeSegmentation;
pub struct TokenIterator<'a> {
scopes: ScopeStack,
parser: ParseState,
lines: LineIterator<'a>,
current_line: Option<&'a str>,
current_byte_offset: usize,
current_position: Position,
line_events: Vec<(usize, ScopeStackOp)>,
}
impl<'a> TokenIterator<'a> {
pub fn new(data: &'a str, def: &SyntaxDefinition) -> TokenIterator<'a> {
let mut token_iterator = TokenIterator{
scopes: ScopeStack::new(),
parser: ParseState::new(def),
lines: LineIterator::new(data),
current_line: None,
current_byte_offset: 0,
current_position: Position{ line: 0, offset: 0 },
line_events: Vec::new(),
};
token_iterator.parse_next_line();
token_iterator
}
fn next_token(&mut self) -> Option<Token<'a>> {
if let Some(token) = self.build_next_token() {
return Some(token)
}
self.parse_next_line();
if self.current_line.is_some() {
Some(Token::Newline)
} else {
None
}
}
fn build_next_token(&mut self) -> Option<Token<'a>> {
let mut lexeme = None;
if let Some(line) = self.current_line {
let end_of_line = if line.chars().last() == Some('\n') {
line.len() - 1
} else {
line.len()
};
while let Some((event_offset, scope_change)) = self.line_events.pop() {
if event_offset > self.current_byte_offset {
let end_of_token = cmp::min(event_offset, end_of_line);
lexeme = Some(
Token::Lexeme(Lexeme{
value: &line[self.current_byte_offset..end_of_token],
scope: self.scopes.clone(),
position: self.current_position.clone(),
})
);
self.current_position.offset +=
*&line[self.current_byte_offset..end_of_token]
.graphemes(true)
.count();
self.current_byte_offset = event_offset;
}
self.scopes.apply(&scope_change);
if lexeme.is_some() { return lexeme }
}
if self.current_byte_offset < end_of_line {
lexeme = Some(
Token::Lexeme(Lexeme{
value: &line[self.current_byte_offset..end_of_line],
scope: self.scopes.clone(),
position: self.current_position.clone(),
})
);
}
}
self.current_line = None;
lexeme
}
fn parse_next_line(&mut self) {
if let Some((line_number, line)) = self.lines.next() {
let mut line_events = self.parser.parse_line(line);
line_events.reverse();
self.line_events = line_events;
self.current_line = Some(line);
self.current_position = Position{ line: line_number, offset: 0 };
self.current_byte_offset = 0;
} else {
self.current_line = None;
}
}
}
impl<'a> Iterator for TokenIterator<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
self.next_token()
}
}
#[cfg(test)]
mod tests {
use super::TokenIterator;
use buffer::{Lexeme, Position, ScopeStack, Token};
use syntect::parsing::{Scope, SyntaxSet};
#[test]
fn token_iterator_returns_correct_tokens() {
let syntax_set = SyntaxSet::load_defaults_newlines();
let def = syntax_set.find_syntax_by_extension("rs");
let iterator = TokenIterator::new("struct Buffer {\n// comment\n data: String\n}garbage\n\n", def.unwrap());
let mut scope_stack = ScopeStack::new();
let mut expected_tokens = Vec::new();
scope_stack.push(Scope::new("source.rust").unwrap());
scope_stack.push(Scope::new("meta.struct.rust").unwrap());
scope_stack.push(Scope::new("storage.type.struct.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme{
value: "struct",
scope: scope_stack.clone(),
position: Position{ line: 0, offset: 0 }
}));
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme{
value: " ",
scope: scope_stack.clone(),
position: Position{ line: 0, offset: 6 }
}));
scope_stack.push(Scope::new("entity.name.struct.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme{
value: "Buffer",
scope: scope_stack.clone(),
position: Position{ line: 0, offset: 7 }
}));
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme{
value: " ",
scope: scope_stack.clone(),
position: Position{ line: 0, offset: 13 }
}));
scope_stack.push(Scope::new("meta.block.rust").unwrap());
scope_stack.push(Scope::new("punctuation.section.block.begin.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme{
value: "{",
scope: scope_stack.clone(),
position: Position{ line: 0, offset: 14 }
}));
expected_tokens.push(Token::Newline);
scope_stack.pop();
scope_stack.push(Scope::new("comment.line.double-slash.rust").unwrap());
scope_stack.push(Scope::new("punctuation.definition.comment.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme{
value: "//",
scope: scope_stack.clone(),
position: Position{ line: 1, offset: 0 }
}));
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme{
value: " comment",
scope: scope_stack.clone(),
position: Position{ line: 1, offset: 2 }
}));
expected_tokens.push(Token::Newline);
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme{
value: " ",
scope: scope_stack.clone(),
position: Position{ line: 2, offset: 0 }
}));
scope_stack.push(Scope::new("variable.other.member.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme{
value: "data",
scope: scope_stack.clone(),
position: Position{ line: 2, offset: 2 }
}));
scope_stack.pop();
scope_stack.push(Scope::new("punctuation.separator.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme{
value: ":",
scope: scope_stack.clone(),
position: Position{ line: 2, offset: 6 }
}));
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme{
value: " String",
scope: scope_stack.clone(),
position: Position{ line: 2, offset: 7 }
}));
expected_tokens.push(Token::Newline);
scope_stack.push(Scope::new("punctuation.section.block.end.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme{
value: "}",
scope: scope_stack.clone(),
position: Position{ line: 3, offset: 0 }
}));
scope_stack.pop();
scope_stack.pop();
scope_stack.pop();
expected_tokens.push(Token::Lexeme(Lexeme{
value: "garbage",
scope: scope_stack.clone(),
position: Position{ line: 3, offset: 1 }
}));
expected_tokens.push(Token::Newline);
expected_tokens.push(Token::Newline);
let actual_tokens: Vec<Token> = iterator.collect();
for (index, token) in expected_tokens.into_iter().enumerate() {
assert_eq!(token, actual_tokens[index]);
}
}
#[test]
fn token_iterator_handles_content_without_trailing_newline() {
let syntax_set = SyntaxSet::load_defaults_newlines();
let def = syntax_set.find_syntax_plain_text();
let iterator = TokenIterator::new("struct", def);
let mut expected_tokens = Vec::new();
expected_tokens.push(
Token::Lexeme(Lexeme{
value: "struct",
scope: ScopeStack::from_vec(vec![
Scope::new("text.plain").unwrap(),
]),
position: Position{ line: 0, offset: 0 }
})
);
let actual_tokens: Vec<Token> = iterator.collect();
for (index, token) in expected_tokens.into_iter().enumerate() {
assert_eq!(token, actual_tokens[index]);
}
}
#[test]
fn token_iterator_handles_unicode_characters() {
let syntax_set = SyntaxSet::load_defaults_newlines();
let def = syntax_set.find_syntax_by_extension("rs");
let iterator = TokenIterator::new("€16", def.unwrap());
let mut scope_stack = ScopeStack::new();
let mut expected_tokens = Vec::new();
scope_stack.push(Scope::new("source.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme{
value: "€",
scope: scope_stack.clone(),
position: Position{ line: 0, offset: 0 }
}));
scope_stack.push(Scope::new("constant.numeric.integer.decimal.rust").unwrap());
expected_tokens.push(Token::Lexeme(Lexeme{
value: "16",
scope: scope_stack.clone(),
position: Position{ line: 0, offset: 1 }
}));
let actual_tokens: Vec<Token> = iterator.collect();
for (index, token) in expected_tokens.into_iter().enumerate() {
assert_eq!(token, actual_tokens[index]);
}
}
}