#[macro_use]
extern crate log;
pub mod error;
mod manual_scanner;
mod tokenizer;
pub mod tokens;
pub use crate::tokenizer::{JSBuffer, Tokenizer};
pub mod prelude {
pub use super::{
tokenize, tokens::prelude::*, Item, OpenCurlyKind, Position, Scanner, ScannerState,
SourceLocation,
};
}
use crate::{tokenizer::RawKeyword, tokens::prelude::*};
use error::{Error, RawError};
pub use manual_scanner::{ManualScanner, ScannerState as ManualState};
type Res<T> = Result<T, Error>;
mod look_behind;
use look_behind::{Brace, LookBehind, MetaToken, Paren};
pub fn tokenize(text: &str) -> Res<Vec<Token<&str>>> {
Scanner::new(text)
.map(|i| {
let t = i?.token;
Ok(t)
})
.collect()
}
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct SourceLocation {
pub start: Position,
pub end: Position,
}
impl SourceLocation {
#[inline]
pub const fn new(start: Position, end: Position) -> Self {
Self { start, end }
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Position {
pub line: usize,
pub column: usize,
}
impl ::std::fmt::Display for Position {
fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result {
write!(f, "{}:{}", self.line, self.column)
}
}
impl ::std::cmp::PartialOrd for Position {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl std::cmp::Ord for Position {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
use std::cmp::Ordering::*;
match self.line.cmp(&other.line) {
Less => Less,
Greater => Greater,
_ => self.column.cmp(&other.column),
}
}
}
impl Position {
#[inline]
pub const fn new(line: usize, column: usize) -> Self {
Self { line, column }
}
}
#[derive(Debug, PartialEq, Clone, Copy)]
pub struct Span {
pub start: usize,
pub end: usize,
}
#[allow(clippy::len_without_is_empty)]
impl Span {
#[inline]
pub const fn new(start: usize, end: usize) -> Self {
Self { start, end }
}
#[inline]
pub const fn len(self) -> usize {
self.end - self.start
}
}
#[derive(Clone, Debug, PartialEq)]
pub struct Item<T> {
pub token: Token<T>,
pub span: Span,
pub location: SourceLocation,
}
impl<T> Item<T> {
pub fn new(token: Token<T>, span: Span, location: SourceLocation) -> Self {
Self {
token,
span,
location,
}
}
fn new_(
token: Token<T>,
span_start: usize,
span_end: usize,
loc_start_line: usize,
loc_start_col: usize,
loc_end_line: usize,
loc_end_col: usize,
) -> Self {
Self {
token,
span: Span::new(span_start, span_end),
location: SourceLocation::new(
Position::new(loc_start_line, loc_start_col),
Position::new(loc_end_line, loc_end_col),
),
}
}
pub fn is_string(&self) -> bool {
matches!(self.token, Token::String(_))
}
pub fn is_eof(&self) -> bool {
matches!(self.token, Token::EoF)
}
pub fn is_template(&self) -> bool {
self.token.is_template_head()
|| self.token.is_template_body()
|| self.token.is_template_tail()
}
}
pub struct Scanner<'a> {
manual_scanner: ManualScanner<'a>,
original: &'a str,
errored: bool,
last_three: LookBehind,
brace_stack: Vec<Brace>,
paren_stack: Vec<Paren>,
}
impl<'a> Scanner<'a> {
pub fn new(text: &'a str) -> Self {
Self {
manual_scanner: ManualScanner::new(text),
original: text,
errored: false,
last_three: LookBehind::new(),
paren_stack: Vec::new(),
brace_stack: Vec::new(),
}
}
}
impl<'a> Iterator for Scanner<'a> {
type Item = Res<Item<&'a str>>;
fn next(&mut self) -> Option<Self::Item> {
self.get_next_token(true)
}
}
impl<'b> Scanner<'b> {
pub fn look_ahead(&mut self) -> Option<Res<Item<&'b str>>> {
self.get_next_token(false)
}
pub fn skip_comments(&mut self) -> Res<()> {
debug!(target: "ress", "skipping comments");
self.manual_scanner.skip_comments()
}
pub fn get_state(&self) -> ScannerState {
ScannerState {
manual_state: self.manual_scanner.get_state(),
last_three: self.last_three.clone(),
paren_stack: self.paren_stack.clone(),
}
}
#[inline]
pub fn set_state(&mut self, state: ScannerState) {
let ScannerState {
manual_state,
last_three,
paren_stack,
} = state;
self.last_three = last_three;
self.paren_stack = paren_stack;
self.manual_scanner.set_state(manual_state);
}
#[inline]
fn get_next_token(&mut self, advance_cursor: bool) -> Option<Res<Item<&'b str>>> {
if self.errored {
return None;
}
if self.manual_scanner.eof {
debug!("end of iterator, returning None");
return None;
};
let state = self.manual_scanner.get_state();
let next = match self.manual_scanner.next_token()? {
Ok(n) => n,
Err(e) => {
self.errored = true;
return Some(Err(e));
}
};
let ret = if next.token.is_div_punct() && self.is_regex_start() {
self.manual_scanner.next_regex(next.span.len())?
} else {
Ok(next)
};
if advance_cursor {
if let Ok(i) = &ret {
if let Err(e) = self.keep_books(&i) {
return Some(Err(e));
}
}
} else {
self.manual_scanner.set_state(state);
}
Some(ret)
}
#[inline]
fn keep_books(&mut self, item: &Item<&'b str>) -> Res<()> {
if let Token::Punct(ref p) = &item.token {
match p {
Punct::OpenParen => self.handle_open_paren_books(),
Punct::OpenBrace => self.handle_open_brace_books(),
Punct::CloseParen => self.handle_close_paren_books(item.span.start)?,
Punct::CloseBrace => self.handle_close_brace_books(item.span.start)?,
_ => self
.last_three
.push((&item.token, self.manual_scanner.new_line_count as u32).into()),
}
} else if !item.token.is_comment() {
self.last_three
.push((&item.token, self.manual_scanner.new_line_count as u32).into());
}
Ok(())
}
#[inline]
fn handle_open_paren_books(&mut self) {
let func_expr = if let Some(MetaToken::Keyword(RawKeyword::Function, _)) =
self.last_three.one()
{
if let Some(tok) = self.last_three.two() {
!Self::check_for_expression(*tok)
} else {
false
}
} else if let Some(MetaToken::Keyword(RawKeyword::Function, _)) = self.last_three.two() {
if let Some(tok) = self.last_three.three() {
Self::check_for_expression(*tok)
} else {
false
}
} else {
false
};
let conditional = if let Some(tok) = self.last_three.one() {
Self::check_token_for_conditional(*tok)
} else {
false
};
let paren = Paren {
func_expr,
conditional,
};
let meta = MetaToken::OpenParen(paren);
self.paren_stack.push(paren);
self.last_three.push(meta);
}
#[inline]
fn handle_open_brace_books(&mut self) {
let is_block = if let Some(last) = self.last_three.one() {
match last {
MetaToken::Punct(Punct::OpenParen)
| MetaToken::Punct(Punct::OpenBracket)
| MetaToken::OpenParen(_)
| MetaToken::OpenBrace(_, _) => false,
MetaToken::Punct(Punct::Colon) => {
if let Some(parent) = self.brace_stack.last() {
parent.is_block
} else {
false
}
}
MetaToken::Punct(_) => !Self::is_op(*last),
MetaToken::Keyword(RawKeyword::Return, line)
| MetaToken::Keyword(RawKeyword::Yield, line) => {
if let Some(last) = self.last_three.two() {
last.line_number() != *line
} else {
false
}
}
MetaToken::Keyword(RawKeyword::Case, _) => false,
MetaToken::Keyword(_, _) => !Self::is_op(*last),
_ => true,
}
} else {
true
};
let paren = if let Some(MetaToken::CloseParen(open)) = self.last_three.one() {
Some(*open)
} else {
None
};
let brace = look_behind::Brace { is_block, paren };
self.brace_stack.push(brace);
self.last_three.push(MetaToken::OpenBrace(
brace,
self.manual_scanner.new_line_count as u32,
));
}
#[inline]
fn handle_close_paren_books(&mut self, start: usize) -> Res<()> {
let paren = if let Some(paren) = self.paren_stack.pop() {
paren
} else {
self.errored = true;
return self.error(RawError {
idx: start,
msg: "Unmatched open close paren".to_string(),
});
};
self.last_three.push(MetaToken::CloseParen(paren));
Ok(())
}
#[inline]
fn handle_close_brace_books(&mut self, start: usize) -> Res<()> {
if let Some(open) = self.brace_stack.pop() {
let close = MetaToken::CloseBrace(open);
self.last_three.push(close);
Ok(())
} else {
self.error(RawError {
idx: start,
msg: "unmatched close brace".to_string(),
})
}
}
fn is_regex_start(&self) -> bool {
if let Some(ref last_token) = self.last_three.one() {
match last_token {
MetaToken::Keyword(k, _) => match k {
RawKeyword::This => false,
_ => true,
},
MetaToken::Punct(p) => match p {
Punct::CloseBracket => false,
_ => true,
},
MetaToken::CloseParen(open) => open.conditional,
MetaToken::CloseBrace(close) => {
if close.is_block {
if let Some(open) = &close.paren {
!open.func_expr
} else {
true
}
} else {
false
}
}
MetaToken::OpenParen(_) | MetaToken::OpenBrace(_, _) => true,
_ => false,
}
} else {
true
}
}
fn check_token_for_conditional(tok: MetaToken) -> bool {
if let MetaToken::Keyword(k, _) = tok {
match k {
RawKeyword::If | RawKeyword::For | RawKeyword::While | RawKeyword::With => true,
_ => false,
}
} else {
false
}
}
fn check_for_expression(token: MetaToken) -> bool {
if Self::is_op(token) {
true
} else {
match token {
MetaToken::Keyword(RawKeyword::Return, _)
| MetaToken::Keyword(RawKeyword::Case, _) => true,
_ => false,
}
}
}
fn is_op(tok: MetaToken) -> bool {
match tok {
MetaToken::Punct(ref p) => matches!(
p,
Punct::Equal
| Punct::PlusEqual
| Punct::DashEqual
| Punct::AsteriskEqual
| Punct::ForwardSlashEqual
| Punct::PercentEqual
| Punct::DoubleLessThanEqual
| Punct::DoubleGreaterThanEqual
| Punct::TripleGreaterThanEqual
| Punct::AmpersandEqual
| Punct::PipeEqual
| Punct::CaretEqual
| Punct::Comma
| Punct::Plus
| Punct::Dash
| Punct::Asterisk
| Punct::ForwardSlash
| Punct::Percent
| Punct::DoubleLessThan
| Punct::DoubleGreaterThan
| Punct::TripleGreaterThan
| Punct::Ampersand
| Punct::Pipe
| Punct::Caret
| Punct::DoubleAmpersand
| Punct::DoublePipe
| Punct::QuestionMark
| Punct::Colon
| Punct::TripleEqual
| Punct::DoubleEqual
| Punct::GreaterThanEqual
| Punct::LessThanEqual
| Punct::LessThan
| Punct::GreaterThan
| Punct::BangEqual
| Punct::BangDoubleEqual
| Punct::DoublePlus
| Punct::DoubleDash
| Punct::Tilde
| Punct::Bang
),
MetaToken::Keyword(k, _) => matches!(
k,
RawKeyword::InstanceOf
| RawKeyword::In
| RawKeyword::Delete
| RawKeyword::Void
| RawKeyword::TypeOf
| RawKeyword::Throw
| RawKeyword::New
),
_ => false,
}
}
pub fn string_for(&self, span: &Span) -> Option<String> {
self.manual_scanner.string_for(span)
}
pub fn str_for(&self, span: &Span) -> Option<&'b str> {
self.manual_scanner.str_for(span)
}
pub fn position_for(&self, idx: usize) -> (usize, usize) {
let mut line_ct = 1;
let mut byte_position = 0;
for (i, c) in self.original.chars().enumerate() {
if i >= idx {
return (line_ct, byte_position);
}
match c {
'\r' => {
if let Some(next) = self.original.get(byte_position..byte_position + 2) {
if next != "\r\n" {
line_ct += 1;
byte_position = 0;
}
}
}
'\n' | '\u{2028}' | '\u{2029}' => {
line_ct += 1;
byte_position = 0;
}
_ => byte_position += c.len_utf8(),
};
}
(line_ct, byte_position)
}
pub fn has_pending_new_line(&self) -> bool {
self.manual_scanner.pending_new_line
}
fn error<T>(&self, raw_error: RawError) -> Res<T> {
let RawError { idx, msg } = &raw_error;
let (line, column) = self.position_for(*idx);
Err(Error {
line,
column,
msg: msg.clone(),
idx: *idx,
})
}
}
#[inline]
fn is_line_term(c: char) -> bool {
c == '\n' || c == '\r' || c == '\u{2028}' || c == '\u{2029}'
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub enum OpenCurlyKind {
Template,
Block,
}
#[derive(Clone)]
pub struct ScannerState {
pub manual_state: ManualState,
pub last_three: LookBehind,
pub paren_stack: Vec<Paren>,
}
#[cfg(test)]
mod test {
use super::{tokens::*, *};
#[test]
fn tokenizer() {
let js = "#!/usr/bin/env node
'use strict';
function thing() {
let x = 0;
console.log('stuff');
}";
let expectation = vec![
Token::Comment(Comment {
kind: tokens::CommentKind::Hashbang,
content: "/usr/bin/env node",
tail_content: None,
}),
Token::String(StringLit::single("use strict", false)),
Token::Punct(Punct::SemiColon),
Token::Keyword(Keyword::Function("function".into())),
Token::Ident("thing".into()),
Token::Punct(Punct::OpenParen),
Token::Punct(Punct::CloseParen),
Token::Punct(Punct::OpenBrace),
Token::Keyword(Keyword::Let("let".into())),
Token::Ident("x".into()),
Token::Punct(Punct::Equal),
Token::Number("0".into()),
Token::Punct(Punct::SemiColon),
Token::Ident("console".into()),
Token::Punct(Punct::Period),
Token::Ident("log".into()),
Token::Punct(Punct::OpenParen),
Token::String(StringLit::single("stuff", false)),
Token::Punct(Punct::CloseParen),
Token::Punct(Punct::SemiColon),
Token::Punct(Punct::CloseBrace),
Token::EoF,
];
for (lhs, rhs) in Scanner::new(js).zip(expectation.into_iter()) {
let lhs = lhs.unwrap();
assert_eq!(lhs.token, rhs);
}
}
#[test]
fn tok_scanner() {
let s = super::Scanner::new(
"(function() {
this.x = 100;
this.y = 0;
})();",
);
let expected = vec![
Token::Punct(Punct::OpenParen),
Token::Keyword(Keyword::Function("function")),
Token::Punct(Punct::OpenParen),
Token::Punct(Punct::CloseParen),
Token::Punct(Punct::OpenBrace),
Token::Keyword(Keyword::This("this")),
Token::Punct(Punct::Period),
Token::Ident("x".into()),
Token::Punct(Punct::Equal),
Token::Number("100".into()),
Token::Punct(Punct::SemiColon),
Token::Keyword(Keyword::This("this")),
Token::Punct(Punct::Period),
Token::Ident("y".into()),
Token::Punct(Punct::Equal),
Token::Number("0".into()),
Token::Punct(Punct::SemiColon),
Token::Punct(Punct::CloseBrace),
Token::Punct(Punct::CloseParen),
Token::Punct(Punct::OpenParen),
Token::Punct(Punct::CloseParen),
Token::Punct(Punct::SemiColon),
Token::EoF,
];
validate(s, expected);
}
#[test]
fn tok_scanner_jq() {
let js = include_str!("../node_modules/jquery/dist/jquery.js");
let t = Scanner::new(js);
let _: Vec<_> = t.collect();
}
#[test]
fn look_ahead() {
let js = "function() { return; }";
let mut s = Scanner::new(js);
while let Some(peek) = s.look_ahead() {
let peek = peek.unwrap();
if let Some(next) = s.next() {
let next = next.unwrap();
assert_eq!(peek, next);
}
}
}
fn validate(s: Scanner, expected: Vec<Token<&str>>) {
for (i, (lhs, rhs)) in s.zip(expected.into_iter()).enumerate() {
let lhs = lhs.unwrap();
println!("{:?}, {:?}", lhs.token, rhs);
assert_eq!((i, lhs.token), (i, rhs));
}
}
#[test]
fn get_str() {
let js = "function ( ) { return ; }";
let mut s = Scanner::new(js);
let strs = js.split(' ');
for (i, p) in strs.enumerate() {
let item = s.next().unwrap().unwrap();
let q = s.string_for(&item.span).unwrap();
assert_eq!((i, p.to_string()), (i, q))
}
}
#[test]
fn spans() {
let js = include_str!("../node_modules/esprima/dist/esprima.js");
let s = Scanner::new(js);
for item in s {
let item = item.unwrap();
let from_stream = &js[item.span.start..item.span.end];
if item.token.is_regex() {
println!("{:?} - {:?}", from_stream, item.token);
}
let token = item.token.to_string();
assert_eq!(
from_stream, token,
"token mismatch {:?} \n{}\n{}\n",
item, from_stream, token
);
}
}
#[test]
fn local_host_regex() {
let js = r#"/^(http|https):\/\/(localhost|127\.0\.0\.1)/"#;
let regex = RegEx::from_parts(r"^(http|https):\/\/(localhost|127\.0\.0\.1)", None);
let mut s = Scanner::new(js);
let r = s.next().unwrap().unwrap();
assert_eq!(r.token, Token::RegEx(regex));
}
#[test]
fn regex_replace() {
let expect = vec![
Token::Ident("ident".into()),
Token::Punct(Punct::Period),
Token::Ident("replace".into()),
Token::Punct(Punct::OpenParen),
Token::RegEx(RegEx::from_parts("%(\\d)", Some("g"))),
Token::Punct(Punct::Comma),
Token::String(StringLit::single("", false)),
Token::Punct(Punct::CloseParen),
];
let js = r#"ident.replace(/%(\d)/g, '')"#;
let s = Scanner::new(js);
for (i, (exp, item)) in expect.iter().zip(s).enumerate() {
assert_eq!((i, exp), (i, &item.unwrap().token));
}
}
#[test]
fn error() {
let js = "
(function() {
let x = 'asdf
';
})()";
for item in Scanner::new(js) {
match item {
Ok(_) => (),
Err(e) => {
assert_eq!(e.line, 3);
assert_eq!(e.column, 17);
}
}
}
}
#[test]
fn locations() {
let js = r"(function() {
let x = 'asdf\
';
let y = `asd
f`;
/*
* things
*/
})();";
let expectation = vec![
SourceLocation::new(Position::new(1, 1), Position::new(1, 2)),
SourceLocation::new(Position::new(1, 2), Position::new(1, 10)),
SourceLocation::new(Position::new(1, 10), Position::new(1, 11)),
SourceLocation::new(Position::new(1, 11), Position::new(1, 12)),
SourceLocation::new(Position::new(1, 13), Position::new(1, 14)),
SourceLocation::new(Position::new(2, 5), Position::new(2, 8)),
SourceLocation::new(Position::new(2, 9), Position::new(2, 10)),
SourceLocation::new(Position::new(2, 11), Position::new(2, 12)),
SourceLocation::new(Position::new(2, 13), Position::new(3, 1)),
SourceLocation::new(Position::new(3, 1), Position::new(3, 2)),
SourceLocation::new(Position::new(4, 5), Position::new(4, 8)),
SourceLocation::new(Position::new(4, 9), Position::new(4, 10)),
SourceLocation::new(Position::new(4, 11), Position::new(4, 12)),
SourceLocation::new(Position::new(4, 13), Position::new(5, 2)),
SourceLocation::new(Position::new(5, 2), Position::new(5, 3)),
SourceLocation::new(Position::new(6, 5), Position::new(8, 6)),
SourceLocation::new(Position::new(9, 1), Position::new(9, 2)),
SourceLocation::new(Position::new(9, 2), Position::new(9, 3)),
SourceLocation::new(Position::new(9, 3), Position::new(9, 4)),
SourceLocation::new(Position::new(9, 4), Position::new(9, 5)),
SourceLocation::new(Position::new(9, 5), Position::new(9, 6)),
];
for (i, (lhs, rhs)) in Scanner::new(js).zip(expectation.iter()).enumerate() {
let item = lhs.expect("error parsing item");
assert_eq!((i, item.location), (i, *rhs))
}
}
#[test]
fn position_display() {
assert_eq!(format!("{}", Position::new(1, 25)), "1:25".to_string(),);
assert_eq!(format!("{}", Position::new(25, 0)), "25:0".to_string(),);
}
#[test]
fn position_ord() {
assert!(
Position::new(1, 25) < Position::new(2, 25),
"line 1 not less than line 2"
);
assert!(
Position::new(2, 25) > Position::new(1, 25),
"line 2 not greater than line 1"
);
assert!(
Position::new(1, 1) < Position::new(1, 5),
"same line, col 1 not less than col 5"
);
assert!(
Position::new(1, 5) > Position::new(1, 1),
"same line, col 5 not greater than col 1"
);
}
#[test]
fn skip_comments() {
let js = "#! /bin/node;
'use strict';
// comment 1
let x = 1;
/*
Lots of information
in a multi line comment
let q = 0;
*/
let y = 1;
// more than one
/* comment type */
<!-- could be skipped -->
ley z = 9;";
let mut s = Scanner::new(js);
for i in 0..4 {
s.skip_comments().unwrap();
assert!(
!s.next().unwrap().unwrap().token.is_comment(),
" failed to skip comment on iter {}",
i
);
}
}
#[test]
fn invalid_regex_flags_for_error() {
let mut s = Scanner::new("let x = /asdf");
let _let = s.next();
let _x = s.next();
let _eq = s.next();
let re = s.next().unwrap();
assert!(re.is_err(), "regex was not an error");
}
#[test]
fn template_with_middle() {
let mut s = Scanner::new("`asdf${0}qwerty${1}poiuy`");
let _head = s.next().unwrap().unwrap();
let _zero = s.next().unwrap().unwrap();
let middle = s.next().unwrap().unwrap();
assert!(middle.token.is_template_body(), "middle was not a template");
let _one = s.next().unwrap().unwrap();
let _tail = s.next().unwrap().unwrap();
}
#[test]
#[should_panic = "Unmatched open close paren"]
fn unmatched_close_paren_error() {
Scanner::new(")").next().unwrap().unwrap();
}
#[test]
#[should_panic = "unmatched close brace"]
fn unmatched_close_brace_error() {
Scanner::new("}").next().unwrap().unwrap();
}
#[test]
fn this_over_number() {
let mut s = Scanner::new("this / 100");
let _this = s.next().unwrap().unwrap();
let div = s.next().unwrap().unwrap();
assert!(
div.token.matches_punct(Punct::ForwardSlash),
"regex with leading this"
);
let _one_hundred = s.next().unwrap().unwrap();
}
#[test]
fn keyword_regex() {
let mut s = Scanner::new("break /a/");
let _break = s.next().unwrap().unwrap();
let re = s.next().unwrap().unwrap();
assert!(re.token.is_regex(), "regex was not a regex: {:?}", re);
}
}