use crate::token::{Position, Span, Token, TokenKind};
pub struct Lexer {
chars: Vec<(usize, char)>,
pos: usize,
line: usize,
column: usize,
in_directive: bool,
}
impl Lexer {
#[must_use]
pub fn new(input: &str) -> Self {
Self {
chars: input.char_indices().collect(),
pos: 0,
line: 1,
column: 1,
in_directive: false,
}
}
#[must_use]
pub fn tokenize(mut self) -> Vec<Token> {
let mut tokens = Vec::new();
while !self.is_at_end() {
if let Some(token) = self.next_token() {
tokens.push(token);
}
}
let eof_pos = Position::new(self.line, self.column);
tokens.push(Token::new(TokenKind::Eof, Span::new(eof_pos, eof_pos)));
tokens
}
fn is_at_end(&self) -> bool {
self.pos >= self.chars.len()
}
fn peek(&self) -> Option<char> {
self.chars.get(self.pos).map(|&(_, ch)| ch)
}
fn advance(&mut self) -> Option<char> {
if let Some(&(_, ch)) = self.chars.get(self.pos) {
self.pos += 1;
if ch == '\n' {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
Some(ch)
} else {
None
}
}
fn next_token(&mut self) -> Option<Token> {
let ch = self.peek()?;
match ch {
'\n' => self.lex_newline(),
'\r' => self.lex_carriage_return(),
'{' => self.lex_single(TokenKind::DirectiveOpen, true),
'}' => self.lex_single(TokenKind::DirectiveClose, false),
'[' => self.lex_single(TokenKind::ChordOpen, false),
']' => self.lex_single(TokenKind::ChordClose, false),
':' if self.in_directive => self.lex_single(TokenKind::Colon, false),
_ => self.lex_text(),
}
}
fn lex_single(&mut self, kind: TokenKind, enter_directive: bool) -> Option<Token> {
let start = Position::new(self.line, self.column);
self.advance();
let end = Position::new(self.line, self.column);
if enter_directive {
self.in_directive = true;
}
if kind == TokenKind::DirectiveClose {
self.in_directive = false;
}
Some(Token::new(kind, Span::new(start, end)))
}
fn lex_newline(&mut self) -> Option<Token> {
let start = Position::new(self.line, self.column);
self.advance(); let end = Position::new(self.line, self.column);
Some(Token::new(TokenKind::Newline, Span::new(start, end)))
}
fn lex_carriage_return(&mut self) -> Option<Token> {
let start = Position::new(self.line, self.column);
let is_crlf = self.chars.get(self.pos + 1).map(|&(_, c)| c) == Some('\n');
if is_crlf {
self.advance(); self.advance(); let end = Position::new(self.line, self.column);
Some(Token::new(TokenKind::Newline, Span::new(start, end)))
} else {
self.lex_text()
}
}
fn lex_text(&mut self) -> Option<Token> {
let start = Position::new(self.line, self.column);
let mut buf = String::new();
while let Some(ch) = self.peek() {
match ch {
'\n' => break,
'\r' => {
let is_crlf = self.chars.get(self.pos + 1).map(|&(_, c)| c) == Some('\n');
if is_crlf {
break;
}
self.advance();
buf.push(ch);
}
'\\' => {
if let Some(&(_, next_ch)) = self.chars.get(self.pos + 1) {
if is_special(next_ch, self.in_directive) {
self.advance(); self.advance(); buf.push(next_ch);
} else {
self.advance();
buf.push(ch);
}
} else {
self.advance();
buf.push(ch);
}
}
'{' | '}' | '[' | ']' => break,
':' if self.in_directive => break,
_ => {
self.advance();
buf.push(ch);
}
}
}
if buf.is_empty() {
return None;
}
let end = Position::new(self.line, self.column);
Some(Token::new(TokenKind::Text(buf), Span::new(start, end)))
}
}
fn is_special(ch: char, in_directive: bool) -> bool {
matches!(ch, '{' | '}' | '[' | ']') || (ch == ':' && in_directive)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::token::TokenKind::*;
fn kinds(input: &str) -> Vec<TokenKind> {
Lexer::new(input)
.tokenize()
.into_iter()
.map(|t| t.kind)
.collect()
}
#[test]
fn empty_input() {
let tokens = Lexer::new("").tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, Eof);
}
#[test]
fn single_newline() {
assert_eq!(kinds("\n"), vec![Newline, Eof]);
}
#[test]
fn crlf_newline() {
assert_eq!(kinds("\r\n"), vec![Newline, Eof]);
}
#[test]
fn plain_text() {
assert_eq!(kinds("Hello world"), vec![Text("Hello world".into()), Eof],);
}
#[test]
fn simple_directive() {
assert_eq!(
kinds("{title: My Song}"),
vec![
DirectiveOpen,
Text("title".into()),
Colon,
Text(" My Song".into()),
DirectiveClose,
Eof,
],
);
}
#[test]
fn directive_without_value() {
assert_eq!(
kinds("{soc}"),
vec![DirectiveOpen, Text("soc".into()), DirectiveClose, Eof,],
);
}
#[test]
fn directive_with_spaces_around_colon() {
assert_eq!(
kinds("{key : Am}"),
vec![
DirectiveOpen,
Text("key ".into()),
Colon,
Text(" Am".into()),
DirectiveClose,
Eof,
],
);
}
#[test]
fn colon_outside_directive_is_text() {
assert_eq!(kinds("Note: hello"), vec![Text("Note: hello".into()), Eof],);
}
#[test]
fn chord_in_lyrics() {
assert_eq!(
kinds("[Am]Hello [G]world"),
vec![
ChordOpen,
Text("Am".into()),
ChordClose,
Text("Hello ".into()),
ChordOpen,
Text("G".into()),
ChordClose,
Text("world".into()),
Eof,
],
);
}
#[test]
fn chord_only_line() {
assert_eq!(
kinds("[Am] [G] [C]"),
vec![
ChordOpen,
Text("Am".into()),
ChordClose,
Text(" ".into()),
ChordOpen,
Text("G".into()),
ChordClose,
Text(" ".into()),
ChordOpen,
Text("C".into()),
ChordClose,
Eof,
],
);
}
#[test]
fn multiple_lines() {
assert_eq!(
kinds("{title: Test}\n[Am]Hello\nWorld"),
vec![
DirectiveOpen,
Text("title".into()),
Colon,
Text(" Test".into()),
DirectiveClose,
Newline,
ChordOpen,
Text("Am".into()),
ChordClose,
Text("Hello".into()),
Newline,
Text("World".into()),
Eof,
],
);
}
#[test]
fn empty_lines() {
assert_eq!(
kinds("Hello\n\nWorld"),
vec![
Text("Hello".into()),
Newline,
Newline,
Text("World".into()),
Eof,
],
);
}
#[test]
fn text_only_lines() {
assert_eq!(
kinds("just text here"),
vec![Text("just text here".into()), Eof],
);
}
#[test]
fn escaped_brace_in_text() {
assert_eq!(
kinds("hello \\{ world"),
vec![Text("hello { world".into()), Eof],
);
}
#[test]
fn escaped_bracket() {
assert_eq!(
kinds("\\[not a chord\\]"),
vec![Text("[not a chord]".into()), Eof],
);
}
#[test]
fn escaped_colon_inside_directive() {
assert_eq!(
kinds("{comment: 10\\:30 AM}"),
vec![
DirectiveOpen,
Text("comment".into()),
Colon,
Text(" 10:30 AM".into()),
DirectiveClose,
Eof,
],
);
}
#[test]
fn backslash_before_normal_char() {
assert_eq!(kinds("back\\slash"), vec![Text("back\\slash".into()), Eof],);
}
#[test]
fn backslash_at_end_of_input() {
assert_eq!(kinds("end\\"), vec![Text("end\\".into()), Eof],);
}
#[test]
fn spans_for_directive() {
let tokens = Lexer::new("{t: X}").tokenize();
assert_eq!(tokens[0].span.start, Position::new(1, 1));
assert_eq!(tokens[0].span.end, Position::new(1, 2));
assert_eq!(tokens[1].span.start, Position::new(1, 2));
assert_eq!(tokens[1].span.end, Position::new(1, 3));
assert_eq!(tokens[2].span.start, Position::new(1, 3));
assert_eq!(tokens[2].span.end, Position::new(1, 4));
assert_eq!(tokens[3].span.start, Position::new(1, 4));
assert_eq!(tokens[3].span.end, Position::new(1, 6));
assert_eq!(tokens[4].span.start, Position::new(1, 6));
assert_eq!(tokens[4].span.end, Position::new(1, 7));
assert_eq!(tokens[5].span.start, Position::new(1, 7));
}
#[test]
fn spans_across_lines() {
let tokens = Lexer::new("AB\nCD").tokenize();
assert_eq!(tokens[0].span.start, Position::new(1, 1));
assert_eq!(tokens[0].span.end, Position::new(1, 3));
assert_eq!(tokens[1].span.start, Position::new(1, 3));
assert_eq!(tokens[1].span.end, Position::new(2, 1));
assert_eq!(tokens[2].span.start, Position::new(2, 1));
assert_eq!(tokens[2].span.end, Position::new(2, 3));
}
#[test]
fn spans_across_crlf_lines() {
let tokens = Lexer::new("AB\r\nCD").tokenize();
assert_eq!(tokens[0].span.start, Position::new(1, 1));
assert_eq!(tokens[0].span.end, Position::new(1, 3));
assert_eq!(tokens[1].span.start, Position::new(1, 3));
assert_eq!(tokens[1].span.end, Position::new(2, 1));
assert_eq!(tokens[2].span.start, Position::new(2, 1));
assert_eq!(tokens[2].span.end, Position::new(2, 3));
}
#[test]
fn nested_braces_produce_separate_tokens() {
let toks = kinds("{{inner}}");
assert_eq!(
toks,
vec![
DirectiveOpen,
DirectiveOpen,
Text("inner".into()),
DirectiveClose,
DirectiveClose,
Eof,
],
);
}
#[test]
fn bracket_inside_directive() {
assert_eq!(
kinds("{comment: [Am] chord}"),
vec![
DirectiveOpen,
Text("comment".into()),
Colon,
Text(" ".into()),
ChordOpen,
Text("Am".into()),
ChordClose,
Text(" chord".into()),
DirectiveClose,
Eof,
],
);
}
#[test]
fn empty_directive() {
assert_eq!(kinds("{}"), vec![DirectiveOpen, DirectiveClose, Eof],);
}
#[test]
fn empty_chord() {
assert_eq!(kinds("[]"), vec![ChordOpen, ChordClose, Eof],);
}
#[test]
fn directive_across_lines_is_not_special() {
let toks = kinds("{title\nvalue}");
assert_eq!(
toks,
vec![
DirectiveOpen,
Text("title".into()),
Newline,
Text("value".into()),
DirectiveClose,
Eof,
],
);
}
#[test]
fn multiple_colons_in_directive() {
assert_eq!(
kinds("{meta: key:value}"),
vec![
DirectiveOpen,
Text("meta".into()),
Colon,
Text(" key".into()),
Colon,
Text("value".into()),
DirectiveClose,
Eof,
],
);
}
#[test]
fn unicode_text() {
assert_eq!(
kinds("[Am]こんにちは"),
vec![
ChordOpen,
Text("Am".into()),
ChordClose,
Text("こんにちは".into()),
Eof,
],
);
}
#[test]
fn full_song_snippet() {
let input = "\
{title: Amazing Grace}
{artist: John Newton}
[G]Amazing [G7]grace, how [C]sweet the [G]sound
[G]That saved a [Em]wretch like [D]me";
let toks = kinds(input);
assert_eq!(
toks,
vec![
DirectiveOpen,
Text("title".into()),
Colon,
Text(" Amazing Grace".into()),
DirectiveClose,
Newline,
DirectiveOpen,
Text("artist".into()),
Colon,
Text(" John Newton".into()),
DirectiveClose,
Newline,
Newline,
ChordOpen,
Text("G".into()),
ChordClose,
Text("Amazing ".into()),
ChordOpen,
Text("G7".into()),
ChordClose,
Text("grace, how ".into()),
ChordOpen,
Text("C".into()),
ChordClose,
Text("sweet the ".into()),
ChordOpen,
Text("G".into()),
ChordClose,
Text("sound".into()),
Newline,
ChordOpen,
Text("G".into()),
ChordClose,
Text("That saved a ".into()),
ChordOpen,
Text("Em".into()),
ChordClose,
Text("wretch like ".into()),
ChordOpen,
Text("D".into()),
ChordClose,
Text("me".into()),
Eof,
],
);
}
}