use crate::identifier::{Identifier, ReservedWord};
use super::{
segment::{Segment, Segmenter, Syntax},
token::{Punct, Token},
};
use std::collections::VecDeque;
use thiserror::Error as ThisError;
#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
pub enum ScanError {
#[error("Unterminated string constant.")]
ExpectedQuote,
#[error("Missing exponent following `{0}`")]
ExpectedExponent(String),
#[error("String of hex digits has {0} characters, which is not a multiple of 2.")]
OddLengthHexString(usize),
#[error("Invalid hex digit {0:?}.")]
BadHexDigit(char),
#[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
IncompleteUtf8 {
substring: String,
offset: usize,
},
#[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
BadUtf8 {
substring: String,
offset: usize,
},
#[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
BadLengthUnicodeString(usize),
#[error("U+{0:04X} is not a valid Unicode code point.")]
BadCodePoint(u32),
#[error("Expected hexadecimal Unicode code point.")]
ExpectedCodePoint,
#[error("`DO REPEAT` nested too deeply.")]
DoRepeatOverflow,
#[error("Unexpected character {0:?} in input.")]
UnexpectedChar(char),
}
#[derive(Clone, Debug)]
pub enum MergeAction {
Copy,
Expand {
n: usize,
token: Token,
},
}
#[derive(Copy, Clone, Debug)]
pub struct Incomplete;
impl Segment {
pub fn to_token(self, s: &str) -> Option<Result<Token, ScanError>> {
match self {
Segment::Number => Some(Ok(Token::Number(s.parse().unwrap()))),
Segment::QuotedString => {
let mut chars = s.chars();
let quote = chars.next().unwrap();
let s = chars.as_str().strip_suffix(quote).unwrap();
let (single_quote, double_quote) = match quote {
'\'' => ("'", "''"),
'"' => ("\"", "\"\""),
_ => unreachable!(),
};
Some(Ok(Token::String(s.replace(double_quote, single_quote))))
}
Segment::HexString => {
let s = &s[2..s.len() - 1];
for c in s.chars() {
if !c.is_ascii_hexdigit() {
return Some(Err(ScanError::BadHexDigit(c)));
}
}
if s.len() % 2 != 0 {
return Some(Err(ScanError::OddLengthHexString(s.len())));
}
let bytes = s
.as_bytes()
.chunks_exact(2)
.map(|pair| {
let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
hi * 16 + lo
})
.collect::<Vec<_>>();
match String::from_utf8(bytes) {
Ok(string) => Some(Ok(Token::String(string))),
Err(error) => {
let details = error.utf8_error();
let offset = details.valid_up_to() * 2;
let end = details
.error_len()
.map(|len| offset + len * 2)
.unwrap_or(s.len());
let substring = String::from(&s[offset..end]);
Some(Err(if details.error_len().is_some() {
ScanError::BadUtf8 { substring, offset }
} else {
ScanError::IncompleteUtf8 { substring, offset }
}))
}
}
}
Segment::UnicodeString => {
let s = &s[2..s.len() - 1];
if !(1..=8).contains(&s.len()) {
return Some(Err(ScanError::BadLengthUnicodeString(s.len())));
}
let Ok(code_point) = u32::from_str_radix(s, 16) else {
return Some(Err(ScanError::ExpectedCodePoint));
};
let Some(c) = char::from_u32(code_point) else {
return Some(Err(ScanError::BadCodePoint(code_point)));
};
Some(Ok(Token::String(String::from(c))))
}
Segment::UnquotedString
| Segment::DoRepeatCommand
| Segment::InlineData
| Segment::Document
| Segment::MacroBody
| Segment::MacroName => Some(Ok(Token::String(String::from(s)))),
Segment::Identifier => {
if let Ok(reserved_word) = ReservedWord::try_from(s) {
match reserved_word {
ReservedWord::And => Some(Ok(Token::Punct(Punct::And))),
ReservedWord::Or => Some(Ok(Token::Punct(Punct::Or))),
ReservedWord::Not => Some(Ok(Token::Punct(Punct::Not))),
ReservedWord::Eq => Some(Ok(Token::Punct(Punct::Eq))),
ReservedWord::Ge => Some(Ok(Token::Punct(Punct::Ge))),
ReservedWord::Gt => Some(Ok(Token::Punct(Punct::Gt))),
ReservedWord::Le => Some(Ok(Token::Punct(Punct::Le))),
ReservedWord::Lt => Some(Ok(Token::Punct(Punct::Lt))),
ReservedWord::Ne => Some(Ok(Token::Punct(Punct::Ne))),
ReservedWord::All => Some(Ok(Token::Punct(Punct::All))),
ReservedWord::By => Some(Ok(Token::Punct(Punct::By))),
ReservedWord::To => Some(Ok(Token::Punct(Punct::To))),
ReservedWord::With => Some(Ok(Token::Punct(Punct::With))),
}
} else {
Some(Ok(Token::Id(Identifier::new(s).unwrap())))
}
}
Segment::Punct => match s {
"(" => Some(Ok(Token::Punct(Punct::LParen))),
")" => Some(Ok(Token::Punct(Punct::RParen))),
"[" => Some(Ok(Token::Punct(Punct::LSquare))),
"]" => Some(Ok(Token::Punct(Punct::RSquare))),
"{" => Some(Ok(Token::Punct(Punct::LCurly))),
"}" => Some(Ok(Token::Punct(Punct::RCurly))),
"," => Some(Ok(Token::Punct(Punct::Comma))),
"=" => Some(Ok(Token::Punct(Punct::Equals))),
"-" => Some(Ok(Token::Punct(Punct::Dash))),
"&" => Some(Ok(Token::Punct(Punct::And))),
"|" => Some(Ok(Token::Punct(Punct::Or))),
"+" => Some(Ok(Token::Punct(Punct::Plus))),
"/" => Some(Ok(Token::Punct(Punct::Slash))),
"*" => Some(Ok(Token::Punct(Punct::Asterisk))),
"<" => Some(Ok(Token::Punct(Punct::Lt))),
">" => Some(Ok(Token::Punct(Punct::Gt))),
"~" => Some(Ok(Token::Punct(Punct::Not))),
":" => Some(Ok(Token::Punct(Punct::Colon))),
";" => Some(Ok(Token::Punct(Punct::Semicolon))),
"**" => Some(Ok(Token::Punct(Punct::Exp))),
"<=" => Some(Ok(Token::Punct(Punct::Le))),
"<>" => Some(Ok(Token::Punct(Punct::Ne))),
"~=" => Some(Ok(Token::Punct(Punct::Ne))),
">=" => Some(Ok(Token::Punct(Punct::Ge))),
"!" => Some(Ok(Token::Punct(Punct::Bang))),
"%" => Some(Ok(Token::Punct(Punct::Percent))),
"?" => Some(Ok(Token::Punct(Punct::Question))),
"`" => Some(Ok(Token::Punct(Punct::Backtick))),
"_" => Some(Ok(Token::Punct(Punct::Underscore))),
"." => Some(Ok(Token::Punct(Punct::Dot))),
"!*" => Some(Ok(Token::Punct(Punct::BangAsterisk))),
_ => unreachable!("bad punctuator {s:?}"),
},
Segment::Shbang
| Segment::Spaces
| Segment::Comment
| Segment::Newline
| Segment::CommentCommand => None,
Segment::DoRepeatOverflow => Some(Err(ScanError::DoRepeatOverflow)),
Segment::StartDocument => Some(Ok(Token::Id(Identifier::new("DOCUMENT").unwrap()))),
Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
Some(Ok(Token::End))
}
Segment::ExpectedQuote => Some(Err(ScanError::ExpectedQuote)),
Segment::ExpectedExponent => Some(Err(ScanError::ExpectedExponent(String::from(s)))),
Segment::UnexpectedChar => {
Some(Err(ScanError::UnexpectedChar(s.chars().next().unwrap())))
}
}
}
}
pub fn merge_tokens<'a, F>(input: F) -> Result<Option<MergeAction>, Incomplete>
where
F: Fn(usize) -> Result<Option<&'a Token>, Incomplete>,
{
let Some(token) = input(0)? else {
return Ok(None);
};
match token {
Token::Punct(Punct::Dash) => match input(1)? {
Some(Token::Number(number)) if number.is_sign_positive() => {
let number = *number;
Ok(Some(MergeAction::Expand {
n: 2,
token: Token::Number(-number),
}))
}
_ => Ok(Some(MergeAction::Copy)),
},
Token::String(_) => {
let mut i = 0;
while matches!(input(i * 2 + 1)?, Some(Token::Punct(Punct::Plus)))
&& matches!(input(i * 2 + 2)?, Some(Token::String(_)))
{
i += 1;
}
if i == 0 {
Ok(Some(MergeAction::Copy))
} else {
let mut output = String::new();
for i in 0..=i {
let Token::String(s) = input(i * 2).unwrap().unwrap() else {
unreachable!()
};
output.push_str(s);
}
Ok(Some(MergeAction::Expand {
n: i * 2 + 1,
token: Token::String(output),
}))
}
}
_ => Ok(Some(MergeAction::Copy)),
}
}
pub struct StringSegmenter<'a> {
input: &'a str,
segmenter: Segmenter,
}
impl<'a> StringSegmenter<'a> {
pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self {
Self {
input,
segmenter: Segmenter::new(mode, is_snippet),
}
}
}
impl<'a> Iterator for StringSegmenter<'a> {
type Item = (&'a str, Result<Token, ScanError>);
fn next(&mut self) -> Option<Self::Item> {
loop {
let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap()?;
let (s, rest) = self.input.split_at(seg_len);
self.input = rest;
if let Some(token) = seg_type.to_token(s) {
return Some((s, token));
}
}
}
}
pub struct StringScanner<'a> {
input: &'a str,
eof: bool,
segmenter: Segmenter,
tokens: VecDeque<Token>,
}
impl<'a> StringScanner<'a> {
pub fn new(input: &'a str, mode: Syntax, is_snippet: bool) -> Self {
Self {
input,
eof: false,
segmenter: Segmenter::new(mode, is_snippet),
tokens: VecDeque::with_capacity(1),
}
}
fn merge(&mut self, eof: bool) -> Result<Option<Result<Token, ScanError>>, Incomplete> {
match merge_tokens(|index| {
if let Some(token) = self.tokens.get(index) {
Ok(Some(token))
} else if eof {
Ok(None)
} else {
Err(Incomplete)
}
})? {
Some(MergeAction::Copy) => Ok(Some(Ok(self.tokens.pop_front().unwrap()))),
Some(MergeAction::Expand { n, token }) => {
self.tokens.drain(..n);
Ok(Some(Ok(token)))
}
None => Ok(None),
}
}
pub fn unwrapped(self) -> impl Iterator<Item = Token> + use<'a> {
self.map(|scan_token| scan_token.ok().unwrap())
}
}
impl Iterator for StringScanner<'_> {
type Item = Result<Token, ScanError>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Ok(Some(token)) = self.merge(self.eof) {
return Some(token);
}
let Some((seg_len, seg_type)) = self.segmenter.push(self.input, true).unwrap() else {
self.eof = true;
return self.merge(true).unwrap();
};
let (s, rest) = self.input.split_at(seg_len);
match seg_type.to_token(s) {
Some(Err(error)) => {
if let Ok(Some(token)) = self.merge(true) {
return Some(token);
}
self.input = rest;
return Some(Err(error));
}
Some(Ok(token)) => {
self.tokens.push_back(token);
}
None => (),
}
self.input = rest;
}
}
}
#[cfg(test)]
mod tests;