use std::{
borrow::{Borrow, Cow},
collections::VecDeque,
fmt::{Debug, Formatter, Result as FmtResult, Write},
fs,
io::Result as IoResult,
iter::once,
mem::take,
ops::{Range, RangeInclusive},
path::Path,
rc::Rc,
sync::Arc,
};
use chardetng::EncodingDetector;
use encoding_rs::{Encoding, UTF_8};
use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
use crate::{
lex::scan::merge_tokens,
macros::{MacroSet, ParseStatus, Parser, macro_tokens_to_syntax},
message::{Category, Diagnostic, Location, Point, Severity},
settings::Settings,
};
use super::{
scan::{MergeAction, ScanError, StringScanner},
segment::{Segmenter, Syntax},
token::Token,
};
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
pub enum ErrorHandling {
Terminal,
#[default]
Continue,
Ignore,
Stop,
}
pub struct SyntaxFile {
file_name: Option<Arc<String>>,
#[allow(dead_code)]
encoding: &'static Encoding,
contents: String,
lines: Vec<usize>,
}
impl SyntaxFile {
pub fn for_file<P>(path: P, encoding: Option<&'static Encoding>) -> IoResult<Self>
where
P: AsRef<Path>,
{
let bytes = fs::read(path.as_ref())?;
let encoding = encoding.unwrap_or_else(|| {
let mut encoding_detector = EncodingDetector::new();
encoding_detector.feed(&bytes, true);
encoding_detector.guess(None, true)
});
let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes);
Ok(Self::new(
contents.to_string(),
Some(path.as_ref().to_string_lossy().to_string()),
encoding,
))
}
pub fn new(contents: String, file_name: Option<String>, encoding: &'static Encoding) -> Self {
let lines = once(0)
.chain(contents.match_indices('\n').map(|(index, _s)| index + 1))
.filter(|index| *index < contents.len())
.collect::<Vec<_>>();
Self {
file_name: file_name.map(Arc::new),
encoding,
contents,
lines,
}
}
pub fn for_string(contents: String) -> Self {
Self::new(contents, None, UTF_8)
}
fn offset_to_point(&self, offset: usize) -> Point {
let line = self
.lines
.partition_point(|&line_start| line_start <= offset);
Point {
line: line as i32,
column: Some(
self.contents
.get(self.lines[line - 1]..offset)
.unwrap_or_default()
.width() as i32
+ 1,
),
}
}
fn get_line(&self, line_number: i32) -> &str {
if (1..=self.lines.len() as i32).contains(&line_number) {
let line_number = line_number as usize;
let start = self.lines[line_number - 1];
let end = self.lines.get(line_number).copied().unwrap_or(
self.contents[start..]
.find('\n')
.map(|ofs| ofs + start)
.unwrap_or(self.contents.len()),
);
self.contents[start..end].strip_newline()
} else {
""
}
}
fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location {
Location {
file_name: self.file_name.clone(),
span: Some(
self.offset_to_point(range.start().pos.start)
..self.offset_to_point(range.end().pos.end),
),
omit_underlines: false,
}
}
}
impl Default for SyntaxFile {
fn default() -> Self {
Self::new(String::new(), None, UTF_8)
}
}
trait StripNewline {
fn strip_newline(&self) -> &str;
}
impl StripNewline for str {
fn strip_newline(&self) -> &str {
self.strip_suffix("\r\n")
.unwrap_or(self.strip_suffix('\n').unwrap_or(self))
}
}
fn ellipsize(s: &str) -> Cow<'_, str> {
if s.width() > 64 {
let mut out = String::new();
let mut width = 0;
for c in s.chars() {
out.push(c);
width += c.width().unwrap_or(0);
if width > 64 {
break;
}
}
out.push_str("...");
Cow::from(out)
} else {
Cow::from(s)
}
}
pub struct LexToken {
pub token: Token,
pub file: Arc<SyntaxFile>,
pos: Range<usize>,
macro_rep: Option<MacroRepresentation>,
}
impl Debug for LexToken {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
self.token.fmt(f)
}
}
#[allow(dead_code)]
struct LexError {
error: ScanError,
pos: Range<usize>,
}
impl Borrow<Token> for LexToken {
fn borrow(&self) -> &Token {
&self.token
}
}
impl LexToken {
fn representation(&self) -> &str {
&self.file.contents[self.pos.clone()]
}
}
struct MacroRepresentation {
expansion: Arc<String>,
pos: RangeInclusive<usize>,
}
pub struct Tokens {
tokens: Vec<LexToken>,
}
impl Tokens {
fn new(tokens: Vec<LexToken>) -> Self {
assert!(matches!(tokens.last().unwrap().token, Token::End));
Self { tokens }
}
}
impl Debug for Tokens {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
write!(f, "Tokens {{ ")?;
for (index, token) in self.tokens.iter().enumerate() {
if index > 0 {
write!(f, ", ")?;
}
write!(f, "{:?}", token.representation())?;
}
write!(f, " }}")
}
}
pub struct TokenSliceIter<'a> {
slice: &'a TokenSlice,
rest: Range<usize>,
}
impl<'a> TokenSliceIter<'a> {
pub fn new(slice: &'a TokenSlice) -> Self {
Self {
slice,
rest: slice.range.clone(),
}
}
pub fn remainder(&self) -> TokenSlice {
TokenSlice {
backing: self.slice.backing.clone(),
range: self.rest.clone(),
}
}
}
impl<'a> Iterator for TokenSliceIter<'a> {
type Item = &'a LexToken;
fn next(&mut self) -> Option<Self::Item> {
if self.rest.is_empty() {
None
} else {
self.rest.start += 1;
Some(&self.slice.backing.tokens[self.rest.start - 1])
}
}
}
#[derive(Clone)]
pub struct TokenSlice {
backing: Rc<Tokens>,
range: Range<usize>,
}
impl Debug for TokenSlice {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
write!(f, "TokenSlice {{ ")?;
for (index, token) in self.tokens().iter().enumerate() {
if index > 0 {
write!(f, ", ")?;
}
write!(f, "{:?}", token.representation())?;
}
write!(f, " }}")
}
}
#[allow(missing_docs)]
impl TokenSlice {
pub fn new(backing: Rc<Tokens>) -> Self {
let range = 0..backing.tokens.len() - 1;
Self { backing, range }
}
fn tokens(&self) -> &[LexToken] {
&self.backing.tokens[self.range.clone()]
}
pub fn get_token(&self, index: usize) -> Option<&Token> {
self.get(index).map(|token| &token.token)
}
pub fn get(&self, index: usize) -> Option<&LexToken> {
self.tokens().get(index)
}
pub fn error<S>(&self, text: S) -> Diagnostic
where
S: ToString,
{
self.diagnostic(Severity::Error, text.to_string())
}
pub fn warning<S>(&self, text: S) -> Diagnostic
where
S: ToString,
{
self.diagnostic(Severity::Warning, text.to_string())
}
pub fn subslice(&self, range: Range<usize>) -> Self {
debug_assert!(range.start <= range.end);
debug_assert!(range.end <= self.len());
let start = self.range.start + range.start;
let end = start + range.len();
Self {
backing: self.backing.clone(),
range: start..end,
}
}
pub fn first(&self) -> &LexToken {
&self.backing.tokens[self.range.start]
}
fn last(&self) -> &LexToken {
&self.backing.tokens[self.range.end - 1]
}
pub fn end(&self) -> Self {
self.subslice(self.len()..self.len())
}
fn file(&self) -> Option<&Arc<SyntaxFile>> {
let first = self.first();
let last = self.last();
if Arc::ptr_eq(&first.file, &last.file) {
Some(&first.file)
} else {
None
}
}
pub fn len(&self) -> usize {
self.tokens().len()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn iter(&self) -> TokenSliceIter<'_> {
TokenSliceIter::new(self)
}
fn get_macro_call(&self) -> Option<&str> {
if self.iter().any(|token| token.macro_rep.is_some()) {
let token0 = self.first();
let token1 = self.last();
if let Some(file) = self.file() {
let start = token0.pos.start;
let end = token1.pos.end;
if start < end {
return Some(&file.contents[start..end]);
}
}
}
None
}
fn location(&self) -> Location {
if let Some(file) = self.file() {
file.token_location(
&self.backing.tokens[self.range.start]..=&self.backing.tokens[self.range.end],
)
} else {
let first = self.first();
first.file.token_location(self.first()..=self.first())
}
}
pub fn skip_to(&self, token: &Token) -> Self {
self.skip_until(|t| t == token)
}
pub fn skip_until<F>(&self, f: F) -> Self
where
F: Fn(&Token) -> bool,
{
for (index, token) in self.iter().enumerate() {
if f(&token.token) {
return self.subslice(index..self.len());
}
}
self.end()
}
pub fn skip(&self, token: &Token) -> Option<Self> {
self.skip_if(|t| t == token)
}
pub fn skip_if<F>(&self, f: F) -> Option<Self>
where
F: Fn(&Token) -> bool,
{
let mut iter = self.iter();
if iter.next().is_some_and(|token| f(&token.token)) {
Some(iter.remainder())
} else {
None
}
}
pub fn skip_keyword(&self, keyword: &str) -> Option<Self> {
self.skip_if(|token| token.matches_keyword(keyword))
}
pub fn skip_syntax(&self, syntax: &str) -> Option<Self> {
let mut input = self.clone();
for token in StringScanner::new(syntax, Syntax::Interactive, true).unwrapped() {
input = input.skip(&token)?;
}
Some(input)
}
pub fn diagnostic(&self, severity: Severity, text: String) -> Diagnostic {
let mut s = String::new();
if let Some(call) = self.get_macro_call() {
write!(&mut s, "In syntax expanded from `{}`: ", ellipsize(call)).unwrap();
}
if !text.is_empty() {
s.push_str(&text);
} else {
s.push_str("Syntax error.");
}
if !s.ends_with('.') {
s.push('.');
}
let location = self.location();
let mut source = Vec::new();
if let Some(Range {
start: Point { line: l0, .. },
end: Point { line: l1, .. },
}) = location.span
{
if let Some(file) = self.file() {
let lines = if l1 - l0 > 3 {
vec![l0, l0 + 1, l1]
} else {
(l0..=l1).collect()
};
for line_number in lines {
source.push((line_number, file.get_line(line_number).to_string()));
}
}
}
Diagnostic {
category: Category::Syntax,
severity,
location,
source,
stack: Vec::new(),
command_name: None, text: s,
}
}
pub fn split<F>(&self, predicate: F) -> impl Iterator<Item = Self> + use<'_, F>
where
F: Fn(&LexToken) -> bool,
{
self.tokens().split(predicate).map(move |slice| {
let start_ofs = unsafe { slice.as_ptr().offset_from(self.tokens().as_ptr()) } as usize;
self.subslice(start_ofs..start_ofs + slice.len())
})
}
}
pub struct Source {
file: Arc<SyntaxFile>,
segmenter: Segmenter,
seg_pos: usize,
lookahead: VecDeque<LexToken>,
}
impl Source {
pub fn new_default(file: &Arc<SyntaxFile>) -> Self {
Self::new(file, Syntax::default())
}
pub fn new(file: &Arc<SyntaxFile>, syntax: Syntax) -> Self {
Self {
file: file.clone(),
segmenter: Segmenter::new(syntax, false),
seg_pos: 0,
lookahead: VecDeque::new(),
}
}
pub fn read_command(&mut self, macros: &MacroSet) -> Option<Tokens> {
loop {
if let Some(end) = self
.lookahead
.iter()
.position(|token| token.token == Token::End)
{
return Some(Tokens::new(self.lookahead.drain(..=end).collect()));
}
if !self.read_lookahead(macros) {
if self.lookahead.is_empty() {
return None;
}
let len = self.file.contents.len();
self.lookahead.push_back(LexToken {
token: Token::End,
file: self.file.clone(),
pos: len..len,
macro_rep: None,
});
}
}
}
fn read_lookahead(&mut self, macros: &MacroSet) -> bool {
let mut errors = Vec::new();
let mut pp = VecDeque::new();
while let Some((seg_len, seg_type)) = self
.segmenter
.push(&self.file.contents[self.seg_pos..], true)
.unwrap()
{
let pos = self.seg_pos..self.seg_pos + seg_len;
self.seg_pos += seg_len;
match seg_type.to_token(&self.file.contents[pos.clone()]) {
None => (),
Some(Ok(token)) => {
let end = token == Token::End;
pp.push_back(LexToken {
file: self.file.clone(),
token,
pos,
macro_rep: None,
});
if end {
break;
}
}
Some(Err(error)) => errors.push(LexError { error, pos }),
}
}
if pp.is_empty() {
return false;
}
let mut merge = if !Settings::global().macros.expand || macros.is_empty() {
take(&mut pp)
} else {
let mut merge = VecDeque::new();
while !pp.is_empty() {
self.expand_macro(macros, &mut pp, &mut merge);
}
merge
};
while let Ok(Some(result)) =
merge_tokens(|index| Ok(merge.get(index).map(|token| &token.token)))
{
match result {
MergeAction::Copy => self.lookahead.push_back(merge.pop_front().unwrap()),
MergeAction::Expand { n, token } => {
let first = &merge[0];
let last = &merge[n - 1];
self.lookahead.push_back(LexToken {
file: self.file.clone(),
token,
pos: first.pos.start..last.pos.end,
macro_rep: match (&first.macro_rep, &last.macro_rep) {
(Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => {
Some(MacroRepresentation {
expansion: a.expansion.clone(),
pos: *a.pos.start()..=*b.pos.end(),
})
}
_ => None,
},
});
merge.drain(..n);
}
}
}
true
}
fn expand_macro(
&self,
macros: &MacroSet,
src: &mut VecDeque<LexToken>,
dst: &mut VecDeque<LexToken>,
) {
let Some(mut parser) = Parser::new(macros, &src[0].token) else {
dst.push_back(src.pop_front().unwrap());
return;
};
for token in src.range(1..) {
if parser.push(&token.token, &self.file.contents[token.pos.clone()], &|e| {
println!("{e:?}")
}) == ParseStatus::Complete
{
break;
}
}
let call = parser.finish();
if call.is_empty() {
dst.push_back(src.pop_front().unwrap());
return;
}
let c0 = &src[0];
let c1 = &src[call.len() - 1];
let mut expansion = Vec::new();
call.expand(
self.segmenter.syntax(),
self.file.token_location(c0..=c1),
&mut expansion,
|e| println!("{e:?}"),
);
if Settings::global().macros.print_expansions {
}
let mut macro_rep = String::new();
let mut pos = Vec::with_capacity(expansion.len());
for [prefix, token] in macro_tokens_to_syntax(expansion.as_slice()) {
macro_rep.push_str(prefix);
let len = macro_rep.len();
pos.push(len..=len + token.len() - 1);
}
let macro_rep = Arc::new(macro_rep);
for (index, token) in expansion.into_iter().enumerate() {
let lt = LexToken {
file: self.file.clone(),
token: token.token,
pos: c0.pos.start..c1.pos.end,
macro_rep: Some(MacroRepresentation {
expansion: Arc::clone(¯o_rep),
pos: pos[index].clone(),
}),
};
dst.push_back(lt);
}
src.drain(..call.len());
}
}
#[cfg(test)]
mod new_lexer_tests {
use std::sync::Arc;
use encoding_rs::UTF_8;
use crate::macros::MacroSet;
use super::{Source, SyntaxFile};
#[test]
fn test() {
let code = r#"DATA LIST LIST /A * B * X * Y * .
BEGIN DATA.
2 3 4 5
END DATA.
CROSSTABS VARIABLES X (1,7) Y (1,7) /TABLES X BY Y.
"#;
let file = Arc::new(SyntaxFile::new(
String::from(code),
Some(String::from("crosstabs.sps")),
UTF_8,
));
let mut source = Source::new_default(&file);
while let Some(tokens) = source.read_command(&MacroSet::new()) {
println!("{tokens:?}");
}
}
}