use std::borrow::Cow;
use std::fmt::Debug;
use std::ops::Range;
use fancy_regex::Regex;
use super::markers::PositionMarker;
use super::segments::base::{ErasedSegment, SegmentBuilder};
use crate::core::config::FluffConfig;
use crate::core::dialects::base::Dialect;
use crate::core::errors::{SQLLexError, ValueError};
use crate::core::parser::segments::base::Tables;
use crate::core::slice_helpers::{is_zero_slice, offset_slice};
use crate::core::templaters::base::TemplatedFile;
use crate::dialects::SyntaxKind;
#[derive(Debug, Clone)]
pub struct Element<'a> {
name: &'static str,
text: Cow<'a, str>,
syntax_kind: SyntaxKind,
}
impl<'a> Element<'a> {
fn new(name: &'static str, syntax_kind: SyntaxKind, text: impl Into<Cow<'a, str>>) -> Self {
Self { name, syntax_kind, text: text.into() }
}
}
#[derive(Debug)]
pub struct TemplateElement<'a> {
raw: Cow<'a, str>,
template_slice: Range<usize>,
matcher: Info,
}
#[derive(Debug)]
struct Info {
name: &'static str,
syntax_kind: SyntaxKind,
}
impl<'a> TemplateElement<'a> {
pub fn from_element(element: Element<'a>, template_slice: Range<usize>) -> Self {
TemplateElement {
raw: element.text,
template_slice,
matcher: Info { name: element.name, syntax_kind: element.syntax_kind },
}
}
pub fn to_segment(
&self,
pos_marker: PositionMarker,
subslice: Option<Range<usize>>,
) -> ErasedSegment {
let slice = subslice.map_or_else(|| self.raw.as_ref(), |slice| &self.raw[slice]);
SegmentBuilder::token(0, slice, self.matcher.syntax_kind).with_position(pos_marker).finish()
}
}
#[derive(Debug)]
pub struct Match<'a> {
pub forward_string: &'a str,
pub elements: Vec<Element<'a>>,
}
impl Match<'_> {
pub fn is_non_empty(&self) -> bool {
!self.elements.is_empty()
}
}
#[derive(Debug, Clone)]
pub struct Matcher {
pattern: Pattern,
subdivider: Option<Pattern>,
trim_post_subdivide: Option<Pattern>,
}
impl Matcher {
pub const fn new(pattern: Pattern) -> Self {
Self { pattern, subdivider: None, trim_post_subdivide: None }
}
pub const fn string(
name: &'static str,
pattern: &'static str,
syntax_kind: SyntaxKind,
) -> Self {
Self::new(Pattern::string(name, pattern, syntax_kind))
}
pub fn regex(name: &'static str, pattern: &'static str, syntax_kind: SyntaxKind) -> Self {
Self::new(Pattern::regex(name, pattern, syntax_kind))
}
pub fn subdivider(mut self, subdivider: Pattern) -> Self {
self.subdivider = Some(subdivider);
self
}
pub fn post_subdivide(mut self, trim_post_subdivide: Pattern) -> Self {
self.trim_post_subdivide = Some(trim_post_subdivide);
self
}
pub fn name(&self) -> &'static str {
self.pattern.name
}
pub fn matches<'a>(&self, forward_string: &'a str) -> Match<'a> {
match self.pattern.matches(forward_string) {
Some(matched) => {
let new_elements = self.subdivide(matched, self.pattern.syntax_kind);
Match { forward_string: &forward_string[matched.len()..], elements: new_elements }
}
None => Match { forward_string, elements: Vec::new() },
}
}
fn subdivide<'a>(&self, matched: &'a str, matched_kind: SyntaxKind) -> Vec<Element<'a>> {
match &self.subdivider {
Some(subdivider) => {
let mut elem_buff = Vec::new();
let mut str_buff = matched;
while !str_buff.is_empty() {
let Some(div_pos) = subdivider.search(str_buff) else {
let mut trimmed_elems = self.trim_match(str_buff);
elem_buff.append(&mut trimmed_elems);
break;
};
let mut trimmed_elems = self.trim_match(&str_buff[..div_pos.start]);
let div_elem = Element::new(
subdivider.name,
subdivider.syntax_kind,
&str_buff[div_pos.start..div_pos.end],
);
elem_buff.append(&mut trimmed_elems);
elem_buff.push(div_elem);
str_buff = &str_buff[div_pos.end..];
}
elem_buff
}
None => {
vec![Element::new(self.name(), matched_kind, matched)]
}
}
}
fn trim_match<'a>(&self, matched_str: &'a str) -> Vec<Element<'a>> {
let Some(trim_post_subdivide) = &self.trim_post_subdivide else {
return Vec::new();
};
let mk_element =
|text| Element::new(trim_post_subdivide.name, trim_post_subdivide.syntax_kind, text);
let mut elem_buff = Vec::new();
let mut content_buff = String::new();
let mut str_buff = matched_str;
while !str_buff.is_empty() {
let Some(trim_pos) = trim_post_subdivide.search(str_buff) else {
break;
};
let start = trim_pos.start;
let end = trim_pos.end;
if start == 0 {
elem_buff.push(mk_element(&str_buff[..end]));
str_buff = str_buff[end..].into();
} else if end == str_buff.len() {
let raw = format!("{}{}", content_buff, &str_buff[..start]);
elem_buff.push(Element::new(
trim_post_subdivide.name,
trim_post_subdivide.syntax_kind,
raw,
));
elem_buff.push(mk_element(&str_buff[start..end]));
content_buff.clear();
str_buff = "";
} else {
content_buff.push_str(&str_buff[..end]);
str_buff = &str_buff[end..];
}
}
if !content_buff.is_empty() || !str_buff.is_empty() {
let raw = format!("{}{}", content_buff, str_buff);
elem_buff.push(Element::new(self.pattern.name, self.pattern.syntax_kind, raw));
}
elem_buff
}
}
#[derive(Debug, Clone)]
pub struct Pattern {
name: &'static str,
syntax_kind: SyntaxKind,
kind: SearchPatternKind,
}
#[derive(Debug, Clone)]
pub enum SearchPatternKind {
String(&'static str),
Regex(Regex),
}
impl Pattern {
pub const fn string(
name: &'static str,
template: &'static str,
syntax_kind: SyntaxKind,
) -> Self {
Self { name, syntax_kind, kind: SearchPatternKind::String(template) }
}
pub fn regex(name: &'static str, regex: &'static str, syntax_kind: SyntaxKind) -> Self {
Self { name, syntax_kind, kind: SearchPatternKind::Regex(Regex::new(regex).unwrap()) }
}
fn matches<'a>(&self, forward_string: &'a str) -> Option<&'a str> {
match self.kind {
SearchPatternKind::String(template) => {
if forward_string.starts_with(template) {
return Some(template);
}
}
SearchPatternKind::Regex(ref template) => {
if let Ok(Some(matched)) = template.find(forward_string) {
if matched.start() == 0 {
return Some(matched.as_str());
}
}
}
};
None
}
fn search(&self, forward_string: &str) -> Option<Range<usize>> {
match &self.kind {
SearchPatternKind::String(template) => {
forward_string.find(template).map(|start| start..start + template.len())
}
SearchPatternKind::Regex(template) => {
if let Ok(Some(matched)) = template.find(forward_string) {
return Some(matched.range());
}
None
}
}
}
}
pub struct Lexer<'a> {
config: &'a FluffConfig,
last_resort_lexer: Matcher,
}
pub enum StringOrTemplate<'a> {
String(&'a str),
Template(TemplatedFile),
}
impl<'a> Lexer<'a> {
pub fn new(config: &'a FluffConfig, _dialect: Option<Dialect>) -> Self {
Lexer {
config,
last_resort_lexer: Matcher::regex("<unlexable>", r"[^\t\n.]*", SyntaxKind::Unlexable),
}
}
pub fn lex(
&self,
tables: &Tables,
raw: StringOrTemplate,
) -> Result<(Vec<ErasedSegment>, Vec<SQLLexError>), ValueError> {
let template;
let mut str_buff = match raw {
StringOrTemplate::String(s) => {
template = TemplatedFile::from_string(s.into());
s
}
StringOrTemplate::Template(slot) => {
template = slot;
template.templated_str.as_ref().unwrap()
}
};
let mut element_buffer: Vec<Element> = Vec::new();
let lexer_matchers = self.config.get_dialect().lexer_matchers();
loop {
let mut res = Lexer::lex_match(str_buff, lexer_matchers);
element_buffer.append(&mut res.elements);
if res.forward_string.is_empty() {
break;
}
let mut resort_res = self.last_resort_lexer.matches(str_buff);
if !resort_res.elements.is_empty() {
break;
}
str_buff = resort_res.forward_string;
element_buffer.append(&mut resort_res.elements);
}
let templated_buffer = Lexer::map_template_slices(element_buffer, &template);
let mut segments = self.elements_to_segments(templated_buffer, &template);
for seg in &mut segments {
seg.get_mut().set_id(tables.next_id())
}
Ok((segments, Vec::new()))
}
#[allow(dead_code)]
fn violations_from_segments(segments: Vec<ErasedSegment>) -> Vec<SQLLexError> {
segments
.into_iter()
.filter(|s| s.is_type(SyntaxKind::Unlexable))
.map(|s| {
SQLLexError::new(
format!(
"Unable to lex characters: {}",
s.raw().chars().take(10).collect::<String>()
),
s.get_position_marker().unwrap().clone(),
)
})
.collect()
}
fn lex_match<'b>(mut forward_string: &'b str, lexer_matchers: &[Matcher]) -> Match<'b> {
let mut elem_buff = Vec::new();
'main: loop {
if forward_string.is_empty() {
return Match { forward_string, elements: elem_buff };
}
for matcher in lexer_matchers {
let mut match_result = matcher.matches(forward_string);
if !match_result.elements.is_empty() {
elem_buff.append(&mut match_result.elements);
forward_string = match_result.forward_string;
continue 'main;
}
}
return Match { forward_string, elements: elem_buff };
}
}
fn map_template_slices<'b>(
elements: Vec<Element<'b>>,
template: &TemplatedFile,
) -> Vec<TemplateElement<'b>> {
let mut idx = 0;
let mut templated_buff: Vec<TemplateElement> = Vec::with_capacity(elements.len());
for element in elements {
let template_slice = offset_slice(idx, element.text.len());
idx += element.text.len();
let templated_string = template.get_templated_string().unwrap();
if templated_string[template_slice.clone()] != element.text {
panic!(
"Template and lexed elements do not match. This should never happen {:?} != \
{:?}",
element.text, &templated_string[template_slice]
);
}
templated_buff.push(TemplateElement::from_element(element, template_slice));
}
templated_buff
}
fn elements_to_segments(
&self,
elements: Vec<TemplateElement>,
templated_file: &TemplatedFile,
) -> Vec<ErasedSegment> {
let mut segments = iter_segments(elements, templated_file);
let position_maker = segments
.last()
.map(|segment| segment.get_position_marker().unwrap().end_point_marker())
.unwrap_or_else(|| {
PositionMarker::from_point(0, 0, templated_file.clone(), None, None)
});
segments.push(
SegmentBuilder::token(0, "", SyntaxKind::EndOfFile)
.with_position(position_maker)
.finish(),
);
segments
}
}
fn iter_segments(
lexed_elements: Vec<TemplateElement>,
templated_file: &TemplatedFile,
) -> Vec<ErasedSegment> {
let mut result: Vec<ErasedSegment> = Vec::with_capacity(lexed_elements.len());
let mut tfs_idx = 0;
let templated_file_slices = &templated_file.sliced_file;
for element in lexed_elements.into_iter() {
let consumed_element_length = 0;
let mut stashed_source_idx = None;
for (idx, tfs) in templated_file_slices
.iter()
.skip(tfs_idx)
.enumerate()
.map(|(i, tfs)| (i + tfs_idx, tfs))
{
if is_zero_slice(&tfs.templated_slice) {
let _slice = if idx + 1 < templated_file_slices.len() {
templated_file_slices[idx + 1].clone().into()
} else {
None
};
_handle_zero_length_slice();
continue;
}
if tfs.slice_type == "literal" {
let tfs_offset = tfs.source_slice.start - tfs.templated_slice.start;
if element.template_slice.end <= tfs.templated_slice.end {
let slice_start = stashed_source_idx.unwrap_or_else(|| {
element.template_slice.start + consumed_element_length + tfs_offset
});
result.push(element.to_segment(
PositionMarker::new(
slice_start..element.template_slice.end + tfs_offset,
element.template_slice.clone(),
templated_file.clone(),
None,
None,
),
Some(consumed_element_length..element.raw.len()),
));
if element.template_slice.end == tfs.templated_slice.end {
tfs_idx += 1
}
break;
} else if element.template_slice.start == tfs.templated_slice.end {
continue;
} else {
if element.matcher.name == "whitespace" {
if stashed_source_idx.is_some() {
panic!("Found literal whitespace with stashed idx!")
}
let incremental_length =
tfs.templated_slice.end - element.template_slice.start;
result.push(element.to_segment(
PositionMarker::new(
element.template_slice.start + consumed_element_length + tfs_offset
..tfs.templated_slice.end + tfs_offset,
element.template_slice.clone(),
templated_file.clone(),
None,
None,
),
offset_slice(consumed_element_length, incremental_length).into(),
));
} else {
if stashed_source_idx.is_none() {
stashed_source_idx = (element.template_slice.start + idx).into();
continue;
}
}
}
} else if matches!(tfs.slice_type.as_str(), "templated" | "block_start") {
if !is_zero_slice(&tfs.templated_slice) {
if tfs.slice_type == "block_start" {
unimplemented!()
}
if element.template_slice.end <= tfs.templated_slice.end {
let slice_start = if let Some(stashed_source_idx) = stashed_source_idx {
stashed_source_idx
} else {
tfs.source_slice.start + consumed_element_length
};
result.push(element.to_segment(
PositionMarker::new(
slice_start..tfs.source_slice.end,
element.template_slice.clone(),
templated_file.clone(),
None,
None,
),
Some(consumed_element_length..element.raw.len()),
));
if element.template_slice.end == tfs.templated_slice.end {
tfs_idx += 1
}
break;
} else {
unimplemented!()
}
}
}
}
}
result
}
fn _handle_zero_length_slice() {
}
#[cfg(test)]
mod tests {
use super::*;
fn assert_matches(in_string: &str, matcher: &Matcher, match_string: Option<&str>) {
let res = matcher.matches(in_string);
if let Some(match_string) = match_string {
assert_eq!(res.forward_string, &in_string[match_string.len()..]);
assert_eq!(res.elements.len(), 1);
assert_eq!(res.elements[0].text, match_string);
} else {
assert_eq!(res.forward_string, in_string);
assert_eq!(res.elements.len(), 0);
}
}
#[test]
fn test_parser_lexer_trim_post_subdivide() {
let matcher: Vec<Matcher> = vec![
Matcher::regex(
"function_script_terminator",
r";\s+(?!\*)\/(?!\*)|\s+(?!\*)\/(?!\*)",
SyntaxKind::StatementTerminator,
)
.subdivider(Pattern::string("semicolon", ";", SyntaxKind::Semicolon))
.post_subdivide(Pattern::regex(
"newline",
r"(\n|\r\n)+",
SyntaxKind::Newline,
)),
];
let res = Lexer::lex_match(";\n/\n", &matcher);
assert_eq!(res.elements[0].text, ";");
assert_eq!(res.elements[1].text, "\n");
assert_eq!(res.elements[2].text, "/");
assert_eq!(res.elements.len(), 3);
}
#[test]
fn test_parser_lexer_regex() {
let tests = &[
("fsaljk", "f", "f"),
("fsaljk", r"f", "f"),
("fsaljk", r"[fas]*", "fsa"),
(" \t fsaljk", r"[^\S\r\n]*", " \t "),
(" \t \n fsaljk", r"[^\S\r\n]*", " \t "),
("'something boring' \t \n fsaljk", r"'[^']*'", "'something boring'"),
(
"' something exciting \t\n ' \t \n fsaljk",
r"'[^']*'",
"' something exciting \t\n '",
),
];
for (raw, reg, res) in tests {
let matcher = Matcher::regex("test", reg, SyntaxKind::Word);
assert_matches(raw, &matcher, Some(res));
}
}
#[test]
fn test_parser_lexer_string() {
let matcher = Matcher::string("dot", ".", SyntaxKind::Dot);
assert_matches(".fsaljk", &matcher, Some("."));
assert_matches("fsaljk", &matcher, None);
}
#[test]
fn test_parser_lexer_lex_match() {
let matchers: Vec<Matcher> = vec![
Matcher::string("dot", ".", SyntaxKind::Dot),
Matcher::regex("test", "#[^#]*#", SyntaxKind::Dash),
];
let res = Lexer::lex_match("..#..#..#", &matchers);
assert_eq!(res.forward_string, "#");
assert_eq!(res.elements.len(), 5);
assert_eq!(res.elements[2].text, "#..#");
}
}