use std::fmt::{Debug, Display, Formatter};
use std::ops::Range;
use dyn_clone::DynClone;
use fancy_regex::{Error, Regex};
use super::markers::PositionMarker;
use super::segments::meta::EndOfFile;
use crate::core::config::FluffConfig;
use crate::core::dialects::base::Dialect;
use crate::core::errors::{SQLLexError, ValueError};
use crate::core::parser::segments::base::{
Segment, SegmentConstructorFn, UnlexableSegment, UnlexableSegmentNewArgs,
};
use crate::core::slice_helpers::{is_zero_slice, offset_slice};
use crate::core::templaters::base::TemplatedFile;
use crate::helpers::Boxed;
#[derive(Debug, Clone)]
pub struct LexedElement {
raw: String,
matcher: Box<dyn Matcher>,
}
impl LexedElement {
pub fn new(raw: String, matcher: Box<dyn Matcher>) -> Self {
LexedElement { raw, matcher }
}
}
#[derive(Debug)]
pub struct TemplateElement {
raw: String,
template_slice: Range<usize>,
matcher: Box<dyn Matcher>,
}
impl TemplateElement {
pub fn from_element(element: LexedElement, template_slice: Range<usize>) -> Self {
TemplateElement { raw: element.raw, template_slice, matcher: element.matcher }
}
pub fn to_segment(
&self,
pos_marker: PositionMarker,
subslice: Option<Range<usize>>,
) -> Box<dyn Segment> {
let slice = subslice.map_or_else(|| self.raw.clone(), |slice| self.raw[slice].to_string());
self.matcher.construct_segment(slice, pos_marker)
}
}
#[derive(Debug)]
pub struct LexMatch {
forward_string: String,
pub elements: Vec<LexedElement>,
}
#[allow(clippy::needless_arbitrary_self_type)]
impl LexMatch {
pub fn is_non_empty(self: &Self) -> bool {
!self.elements.is_empty()
}
}
pub trait CloneMatcher {
fn clone_box(&self) -> Box<dyn Matcher>;
}
impl<T: Matcher + DynClone> CloneMatcher for T {
fn clone_box(&self) -> Box<dyn Matcher> {
dyn_clone::clone(self).boxed()
}
}
#[allow(clippy::needless_arbitrary_self_type)]
pub trait Matcher: Debug + DynClone + CloneMatcher + 'static {
fn get_name(self: &Self) -> String;
fn match_(self: &Self, forward_string: String) -> Result<LexMatch, ValueError>;
fn search(self: &Self, forward_string: &str) -> Option<Range<usize>>;
fn get_sub_divider(self: &Self) -> Option<Box<dyn Matcher>>;
fn get_trim_post_subdivide(self: &Self) -> Option<Box<dyn Matcher>>;
fn _subdivide(self: &Self, matched: LexedElement) -> Vec<LexedElement> {
if let Some(sub_divider) = &self.get_sub_divider() {
let mut elem_buff: Vec<LexedElement> = vec![];
let mut str_buff = matched.raw;
while !str_buff.is_empty() {
let div_pos = sub_divider.clone().search(&str_buff);
if let Some(div_pos) = div_pos {
let trimmed_elems =
self._trim_match(str_buff[..div_pos.start].to_string().as_str());
let div_elem = LexedElement::new(
str_buff[div_pos.start..div_pos.end].to_string(),
sub_divider.clone(),
);
elem_buff.extend_from_slice(&trimmed_elems);
elem_buff.push(div_elem);
str_buff = str_buff[div_pos.end..].to_string();
} else {
let trimmed_elems = self._trim_match(&str_buff);
elem_buff.extend_from_slice(&trimmed_elems);
break;
}
}
elem_buff
} else {
vec![matched]
}
}
fn _trim_match(self: &Self, matched_str: &str) -> Vec<LexedElement> {
let mut elem_buff = Vec::new();
let mut content_buff = String::new();
let mut str_buff = String::from(matched_str);
if let Some(trim_post_subdivide) = self.get_trim_post_subdivide() {
while !str_buff.is_empty() {
if let Some(trim_pos) = trim_post_subdivide.clone().search(&str_buff) {
let start = trim_pos.start;
let end = trim_pos.end;
if start == 0 {
elem_buff.push(LexedElement::new(
str_buff[..end].to_string(),
trim_post_subdivide.clone(),
));
str_buff = str_buff[end..].to_string();
} else if end == str_buff.len() {
elem_buff.push(LexedElement::new(
format!("{}{}", content_buff, &str_buff[..start]),
trim_post_subdivide.clone(),
));
elem_buff.push(LexedElement::new(
str_buff[start..end].to_string(),
trim_post_subdivide.clone(),
));
content_buff.clear();
str_buff.clear();
} else {
content_buff.push_str(&str_buff[..end]);
str_buff = str_buff[end..].to_string();
}
} else {
break;
}
}
if !content_buff.is_empty() || !str_buff.is_empty() {
elem_buff.push(LexedElement::new(
format!("{}{}", content_buff, str_buff),
self.clone_box(),
));
}
}
elem_buff
}
fn construct_segment(&self, _raw: String, _pos_marker: PositionMarker) -> Box<dyn Segment> {
unimplemented!("{}", std::any::type_name::<Self>());
}
}
dyn_clone::clone_trait_object!(Matcher);
impl Display for dyn Matcher {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "Matcher({})", self.get_name())
}
}
#[derive(Clone)]
pub struct StringLexer<SegmentArgs: 'static + Clone> {
name: &'static str,
template: &'static str,
segment_constructor: SegmentConstructorFn<SegmentArgs>,
segment_args: SegmentArgs,
sub_divider: Option<Box<dyn Matcher>>,
trim_post_subdivide: Option<Box<dyn Matcher>>,
}
impl<SegmentArgs: Clone + Debug> StringLexer<SegmentArgs> {
pub fn new(
name: &'static str,
template: &'static str,
segment_constructor: SegmentConstructorFn<SegmentArgs>,
segment_args: SegmentArgs,
sub_divider: Option<Box<dyn Matcher>>,
trim_post_subdivide: Option<Box<dyn Matcher>>,
) -> Self {
StringLexer {
name,
template,
segment_constructor,
segment_args,
sub_divider,
trim_post_subdivide,
}
}
fn _match(&self, forward_string: &str) -> Option<LexedElement> {
if forward_string.starts_with(self.template) {
Some(LexedElement { raw: self.template.to_string(), matcher: Box::new(self.clone()) })
} else {
None
}
}
fn _trim_match(&self, _matched_string: String) -> Vec<LexedElement> {
panic!("Not implemented")
}
fn _subdivide(&self, matched: LexedElement) -> Vec<LexedElement> {
if let Some(sub_divider) = &self.sub_divider {
let mut elem_buff: Vec<LexedElement> = vec![];
let mut str_buff = matched.raw;
while !str_buff.is_empty() {
let div_pos = self.sub_divider.clone().unwrap().search(&str_buff);
if let Some(div_pos) = div_pos {
let trimmed_elems = self._trim_match(str_buff[..div_pos.start].to_string());
let div_elem = LexedElement::new(
str_buff[div_pos.start..div_pos.end].to_string(),
sub_divider.clone(),
);
elem_buff.extend_from_slice(&trimmed_elems);
elem_buff.push(div_elem);
str_buff = str_buff[div_pos.end..].to_string();
} else {
let trimmed_elems = self._trim_match(str_buff);
elem_buff.extend_from_slice(&trimmed_elems);
break;
}
}
elem_buff
} else {
vec![matched]
}
}
}
impl<SegmentArgs: Debug + Clone> Debug for StringLexer<SegmentArgs> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "StringLexer({})", self.name)
}
}
impl<SegmentArgs: Clone + Debug> Display for StringLexer<SegmentArgs> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "StringLexer({})", self.template)
}
}
impl<SegmentArgs: Clone + Debug> Matcher for StringLexer<SegmentArgs> {
fn get_name(&self) -> String {
self.template.to_string()
}
fn match_(&self, forward_string: String) -> Result<LexMatch, ValueError> {
if forward_string.len() == 0 {
return Err(ValueError::new(String::from("Unexpected empty string!")));
};
let matched = self._match(&forward_string);
match matched {
Some(matched) => {
let length = matched.raw.len();
let new_elements = self._subdivide(matched);
Ok(LexMatch {
forward_string: forward_string[length..].to_string(),
elements: new_elements,
})
}
None => Ok(LexMatch { forward_string: forward_string.to_string(), elements: vec![] }),
}
}
fn search(&self, forward_string: &str) -> Option<Range<usize>> {
let start = forward_string.find(&self.template);
if start.is_some() {
Some(start.unwrap()..start.unwrap() + self.template.len())
} else {
None
}
}
fn get_sub_divider(&self) -> Option<Box<dyn Matcher>> {
self.sub_divider.clone()
}
fn get_trim_post_subdivide(&self) -> Option<Box<dyn Matcher>> {
self.trim_post_subdivide.clone()
}
fn construct_segment(&self, raw: String, pos_marker: PositionMarker) -> Box<dyn Segment> {
(self.segment_constructor)(&raw, &pos_marker, self.segment_args.clone())
}
}
#[derive(Clone)]
pub struct RegexLexer<SegmentArgs: 'static + Clone> {
name: &'static str,
template: Regex,
segment_constructor: SegmentConstructorFn<SegmentArgs>,
segment_args: SegmentArgs,
sub_divider: Option<Box<dyn Matcher>>,
trim_post_subdivide: Option<Box<dyn Matcher>>,
}
impl<SegmentArgs: Clone + Debug> RegexLexer<SegmentArgs> {
pub fn new(
name: &'static str,
regex: &str,
segment_constructor: SegmentConstructorFn<SegmentArgs>,
segment_args: SegmentArgs,
sub_divider: Option<Box<dyn Matcher>>,
trim_post_subdivide: Option<Box<dyn Matcher>>,
) -> Result<Self, Error> {
Ok(RegexLexer {
name,
template: Regex::new(regex)?,
segment_constructor,
segment_args,
sub_divider,
trim_post_subdivide,
})
}
pub fn _match(&self, forward_string: &str) -> Option<LexedElement> {
if let Ok(Some(matched)) = self.template.find(forward_string) {
if matched.start() == 0 {
return Some(LexedElement {
raw: matched.as_str().to_string(),
matcher: Box::new(self.clone()),
});
}
}
None
}
}
impl<SegmentArgs: Debug + Clone> Debug for RegexLexer<SegmentArgs> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "RegexLexer({})", self.name)
}
}
impl<SegmentArgs: Clone + Debug> Display for RegexLexer<SegmentArgs> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "RegexLexer({})", self.get_name())
}
}
impl<SegmentArgs: Clone + Debug> Matcher for RegexLexer<SegmentArgs> {
fn get_name(&self) -> String {
self.template.as_str().to_string()
}
fn match_(&self, forward_string: String) -> Result<LexMatch, ValueError> {
if forward_string.len() == 0 {
return Err(ValueError::new(String::from("Unexpected empty string!")));
};
let matched = self._match(&forward_string);
match matched {
Some(matched) => {
let length = matched.raw.len();
let new_elements = self._subdivide(matched);
Ok(LexMatch {
forward_string: forward_string[length..].to_string(),
elements: new_elements,
})
}
None => Ok(LexMatch { forward_string: forward_string.to_string(), elements: vec![] }),
}
}
fn search(&self, forward_string: &str) -> Option<Range<usize>> {
if let Ok(Some(matched)) = self.template.find(forward_string) {
let match_str = matched.as_str();
if !match_str.is_empty() {
return Some(matched.range());
} else {
panic!(
"Zero length Lex item returned from '{}'. Report this as a bug.",
self.get_name()
);
}
}
None
}
fn get_sub_divider(&self) -> Option<Box<dyn Matcher>> {
self.sub_divider.clone()
}
fn get_trim_post_subdivide(&self) -> Option<Box<dyn Matcher>> {
self.trim_post_subdivide.clone()
}
fn construct_segment(&self, raw: String, pos_marker: PositionMarker) -> Box<dyn Segment> {
(self.segment_constructor)(&raw, &pos_marker, self.segment_args.clone())
}
}
pub struct Lexer {
config: FluffConfig,
last_resort_lexer: Box<dyn Matcher>,
}
pub enum StringOrTemplate {
String(String),
Template(TemplatedFile),
}
impl Lexer {
pub fn new(config: FluffConfig, dialect: Option<Dialect>) -> Self {
let fluff_config = FluffConfig::from_kwargs(Some(config), dialect, None);
let last_resort_lexer = RegexLexer::new(
"last_resort",
"[^\t\n.]*",
&UnlexableSegment::new,
UnlexableSegmentNewArgs { expected: None },
None,
None,
)
.expect("Unable to create last resort lexer");
Lexer { config: fluff_config, last_resort_lexer: Box::new(last_resort_lexer) }
}
pub fn lex(
&self,
raw: StringOrTemplate,
) -> Result<(Vec<Box<dyn Segment>>, Vec<SQLLexError>), ValueError> {
let (mut str_buff, template) = match raw {
StringOrTemplate::String(s) => (s.clone(), TemplatedFile::from_string(s.to_string())),
StringOrTemplate::Template(f) => (f.to_string(), f),
};
let mut element_buffer: Vec<LexedElement> = Vec::new();
loop {
let res =
Lexer::lex_match(&str_buff, self.config.get_dialect().lexer_matchers()).unwrap();
element_buffer.extend(res.elements);
if !res.forward_string.is_empty() {
let resort_res = self.last_resort_lexer.match_(str_buff.to_string())?;
str_buff = resort_res.forward_string;
element_buffer.extend(resort_res.elements);
} else {
break;
}
}
let templated_buffer = Lexer::map_template_slices(element_buffer, template.clone());
let segments = self.elements_to_segments(templated_buffer, template);
Ok((segments, Vec::new()))
}
fn violations_from_segments<T: Debug + Clone>(segments: Vec<impl Segment>) -> Vec<SQLLexError> {
segments
.into_iter()
.filter(|s| s.is_type("unlexable"))
.map(|s| {
SQLLexError::new(
format!(
"Unable to lex characters: {}",
s.get_raw().unwrap().chars().take(10).collect::<String>()
),
s.get_position_marker().unwrap(),
)
})
.collect()
}
fn lex_match(
forward_string: &str,
lexer_matchers: &[Box<dyn Matcher>],
) -> Result<LexMatch, ValueError> {
let mut elem_buff: Vec<LexedElement> = vec![];
let mut forward_string = forward_string.to_string();
loop {
if forward_string.is_empty() {
return Ok(LexMatch {
forward_string: forward_string.to_string(),
elements: elem_buff,
});
};
let mut matched = false;
for matcher in lexer_matchers {
let res = matcher.match_(forward_string.to_string())?;
if !res.elements.is_empty() {
elem_buff.append(res.elements.clone().as_mut());
forward_string = res.forward_string;
matched = true;
break;
}
}
if !matched {
return Ok(LexMatch {
forward_string: forward_string.to_string(),
elements: elem_buff,
});
}
}
}
fn map_template_slices(
elements: Vec<LexedElement>,
template: TemplatedFile,
) -> Vec<TemplateElement> {
let mut idx = 0;
let mut templated_buff: Vec<TemplateElement> = vec![];
for element in elements {
let template_slice = offset_slice(idx, element.raw.len());
idx += element.raw.len();
templated_buff
.push(TemplateElement::from_element(element.clone(), template_slice.clone()));
let templated_string = template.get_templated_string().unwrap();
if templated_string[template_slice.clone()] != element.raw {
panic!(
"Template and lexed elements do not match. This should never happen {:?} != \
{:?}",
element.raw, &templated_string[template_slice]
);
}
}
return templated_buff;
}
fn elements_to_segments(
&self,
elements: Vec<TemplateElement>,
templated_file: TemplatedFile,
) -> Vec<Box<dyn Segment>> {
let mut segments = iter_segments(elements, templated_file.clone());
let position_maker = segments
.last()
.map(|segment| segment.get_position_marker().unwrap())
.unwrap_or_else(|| PositionMarker::from_point(0, 0, templated_file, None, None));
segments.push(EndOfFile::new(position_maker));
segments
}
}
fn iter_segments(
lexed_elements: Vec<TemplateElement>,
templated_file: TemplatedFile,
) -> Vec<Box<dyn Segment>> {
let mut result = Vec::new();
let tfs_idx = 0;
let templated_file_slices = templated_file.clone().sliced_file;
for (_idx, element) in lexed_elements.into_iter().enumerate() {
let consumed_element_length = 0;
let mut stashed_source_idx = None;
for (mut tfs_idx, tfs) in templated_file_slices
.iter()
.skip(tfs_idx)
.enumerate()
.map(|(i, tfs)| (i + tfs_idx, tfs))
{
if is_zero_slice(tfs.templated_slice.clone()) {
let _slice = if tfs_idx + 1 < templated_file_slices.len() {
templated_file_slices[tfs_idx + 1].clone().into()
} else {
None
};
_handle_zero_length_slice();
continue;
}
if tfs.slice_type == "literal" {
let tfs_offset = tfs.source_slice.start - tfs.templated_slice.start;
if element.template_slice.end <= tfs.templated_slice.end {
let slice_start = stashed_source_idx.unwrap_or_else(|| {
element.template_slice.start + consumed_element_length + tfs_offset
});
result.push(element.to_segment(
PositionMarker::new(
slice_start..element.template_slice.end + tfs_offset,
element.template_slice.clone(),
templated_file.clone(),
None,
None,
),
Some(consumed_element_length..element.raw.len()),
));
if element.template_slice.end == tfs.templated_slice.end {
tfs_idx += 1
}
break;
} else if element.template_slice.start == tfs.templated_slice.end {
continue;
} else {
if element.matcher.get_name() == "whitespace" {
if stashed_source_idx.is_some() {
panic!("Found literal whitespace with stashed idx!")
}
let incremental_length =
tfs.templated_slice.end - element.template_slice.start;
result.push(element.to_segment(
PositionMarker::new(
element.template_slice.start + consumed_element_length + tfs_offset
..tfs.templated_slice.end + tfs_offset,
element.template_slice.clone(),
templated_file.clone(),
None,
None,
),
offset_slice(consumed_element_length, incremental_length).into(),
));
} else {
if stashed_source_idx.is_none() {
stashed_source_idx = (element.template_slice.start + tfs_idx).into();
continue;
}
}
}
} else if matches!(tfs.slice_type.as_str(), "templated" | "block_start") {
unimplemented!();
}
}
}
result
}
fn _handle_zero_length_slice() {
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::parser::segments::base::{
CodeSegment, CodeSegmentNewArgs, NewlineSegment, NewlineSegmentNewArgs,
};
fn assert_matches(in_string: &str, matcher: &impl Matcher, match_string: Option<&str>) {
let res = matcher.match_(in_string.to_string()).unwrap();
if let Some(match_string) = match_string {
assert_eq!(res.forward_string, in_string[match_string.len()..]);
assert_eq!(res.elements.len(), 1);
assert_eq!(res.elements[0].raw, match_string);
} else {
assert_eq!(res.forward_string, in_string);
assert_eq!(res.elements.len(), 0);
}
}
#[test]
fn test__parser__lexer_trim_post_subdivide() {
let matcher: Vec<Box<dyn Matcher>> = vec![Box::new(
RegexLexer::new(
"function_script_terminator",
r";\s+(?!\*)\/(?!\*)|\s+(?!\*)\/(?!\*)",
&CodeSegment::new,
CodeSegmentNewArgs {
code_type: "function_script_terminator",
instance_types: vec![],
trim_start: None,
trim_chars: None,
source_fixes: None,
},
Some(Box::new(StringLexer::new(
"semicolon",
";",
&CodeSegment::new,
CodeSegmentNewArgs {
code_type: "semicolon",
instance_types: vec![],
trim_start: None,
trim_chars: None,
source_fixes: None,
},
None,
None,
))),
Some(Box::new(
RegexLexer::new(
"newline",
r"(\n|\r\n)+",
&NewlineSegment::new,
NewlineSegmentNewArgs {},
None,
None,
)
.unwrap(),
)),
)
.unwrap(),
)];
let res = Lexer::lex_match(";\n/\n", &matcher).unwrap();
assert_eq!(res.elements[0].raw, ";");
assert_eq!(res.elements[1].raw, "\n");
assert_eq!(res.elements[2].raw, "/");
assert_eq!(res.elements.len(), 3);
}
#[test]
fn test__parser__lexer_regex() {
let tests = &[
("fsaljk", "f", "f"),
("fsaljk", r"f", "f"),
("fsaljk", r"[fas]*", "fsa"),
(" \t fsaljk", r"[^\S\r\n]*", " \t "),
(" \t \n fsaljk", r"[^\S\r\n]*", " \t "),
("'something boring' \t \n fsaljk", r"'[^']*'", "'something boring'"),
(
"' something exciting \t\n ' \t \n fsaljk",
r"'[^']*'",
"' something exciting \t\n '",
),
];
for (raw, reg, res) in tests {
let matcher = RegexLexer::new(
"test",
reg,
&CodeSegment::new,
CodeSegmentNewArgs {
code_type: "",
instance_types: vec![],
trim_start: None,
trim_chars: None,
source_fixes: None,
},
None,
None,
)
.unwrap();
assert_matches(raw, &matcher, Some(res));
}
}
#[test]
fn test__parser__lexer_string() {
let matcher = StringLexer::new(
"dot",
".",
&CodeSegment::new,
CodeSegmentNewArgs {
code_type: "dot",
instance_types: vec![],
trim_start: None,
trim_chars: None,
source_fixes: None,
},
None,
None,
);
assert_matches(".fsaljk", &matcher, Some("."));
assert_matches("fsaljk", &matcher, None);
}
#[test]
fn test__parser__lexer_lex_match() {
let matchers: Vec<Box<dyn Matcher>> = vec![
Box::new(StringLexer::new(
"dot",
".",
&CodeSegment::new,
CodeSegmentNewArgs {
code_type: "",
instance_types: vec![],
trim_start: None,
trim_chars: None,
source_fixes: None,
},
None,
None,
)),
Box::new(
RegexLexer::new(
"test",
r"#[^#]*#",
&CodeSegment::new,
CodeSegmentNewArgs {
code_type: "",
instance_types: vec![],
trim_start: None,
trim_chars: None,
source_fixes: None,
},
None,
None,
)
.unwrap(),
),
];
let res = Lexer::lex_match("..#..#..#", &matchers).unwrap();
assert_eq!(res.forward_string, "#");
assert_eq!(res.elements.len(), 5);
assert_eq!(res.elements[2].raw, "#..#");
}
}