use std::sync::Arc;
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
use dashmap::DashMap;
use grep_searcher::LineStep;
use log::Level::Trace;
use once_cell::sync::Lazy;
use regex::bytes::Regex;
use crate::{stats::CodeStats, utils::ext::SliceExt, Config, LanguageType};
#[derive(Clone, Debug)]
pub(crate) struct SyntaxCounter {
pub(crate) shared: Arc<SharedMatchers>,
pub(crate) quote: Option<&'static str>,
pub(crate) quote_is_doc_quote: bool,
pub(crate) stack: Vec<&'static str>,
pub(crate) quote_is_verbatim: bool,
}
#[derive(Clone, Debug)]
pub(crate) struct FileContext {
pub(crate) language: LanguageContext,
pub(crate) stats: CodeStats,
pub(crate) end: usize,
}
impl FileContext {
pub fn new(language: LanguageContext, end: usize, stats: CodeStats) -> Self {
Self {
language,
stats,
end,
}
}
}
#[derive(Clone, Debug)]
pub(crate) enum LanguageContext {
Html {
language: LanguageType,
},
Markdown {
balanced: bool,
language: LanguageType,
},
Rust,
}
#[derive(Clone, Debug)]
pub(crate) struct SharedMatchers {
pub language: LanguageType,
pub allows_nested: bool,
pub doc_quotes: &'static [(&'static str, &'static str)],
pub important_syntax: AhoCorasick<u16>,
pub any_comments: &'static [&'static str],
pub is_fortran: bool,
pub is_literate: bool,
pub line_comments: &'static [&'static str],
pub any_multi_line_comments: &'static [(&'static str, &'static str)],
pub multi_line_comments: &'static [(&'static str, &'static str)],
pub nested_comments: &'static [(&'static str, &'static str)],
pub string_literals: &'static [(&'static str, &'static str)],
pub verbatim_string_literals: &'static [(&'static str, &'static str)],
}
impl SharedMatchers {
pub fn new(language: LanguageType) -> Arc<Self> {
static MATCHERS: Lazy<DashMap<LanguageType, Arc<SharedMatchers>>> = Lazy::new(DashMap::new);
MATCHERS
.entry(language)
.or_insert_with(|| Arc::new(Self::init(language)))
.value()
.clone()
}
pub fn init(language: LanguageType) -> Self {
fn init_corasick(pattern: &[&'static str], anchored: bool) -> AhoCorasick<u16> {
let mut builder = AhoCorasickBuilder::new();
builder
.anchored(anchored)
.byte_classes(false)
.dfa(true)
.prefilter(true);
builder.build_with_size(pattern).unwrap()
}
Self {
language,
allows_nested: language.allows_nested(),
doc_quotes: language.doc_quotes(),
is_fortran: language.is_fortran(),
is_literate: language.is_literate(),
important_syntax: init_corasick(language.important_syntax(), false),
any_comments: language.any_comments(),
line_comments: language.line_comments(),
multi_line_comments: language.multi_line_comments(),
any_multi_line_comments: language.any_multi_line_comments(),
nested_comments: language.nested_comments(),
string_literals: language.quotes(),
verbatim_string_literals: language.verbatim_quotes(),
}
}
}
#[derive(Debug)]
pub(crate) enum AnalysisReport {
Normal(bool),
ChildLanguage(FileContext),
}
impl SyntaxCounter {
pub(crate) fn new(language: LanguageType) -> Self {
Self {
shared: SharedMatchers::new(language),
quote_is_doc_quote: false,
quote_is_verbatim: false,
stack: Vec::with_capacity(1),
quote: None,
}
}
pub(crate) fn is_plain_mode(&self) -> bool {
self.quote.is_none() && self.stack.is_empty()
}
pub(crate) fn _is_string_mode(&self) -> bool {
self.quote.is_some()
}
pub(crate) fn _is_comment_mode(&self) -> bool {
!self.stack.is_empty()
}
#[inline]
pub(crate) fn parse_line_comment(&self, window: &[u8]) -> bool {
if self.quote.is_some() || !self.stack.is_empty() {
false
} else if let Some(comment) = self
.shared
.line_comments
.iter()
.find(|c| window.starts_with(c.as_bytes()))
{
trace!("Start {:?}", comment);
true
} else {
false
}
}
pub(crate) fn can_perform_single_line_analysis(
&self,
line: &[u8],
stats: &mut crate::stats::CodeStats,
) -> bool {
if self.is_plain_mode() {
if line.trim().is_empty() {
stats.blanks += 1;
trace!("Blank No.{}", stats.blanks);
return true;
} else if !self.shared.important_syntax.is_match(line) {
trace!("^ Skippable");
if self.shared.is_literate
|| self
.shared
.line_comments
.iter()
.any(|c| line.starts_with(c.as_bytes()))
{
stats.comments += 1;
trace!("Comment No.{}", stats.comments);
} else {
stats.code += 1;
trace!("Code No.{}", stats.code);
}
return true;
}
}
false
}
pub(crate) fn perform_multi_line_analysis(
&mut self,
lines: &[u8],
start: usize,
end: usize,
config: &Config,
) -> AnalysisReport {
let mut ended_with_comments = false;
let mut skip = 0;
macro_rules! skip {
($skip:expr) => {{
skip = $skip - 1;
}};
}
for i in start..end {
if skip != 0 {
skip -= 1;
continue;
}
let window = &lines[i..];
if window.trim().is_empty() {
break;
}
ended_with_comments = false;
let is_end_of_quote_or_multi_line = self
.parse_end_of_quote(window)
.or_else(|| self.parse_end_of_multi_line(window));
if let Some(skip_amount) = is_end_of_quote_or_multi_line {
ended_with_comments = true;
skip!(skip_amount);
continue;
} else if self.quote.is_some() {
continue;
}
if let Some(child) = self.parse_context(lines, i, end, config) {
return AnalysisReport::ChildLanguage(child);
}
let is_quote_or_multi_line = self
.parse_quote(window)
.or_else(|| self.parse_multi_line_comment(window));
if let Some(skip_amount) = is_quote_or_multi_line {
skip!(skip_amount);
continue;
}
if self.parse_line_comment(window) {
ended_with_comments = true;
break;
}
}
AnalysisReport::Normal(ended_with_comments)
}
pub(crate) fn line_is_comment(
&self,
line: &[u8],
config: &crate::Config,
_ended_with_comments: bool,
started_in_comments: bool,
) -> bool {
let trimmed = line.trim();
let whole_line_is_comment = || {
self.shared
.line_comments
.iter()
.any(|c| trimmed.starts_with(c.as_bytes()))
|| self
.shared
.any_multi_line_comments
.iter()
.any(|(start, end)| {
trimmed.starts_with(start.as_bytes()) && trimmed.ends_with(end.as_bytes())
})
};
let starts_with_comment = || {
let quote = match self.stack.last() {
Some(q) => q,
_ => return false,
};
self.shared
.any_multi_line_comments
.iter()
.any(|(start, end)| end == quote && trimmed.starts_with(start.as_bytes()))
};
#[allow(clippy::if_same_then_else)]
if self.quote.is_some() {
if self.quote_is_doc_quote && config.treat_doc_strings_as_comments == Some(true) {
self.quote.map_or(false, |q| line.starts_with(q.as_bytes()))
|| (self.quote.is_some())
} else {
false
}
} else if self
.shared
.doc_quotes
.iter()
.any(|(_, e)| line.contains_slice(e.as_bytes()))
&& started_in_comments
{
true
} else if (whole_line_is_comment)() {
true
} else if started_in_comments {
true
} else {
(starts_with_comment)()
}
}
#[inline]
pub(crate) fn parse_context(
&mut self,
lines: &[u8],
start: usize,
end: usize,
config: &Config,
) -> Option<FileContext> {
use std::str::FromStr;
if self.quote.is_some() || !self.stack.is_empty() {
return None;
}
match self.shared.language {
LanguageType::Markdown | LanguageType::UnrealDeveloperMarkdown => {
static STARTING_MARKDOWN_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"^```\S+\s"#).unwrap());
static ENDING_MARKDOWN_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"```\s?"#).unwrap());
if !lines[start..end].contains_slice(b"```") {
return None;
}
let opening_fence = STARTING_MARKDOWN_REGEX.find(&lines[start..end])?;
let start_of_code = start + opening_fence.end();
let closing_fence = ENDING_MARKDOWN_REGEX.find(&lines[start_of_code..]);
if let Some(m) = &closing_fence {
trace!("{:?}", String::from_utf8_lossy(m.as_bytes()))
}
let end_of_code = closing_fence
.map(|fence| start_of_code + fence.start())
.unwrap_or_else(|| lines.len());
let end_of_code_block = closing_fence
.map(|fence| start_of_code + fence.end())
.unwrap_or_else(|| lines.len());
let balanced = closing_fence.is_some();
let identifier = &opening_fence.as_bytes().trim()[3..];
let language = identifier
.split(|&b| b == b',')
.filter_map(|s| LanguageType::from_str(&String::from_utf8_lossy(s)).ok())
.next()?;
trace!(
"{} BLOCK: {:?}",
language,
String::from_utf8_lossy(&lines[start_of_code..end_of_code])
);
let stats =
language.parse_from_slice(&lines[start_of_code..end_of_code].trim(), config);
Some(FileContext::new(
LanguageContext::Markdown { balanced, language },
end_of_code_block,
stats,
))
}
LanguageType::Rust => {
let rest = &lines[start..];
let comment_syntax = if rest.trim_start().starts_with(b"///") {
b"///"
} else if rest.trim_start().starts_with(b"//!") {
b"//!"
} else {
return None;
};
let mut stepper = LineStep::new(b'\n', start, lines.len());
let mut markdown = Vec::new();
let mut end_of_block = lines.len();
while let Some((start, end)) = stepper.next(lines) {
if lines[start..].trim().starts_with(comment_syntax) {
trace!("{}", String::from_utf8_lossy(&lines[start..end]));
let line = lines[start..end].trim_start();
let stripped_line = &line[3.min(line.len())..];
markdown.extend_from_slice(stripped_line);
end_of_block = end;
} else {
end_of_block = start;
break;
}
}
trace!("Markdown found: {:?}", String::from_utf8_lossy(&markdown));
let doc_block = LanguageType::Markdown.parse_from_slice(markdown.trim(), config);
Some(FileContext::new(
LanguageContext::Rust,
end_of_block,
doc_block,
))
}
#[allow(clippy::trivial_regex)]
LanguageType::Html
| LanguageType::RubyHtml
| LanguageType::Svelte
| LanguageType::Vue => {
static START_SCRIPT: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"^<script(?:.*type="(.*)")?.*?>"#).unwrap());
static END_SCRIPT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"</script>"#).unwrap());
static START_STYLE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"^<style(?:.*lang="(.*)")?.*?>"#).unwrap());
static END_STYLE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"</style>"#).unwrap());
static START_TEMPLATE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"^<template(?:.*lang="(.*)")?.*?>"#).unwrap());
static END_TEMPLATE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"</template>"#).unwrap());
if let Some(captures) = START_SCRIPT.captures(&lines[start..end]) {
let start_of_code = start + captures.get(0).unwrap().end();
let closing_tag = END_SCRIPT.find(&lines[start_of_code..])?;
let end_of_code = start_of_code + closing_tag.start();
let language = captures
.get(1)
.and_then(|m| {
LanguageType::from_mime(&String::from_utf8_lossy(m.as_bytes().trim()))
})
.unwrap_or(LanguageType::JavaScript);
let script_contents = &lines[start_of_code..end_of_code];
if script_contents.trim().is_empty() {
return None;
}
let stats = language.parse_from_slice(
script_contents.trim_first_and_last_line_of_whitespace(),
config,
);
Some(FileContext::new(
LanguageContext::Html { language },
end_of_code,
stats,
))
} else if let Some(captures) = START_STYLE.captures(&lines[start..end]) {
let start_of_code = start + captures.get(0).unwrap().end();
let closing_tag = END_STYLE.find(&lines[start_of_code..])?;
let end_of_code = start_of_code + closing_tag.start();
let language = captures
.get(1)
.and_then(|m| {
LanguageType::from_str(
&String::from_utf8_lossy(m.as_bytes().trim()).to_lowercase(),
)
.ok()
})
.unwrap_or(LanguageType::Css);
let style_contents = &lines[start_of_code..end_of_code];
if style_contents.trim().is_empty() {
return None;
}
let stats = language.parse_from_slice(
style_contents.trim_first_and_last_line_of_whitespace(),
config,
);
Some(FileContext::new(
LanguageContext::Html { language },
end_of_code,
stats,
))
} else if let Some(captures) = START_TEMPLATE.captures(&lines[start..end]) {
let start_of_code = start + captures.get(0).unwrap().end();
let closing_tag = END_TEMPLATE.find(&lines[start_of_code..])?;
let end_of_code = start_of_code + closing_tag.start();
let language = captures
.get(1)
.and_then(|m| {
LanguageType::from_str(
&String::from_utf8_lossy(m.as_bytes().trim()).to_lowercase(),
)
.ok()
})
.unwrap_or(LanguageType::Html);
let template_contents = &lines[start_of_code..end_of_code];
if template_contents.trim().is_empty() {
return None;
}
let stats = language.parse_from_slice(
template_contents.trim_first_and_last_line_of_whitespace(),
config,
);
Some(FileContext::new(
LanguageContext::Html { language },
end_of_code,
stats,
))
} else {
None
}
}
_ => None,
}
}
#[inline]
pub(crate) fn parse_quote(&mut self, window: &[u8]) -> Option<usize> {
if !self.stack.is_empty() {
return None;
}
if let Some((start, end)) = self
.shared
.doc_quotes
.iter()
.find(|(s, _)| window.starts_with(s.as_bytes()))
{
trace!("Start Doc {:?}", start);
self.quote = Some(end);
self.quote_is_verbatim = false;
self.quote_is_doc_quote = true;
return Some(start.len());
}
if let Some((start, end)) = self
.shared
.verbatim_string_literals
.iter()
.find(|(s, _)| window.starts_with(s.as_bytes()))
{
trace!("Start verbatim {:?}", start);
self.quote = Some(end);
self.quote_is_verbatim = true;
self.quote_is_doc_quote = false;
return Some(start.len());
}
if let Some((start, end)) = self
.shared
.string_literals
.iter()
.find(|(s, _)| window.starts_with(s.as_bytes()))
{
trace!("Start {:?}", start);
self.quote = Some(end);
self.quote_is_verbatim = false;
self.quote_is_doc_quote = false;
return Some(start.len());
}
None
}
#[inline]
pub(crate) fn parse_end_of_quote(&mut self, window: &[u8]) -> Option<usize> {
if self._is_string_mode() && window.starts_with(self.quote?.as_bytes()) {
let quote = self.quote.take().unwrap();
trace!("End {:?}", quote);
Some(quote.len())
} else if !self.quote_is_verbatim
&& window.starts_with(br"\")
&& self
.shared
.string_literals
.iter()
.any(|(start, _)| window[1..].starts_with(start.as_bytes()))
{
Some(2)
} else {
None
}
}
#[inline]
pub(crate) fn parse_multi_line_comment(&mut self, window: &[u8]) -> Option<usize> {
if self.quote.is_some() {
return None;
}
let iter = self
.shared
.multi_line_comments
.iter()
.chain(self.shared.nested_comments);
for &(start, end) in iter {
if window.starts_with(start.as_bytes()) {
if self.stack.is_empty()
|| self.shared.allows_nested
|| self.shared.nested_comments.contains(&(start, end))
{
self.stack.push(end);
if log_enabled!(Trace) && self.shared.allows_nested {
trace!("Start nested {:?}", start);
} else {
trace!("Start {:?}", start);
}
}
return Some(start.len());
}
}
None
}
#[inline]
pub(crate) fn parse_end_of_multi_line(&mut self, window: &[u8]) -> Option<usize> {
if self
.stack
.last()
.map_or(false, |l| window.starts_with(l.as_bytes()))
{
let last = self.stack.pop().unwrap();
if log_enabled!(Trace) {
if self.stack.is_empty() {
trace!("End {:?}", last);
} else {
trace!("End {:?}. Still in comments.", last);
}
}
Some(last.len())
} else {
None
}
}
}