use std::sync::OnceLock;
use regex::Regex;
use serde::Deserialize;
use crate::analyzer::trie::{self, TokenTrie};
#[derive(Debug, Deserialize)]
pub struct Language {
pub name: String,
#[serde(default)]
pub extensions: Vec<String>,
#[serde(default)]
pub filenames: Vec<String>,
#[serde(default)]
pub line_comments: Vec<String>,
#[serde(default, deserialize_with = "deserialize_block_comments")]
pub block_comments: Vec<(String, String)>,
#[serde(default)]
pub string_delimiters: Vec<StringDelimiter>,
#[serde(default)]
pub function_pattern: Option<String>,
#[serde(default)]
pub complexity_keywords: Vec<String>,
#[serde(default)]
pub nested_comments: bool,
#[serde(skip)]
pub(crate) tokens_cache: OnceLock<(TokenTrie, u8)>,
#[serde(skip)]
pub(crate) complexity_cache: OnceLock<ComplexityPatterns>,
}
#[derive(Debug)]
pub struct ComplexityPatterns {
pub function_re: Option<Regex>,
pub keywords_re: Option<Regex>,
}
impl Clone for Language {
fn clone(&self) -> Self {
Self {
name: self.name.clone(),
extensions: self.extensions.clone(),
filenames: self.filenames.clone(),
line_comments: self.line_comments.clone(),
block_comments: self.block_comments.clone(),
string_delimiters: self.string_delimiters.clone(),
function_pattern: self.function_pattern.clone(),
complexity_keywords: self.complexity_keywords.clone(),
nested_comments: self.nested_comments,
tokens_cache: OnceLock::new(),
complexity_cache: OnceLock::new(),
}
}
}
impl Language {
pub fn tokens(&self) -> &(TokenTrie, u8) {
self.tokens_cache
.get_or_init(|| trie::build_from_language(self))
}
pub fn complexity_patterns(&self) -> &ComplexityPatterns {
self.complexity_cache.get_or_init(|| {
let function_re = self
.function_pattern
.as_ref()
.and_then(|p| Regex::new(p).ok());
let keywords_re = if self.complexity_keywords.is_empty() {
None
} else {
let alts: Vec<String> = self
.complexity_keywords
.iter()
.map(|k| regex::escape(k))
.collect();
let pattern = format!(r"\b({})\b", alts.join("|"));
Regex::new(&pattern).ok()
};
ComplexityPatterns {
function_re,
keywords_re,
}
})
}
}
impl Default for Language {
fn default() -> Self {
Self {
name: "Unknown".to_string(),
extensions: vec![],
filenames: vec![],
line_comments: vec![],
block_comments: vec![],
string_delimiters: vec![],
function_pattern: None,
complexity_keywords: vec![],
nested_comments: false,
tokens_cache: OnceLock::new(),
complexity_cache: OnceLock::new(),
}
}
}
#[derive(Debug, Clone, Deserialize)]
pub struct StringDelimiter {
pub start: String,
pub end: String,
#[serde(default)]
pub escape: Option<String>,
}
fn deserialize_block_comments<'de, D>(deserializer: D) -> Result<Vec<(String, String)>, D::Error>
where
D: serde::Deserializer<'de>,
{
let raw: Vec<Vec<String>> = Vec::deserialize(deserializer)?;
Ok(raw
.into_iter()
.filter_map(|pair| {
if pair.len() >= 2 {
Some((pair[0].clone(), pair[1].clone()))
} else {
None
}
})
.collect())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_language_default() {
let lang = Language::default();
assert_eq!(lang.name, "Unknown");
assert!(lang.extensions.is_empty());
assert!(lang.line_comments.is_empty());
}
#[test]
fn test_language_deserialize() {
let toml = r#"
name = "Rust"
extensions = [".rs"]
line_comments = ["//"]
block_comments = [["/*", "*/"]]
function_pattern = "fn\\s+\\w+"
complexity_keywords = ["if", "for", "while"]
nested_comments = true
"#;
let lang: Language = toml::from_str(toml).unwrap();
assert_eq!(lang.name, "Rust");
assert_eq!(lang.extensions, vec![".rs"]);
assert_eq!(lang.line_comments, vec!["//"]);
assert_eq!(
lang.block_comments,
vec![("/*".to_string(), "*/".to_string())]
);
assert!(lang.nested_comments);
}
}