Skip to main content

codelens_core/language/
definition.rs

1//! Language definition structures.
2
3use std::sync::OnceLock;
4
5use regex::Regex;
6use serde::Deserialize;
7
8use crate::analyzer::trie::{self, TokenTrie};
9
10/// Definition of a programming language.
11#[derive(Debug, Deserialize)]
12pub struct Language {
13    /// Language display name (e.g., "Rust", "Python").
14    pub name: String,
15
16    /// File extensions (e.g., [".rs", ".rlib"]).
17    #[serde(default)]
18    pub extensions: Vec<String>,
19
20    /// Special filenames (e.g., ["Makefile", "Dockerfile"]).
21    #[serde(default)]
22    pub filenames: Vec<String>,
23
24    /// Single-line comment prefixes (e.g., ["//", "#"]).
25    #[serde(default)]
26    pub line_comments: Vec<String>,
27
28    /// Block comment delimiters (e.g., [("/*", "*/")]).
29    #[serde(default, deserialize_with = "deserialize_block_comments")]
30    pub block_comments: Vec<(String, String)>,
31
32    /// String literal delimiters for accurate parsing.
33    #[serde(default)]
34    pub string_delimiters: Vec<StringDelimiter>,
35
36    /// Regex pattern to match function definitions.
37    #[serde(default)]
38    pub function_pattern: Option<String>,
39
40    /// Keywords that contribute to cyclomatic complexity.
41    #[serde(default)]
42    pub complexity_keywords: Vec<String>,
43
44    /// Whether block comments can be nested (e.g., Rust allows /* /* */ */).
45    #[serde(default)]
46    pub nested_comments: bool,
47
48    /// Lazily-built token trie and process mask.
49    #[serde(skip)]
50    pub(crate) tokens_cache: OnceLock<(TokenTrie, u8)>,
51
52    /// Lazily-compiled complexity regex patterns.
53    #[serde(skip)]
54    pub(crate) complexity_cache: OnceLock<ComplexityPatterns>,
55}
56
57/// Precompiled regex patterns for complexity analysis.
58#[derive(Debug)]
59pub struct ComplexityPatterns {
60    /// Compiled function_pattern regex.
61    pub function_re: Option<Regex>,
62    /// Single alternation regex matching all complexity keywords: `\b(if|else|...)\b`
63    pub keywords_re: Option<Regex>,
64}
65
66impl Clone for Language {
67    fn clone(&self) -> Self {
68        Self {
69            name: self.name.clone(),
70            extensions: self.extensions.clone(),
71            filenames: self.filenames.clone(),
72            line_comments: self.line_comments.clone(),
73            block_comments: self.block_comments.clone(),
74            string_delimiters: self.string_delimiters.clone(),
75            function_pattern: self.function_pattern.clone(),
76            complexity_keywords: self.complexity_keywords.clone(),
77            nested_comments: self.nested_comments,
78            tokens_cache: OnceLock::new(),
79            complexity_cache: OnceLock::new(),
80        }
81    }
82}
83
84impl Language {
85    /// Get the token trie and process mask, building them on first access.
86    pub fn tokens(&self) -> &(TokenTrie, u8) {
87        self.tokens_cache
88            .get_or_init(|| trie::build_from_language(self))
89    }
90
91    /// Get precompiled complexity regex patterns, building them on first access.
92    pub fn complexity_patterns(&self) -> &ComplexityPatterns {
93        self.complexity_cache.get_or_init(|| {
94            let function_re = self
95                .function_pattern
96                .as_ref()
97                .and_then(|p| Regex::new(p).ok());
98
99            let keywords_re = if self.complexity_keywords.is_empty() {
100                None
101            } else {
102                let alts: Vec<String> = self
103                    .complexity_keywords
104                    .iter()
105                    .map(|k| regex::escape(k))
106                    .collect();
107                let pattern = format!(r"\b({})\b", alts.join("|"));
108                Regex::new(&pattern).ok()
109            };
110
111            ComplexityPatterns {
112                function_re,
113                keywords_re,
114            }
115        })
116    }
117}
118
119impl Default for Language {
120    fn default() -> Self {
121        Self {
122            name: "Unknown".to_string(),
123            extensions: vec![],
124            filenames: vec![],
125            line_comments: vec![],
126            block_comments: vec![],
127            string_delimiters: vec![],
128            function_pattern: None,
129            complexity_keywords: vec![],
130            nested_comments: false,
131            tokens_cache: OnceLock::new(),
132            complexity_cache: OnceLock::new(),
133        }
134    }
135}
136
137/// String delimiter definition.
138#[derive(Debug, Clone, Deserialize)]
139pub struct StringDelimiter {
140    /// Opening delimiter.
141    pub start: String,
142    /// Closing delimiter.
143    pub end: String,
144    /// Escape character (if any).
145    #[serde(default)]
146    pub escape: Option<String>,
147}
148
149/// Custom deserializer for block comments that handles TOML array format.
150fn deserialize_block_comments<'de, D>(deserializer: D) -> Result<Vec<(String, String)>, D::Error>
151where
152    D: serde::Deserializer<'de>,
153{
154    let raw: Vec<Vec<String>> = Vec::deserialize(deserializer)?;
155    Ok(raw
156        .into_iter()
157        .filter_map(|pair| {
158            if pair.len() >= 2 {
159                Some((pair[0].clone(), pair[1].clone()))
160            } else {
161                None
162            }
163        })
164        .collect())
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170
171    #[test]
172    fn test_language_default() {
173        let lang = Language::default();
174        assert_eq!(lang.name, "Unknown");
175        assert!(lang.extensions.is_empty());
176        assert!(lang.line_comments.is_empty());
177    }
178
179    #[test]
180    fn test_language_deserialize() {
181        let toml = r#"
182            name = "Rust"
183            extensions = [".rs"]
184            line_comments = ["//"]
185            block_comments = [["/*", "*/"]]
186            function_pattern = "fn\\s+\\w+"
187            complexity_keywords = ["if", "for", "while"]
188            nested_comments = true
189        "#;
190
191        let lang: Language = toml::from_str(toml).unwrap();
192        assert_eq!(lang.name, "Rust");
193        assert_eq!(lang.extensions, vec![".rs"]);
194        assert_eq!(lang.line_comments, vec!["//"]);
195        assert_eq!(
196            lang.block_comments,
197            vec![("/*".to_string(), "*/".to_string())]
198        );
199        assert!(lang.nested_comments);
200    }
201}