Skip to main content

codelens_core/analyzer/
file.rs

1//! Single file analyzer.
2
3use std::fs;
4use std::path::Path;
5use std::sync::Arc;
6
7use crate::config::Config;
8use crate::error::Result;
9use crate::language::{Language, LanguageRegistry};
10
11use super::complexity::ComplexityAnalyzer;
12use super::stats::{FileStats, LineStats};
13
14/// Represents a string delimiter for multiline string detection.
15#[derive(Debug, Clone)]
16struct StringDelimiter {
17    /// The closing delimiter pattern
18    end_pattern: String,
19    /// Whether this is a raw string (no escape processing)
20    is_raw: bool,
21    /// Whether this is a docstring (Python) - should be counted as comment
22    is_docstring: bool,
23}
24
25/// Analyzes individual source files.
26pub struct FileAnalyzer {
27    registry: Arc<LanguageRegistry>,
28    complexity_analyzer: ComplexityAnalyzer,
29    min_lines: Option<usize>,
30    max_lines: Option<usize>,
31}
32
33impl FileAnalyzer {
34    /// Create a new file analyzer.
35    pub fn new(registry: Arc<LanguageRegistry>, config: &Config) -> Self {
36        Self {
37            registry,
38            complexity_analyzer: ComplexityAnalyzer::new(),
39            min_lines: config.filter.min_lines,
40            max_lines: config.filter.max_lines,
41        }
42    }
43
44    /// Analyze a single file.
45    ///
46    /// Returns `None` if the file's language is not recognized.
47    pub fn analyze(&self, path: &Path) -> Result<Option<FileStats>> {
48        // Detect language
49        let language = match self.registry.detect(path) {
50            Some(lang) => lang,
51            None => return Ok(None),
52        };
53
54        // Read file content
55        let content = match fs::read_to_string(path) {
56            Ok(c) => c,
57            Err(_) => {
58                // Try reading as lossy UTF-8
59                match fs::read(path) {
60                    Ok(bytes) => String::from_utf8_lossy(&bytes).into_owned(),
61                    Err(e) => {
62                        return Err(crate::error::Error::FileRead {
63                            path: path.to_path_buf(),
64                            source: e,
65                        })
66                    }
67                }
68            }
69        };
70
71        // Count lines
72        let lines = self.count_lines(&content, &language);
73
74        // Apply line filters
75        if let Some(min) = self.min_lines {
76            if lines.total < min {
77                return Ok(None);
78            }
79        }
80        if let Some(max) = self.max_lines {
81            if lines.total > max {
82                return Ok(None);
83            }
84        }
85
86        // Get file size
87        let size = fs::metadata(path).map(|m| m.len()).unwrap_or(0);
88
89        // Analyze complexity
90        let complexity = self.complexity_analyzer.analyze(&content, &language);
91
92        Ok(Some(FileStats {
93            path: path.to_path_buf(),
94            language: language.name.clone(),
95            lines,
96            size,
97            complexity,
98        }))
99    }
100
101    /// Count lines in file content.
102    fn count_lines(&self, content: &str, lang: &Language) -> LineStats {
103        let mut stats = LineStats::default();
104        let mut in_block_comment = false;
105        let mut block_comment_end = "";
106        let mut in_multiline_string = false;
107        let mut string_delimiter: Option<StringDelimiter> = None;
108
109        for line in content.lines() {
110            stats.total += 1;
111            let trimmed = line.trim();
112
113            // Empty line
114            if trimmed.is_empty() {
115                stats.blank += 1;
116                continue;
117            }
118
119            // Inside multiline string
120            if in_multiline_string {
121                if let Some(ref delim) = string_delimiter {
122                    // Docstrings count as comments, regular strings as code
123                    if delim.is_docstring {
124                        stats.comment += 1;
125                    } else {
126                        stats.code += 1;
127                    }
128                    if self.line_ends_string(line, delim) {
129                        in_multiline_string = false;
130                        string_delimiter = None;
131                    }
132                }
133                continue;
134            }
135
136            // Inside block comment
137            if in_block_comment {
138                stats.comment += 1;
139                if let Some(pos) = trimmed.find(block_comment_end) {
140                    // Check if there's code after the comment end
141                    let after = trimmed[pos + block_comment_end.len()..].trim();
142                    if !after.is_empty() && !self.starts_with_comment(after, lang) {
143                        // Line has code after comment - count as code too
144                        // But we already counted as comment, so adjust
145                        stats.comment -= 1;
146                        stats.code += 1;
147                    }
148                    in_block_comment = false;
149                }
150                continue;
151            }
152
153            // Check if line starts a multiline string
154            // starts_multiline_string only returns Some if the string is NOT closed on the same line
155            if let Some(delim) = self.starts_multiline_string(line, lang) {
156                // Docstrings count as comments, regular strings as code
157                if delim.is_docstring {
158                    stats.comment += 1;
159                } else {
160                    stats.code += 1;
161                }
162                in_multiline_string = true;
163                string_delimiter = Some(delim);
164                continue;
165            }
166
167            // Check for single-line Python docstring ("""...""" on one line)
168            if lang.name == "Python" {
169                if let Some(is_docstring) = self.is_single_line_docstring(trimmed) {
170                    if is_docstring {
171                        stats.comment += 1;
172                    } else {
173                        stats.code += 1;
174                    }
175                    continue;
176                }
177            }
178
179            // Check for block comment start
180            let mut found_block_start = false;
181            for (start, end) in &lang.block_comments {
182                if let Some(start_pos) = trimmed.find(start.as_str()) {
183                    // Check if it's inside a string (simplified check)
184                    let before = &trimmed[..start_pos];
185                    if self.is_in_string(before, lang) {
186                        continue;
187                    }
188
189                    found_block_start = true;
190                    let after_start = &trimmed[start_pos + start.len()..];
191
192                    if let Some(end_pos) = after_start.find(end.as_str()) {
193                        // Single-line block comment
194                        let after_end = after_start[end_pos + end.len()..].trim();
195                        if before.trim().is_empty() && after_end.is_empty() {
196                            stats.comment += 1;
197                        } else {
198                            // Mixed line - count as code
199                            stats.code += 1;
200                        }
201                    } else {
202                        // Multi-line block comment starts
203                        in_block_comment = true;
204                        block_comment_end = end;
205                        if before.trim().is_empty() {
206                            stats.comment += 1;
207                        } else {
208                            // Code before comment start
209                            stats.code += 1;
210                        }
211                    }
212                    break;
213                }
214            }
215
216            if found_block_start {
217                continue;
218            }
219
220            // Check for line comment
221            let is_line_comment = lang
222                .line_comments
223                .iter()
224                .any(|prefix| trimmed.starts_with(prefix.as_str()));
225
226            if is_line_comment {
227                stats.comment += 1;
228            } else {
229                stats.code += 1;
230            }
231        }
232
233        stats
234    }
235
236    /// Check if a line starts a multiline string literal.
237    /// Returns the delimiter info if a multiline string starts on this line.
238    fn starts_multiline_string(&self, line: &str, lang: &Language) -> Option<StringDelimiter> {
239        // Check for Rust raw strings: r#"..."# or r##"..."##
240        if lang.name == "Rust" {
241            if let Some(delim) = self.detect_rust_raw_string_start(line) {
242                return Some(delim);
243            }
244        }
245
246        // Check for Python triple-quoted strings (including docstrings)
247        if lang.name == "Python" {
248            for pattern in &["\"\"\"", "'''"] {
249                if let Some(pos) = line.find(pattern) {
250                    let before = &line[..pos];
251                    if !self.is_in_string(before, lang) {
252                        let after = &line[pos + 3..];
253                        // Check if it closes on the same line
254                        if after.find(pattern).is_none() {
255                            // Docstring: no assignment before the triple quotes
256                            let is_docstring = !before.contains('=');
257                            return Some(StringDelimiter {
258                                end_pattern: pattern.to_string(),
259                                is_raw: false,
260                                is_docstring,
261                            });
262                        }
263                    }
264                }
265            }
266        }
267
268        // Check for regular multiline strings (string not closed on same line)
269        let mut in_string = false;
270        let mut string_char = '"';
271        let mut escape_next = false;
272
273        let chars: Vec<char> = line.chars().collect();
274        let mut i = 0;
275        while i < chars.len() {
276            let c = chars[i];
277
278            if escape_next {
279                escape_next = false;
280                i += 1;
281                continue;
282            }
283
284            if c == '\\' && in_string {
285                escape_next = true;
286                i += 1;
287                continue;
288            }
289
290            if (c == '"' || c == '\'') && !in_string {
291                // Calculate byte position for string slice
292                let byte_pos: usize = chars[..i].iter().map(|ch| ch.len_utf8()).sum();
293                let before = &line[..byte_pos];
294                if !self.is_in_string(before, lang) {
295                    in_string = true;
296                    string_char = c;
297                }
298            } else if c == string_char && in_string {
299                in_string = false;
300            }
301
302            i += 1;
303        }
304
305        if in_string {
306            return Some(StringDelimiter {
307                end_pattern: string_char.to_string(),
308                is_raw: false,
309                is_docstring: false,
310            });
311        }
312
313        None
314    }
315
316    /// Detect Rust raw string start (r#"..."# or r##"..."##, etc.)
317    fn detect_rust_raw_string_start(&self, line: &str) -> Option<StringDelimiter> {
318        let bytes = line.as_bytes();
319        let len = bytes.len();
320        let mut i = 0;
321
322        while i < len {
323            // Look for 'r' followed by optional '#' and '"'
324            if bytes[i] == b'r' && i + 1 < len {
325                let start = i;
326                i += 1;
327
328                // Count the number of '#' after 'r'
329                let mut hash_count = 0;
330                while i < len && bytes[i] == b'#' {
331                    hash_count += 1;
332                    i += 1;
333                }
334
335                // Check for opening '"'
336                if i < len && bytes[i] == b'"' {
337                    // Check that 'r' is not part of an identifier
338                    if start == 0 || !bytes[start - 1].is_ascii_alphanumeric() {
339                        // Build the closing pattern: "# repeated hash_count times
340                        let end_pattern = format!("\"{}", "#".repeat(hash_count));
341
342                        // Check if it closes on the same line
343                        let after_quote = &line[i + 1..];
344                        if after_quote.find(&end_pattern).is_none() {
345                            return Some(StringDelimiter {
346                                end_pattern,
347                                is_raw: true,
348                                is_docstring: false,
349                            });
350                        }
351                    }
352                }
353            }
354            i += 1;
355        }
356
357        None
358    }
359
360    /// Check if a line ends the current multiline string.
361    fn line_ends_string(&self, line: &str, delim: &StringDelimiter) -> bool {
362        if delim.is_raw {
363            // For raw strings, just look for the closing pattern
364            line.contains(&delim.end_pattern)
365        } else {
366            // For regular strings, need to handle escapes
367            let mut chars = line.chars().peekable();
368            let target: Vec<char> = delim.end_pattern.chars().collect();
369
370            while let Some(c) = chars.next() {
371                if c == '\\' {
372                    // Skip escaped character
373                    chars.next();
374                    continue;
375                }
376
377                if !target.is_empty() && c == target[0] {
378                    // Check if this matches the closing pattern
379                    let mut matched = true;
380                    for expected in target.iter().skip(1) {
381                        if chars.next() != Some(*expected) {
382                            matched = false;
383                            break;
384                        }
385                    }
386                    if matched {
387                        return true;
388                    }
389                }
390            }
391            false
392        }
393    }
394
395    /// Check if a line is a single-line Python docstring.
396    /// Returns Some(true) if it's a docstring, Some(false) if it's a regular string assignment,
397    /// None if it doesn't contain a complete triple-quoted string.
398    fn is_single_line_docstring(&self, trimmed: &str) -> Option<bool> {
399        for pattern in &["\"\"\"", "'''"] {
400            if let Some(start_pos) = trimmed.find(pattern) {
401                let after_start = &trimmed[start_pos + 3..];
402                // Check if it closes on the same line
403                if let Some(end_pos) = after_start.find(pattern) {
404                    // Make sure there's nothing significant after the closing quotes
405                    let after_end = after_start[end_pos + 3..].trim();
406                    if after_end.is_empty() || after_end.starts_with('#') {
407                        // It's a complete triple-quoted string on one line
408                        let before = &trimmed[..start_pos];
409                        // Docstring: no assignment before the triple quotes
410                        return Some(!before.contains('='));
411                    }
412                }
413            }
414        }
415        None
416    }
417
418    /// Check if a string position is likely inside a string literal.
419    fn is_in_string(&self, text: &str, _lang: &Language) -> bool {
420        // Simplified check: count unescaped quotes
421        let mut in_string = false;
422        let mut chars = text.chars().peekable();
423
424        while let Some(c) = chars.next() {
425            match c {
426                '"' | '\'' => {
427                    in_string = !in_string;
428                }
429                '\\' => {
430                    // Skip escaped character
431                    chars.next();
432                }
433                _ => {}
434            }
435        }
436
437        in_string
438    }
439
440    /// Check if text starts with a comment.
441    fn starts_with_comment(&self, text: &str, lang: &Language) -> bool {
442        lang.line_comments
443            .iter()
444            .any(|prefix| text.starts_with(prefix.as_str()))
445            || lang
446                .block_comments
447                .iter()
448                .any(|(start, _)| text.starts_with(start.as_str()))
449    }
450}
451
452#[cfg(test)]
453mod tests {
454    use super::*;
455
456    fn make_rust_lang() -> Language {
457        Language {
458            name: "Rust".to_string(),
459            extensions: vec![".rs".to_string()],
460            filenames: vec![],
461            line_comments: vec!["//".to_string()],
462            block_comments: vec![("/*".to_string(), "*/".to_string())],
463            string_delimiters: vec![],
464            function_pattern: None,
465            complexity_keywords: vec![],
466            nested_comments: true,
467        }
468    }
469
470    #[test]
471    fn test_count_lines_basic() {
472        let lang = make_rust_lang();
473        let registry = Arc::new(LanguageRegistry::empty());
474        let analyzer = FileAnalyzer::new(registry, &Config::default());
475
476        let content = "fn main() {\n    println!(\"hello\");\n}\n";
477        let stats = analyzer.count_lines(content, &lang);
478        assert_eq!(stats.total, 3);
479        assert_eq!(stats.code, 3);
480        assert_eq!(stats.blank, 0);
481        assert_eq!(stats.comment, 0);
482    }
483
484    #[test]
485    fn test_count_lines_with_comments() {
486        let lang = make_rust_lang();
487        let registry = Arc::new(LanguageRegistry::empty());
488        let analyzer = FileAnalyzer::new(registry, &Config::default());
489
490        let content = "// This is a comment\nfn main() {\n    /* block comment */\n    println!(\"hello\");\n}\n";
491        let stats = analyzer.count_lines(content, &lang);
492        assert_eq!(stats.total, 5);
493        assert_eq!(stats.code, 3);
494        assert_eq!(stats.comment, 2);
495        assert_eq!(stats.blank, 0);
496    }
497
498    #[test]
499    fn test_count_lines_multiline_comment() {
500        let lang = make_rust_lang();
501        let registry = Arc::new(LanguageRegistry::empty());
502        let analyzer = FileAnalyzer::new(registry, &Config::default());
503
504        let content = "/*\n * Multi-line\n * comment\n */\nfn main() {}\n";
505        let stats = analyzer.count_lines(content, &lang);
506        assert_eq!(stats.total, 5);
507        assert_eq!(stats.code, 1);
508        assert_eq!(stats.comment, 4);
509        assert_eq!(stats.blank, 0);
510    }
511
512    #[test]
513    fn test_count_lines_multiline_string() {
514        let lang = make_rust_lang();
515        let registry = Arc::new(LanguageRegistry::empty());
516        let analyzer = FileAnalyzer::new(registry, &Config::default());
517
518        // Multiline string with content that looks like a comment
519        let content = "let s = \"hello\n// not a comment\nworld\";\n";
520        let stats = analyzer.count_lines(content, &lang);
521        assert_eq!(stats.total, 3);
522        assert_eq!(stats.code, 3, "All lines should be code (inside string)");
523        assert_eq!(stats.comment, 0, "No comments - // is inside string");
524        assert_eq!(stats.blank, 0);
525    }
526
527    #[test]
528    fn test_count_lines_raw_string() {
529        let lang = make_rust_lang();
530        let registry = Arc::new(LanguageRegistry::empty());
531        let analyzer = FileAnalyzer::new(registry, &Config::default());
532
533        // Raw string with content that looks like a comment
534        let content = "let s = r#\"hello\n// not a comment\n/* also not */\nworld\"#;\n";
535        let stats = analyzer.count_lines(content, &lang);
536        assert_eq!(stats.total, 4);
537        assert_eq!(stats.code, 4, "All lines should be code (inside raw string)");
538        assert_eq!(stats.comment, 0, "No comments - everything is inside raw string");
539        assert_eq!(stats.blank, 0);
540    }
541}