codegraph_c/
preprocessor.rs

1//! C Preprocessor simulation layer
2//!
3//! This module provides a lightweight preprocessing layer that handles common
4//! C macros without requiring actual header files. Unlike a real preprocessor,
5//! this doesn't expand macros textually but instead helps tree-sitter parse
6//! code that uses common macro patterns.
7//!
8//! Key strategies:
9//! 1. Macro recognition: Identify and annotate common macro patterns
10//! 2. Attribute simulation: Convert __attribute__ and similar to parseable form
11//! 3. Kernel macro handling: Special support for Linux kernel macros
12
13use std::collections::HashMap;
14
15/// Known macro patterns and their semantic meaning
16#[derive(Debug, Clone, PartialEq)]
17pub enum MacroKind {
18    /// Type-like macro (expands to a type): u8, u16, size_t, etc.
19    TypeAlias,
20    /// Attribute macro: __init, __exit, __user, __packed, etc.
21    Attribute,
22    /// Function-like macro that wraps a function definition
23    FunctionWrapper,
24    /// Module/export macro: MODULE_LICENSE, EXPORT_SYMBOL, etc.
25    ModuleDeclaration,
26    /// Locking primitive: DEFINE_MUTEX, spin_lock, etc.
27    LockingPrimitive,
28    /// Memory allocation: kmalloc, kfree, etc.
29    MemoryOperation,
30    /// Conditional compilation marker
31    ConditionalMarker,
32    /// Generic macro call
33    Generic,
34}
35
36/// Information about a recognized macro
37#[derive(Debug, Clone)]
38pub struct MacroInfo {
39    pub name: String,
40    pub kind: MacroKind,
41    pub expansion_hint: Option<String>,
42}
43
44/// C Preprocessor simulation for better parsing
45pub struct CPreprocessor {
46    /// Known type-like macros (expand to types)
47    type_macros: HashMap<String, String>,
48    /// Known attribute macros (can be stripped)
49    attribute_macros: Vec<String>,
50    /// Known function wrapper macros
51    function_wrappers: HashMap<String, String>,
52    /// Known module declaration macros
53    module_macros: Vec<String>,
54}
55
56impl Default for CPreprocessor {
57    fn default() -> Self {
58        Self::new()
59    }
60}
61
62impl CPreprocessor {
63    pub fn new() -> Self {
64        let mut preprocessor = Self {
65            type_macros: HashMap::new(),
66            attribute_macros: Vec::new(),
67            function_wrappers: HashMap::new(),
68            module_macros: Vec::new(),
69        };
70        preprocessor.init_kernel_macros();
71        preprocessor.init_standard_macros();
72        preprocessor
73    }
74
75    /// Initialize Linux kernel-specific macros
76    fn init_kernel_macros(&mut self) {
77        // Integer types
78        for (macro_name, expansion) in [
79            ("u8", "unsigned char"),
80            ("u16", "unsigned short"),
81            ("u32", "unsigned int"),
82            ("u64", "unsigned long long"),
83            ("s8", "signed char"),
84            ("s16", "signed short"),
85            ("s32", "signed int"),
86            ("s64", "signed long long"),
87            ("__u8", "unsigned char"),
88            ("__u16", "unsigned short"),
89            ("__u32", "unsigned int"),
90            ("__u64", "unsigned long long"),
91            ("__s8", "signed char"),
92            ("__s16", "signed short"),
93            ("__s32", "signed int"),
94            ("__s64", "signed long long"),
95            ("__le16", "unsigned short"),
96            ("__le32", "unsigned int"),
97            ("__le64", "unsigned long long"),
98            ("__be16", "unsigned short"),
99            ("__be32", "unsigned int"),
100            ("__be64", "unsigned long long"),
101            ("bool", "_Bool"),
102            ("atomic_t", "int"),
103            ("atomic64_t", "long long"),
104            ("spinlock_t", "int"),
105            ("rwlock_t", "int"),
106            ("mutex", "int"),
107            ("size_t", "unsigned long"),
108            ("ssize_t", "long"),
109            ("ptrdiff_t", "long"),
110            ("uintptr_t", "unsigned long"),
111            ("intptr_t", "long"),
112            ("phys_addr_t", "unsigned long long"),
113            ("dma_addr_t", "unsigned long long"),
114            ("resource_size_t", "unsigned long long"),
115            ("gfp_t", "unsigned int"),
116            ("fmode_t", "unsigned int"),
117            ("umode_t", "unsigned short"),
118            ("dev_t", "unsigned int"),
119            ("loff_t", "long long"),
120            ("pid_t", "int"),
121            ("uid_t", "unsigned int"),
122            ("gid_t", "unsigned int"),
123            ("ktime_t", "long long"),
124        ] {
125            self.type_macros
126                .insert(macro_name.to_string(), expansion.to_string());
127        }
128
129        // Attribute macros (can be stripped for parsing)
130        self.attribute_macros.extend(
131            [
132                // Section/init attributes
133                "__init",
134                "__exit",
135                "__initdata",
136                "__exitdata",
137                "__initconst",
138                "__devinit",
139                "__devexit",
140                // Compiler hints
141                "__cold",
142                "__hot",
143                "__pure",
144                "__const",
145                "__noreturn",
146                "__malloc",
147                "__weak",
148                "__alias",
149                "__always_inline",
150                "__noinline",
151                "noinline",
152                "inline",
153                "__inline",
154                "__inline__",
155                "__section",
156                "__visible",
157                "__flatten",
158                // Address space annotations
159                "__user",
160                "__kernel",
161                "__iomem",
162                "__percpu",
163                "__rcu",
164                "__force",
165                "__bitwise",
166                "__safe",
167                // Unused/maybe annotations (common error source)
168                "__maybe_unused",
169                "__always_unused",
170                "__unused",
171                // Packing and alignment
172                "__packed",
173                "__aligned",
174                "__cacheline_aligned",
175                "__cacheline_aligned_in_smp",
176                "__page_aligned_data",
177                "__page_aligned_bss",
178                // Deprecation
179                "__deprecated",
180                "__deprecated_for_modules",
181                // Locking annotations
182                "__must_check",
183                "__must_hold",
184                "__acquires",
185                "__releases",
186                "__acquire",
187                "__release",
188                "__cond_lock",
189                // Memory placement
190                "__read_mostly",
191                "__ro_after_init",
192                // Calling conventions
193                "asmlinkage",
194                "fastcall",
195                "regparm",
196                // Export symbols
197                "EXPORT_SYMBOL",
198                "EXPORT_SYMBOL_GPL",
199                "EXPORT_SYMBOL_NS",
200                "EXPORT_SYMBOL_NS_GPL",
201                // Branch prediction
202                "likely",
203                "unlikely",
204                // Memory access
205                "ACCESS_ONCE",
206                "READ_ONCE",
207                "WRITE_ONCE",
208                // Checksum types
209                "__wsum",
210                "__sum16",
211                "__be16",
212                "__be32",
213                "__be64",
214                "__le16",
215                "__le32",
216                "__le64",
217            ]
218            .iter()
219            .map(|s| s.to_string()),
220        );
221
222        // Function wrapper macros
223        for (wrapper, ret_type) in [
224            ("SYSCALL_DEFINE0", "long"),
225            ("SYSCALL_DEFINE1", "long"),
226            ("SYSCALL_DEFINE2", "long"),
227            ("SYSCALL_DEFINE3", "long"),
228            ("SYSCALL_DEFINE4", "long"),
229            ("SYSCALL_DEFINE5", "long"),
230            ("SYSCALL_DEFINE6", "long"),
231            ("COMPAT_SYSCALL_DEFINE0", "long"),
232            ("COMPAT_SYSCALL_DEFINE1", "long"),
233            ("COMPAT_SYSCALL_DEFINE2", "long"),
234            ("COMPAT_SYSCALL_DEFINE3", "long"),
235            ("COMPAT_SYSCALL_DEFINE4", "long"),
236            ("COMPAT_SYSCALL_DEFINE5", "long"),
237            ("COMPAT_SYSCALL_DEFINE6", "long"),
238            ("__setup", "int"),
239            ("early_param", "int"),
240            ("core_param", "int"),
241            ("module_param", "void"),
242            ("module_param_named", "void"),
243            ("DEFINE_PER_CPU", "void"),
244            ("DECLARE_PER_CPU", "void"),
245        ] {
246            self.function_wrappers
247                .insert(wrapper.to_string(), ret_type.to_string());
248        }
249
250        // Module declaration macros
251        self.module_macros.extend(
252            [
253                "MODULE_LICENSE",
254                "MODULE_AUTHOR",
255                "MODULE_DESCRIPTION",
256                "MODULE_VERSION",
257                "MODULE_ALIAS",
258                "MODULE_DEVICE_TABLE",
259                "MODULE_FIRMWARE",
260                "MODULE_INFO",
261                "MODULE_PARM_DESC",
262                "module_init",
263                "module_exit",
264                "late_initcall",
265                "subsys_initcall",
266                "fs_initcall",
267                "device_initcall",
268                "arch_initcall",
269                "core_initcall",
270                "postcore_initcall",
271            ]
272            .iter()
273            .map(|s| s.to_string()),
274        );
275    }
276
277    /// Initialize standard C macros
278    fn init_standard_macros(&mut self) {
279        // Standard C types
280        for (macro_name, expansion) in [
281            ("NULL", "((void*)0)"),
282            ("EOF", "(-1)"),
283            ("true", "1"),
284            ("false", "0"),
285            ("TRUE", "1"),
286            ("FALSE", "0"),
287        ] {
288            self.type_macros
289                .insert(macro_name.to_string(), expansion.to_string());
290        }
291    }
292
293    /// Check if an identifier is a known type macro
294    pub fn is_type_macro(&self, name: &str) -> bool {
295        self.type_macros.contains_key(name)
296    }
297
298    /// Get the expansion hint for a type macro
299    pub fn get_type_expansion(&self, name: &str) -> Option<&str> {
300        self.type_macros.get(name).map(|s| s.as_str())
301    }
302
303    /// Check if an identifier is a known attribute macro
304    pub fn is_attribute_macro(&self, name: &str) -> bool {
305        self.attribute_macros.contains(&name.to_string())
306    }
307
308    /// Check if an identifier is a function wrapper macro
309    pub fn is_function_wrapper(&self, name: &str) -> bool {
310        self.function_wrappers.contains_key(name)
311    }
312
313    /// Check if an identifier is a module declaration macro
314    pub fn is_module_macro(&self, name: &str) -> bool {
315        self.module_macros.contains(&name.to_string())
316    }
317
318    /// Classify a macro by name
319    pub fn classify_macro(&self, name: &str) -> MacroKind {
320        if self.is_type_macro(name) {
321            MacroKind::TypeAlias
322        } else if self.is_attribute_macro(name) {
323            MacroKind::Attribute
324        } else if self.is_function_wrapper(name) {
325            MacroKind::FunctionWrapper
326        } else if self.is_module_macro(name) {
327            MacroKind::ModuleDeclaration
328        } else if name.starts_with("DEFINE_")
329            || name.starts_with("DECLARE_")
330            || name.contains("_LOCK")
331            || name.contains("_MUTEX")
332        {
333            MacroKind::LockingPrimitive
334        } else if name.contains("alloc")
335            || name.contains("free")
336            || name.starts_with("k")
337                && (name.contains("alloc") || name.contains("free") || name.contains("zalloc"))
338        {
339            MacroKind::MemoryOperation
340        } else if name.starts_with("CONFIG_")
341            || name.starts_with("IS_ENABLED")
342            || name.starts_with("IS_BUILTIN")
343            || name.starts_with("IS_MODULE")
344        {
345            MacroKind::ConditionalMarker
346        } else {
347            MacroKind::Generic
348        }
349    }
350
351    /// Preprocess source code to make it more parseable
352    ///
353    /// This performs lightweight transformations:
354    /// - Strips problematic attributes
355    /// - Normalizes some macro patterns
356    pub fn preprocess(&self, source: &str) -> String {
357        let mut result = String::with_capacity(source.len());
358
359        for line in source.lines() {
360            let processed = self.process_line(line);
361            result.push_str(&processed);
362            result.push('\n');
363        }
364
365        result
366    }
367
368    /// Process a single line
369    fn process_line(&self, line: &str) -> String {
370        let trimmed = line.trim();
371
372        // Skip empty lines and comments
373        if trimmed.is_empty() || trimmed.starts_with("//") {
374            return line.to_string();
375        }
376
377        // Handle #include - keep as-is (tree-sitter handles these)
378        if trimmed.starts_with("#include") {
379            return line.to_string();
380        }
381
382        // Handle preprocessor directives
383        // Note: We keep #if/#ifdef/#else/#endif as-is because converting them to comments
384        // can break code structure (e.g., nested conditionals become orphaned code blocks).
385        // Tree-sitter's error tolerance handles these better when left intact.
386
387        // Strip #define macros that often cause parsing issues
388        // (function-like macros mid-declaration)
389        if trimmed.starts_with("#define ")
390            || trimmed.starts_with("#undef ")
391            || trimmed.starts_with("#pragma ")
392            || trimmed.starts_with("#error ")
393            || trimmed.starts_with("#warning ")
394        {
395            // Convert to empty comment to preserve line numbers
396            return "/* preprocessor directive stripped */".to_string();
397        }
398
399        // Strip known attribute macros that confuse the parser
400        let mut result = line.to_string();
401        for attr in &self.attribute_macros {
402            // Handle both plain attributes and function-like attributes
403            // e.g., __init, __section(".text")
404            let patterns = [format!("{attr} "), format!("{attr}\t"), format!("{attr}(")];
405
406            for pattern in &patterns {
407                if result.contains(pattern.as_str()) {
408                    // For function-like attributes, need to handle parentheses
409                    if pattern.ends_with('(') {
410                        // Find matching closing paren and remove the whole thing
411                        if let Some(start) = result.find(attr) {
412                            if let Some(paren_start) = result[start..].find('(') {
413                                let abs_paren = start + paren_start;
414                                let mut depth = 1;
415                                let mut end = abs_paren + 1;
416                                for (i, c) in result[abs_paren + 1..].char_indices() {
417                                    match c {
418                                        '(' => depth += 1,
419                                        ')' => {
420                                            depth -= 1;
421                                            if depth == 0 {
422                                                end = abs_paren + 1 + i + 1;
423                                                break;
424                                            }
425                                        }
426                                        _ => {}
427                                    }
428                                }
429                                result = format!("{}{}", &result[..start], &result[end..]);
430                            }
431                        }
432                    } else {
433                        result = result.replace(pattern, "");
434                    }
435                }
436            }
437        }
438
439        // Handle offsetof(type, member) - common source of errors
440        // Replace with 0 (a constant that tree-sitter can parse)
441        while let Some(start) = result.find("offsetof(") {
442            let rest = &result[start + 9..]; // after "offsetof("
443            let mut depth = 1;
444            let mut end_paren = 0;
445
446            for (i, c) in rest.char_indices() {
447                match c {
448                    '(' => depth += 1,
449                    ')' => {
450                        depth -= 1;
451                        if depth == 0 {
452                            end_paren = i;
453                            break;
454                        }
455                    }
456                    _ => {}
457                }
458            }
459
460            if end_paren > 0 {
461                result = format!(
462                    "{}0{}",
463                    &result[..start],
464                    &result[start + 9 + end_paren + 1..]
465                );
466            } else {
467                break;
468            }
469        }
470
471        // Handle container_of(ptr, type, member) - 6.4% of errors
472        // Replace with a simpler cast expression: ((type *)ptr)
473        while let Some(start) = result.find("container_of(") {
474            let rest = &result[start + 13..]; // after "container_of("
475            let mut depth = 1;
476            let mut end_paren = 0;
477            let mut first_comma = None;
478
479            for (i, c) in rest.char_indices() {
480                match c {
481                    '(' => depth += 1,
482                    ')' => {
483                        depth -= 1;
484                        if depth == 0 {
485                            end_paren = i;
486                            break;
487                        }
488                    }
489                    ',' if depth == 1 && first_comma.is_none() => {
490                        first_comma = Some(i);
491                    }
492                    _ => {}
493                }
494            }
495
496            if end_paren > 0 {
497                // Extract ptr (first argument)
498                let ptr = if let Some(comma_pos) = first_comma {
499                    rest[..comma_pos].trim()
500                } else {
501                    "ptr"
502                };
503                // Replace with (void*)ptr - simple cast that tree-sitter can parse
504                let replacement = format!("((void*){ptr})");
505                result = format!(
506                    "{}{}{}",
507                    &result[..start],
508                    replacement,
509                    &result[start + 13 + end_paren + 1..]
510                );
511            } else {
512                break;
513            }
514        }
515
516        // Handle __attribute__((...)) - complex case
517        while let Some(start) = result.find("__attribute__") {
518            if let Some(paren_start) = result[start..].find("((") {
519                let abs_start = start + paren_start;
520                let mut depth = 2; // Starting after "(("
521                let mut end = abs_start + 2;
522                for (i, c) in result[abs_start + 2..].char_indices() {
523                    match c {
524                        '(' => depth += 1,
525                        ')' => {
526                            depth -= 1;
527                            if depth == 0 {
528                                end = abs_start + 2 + i + 1;
529                                break;
530                            }
531                        }
532                        _ => {}
533                    }
534                }
535                result = format!("{}{}", &result[..start], &result[end..]);
536            } else {
537                break;
538            }
539        }
540
541        result
542    }
543
544    /// Get information about all recognized macros in source
545    pub fn analyze_macros(&self, source: &str) -> Vec<MacroInfo> {
546        let mut macros = Vec::new();
547
548        // Simple token extraction for macro detection
549        for word in source.split(|c: char| !c.is_alphanumeric() && c != '_') {
550            if word.is_empty() {
551                continue;
552            }
553
554            let kind = self.classify_macro(word);
555            if kind != MacroKind::Generic {
556                macros.push(MacroInfo {
557                    name: word.to_string(),
558                    kind: kind.clone(),
559                    expansion_hint: self.get_type_expansion(word).map(|s| s.to_string()),
560                });
561            }
562        }
563
564        // Deduplicate
565        macros.sort_by(|a, b| a.name.cmp(&b.name));
566        macros.dedup_by(|a, b| a.name == b.name);
567
568        macros
569    }
570}
571
572#[cfg(test)]
573mod tests {
574    use super::*;
575
576    #[test]
577    fn test_type_macro_recognition() {
578        let pp = CPreprocessor::new();
579        assert!(pp.is_type_macro("u8"));
580        assert!(pp.is_type_macro("u32"));
581        assert!(pp.is_type_macro("size_t"));
582        assert!(!pp.is_type_macro("unknown_type"));
583    }
584
585    #[test]
586    fn test_attribute_macro_recognition() {
587        let pp = CPreprocessor::new();
588        assert!(pp.is_attribute_macro("__init"));
589        assert!(pp.is_attribute_macro("__exit"));
590        assert!(pp.is_attribute_macro("__user"));
591        assert!(!pp.is_attribute_macro("regular_function"));
592    }
593
594    #[test]
595    fn test_macro_classification() {
596        let pp = CPreprocessor::new();
597        assert_eq!(pp.classify_macro("u32"), MacroKind::TypeAlias);
598        assert_eq!(pp.classify_macro("__init"), MacroKind::Attribute);
599        assert_eq!(
600            pp.classify_macro("MODULE_LICENSE"),
601            MacroKind::ModuleDeclaration
602        );
603        assert_eq!(
604            pp.classify_macro("DEFINE_MUTEX"),
605            MacroKind::LockingPrimitive
606        );
607        assert_eq!(
608            pp.classify_macro("CONFIG_DEBUG"),
609            MacroKind::ConditionalMarker
610        );
611    }
612
613    #[test]
614    fn test_preprocess_strips_attributes() {
615        let pp = CPreprocessor::new();
616        let source = "static __init int my_init(void) { return 0; }";
617        let processed = pp.preprocess(source);
618        assert!(!processed.contains("__init"));
619        assert!(processed.contains("static"));
620        assert!(processed.contains("int my_init"));
621    }
622
623    #[test]
624    fn test_preprocess_handles_attribute_syntax() {
625        let pp = CPreprocessor::new();
626        let source = "void __attribute__((packed)) my_struct;";
627        let processed = pp.preprocess(source);
628        assert!(!processed.contains("__attribute__"));
629        assert!(processed.contains("void"));
630    }
631
632    #[test]
633    fn test_analyze_macros() {
634        let pp = CPreprocessor::new();
635        let source = "u32 foo; __init static int bar(size_t n) { return 0; }";
636        let macros = pp.analyze_macros(source);
637
638        let names: Vec<_> = macros.iter().map(|m| m.name.as_str()).collect();
639        assert!(names.contains(&"u32"));
640        assert!(names.contains(&"__init"));
641        assert!(names.contains(&"size_t"));
642    }
643
644    #[test]
645    fn test_preprocess_preserves_includes() {
646        let pp = CPreprocessor::new();
647        let source = "#include <linux/module.h>\n#include \"myheader.h\"";
648        let processed = pp.preprocess(source);
649        assert!(processed.contains("#include <linux/module.h>"));
650        assert!(processed.contains("#include \"myheader.h\""));
651    }
652}