codegraph_c/pipeline/
mod.rs

1//! Layered processing pipeline for C source code
2//!
3//! This module provides a multi-stage processing pipeline that transforms C source code
4//! to make it more parseable by tree-sitter, while preserving semantic information.
5//!
6//! ## Pipeline Stages
7//!
8//! 1. **Platform Detection** - Identify target platform (Linux, FreeBSD, Darwin)
9//! 2. **Header Stub Injection** - Inject type definitions for known headers
10//! 3. **Conditional Evaluation** - Strip `#if 0` blocks, optionally evaluate simple conditions
11//! 4. **GCC Neutralization** - Remove/replace GCC extensions
12//! 5. **Attribute Stripping** - Remove platform-specific attributes
13//!
14//! After these stages, the code is ready for tree-sitter parsing.
15
16mod conditionals;
17mod gcc;
18mod macros;
19
20pub use conditionals::{evaluate_conditionals, ConditionalStrategy};
21pub use gcc::{GccNeutralizer, NeutralizedSource, TransformKind, Transformation};
22pub use macros::{MacroNeutralizer, MacroStats};
23
24use crate::platform::{DetectionResult, HeaderStubs, PlatformRegistry};
25
26/// Configuration for the processing pipeline
27#[derive(Debug, Clone)]
28pub struct PipelineConfig {
29    /// Whether to inject header stubs
30    pub inject_stubs: bool,
31    /// Strategy for handling preprocessor conditionals
32    pub conditional_strategy: ConditionalStrategy,
33    /// Whether to neutralize GCC extensions
34    pub neutralize_gcc: bool,
35    /// Whether to strip platform-specific attributes
36    pub strip_attributes: bool,
37    /// Whether to neutralize kernel macros
38    pub neutralize_macros: bool,
39    /// Optional platform ID to force (bypasses detection)
40    pub force_platform: Option<String>,
41}
42
43impl Default for PipelineConfig {
44    fn default() -> Self {
45        Self {
46            inject_stubs: true,
47            conditional_strategy: ConditionalStrategy::EvaluateSimple,
48            neutralize_gcc: true,
49            strip_attributes: true,
50            neutralize_macros: true,
51            force_platform: None,
52        }
53    }
54}
55
56impl PipelineConfig {
57    /// Configuration for minimal processing
58    pub fn minimal() -> Self {
59        Self {
60            inject_stubs: false,
61            conditional_strategy: ConditionalStrategy::KeepAll,
62            neutralize_gcc: false,
63            strip_attributes: false,
64            neutralize_macros: false,
65            force_platform: None,
66        }
67    }
68
69    /// Configuration optimized for kernel code
70    pub fn for_kernel_code() -> Self {
71        Self {
72            inject_stubs: true,
73            conditional_strategy: ConditionalStrategy::EvaluateSimple,
74            neutralize_gcc: true,
75            strip_attributes: true,
76            neutralize_macros: true,
77            force_platform: Some("linux".to_string()),
78        }
79    }
80}
81
82/// Result of pipeline processing
83#[derive(Debug)]
84pub struct PipelineResult {
85    /// Processed source code ready for parsing
86    pub source: String,
87    /// Detected or forced platform
88    pub platform: DetectionResult,
89    /// GCC transformation records (for position mapping)
90    pub transformations: Vec<Transformation>,
91    /// Statistics about processing
92    pub stats: PipelineStats,
93}
94
95/// Statistics about pipeline processing
96#[derive(Debug, Default, Clone)]
97pub struct PipelineStats {
98    /// Number of header stubs injected
99    pub stubs_injected: usize,
100    /// Number of conditional blocks stripped
101    pub conditionals_stripped: usize,
102    /// Number of GCC extensions neutralized
103    pub gcc_neutralized: usize,
104    /// Number of attributes stripped
105    pub attributes_stripped: usize,
106    /// Statistics about macro neutralization
107    pub macro_stats: MacroStats,
108    /// Original source length
109    pub original_length: usize,
110    /// Processed source length
111    pub processed_length: usize,
112}
113
114/// The processing pipeline
115pub struct Pipeline {
116    registry: PlatformRegistry,
117    gcc_neutralizer: GccNeutralizer,
118}
119
120impl Default for Pipeline {
121    fn default() -> Self {
122        Self::new()
123    }
124}
125
126impl Pipeline {
127    pub fn new() -> Self {
128        Self {
129            registry: PlatformRegistry::new(),
130            gcc_neutralizer: GccNeutralizer::new(),
131        }
132    }
133
134    /// Process source code through the pipeline
135    pub fn process(&self, source: &str, config: &PipelineConfig) -> PipelineResult {
136        let mut stats = PipelineStats {
137            original_length: source.len(),
138            ..Default::default()
139        };
140
141        // Step 1: Platform detection
142        let platform = if let Some(ref forced) = config.force_platform {
143            DetectionResult {
144                platform_id: forced.clone(),
145                confidence: 1.0,
146                matched_patterns: vec!["forced".to_string()],
147            }
148        } else {
149            self.registry.detect(source)
150        };
151
152        let platform_module = self.registry.get(&platform.platform_id);
153
154        // Step 2: Header stub injection
155        let mut processed = source.to_string();
156        if config.inject_stubs {
157            if let Some(module) = platform_module {
158                let stubs = module.header_stubs().get_for_includes(source);
159                if !stubs.is_empty() {
160                    stats.stubs_injected = stubs.lines().filter(|l| l.contains("typedef")).count();
161                    processed = format!("{stubs}\n{processed}");
162                }
163            }
164        }
165
166        // Step 3: Conditional evaluation
167        let (processed, conditionals_stripped) =
168            evaluate_conditionals(&processed, config.conditional_strategy.clone());
169        stats.conditionals_stripped = conditionals_stripped;
170
171        // Step 4: GCC extension neutralization
172        let (processed, transformations) = if config.neutralize_gcc {
173            let result = self.gcc_neutralizer.neutralize(&processed);
174            stats.gcc_neutralized = result.transformations.len();
175            (result.code, result.transformations)
176        } else {
177            (processed, Vec::new())
178        };
179
180        // Step 5: Attribute stripping
181        let processed = if config.strip_attributes {
182            if let Some(module) = platform_module {
183                let (stripped, count) =
184                    Self::strip_attributes(&processed, module.attributes_to_strip());
185                stats.attributes_stripped = count;
186                stripped
187            } else {
188                processed
189            }
190        } else {
191            processed
192        };
193
194        // Step 6: Macro neutralization (kernel macros like likely/unlikely, BUILD_BUG_ON, etc.)
195        let processed = if config.neutralize_macros {
196            let mut macro_neutralizer = MacroNeutralizer::new();
197            let result = macro_neutralizer.neutralize(&processed);
198            stats.macro_stats = macro_neutralizer.stats().clone();
199            result
200        } else {
201            processed
202        };
203
204        stats.processed_length = processed.len();
205
206        PipelineResult {
207            source: processed,
208            platform,
209            transformations,
210            stats,
211        }
212    }
213
214    /// Strip platform-specific attributes from source
215    fn strip_attributes(source: &str, attributes: &[&str]) -> (String, usize) {
216        let mut result = source.to_string();
217        let mut count = 0;
218
219        for attr in attributes {
220            // Count occurrences before stripping
221            let before_count = result.matches(attr).count();
222
223            // Handle both plain attributes and function-like attributes
224            let patterns = [format!("{attr} "), format!("{attr}\t"), format!("{attr}(")];
225
226            for pattern in &patterns {
227                while result.contains(pattern.as_str()) {
228                    if pattern.ends_with('(') {
229                        // For function-like attributes, find and remove with parentheses
230                        if let Some(start) = result.find(attr) {
231                            if let Some(paren_start) = result[start..].find('(') {
232                                let abs_paren = start + paren_start;
233                                let mut depth = 1;
234                                let mut end = abs_paren + 1;
235                                for (i, c) in result[abs_paren + 1..].char_indices() {
236                                    match c {
237                                        '(' => depth += 1,
238                                        ')' => {
239                                            depth -= 1;
240                                            if depth == 0 {
241                                                end = abs_paren + 1 + i + 1;
242                                                break;
243                                            }
244                                        }
245                                        _ => {}
246                                    }
247                                }
248                                result = format!("{}{}", &result[..start], &result[end..]);
249                            } else {
250                                break;
251                            }
252                        } else {
253                            break;
254                        }
255                    } else {
256                        result = result.replacen(pattern, "", 1);
257                    }
258                }
259            }
260
261            let after_count = result.matches(attr).count();
262            count += before_count.saturating_sub(after_count);
263        }
264
265        (result, count)
266    }
267
268    /// Get the platform registry for direct access
269    pub fn registry(&self) -> &PlatformRegistry {
270        &self.registry
271    }
272
273    /// Get header stubs for a platform
274    pub fn get_stubs(&self, platform_id: &str) -> Option<&HeaderStubs> {
275        self.registry.get(platform_id).map(|p| p.header_stubs())
276    }
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282
283    #[test]
284    fn test_pipeline_default_config() {
285        let config = PipelineConfig::default();
286        assert!(config.inject_stubs);
287        assert!(config.neutralize_gcc);
288        assert!(config.strip_attributes);
289    }
290
291    #[test]
292    fn test_pipeline_minimal_config() {
293        let config = PipelineConfig::minimal();
294        assert!(!config.inject_stubs);
295        assert!(!config.neutralize_gcc);
296        assert!(!config.strip_attributes);
297    }
298
299    #[test]
300    fn test_pipeline_kernel_config() {
301        let config = PipelineConfig::for_kernel_code();
302        assert!(config.inject_stubs);
303        assert!(config.neutralize_gcc);
304        assert_eq!(config.force_platform, Some("linux".to_string()));
305    }
306
307    #[test]
308    fn test_pipeline_basic_processing() {
309        let pipeline = Pipeline::new();
310        let config = PipelineConfig::minimal();
311
312        let source = "int main() { return 0; }";
313        let result = pipeline.process(source, &config);
314
315        assert_eq!(result.source, source);
316        assert_eq!(result.stats.original_length, source.len());
317    }
318
319    #[test]
320    fn test_pipeline_with_linux_detection() {
321        let pipeline = Pipeline::new();
322        let config = PipelineConfig::default();
323
324        let source = r#"
325#include <linux/module.h>
326#include <linux/kernel.h>
327
328MODULE_LICENSE("GPL");
329
330static int __init my_init(void) {
331    printk(KERN_INFO "Hello\n");
332    return 0;
333}
334"#;
335
336        let result = pipeline.process(source, &config);
337
338        // Should detect Linux platform
339        assert_eq!(result.platform.platform_id, "linux");
340        assert!(result.platform.confidence > 0.5);
341
342        // Verify stubs were available for the headers
343        let stubs = pipeline.get_stubs("linux");
344        assert!(stubs.is_some());
345        let stubs = stubs.unwrap();
346        assert!(stubs.has_stub("linux/module.h"));
347        assert!(stubs.has_stub("linux/kernel.h"));
348
349        // Processed source should have type definitions from stubs
350        // Note: stubs_injected counts typedef lines
351        assert!(
352            result.source.contains("extern int printk")
353                || result.source.contains("typedef")
354                || result.stats.stubs_injected > 0
355        );
356
357        // Should have stripped __init attribute
358        assert!(!result.source.contains("__init"));
359    }
360
361    #[test]
362    fn test_pipeline_conditional_stripping() {
363        let pipeline = Pipeline::new();
364        let config = PipelineConfig {
365            inject_stubs: false,
366            conditional_strategy: ConditionalStrategy::EvaluateSimple,
367            neutralize_gcc: false,
368            strip_attributes: false,
369            neutralize_macros: false,
370            force_platform: None,
371        };
372
373        let source = r#"
374int a;
375#if 0
376int b;
377#endif
378int c;
379"#;
380
381        let result = pipeline.process(source, &config);
382
383        // Should have stripped the #if 0 block
384        assert!(result.source.contains("int a;"));
385        assert!(!result.source.contains("int b;"));
386        assert!(result.source.contains("int c;"));
387        assert!(result.stats.conditionals_stripped > 0);
388    }
389
390    #[test]
391    fn test_pipeline_gcc_neutralization() {
392        let pipeline = Pipeline::new();
393        let config = PipelineConfig {
394            inject_stubs: false,
395            conditional_strategy: ConditionalStrategy::KeepAll,
396            neutralize_gcc: true,
397            strip_attributes: false,
398            neutralize_macros: false,
399            force_platform: None,
400        };
401
402        let source = "void __attribute__((packed)) foo(void) {}";
403        let result = pipeline.process(source, &config);
404
405        assert!(!result.source.contains("__attribute__"));
406        assert!(result.stats.gcc_neutralized > 0);
407    }
408
409    #[test]
410    fn test_pipeline_attribute_stripping() {
411        let pipeline = Pipeline::new();
412        let config = PipelineConfig {
413            inject_stubs: false,
414            conditional_strategy: ConditionalStrategy::KeepAll,
415            neutralize_gcc: false,
416            strip_attributes: true,
417            neutralize_macros: false,
418            force_platform: Some("linux".to_string()),
419        };
420
421        let source = "static __init int my_init(void) { return 0; }";
422        let result = pipeline.process(source, &config);
423
424        assert!(!result.source.contains("__init"));
425        assert!(result.stats.attributes_stripped > 0);
426    }
427
428    #[test]
429    fn test_pipeline_forced_platform() {
430        let pipeline = Pipeline::new();
431        let config = PipelineConfig {
432            force_platform: Some("linux".to_string()),
433            ..PipelineConfig::minimal()
434        };
435
436        let source = "int main() { return 0; }";
437        let result = pipeline.process(source, &config);
438
439        assert_eq!(result.platform.platform_id, "linux");
440        assert!((result.platform.confidence - 1.0).abs() < f32::EPSILON);
441    }
442
443    #[test]
444    fn test_pipeline_stats() {
445        let pipeline = Pipeline::new();
446        let config = PipelineConfig::for_kernel_code();
447
448        let source = r#"
449#include <linux/types.h>
450
451static __init int my_init(void) {
452    u32 x = 0;
453    return x;
454}
455"#;
456
457        let result = pipeline.process(source, &config);
458
459        // Check stats
460        assert!(result.stats.original_length > 0);
461        assert!(result.stats.processed_length > 0);
462    }
463}