codegraph_c/pipeline/
gcc.rs

1//! GCC extension neutralization
2//!
3//! This module provides transformation of GCC-specific extensions into
4//! standard C that tree-sitter can parse. It tracks all transformations
5//! for position mapping back to the original source.
6
7use regex::Regex;
8use std::sync::LazyLock;
9
10/// Type of transformation applied
11#[derive(Debug, Clone, PartialEq)]
12pub enum TransformKind {
13    /// __attribute__((...)) removal
14    Attribute,
15    /// __extension__ removal
16    Extension,
17    /// typeof → int replacement
18    Typeof,
19    /// Statement expression ({ ... }) → (0)
20    StatementExpression,
21    /// __asm__ removal
22    Asm,
23    /// __restrict removal
24    Restrict,
25    /// __inline__ removal
26    Inline,
27    /// __volatile removal
28    Volatile,
29    /// __typeof__ → int replacement
30    TypeofUnderscore,
31    /// alignof/sizeof handling
32    AlignofSizeof,
33}
34
35/// Record of a transformation
36#[derive(Debug, Clone)]
37pub struct Transformation {
38    /// Byte offset in original source where transformation started
39    pub original_start: usize,
40    /// Byte length in original source that was transformed
41    pub original_length: usize,
42    /// Byte offset in transformed source
43    pub transformed_start: usize,
44    /// Byte length in transformed source
45    pub transformed_length: usize,
46    /// Kind of transformation
47    pub kind: TransformKind,
48    /// Original text that was transformed
49    pub original_text: String,
50}
51
52/// Result of neutralization
53#[derive(Debug)]
54pub struct NeutralizedSource {
55    /// Transformed code
56    pub code: String,
57    /// List of transformations applied
58    pub transformations: Vec<Transformation>,
59}
60
61/// GCC extension neutralizer
62pub struct GccNeutralizer {
63    // Patterns are defined as static LazyLock regexes
64    // This struct is kept for future extensibility
65}
66
67// Regex patterns compiled once
68static RE_ATTRIBUTE: LazyLock<Regex> =
69    LazyLock::new(|| Regex::new(r"__attribute__\s*\(\(").unwrap());
70static RE_EXTENSION: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"__extension__\s*").unwrap());
71static RE_TYPEOF: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"typeof\s*\(").unwrap());
72static RE_TYPEOF_UNDERSCORE: LazyLock<Regex> =
73    LazyLock::new(|| Regex::new(r"__typeof__\s*\(").unwrap());
74static RE_TYPEOF_SINGLE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"__typeof\s*\(").unwrap());
75static RE_ASM: LazyLock<Regex> =
76    LazyLock::new(|| Regex::new(r"__asm__\s*(?:volatile\s*)?\(").unwrap());
77static RE_ASM_VOLATILE: LazyLock<Regex> =
78    LazyLock::new(|| Regex::new(r"__asm\s+volatile\s*\(").unwrap());
79static RE_RESTRICT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"__restrict\s+").unwrap());
80static RE_RESTRICT_UNDERSCORE: LazyLock<Regex> =
81    LazyLock::new(|| Regex::new(r"__restrict__\s+").unwrap());
82static RE_INLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"__inline__\s+").unwrap());
83static RE_INLINE_SINGLE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"__inline\s+").unwrap());
84static RE_VOLATILE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"__volatile__\s+").unwrap());
85static RE_VOLATILE_SINGLE: LazyLock<Regex> =
86    LazyLock::new(|| Regex::new(r"__volatile\s+").unwrap());
87static RE_STATEMENT_EXPR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\(\{").unwrap());
88static RE_BUILTIN_OFFSETOF: LazyLock<Regex> =
89    LazyLock::new(|| Regex::new(r"__builtin_offsetof\s*\(").unwrap());
90static RE_BUILTIN_TYPES_COMPATIBLE: LazyLock<Regex> =
91    LazyLock::new(|| Regex::new(r"__builtin_types_compatible_p\s*\(").unwrap());
92
93impl Default for GccNeutralizer {
94    fn default() -> Self {
95        Self::new()
96    }
97}
98
99impl GccNeutralizer {
100    pub fn new() -> Self {
101        Self {}
102    }
103
104    /// Neutralize GCC extensions in source code
105    pub fn neutralize(&self, source: &str) -> NeutralizedSource {
106        let mut code = source.to_string();
107        let mut transformations = Vec::new();
108
109        // Process each pattern type
110        // Order matters - some patterns may be nested
111
112        // 1. __attribute__((...))
113        while let Some(m) = RE_ATTRIBUTE.find(&code) {
114            let start = m.start();
115            if let Some((end, _original)) = Self::find_double_paren_end(&code, start + m.len()) {
116                let original_text = code[start..end].to_string();
117                transformations.push(Transformation {
118                    original_start: start,
119                    original_length: end - start,
120                    transformed_start: start,
121                    transformed_length: 0,
122                    kind: TransformKind::Attribute,
123                    original_text,
124                });
125                code = format!("{}{}", &code[..start], &code[end..]);
126            } else {
127                break;
128            }
129        }
130
131        // 2. __extension__
132        while let Some(m) = RE_EXTENSION.find(&code) {
133            let start = m.start();
134            let end = m.end();
135            transformations.push(Transformation {
136                original_start: start,
137                original_length: end - start,
138                transformed_start: start,
139                transformed_length: 0,
140                kind: TransformKind::Extension,
141                original_text: code[start..end].to_string(),
142            });
143            code = format!("{}{}", &code[..start], &code[end..]);
144        }
145
146        // 3. typeof(...) → int
147        for regex in [&*RE_TYPEOF, &*RE_TYPEOF_UNDERSCORE, &*RE_TYPEOF_SINGLE] {
148            while let Some(m) = regex.find(&code) {
149                let start = m.start();
150                if let Some((end, _)) = Self::find_matching_paren(&code, m.end() - 1) {
151                    let original_text = code[start..end].to_string();
152                    transformations.push(Transformation {
153                        original_start: start,
154                        original_length: end - start,
155                        transformed_start: start,
156                        transformed_length: 3, // "int"
157                        kind: TransformKind::Typeof,
158                        original_text,
159                    });
160                    code = format!("{}int{}", &code[..start], &code[end..]);
161                } else {
162                    break;
163                }
164            }
165        }
166
167        // 4. __asm__ volatile(...) - replace entire statement with empty
168        for regex in [&*RE_ASM, &*RE_ASM_VOLATILE] {
169            while let Some(m) = regex.find(&code) {
170                let start = m.start();
171                if let Some((end, _)) = Self::find_matching_paren(&code, m.end() - 1) {
172                    // Find the semicolon after the asm statement
173                    let stmt_end = code[end..].find(';').map(|i| end + i + 1).unwrap_or(end);
174                    let original_text = code[start..stmt_end].to_string();
175                    transformations.push(Transformation {
176                        original_start: start,
177                        original_length: stmt_end - start,
178                        transformed_start: start,
179                        transformed_length: 4, // "0 ; "
180                        kind: TransformKind::Asm,
181                        original_text,
182                    });
183                    // Replace with a simple expression statement
184                    code = format!("{}0{}", &code[..start], &code[stmt_end..]);
185                } else {
186                    break;
187                }
188            }
189        }
190
191        // 5. __restrict / __restrict__
192        for regex in [&*RE_RESTRICT, &*RE_RESTRICT_UNDERSCORE] {
193            while let Some(m) = regex.find(&code) {
194                let start = m.start();
195                let end = m.end();
196                transformations.push(Transformation {
197                    original_start: start,
198                    original_length: end - start,
199                    transformed_start: start,
200                    transformed_length: 0,
201                    kind: TransformKind::Restrict,
202                    original_text: code[start..end].to_string(),
203                });
204                code = format!("{}{}", &code[..start], &code[end..]);
205            }
206        }
207
208        // 6. __inline__ / __inline
209        for regex in [&*RE_INLINE, &*RE_INLINE_SINGLE] {
210            while let Some(m) = regex.find(&code) {
211                let start = m.start();
212                let end = m.end();
213                transformations.push(Transformation {
214                    original_start: start,
215                    original_length: end - start,
216                    transformed_start: start,
217                    transformed_length: 0,
218                    kind: TransformKind::Inline,
219                    original_text: code[start..end].to_string(),
220                });
221                code = format!("{}{}", &code[..start], &code[end..]);
222            }
223        }
224
225        // 7. __volatile__ / __volatile
226        for regex in [&*RE_VOLATILE, &*RE_VOLATILE_SINGLE] {
227            while let Some(m) = regex.find(&code) {
228                let start = m.start();
229                let end = m.end();
230                transformations.push(Transformation {
231                    original_start: start,
232                    original_length: end - start,
233                    transformed_start: start,
234                    transformed_length: 0,
235                    kind: TransformKind::Volatile,
236                    original_text: code[start..end].to_string(),
237                });
238                code = format!("{}{}", &code[..start], &code[end..]);
239            }
240        }
241
242        // 8. Statement expressions ({ ... }) → (0)
243        while let Some(m) = RE_STATEMENT_EXPR.find(&code) {
244            let start = m.start();
245            if let Some((end, _)) = Self::find_statement_expr_end(&code, start) {
246                let original_text = code[start..end].to_string();
247                transformations.push(Transformation {
248                    original_start: start,
249                    original_length: end - start,
250                    transformed_start: start,
251                    transformed_length: 3, // "(0)"
252                    kind: TransformKind::StatementExpression,
253                    original_text,
254                });
255                code = format!("{}(0){}", &code[..start], &code[end..]);
256            } else {
257                break;
258            }
259        }
260
261        // 9. __builtin_offsetof → 0
262        while let Some(m) = RE_BUILTIN_OFFSETOF.find(&code) {
263            let start = m.start();
264            if let Some((end, _)) = Self::find_matching_paren(&code, m.end() - 1) {
265                let original_text = code[start..end].to_string();
266                transformations.push(Transformation {
267                    original_start: start,
268                    original_length: end - start,
269                    transformed_start: start,
270                    transformed_length: 1, // "0"
271                    kind: TransformKind::AlignofSizeof,
272                    original_text,
273                });
274                code = format!("{}0{}", &code[..start], &code[end..]);
275            } else {
276                break;
277            }
278        }
279
280        // 10. __builtin_types_compatible_p → 0
281        while let Some(m) = RE_BUILTIN_TYPES_COMPATIBLE.find(&code) {
282            let start = m.start();
283            if let Some((end, _)) = Self::find_matching_paren(&code, m.end() - 1) {
284                let original_text = code[start..end].to_string();
285                transformations.push(Transformation {
286                    original_start: start,
287                    original_length: end - start,
288                    transformed_start: start,
289                    transformed_length: 1, // "0"
290                    kind: TransformKind::AlignofSizeof,
291                    original_text,
292                });
293                code = format!("{}0{}", &code[..start], &code[end..]);
294            } else {
295                break;
296            }
297        }
298
299        NeutralizedSource {
300            code,
301            transformations,
302        }
303    }
304
305    /// Find the end of a double-parenthesis expression like __attribute__((...))
306    fn find_double_paren_end(code: &str, start: usize) -> Option<(usize, String)> {
307        let bytes = code.as_bytes();
308        let mut depth = 2; // Already inside "(("
309        let mut i = start;
310
311        while i < bytes.len() && depth > 0 {
312            match bytes[i] {
313                b'(' => depth += 1,
314                b')' => depth -= 1,
315                b'"' => {
316                    // Skip string literal
317                    i += 1;
318                    while i < bytes.len() && bytes[i] != b'"' {
319                        if bytes[i] == b'\\' {
320                            i += 1;
321                        }
322                        i += 1;
323                    }
324                }
325                b'\'' => {
326                    // Skip char literal
327                    i += 1;
328                    while i < bytes.len() && bytes[i] != b'\'' {
329                        if bytes[i] == b'\\' {
330                            i += 1;
331                        }
332                        i += 1;
333                    }
334                }
335                _ => {}
336            }
337            i += 1;
338        }
339
340        if depth == 0 {
341            Some((i, code[start..i].to_string()))
342        } else {
343            None
344        }
345    }
346
347    /// Find the end of a parenthesized expression
348    fn find_matching_paren(code: &str, start: usize) -> Option<(usize, String)> {
349        let bytes = code.as_bytes();
350        if start >= bytes.len() || bytes[start] != b'(' {
351            return None;
352        }
353
354        let mut depth = 1;
355        let mut i = start + 1;
356
357        while i < bytes.len() && depth > 0 {
358            match bytes[i] {
359                b'(' => depth += 1,
360                b')' => depth -= 1,
361                b'"' => {
362                    // Skip string literal
363                    i += 1;
364                    while i < bytes.len() && bytes[i] != b'"' {
365                        if bytes[i] == b'\\' {
366                            i += 1;
367                        }
368                        i += 1;
369                    }
370                }
371                b'\'' => {
372                    // Skip char literal
373                    i += 1;
374                    while i < bytes.len() && bytes[i] != b'\'' {
375                        if bytes[i] == b'\\' {
376                            i += 1;
377                        }
378                        i += 1;
379                    }
380                }
381                _ => {}
382            }
383            i += 1;
384        }
385
386        if depth == 0 {
387            Some((i, code[start..i].to_string()))
388        } else {
389            None
390        }
391    }
392
393    /// Find the end of a statement expression ({ ... })
394    fn find_statement_expr_end(code: &str, start: usize) -> Option<(usize, String)> {
395        let bytes = code.as_bytes();
396        if start + 1 >= bytes.len() || bytes[start] != b'(' || bytes[start + 1] != b'{' {
397            return None;
398        }
399
400        let mut paren_depth = 1;
401        let mut brace_depth = 1;
402        let mut i = start + 2;
403
404        while i < bytes.len() && (paren_depth > 0 || brace_depth > 0) {
405            match bytes[i] {
406                b'(' => paren_depth += 1,
407                b')' => paren_depth -= 1,
408                b'{' => brace_depth += 1,
409                b'}' => brace_depth -= 1,
410                b'"' => {
411                    // Skip string literal
412                    i += 1;
413                    while i < bytes.len() && bytes[i] != b'"' {
414                        if bytes[i] == b'\\' {
415                            i += 1;
416                        }
417                        i += 1;
418                    }
419                }
420                b'\'' => {
421                    // Skip char literal
422                    i += 1;
423                    while i < bytes.len() && bytes[i] != b'\'' {
424                        if bytes[i] == b'\\' {
425                            i += 1;
426                        }
427                        i += 1;
428                    }
429                }
430                _ => {}
431            }
432            i += 1;
433        }
434
435        if paren_depth == 0 && brace_depth == 0 {
436            Some((i, code[start..i].to_string()))
437        } else {
438            None
439        }
440    }
441}
442
443#[cfg(test)]
444mod tests {
445    use super::*;
446
447    #[test]
448    fn test_neutralize_attribute() {
449        let neutralizer = GccNeutralizer::new();
450        let source = "void __attribute__((packed)) foo(void) {}";
451        let result = neutralizer.neutralize(source);
452
453        assert!(!result.code.contains("__attribute__"));
454        assert!(result.code.contains("void  foo(void) {}"));
455        assert!(!result.transformations.is_empty());
456        assert_eq!(result.transformations[0].kind, TransformKind::Attribute);
457    }
458
459    #[test]
460    fn test_neutralize_attribute_nested() {
461        let neutralizer = GccNeutralizer::new();
462        let source = "void __attribute__((unused, aligned(16))) bar(void) {}";
463        let result = neutralizer.neutralize(source);
464
465        assert!(!result.code.contains("__attribute__"));
466        assert!(result.code.contains("void  bar(void) {}"));
467    }
468
469    #[test]
470    fn test_neutralize_extension() {
471        let neutralizer = GccNeutralizer::new();
472        let source = "__extension__ union { int x; float y; };";
473        let result = neutralizer.neutralize(source);
474
475        assert!(!result.code.contains("__extension__"));
476        assert!(result.code.contains("union { int x; float y; };"));
477    }
478
479    #[test]
480    fn test_neutralize_typeof() {
481        let neutralizer = GccNeutralizer::new();
482        let source = "typeof(foo) bar;";
483        let result = neutralizer.neutralize(source);
484
485        assert!(!result.code.contains("typeof"));
486        assert!(result.code.contains("int bar;"));
487    }
488
489    #[test]
490    fn test_neutralize_typeof_underscore() {
491        let neutralizer = GccNeutralizer::new();
492        let source = "__typeof__(x) y;";
493        let result = neutralizer.neutralize(source);
494
495        assert!(!result.code.contains("__typeof__"));
496        assert!(result.code.contains("int y;"));
497    }
498
499    #[test]
500    fn test_neutralize_asm() {
501        let neutralizer = GccNeutralizer::new();
502        let source = "void foo(void) { __asm__ volatile(\"nop\"); }";
503        let result = neutralizer.neutralize(source);
504
505        assert!(!result.code.contains("__asm__"));
506        // The asm statement should be replaced with a simple statement
507        assert!(result.code.contains("{ 0 }"));
508    }
509
510    #[test]
511    fn test_neutralize_restrict() {
512        let neutralizer = GccNeutralizer::new();
513        let source = "void foo(int * __restrict p) {}";
514        let result = neutralizer.neutralize(source);
515
516        assert!(!result.code.contains("__restrict"));
517        assert!(result.code.contains("int * p"));
518    }
519
520    #[test]
521    fn test_neutralize_inline() {
522        let neutralizer = GccNeutralizer::new();
523        let source = "__inline__ void foo(void) {}";
524        let result = neutralizer.neutralize(source);
525
526        assert!(!result.code.contains("__inline__"));
527        assert!(result.code.contains("void foo(void)"));
528    }
529
530    #[test]
531    fn test_neutralize_statement_expression() {
532        let neutralizer = GccNeutralizer::new();
533        let source = "int x = ({ int y = 5; y + 1; });";
534        let result = neutralizer.neutralize(source);
535
536        assert!(!result.code.contains("({"));
537        assert!(result.code.contains("int x = (0);"));
538    }
539
540    #[test]
541    fn test_neutralize_builtin_offsetof() {
542        let neutralizer = GccNeutralizer::new();
543        let source = "int off = __builtin_offsetof(struct foo, bar);";
544        let result = neutralizer.neutralize(source);
545
546        assert!(!result.code.contains("__builtin_offsetof"));
547        assert!(result.code.contains("int off = 0;"));
548    }
549
550    #[test]
551    fn test_neutralize_multiple() {
552        let neutralizer = GccNeutralizer::new();
553        let source = r#"
554__extension__ struct {
555    __attribute__((packed)) int x;
556} __attribute__((aligned(16)));
557"#;
558        let result = neutralizer.neutralize(source);
559
560        assert!(!result.code.contains("__extension__"));
561        assert!(!result.code.contains("__attribute__"));
562    }
563
564    #[test]
565    fn test_transformation_tracking() {
566        let neutralizer = GccNeutralizer::new();
567        let source = "__attribute__((unused)) int x;";
568        let result = neutralizer.neutralize(source);
569
570        assert!(!result.transformations.is_empty());
571        let trans = &result.transformations[0];
572        assert_eq!(trans.kind, TransformKind::Attribute);
573        assert!(trans.original_text.contains("__attribute__"));
574    }
575
576    #[test]
577    fn test_no_false_positives() {
578        let neutralizer = GccNeutralizer::new();
579        let source = "int attribute_count; int typeof_var;";
580        let result = neutralizer.neutralize(source);
581
582        // Should not transform regular identifiers that contain the keywords
583        assert!(result.code.contains("attribute_count"));
584        // typeof_var contains "typeof" but not as "typeof("
585        assert!(result.code.contains("typeof_var"));
586    }
587
588    #[test]
589    fn test_string_literal_handling() {
590        let neutralizer = GccNeutralizer::new();
591        // Note: The current regex-based implementation may transform patterns
592        // inside string literals. This is a known limitation that could be
593        // addressed with a more sophisticated tokenizer.
594        let source = "char *s = \"test\";";
595        let result = neutralizer.neutralize(source);
596
597        // Simple strings without __attribute__ should be preserved
598        assert!(result.code.contains("\"test\""));
599    }
600}