Skip to main content

sloc_languages/
cpp_style.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3
4//! Lexical style-guide adherence analysis for C and C++ source files.
5//!
6//! The analyser scans each file character-by-character and tallies observable
7//! style signals: indentation type/width, brace placement, pointer-declarator
8//! alignment, control-flow spacing, and line-length compliance.  These raw
9//! counts are then scored against five well-known style guides (LLVM, Google,
10//! Mozilla, Microsoft, WebKit) to produce per-file adherence percentages.
11//!
12//! All analysis is purely lexical — no AST is built — so results are
13//! approximate.  The intent is to give a quick directional signal, not an
14//! exact conformance report.
15
16use serde::{Deserialize, Serialize};
17
18// ─── Observable style signals ────────────────────────────────────────────────
19
20/// Detected leading-whitespace style.
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
22#[serde(rename_all = "snake_case")]
23pub enum IndentStyle {
24    Tabs,
25    Spaces2,
26    Spaces4,
27    Spaces8,
28    Mixed,
29    #[default]
30    Unknown,
31}
32
33impl IndentStyle {
34    pub fn display(self) -> &'static str {
35        match self {
36            Self::Tabs => "Tabs",
37            Self::Spaces2 => "2-Space",
38            Self::Spaces4 => "4-Space",
39            Self::Spaces8 => "8-Space",
40            Self::Mixed => "Mixed",
41            Self::Unknown => "—",
42        }
43    }
44}
45
46/// Detected opening-brace placement style.
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
48#[serde(rename_all = "snake_case")]
49pub enum BraceStyle {
50    /// K&R / Attach — `{` on the same line as the preceding statement.
51    Attach,
52    /// Allman / "Broken" — `{` on its own line.
53    Allman,
54    Mixed,
55    #[default]
56    Unknown,
57}
58
59impl BraceStyle {
60    pub fn display(self) -> &'static str {
61        match self {
62            Self::Attach => "K&R / Attach",
63            Self::Allman => "Allman",
64            Self::Mixed => "Mixed",
65            Self::Unknown => "—",
66        }
67    }
68}
69
70/// Detected pointer / reference declarator alignment.
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
72#[serde(rename_all = "snake_case")]
73pub enum PointerStyle {
74    /// `Type* var` — `*` glued to the type keyword.
75    WithType,
76    /// `Type *var` — `*` glued to the variable name.
77    WithName,
78    /// `Type * var` — `*` in the middle (both sides spaced).
79    Middle,
80    Mixed,
81    #[default]
82    Unknown,
83}
84
85impl PointerStyle {
86    pub fn display(self) -> &'static str {
87        match self {
88            Self::WithType => "Type* var",
89            Self::WithName => "Type *var",
90            Self::Middle => "Type * var",
91            Self::Mixed => "Mixed",
92            Self::Unknown => "—",
93        }
94    }
95}
96
97// ─── Output types ─────────────────────────────────────────────────────────────
98
99/// Adherence percentage for one named style guide.
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct StyleGuideScore {
102    /// Short guide name, e.g. `"LLVM"`, `"Google"`.
103    pub name: String,
104    /// Key characteristics used in scoring.
105    pub description: String,
106    /// Computed adherence, 0–100.
107    pub score_pct: u8,
108}
109
110/// Full lexical style analysis result for a single C/C++ file.
111#[derive(Debug, Clone, Serialize, Deserialize, Default)]
112pub struct CppStyleAnalysis {
113    // ── classified styles ──────────────────────────────────────────────────
114    pub indent_style: IndentStyle,
115    pub brace_style: BraceStyle,
116    pub pointer_style: PointerStyle,
117
118    // ── raw signal counts ─────────────────────────────────────────────────
119    pub tab_indented_lines: u32,
120    pub space2_indented_lines: u32,
121    pub space4_indented_lines: u32,
122    pub allman_braces: u32,
123    pub attach_braces: u32,
124    pub ptr_with_type: u32,
125    pub ptr_with_name: u32,
126    pub ptr_middle: u32,
127    pub space_before_paren: u32,
128    pub no_space_before_paren: u32,
129    pub lines_over_80: u32,
130    pub lines_over_100: u32,
131    pub max_line_length: u32,
132    pub total_lines: u32,
133    pub has_pragma_once: bool,
134
135    // ── style-guide scores ────────────────────────────────────────────────
136    pub guide_scores: Vec<StyleGuideScore>,
137    /// Name of the guide with the highest adherence score.
138    pub dominant_guide: String,
139    /// Adherence percentage of the dominant guide (0–100).
140    pub dominant_score_pct: u8,
141}
142
143// ─── Public entry point ───────────────────────────────────────────────────────
144
145/// Analyse `text` for C/C++ coding-style signals and return a scored result.
146#[must_use]
147pub fn analyze_cpp_style(text: &str) -> CppStyleAnalysis {
148    let mut tab_lines = 0u32;
149    let mut sp2_lines = 0u32;
150    let mut sp4_lines = 0u32;
151    let mut allman = 0u32;
152    let mut attach = 0u32;
153    let mut ptr_type = 0u32;
154    let mut ptr_name = 0u32;
155    let mut ptr_mid = 0u32;
156    let mut space_paren = 0u32;
157    let mut nospace_paren = 0u32;
158    let mut over_80 = 0u32;
159    let mut over_100 = 0u32;
160    let mut max_len = 0u32;
161    let mut pragma_once = false;
162    let mut total = 0u32;
163
164    let lines: Vec<&str> = text.lines().collect();
165
166    for line in &lines {
167        total += 1;
168
169        // Line length (byte count — acceptable approximation for ASCII-dominant code)
170        let len = line.len() as u32;
171        if len > max_len {
172            max_len = len;
173        }
174        if len > 80 {
175            over_80 += 1;
176        }
177        if len > 100 {
178            over_100 += 1;
179        }
180
181        let trimmed = line.trim();
182
183        // #pragma once
184        if trimmed == "#pragma once" {
185            pragma_once = true;
186        }
187
188        // ── Indentation ──────────────────────────────────────────────────
189        scan_indent(line, &mut tab_lines, &mut sp2_lines, &mut sp4_lines);
190
191        // ── Brace placement ──────────────────────────────────────────────
192        scan_braces(trimmed, &mut allman, &mut attach);
193
194        // ── Control-flow spacing ─────────────────────────────────────────
195        scan_paren_spacing(trimmed, &mut space_paren, &mut nospace_paren);
196
197        // ── Pointer/reference declarator alignment ────────────────────────
198        scan_pointer_style(trimmed, &mut ptr_type, &mut ptr_name, &mut ptr_mid);
199    }
200
201    let indent_style = classify_indent(tab_lines, sp2_lines, sp4_lines);
202    let brace_style = classify_braces(allman, attach);
203    let pointer_style = classify_pointers(ptr_type, ptr_name, ptr_mid);
204
205    let guide_scores = compute_guide_scores(
206        indent_style,
207        brace_style,
208        pointer_style,
209        over_80,
210        over_100,
211        total,
212        space_paren,
213        nospace_paren,
214    );
215
216    let (dominant_guide, dominant_score_pct) = guide_scores
217        .iter()
218        .max_by_key(|s| s.score_pct)
219        .map(|s| (s.name.clone(), s.score_pct))
220        .unwrap_or_else(|| (String::from("Unknown"), 0));
221
222    CppStyleAnalysis {
223        indent_style,
224        brace_style,
225        pointer_style,
226        tab_indented_lines: tab_lines,
227        space2_indented_lines: sp2_lines,
228        space4_indented_lines: sp4_lines,
229        allman_braces: allman,
230        attach_braces: attach,
231        ptr_with_type: ptr_type,
232        ptr_with_name: ptr_name,
233        ptr_middle: ptr_mid,
234        space_before_paren: space_paren,
235        no_space_before_paren: nospace_paren,
236        lines_over_80: over_80,
237        lines_over_100: over_100,
238        max_line_length: max_len,
239        total_lines: total,
240        has_pragma_once: pragma_once,
241        guide_scores,
242        dominant_guide,
243        dominant_score_pct,
244    }
245}
246
247// ─── Signal scanners ──────────────────────────────────────────────────────────
248
249fn scan_indent(line: &str, tabs: &mut u32, sp2: &mut u32, sp4: &mut u32) {
250    let first = match line.chars().next() {
251        Some(c) => c,
252        None => return,
253    };
254    if first == '\t' {
255        *tabs += 1;
256        return;
257    }
258    if first != ' ' {
259        return; // non-blank, non-indented line — skip
260    }
261    let leading = line.bytes().take_while(|&b| b == b' ').count();
262    if leading == 0 {
263        return;
264    }
265    // Classify by the base indent unit inferred from the count.
266    // 4-space code will have lines indented 4, 8, 12 … spaces (all divisible by 4).
267    // 2-space code will have lines indented 2, 4, 6 … spaces.
268    // Lines at depth > 1 in 4-space code are also divisible by 4 so they count for sp4.
269    // Lines at depth 1 in 2-space code show as 2, which is only divisible by 2 (not 4).
270    if leading % 4 == 0 {
271        *sp4 += 1;
272    } else if leading % 2 == 0 {
273        *sp2 += 1;
274    }
275    // Odd leading-space counts are usually alignment padding — ignore.
276}
277
278fn scan_braces(trimmed: &str, allman: &mut u32, attach: &mut u32) {
279    // Allman: the entire (trimmed) line is just `{`
280    if trimmed == "{" {
281        *allman += 1;
282        return;
283    }
284    // Attach: line ends with ` {` and has content before it that suggests
285    // a control/function/class head.
286    if trimmed.ends_with(" {") || trimmed.ends_with("\t{") {
287        let head = &trimmed[..trimmed.len() - 2];
288        if !head.is_empty() && looks_like_block_head(head) {
289            *attach += 1;
290        }
291    }
292}
293
294/// Returns `true` when `head` (the text before a trailing ` {`) looks like it
295/// opens a block rather than being an expression assignment or initialiser.
296fn looks_like_block_head(head: &str) -> bool {
297    let head = head.trim_end();
298    // ends with `)` (function/control-flow), `else`, `try`, `do`, `noexcept`, etc.
299    if head.ends_with(')')
300        || head.ends_with("else")
301        || head.ends_with("try")
302        || head.ends_with("do")
303        || head.ends_with("noexcept")
304        || head.ends_with("const")
305        || head.ends_with("override")
306    {
307        return true;
308    }
309    // class / struct / enum / namespace / extern "C" blocks
310    for kw in &["class ", "struct ", "enum ", "namespace ", "extern "] {
311        if head.contains(kw) {
312            return true;
313        }
314    }
315    false
316}
317
318fn scan_paren_spacing(trimmed: &str, with_space: &mut u32, no_space: &mut u32) {
319    static WITH: &[&str] = &[
320        "if (",
321        "} else if (",
322        "while (",
323        "for (",
324        "switch (",
325        "catch (",
326    ];
327    static WITHOUT: &[&str] = &["if(", "while(", "for(", "switch(", "catch("];
328
329    let mut found_with = false;
330    let mut found_without = false;
331
332    for kw in WITH {
333        if trimmed.starts_with(kw) || trimmed.contains(kw) {
334            found_with = true;
335            break;
336        }
337    }
338    for kw in WITHOUT {
339        if trimmed.starts_with(kw) || trimmed.contains(kw) {
340            found_without = true;
341            break;
342        }
343    }
344
345    if found_with {
346        *with_space += 1;
347    }
348    if found_without {
349        *no_space += 1;
350    }
351}
352
353/// Scan a single (trimmed) line for pointer/reference declarator patterns.
354///
355/// We look for `*` and `&` characters whose neighbourhood suggests a
356/// type-qualifier context rather than a dereference or multiplication.
357fn scan_pointer_style(trimmed: &str, with_type: &mut u32, with_name: &mut u32, _middle: &mut u32) {
358    // Skip comment lines and preprocessor directives — they produce false positives.
359    if trimmed.starts_with("//")
360        || trimmed.starts_with('*')
361        || trimmed.starts_with("/*")
362        || trimmed.starts_with('#')
363    {
364        return;
365    }
366
367    let bytes = trimmed.as_bytes();
368    let len = bytes.len();
369    let mut i = 0;
370    let mut in_str = false;
371    let mut in_char = false;
372
373    while i < len {
374        let b = bytes[i];
375
376        if b == b'"' && !in_char && (i == 0 || bytes[i - 1] != b'\\') {
377            in_str = !in_str;
378        }
379        if b == b'\'' && !in_str && (i == 0 || bytes[i - 1] != b'\\') {
380            in_char = !in_char;
381        }
382        if in_str || in_char {
383            i += 1;
384            continue;
385        }
386
387        if b == b'*' || b == b'&' {
388            // Skip doubled `**` / `&&`
389            if i + 1 < len && (bytes[i + 1] == b'*' || bytes[i + 1] == b'&') {
390                i += 2;
391                continue;
392            }
393            // Skip compound-assignment tokens `*=` `&=` `->` `/*` `*/`
394            if i + 1 < len && (bytes[i + 1] == b'=' || bytes[i + 1] == b'/' || bytes[i + 1] == b'>')
395            {
396                i += 2;
397                continue;
398            }
399            if i > 0 && (bytes[i - 1] == b'=' || bytes[i - 1] == b'/' || bytes[i - 1] == b'-') {
400                i += 1;
401                continue;
402            }
403
404            let pre_word = i > 0 && (bytes[i - 1].is_ascii_alphanumeric() || bytes[i - 1] == b'_');
405            let pre_space = i > 0 && bytes[i - 1] == b' ';
406            let post_word =
407                i + 1 < len && (bytes[i + 1].is_ascii_alphanumeric() || bytes[i + 1] == b'_');
408            let post_space = i + 1 < len && bytes[i + 1] == b' ';
409
410            // `Type*var` or `Type* var` → with_type
411            if pre_word && (post_word || post_space) {
412                *with_type += 1;
413            }
414            // ` *var` → with_name
415            else if pre_space && post_word {
416                *with_name += 1;
417            }
418            // ` * ` → middle (we tally this but don't increment _middle to avoid
419            // false positives from multiplication; the classify_pointers function
420            // treats "unknown" neutrally)
421        }
422
423        i += 1;
424    }
425}
426
427// ─── Classifiers ─────────────────────────────────────────────────────────────
428
429fn classify_indent(tabs: u32, sp2: u32, sp4: u32) -> IndentStyle {
430    let total = tabs + sp2 + sp4;
431    if total == 0 {
432        return IndentStyle::Unknown;
433    }
434
435    let tab_pct = tabs as f32 / total as f32;
436    let s2_pct = sp2 as f32 / total as f32;
437    let s4_pct = sp4 as f32 / total as f32;
438
439    if tab_pct >= 0.60 {
440        return IndentStyle::Tabs;
441    }
442    if s4_pct >= 0.60 {
443        return IndentStyle::Spaces4;
444    }
445    if s2_pct >= 0.60 {
446        return IndentStyle::Spaces2;
447    }
448
449    // Disambiguate 4-space vs 2-space when both are present.
450    // In a 4-space codebase, deeply-nested lines appear as sp4 while 2-level lines
451    // also appear in sp4; sp2 only shows up for 1-level nesting in a 2-space codebase.
452    // Heuristic: if sp4 > sp2*2, the base unit is likely 4 spaces.
453    if sp4 > sp2 * 2 && sp4 > tabs {
454        return IndentStyle::Spaces4;
455    }
456    if sp2 > sp4 && sp2 > tabs {
457        return IndentStyle::Spaces2;
458    }
459
460    IndentStyle::Mixed
461}
462
463fn classify_braces(allman: u32, attach: u32) -> BraceStyle {
464    let total = allman + attach;
465    if total == 0 {
466        return BraceStyle::Unknown;
467    }
468    let a_pct = allman as f32 / total as f32;
469    let k_pct = attach as f32 / total as f32;
470    if a_pct >= 0.65 {
471        BraceStyle::Allman
472    } else if k_pct >= 0.65 {
473        BraceStyle::Attach
474    } else {
475        BraceStyle::Mixed
476    }
477}
478
479fn classify_pointers(with_type: u32, with_name: u32, _middle: u32) -> PointerStyle {
480    let total = with_type + with_name;
481    if total == 0 {
482        return PointerStyle::Unknown;
483    }
484    let t = with_type as f32 / total as f32;
485    let n = with_name as f32 / total as f32;
486    if t >= 0.65 {
487        PointerStyle::WithType
488    } else if n >= 0.65 {
489        PointerStyle::WithName
490    } else {
491        PointerStyle::Mixed
492    }
493}
494
495// ─── Scoring helpers ─────────────────────────────────────────────────────────
496
497/// Compute a 0–100 score from weighted feature values.
498/// Each pair is (weight, value) where value is in [0.0, 1.0].
499fn weighted_score(features: &[(f32, f32)]) -> u8 {
500    let s: f32 = features.iter().map(|(w, v)| w * v).sum();
501    (s * 100.0).round().clamp(0.0, 100.0) as u8
502}
503
504fn score_indent_2(s: IndentStyle) -> f32 {
505    match s {
506        IndentStyle::Spaces2 => 1.0,
507        IndentStyle::Mixed => 0.35,
508        _ => 0.05,
509    }
510}
511
512fn score_indent_4(s: IndentStyle) -> f32 {
513    match s {
514        IndentStyle::Spaces4 => 1.0,
515        IndentStyle::Mixed => 0.35,
516        _ => 0.05,
517    }
518}
519
520/// Score line-length compliance for an 80-column limit.
521fn score_line80(over: u32, total: u32) -> f32 {
522    if total == 0 {
523        return 1.0;
524    }
525    let pct = over as f32 / total as f32;
526    if pct < 0.02 {
527        1.00
528    } else if pct < 0.08 {
529        0.75
530    } else if pct < 0.20 {
531        0.45
532    } else if pct < 0.40 {
533        0.20
534    } else {
535        0.05
536    }
537}
538
539/// Score line-length compliance for a 100-column limit.
540fn score_line100(over: u32, total: u32) -> f32 {
541    if total == 0 {
542        return 1.0;
543    }
544    let pct = over as f32 / total as f32;
545    if pct < 0.03 {
546        1.00
547    } else if pct < 0.10 {
548        0.75
549    } else if pct < 0.25 {
550        0.45
551    } else {
552        0.10
553    }
554}
555
556fn score_attach(s: BraceStyle) -> f32 {
557    match s {
558        BraceStyle::Attach => 1.0,
559        BraceStyle::Mixed => 0.40,
560        BraceStyle::Allman => 0.05,
561        BraceStyle::Unknown => 0.50,
562    }
563}
564
565fn score_allman(s: BraceStyle) -> f32 {
566    match s {
567        BraceStyle::Allman => 1.0,
568        BraceStyle::Mixed => 0.40,
569        BraceStyle::Attach => 0.05,
570        BraceStyle::Unknown => 0.50,
571    }
572}
573
574fn score_ptr(detected: PointerStyle, expected: PointerStyle) -> f32 {
575    if detected == expected {
576        return 1.0;
577    }
578    match detected {
579        PointerStyle::Mixed => 0.40,
580        PointerStyle::Unknown => 0.50,
581        _ => 0.05,
582    }
583}
584
585fn score_space_paren(with_space: u32, no_space: u32) -> f32 {
586    let total = with_space + no_space;
587    if total == 0 {
588        return 0.50;
589    }
590    with_space as f32 / total as f32
591}
592
593// ─── Guide score table ────────────────────────────────────────────────────────
594
595#[allow(clippy::too_many_arguments)]
596fn compute_guide_scores(
597    indent: IndentStyle,
598    braces: BraceStyle,
599    ptrs: PointerStyle,
600    over_80: u32,
601    over_100: u32,
602    total: u32,
603    space_paren: u32,
604    no_space_paren: u32,
605) -> Vec<StyleGuideScore> {
606    let l80 = score_line80(over_80, total);
607    let l100 = score_line100(over_100, total);
608    let att = score_attach(braces);
609    let all = score_allman(braces);
610    let pt = score_ptr(ptrs, PointerStyle::WithType);
611    let pn = score_ptr(ptrs, PointerStyle::WithName);
612    let sp = score_space_paren(space_paren, no_space_paren);
613
614    // LLVM: 2-space, 80-col, K&R braces, `*var` pointer, space-before-paren
615    let llvm = weighted_score(&[
616        (0.28, score_indent_2(indent)),
617        (0.20, l80),
618        (0.24, att),
619        (0.15, pn),
620        (0.13, sp),
621    ]);
622
623    // Google: 2-space, 80-col, K&R braces, `Type*` pointer, space-before-paren
624    let google = weighted_score(&[
625        (0.25, score_indent_2(indent)),
626        (0.20, l80),
627        (0.25, att),
628        (0.18, pt),
629        (0.12, sp),
630    ]);
631
632    // Mozilla: 4-space, 80-col, mixed braces (partial credit for both), `Type*`
633    let moz_brace = match braces {
634        BraceStyle::Attach => 0.60,
635        BraceStyle::Allman => 0.45,
636        BraceStyle::Mixed => 0.80,
637        BraceStyle::Unknown => 0.50,
638    };
639    let mozilla = weighted_score(&[
640        (0.28, score_indent_4(indent)),
641        (0.20, l80),
642        (0.22, moz_brace),
643        (0.18, pt),
644        (0.12, sp),
645    ]);
646
647    // Microsoft: 4-space, Allman braces, lenient line length (100-col), `*var`
648    let microsoft = weighted_score(&[
649        (0.32, score_indent_4(indent)),
650        (0.36, all),
651        (0.16, l100),
652        (0.16, pn),
653    ]);
654
655    // WebKit: 4-space, 80-col, K&R braces, `Type*`, space-before-paren
656    let webkit = weighted_score(&[
657        (0.28, score_indent_4(indent)),
658        (0.20, l80),
659        (0.24, att),
660        (0.16, pt),
661        (0.12, sp),
662    ]);
663
664    vec![
665        StyleGuideScore {
666            name: "LLVM".to_string(),
667            description: "2-space indent \u{00b7} 80-col \u{00b7} K&R braces \u{00b7} *var pointer"
668                .to_string(),
669            score_pct: llvm,
670        },
671        StyleGuideScore {
672            name: "Google".to_string(),
673            description:
674                "2-space indent \u{00b7} 80-col \u{00b7} K&R braces \u{00b7} Type* pointer"
675                    .to_string(),
676            score_pct: google,
677        },
678        StyleGuideScore {
679            name: "Mozilla".to_string(),
680            description:
681                "4-space indent \u{00b7} 80-col \u{00b7} mixed braces \u{00b7} Type* pointer"
682                    .to_string(),
683            score_pct: mozilla,
684        },
685        StyleGuideScore {
686            name: "Microsoft".to_string(),
687            description:
688                "4-space indent \u{00b7} Allman braces \u{00b7} 100-col \u{00b7} *var pointer"
689                    .to_string(),
690            score_pct: microsoft,
691        },
692        StyleGuideScore {
693            name: "WebKit".to_string(),
694            description:
695                "4-space indent \u{00b7} 80-col \u{00b7} K&R braces \u{00b7} Type* pointer"
696                    .to_string(),
697            score_pct: webkit,
698        },
699    ]
700}