#[derive(Debug, Clone, PartialEq, Eq)]
enum LexicalState {
Code,
LineComment,
BlockComment { depth: u32 },
StringLiteral { quote: char, escapable: bool },
RawString { hashes: u32 },
TripleQuote { quote: char },
CharLiteral,
TemplateLiteral,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RawStringSyntax {
None,
RustHashes,
TripleQuote,
GoBacktick,
}
#[derive(Debug, Clone)]
pub struct LexicalRules {
pub line_comment: &'static str,
pub block_comments: bool,
pub nested_comments: bool,
pub char_literals: bool,
pub raw_strings: RawStringSyntax,
pub template_literals: bool,
}
impl LexicalRules {
pub fn rust() -> Self {
Self {
line_comment: "//",
block_comments: true,
nested_comments: true,
char_literals: true,
raw_strings: RawStringSyntax::RustHashes,
template_literals: false,
}
}
pub fn c() -> Self {
Self {
line_comment: "//",
block_comments: true,
nested_comments: false,
char_literals: true,
raw_strings: RawStringSyntax::None,
template_literals: false,
}
}
pub fn cpp() -> Self {
Self::c()
}
pub fn go() -> Self {
Self {
line_comment: "//",
block_comments: true,
nested_comments: false,
char_literals: true,
raw_strings: RawStringSyntax::GoBacktick,
template_literals: false,
}
}
pub fn java() -> Self {
Self {
line_comment: "//",
block_comments: true,
nested_comments: false,
char_literals: true,
raw_strings: RawStringSyntax::None,
template_literals: false,
}
}
pub fn javascript() -> Self {
Self {
line_comment: "//",
block_comments: true,
nested_comments: false,
char_literals: false,
raw_strings: RawStringSyntax::None,
template_literals: true,
}
}
pub fn typescript() -> Self {
Self::javascript()
}
pub fn python() -> Self {
Self {
line_comment: "#",
block_comments: false,
nested_comments: false,
char_literals: false,
raw_strings: RawStringSyntax::TripleQuote,
template_literals: false,
}
}
}
#[derive(Debug, Clone)]
pub struct ScannedLine {
pub line_number: u32,
pub brace_delta: i32,
pub depth_after: u32,
pub is_non_code: bool,
}
pub struct CodeScanner {
rules: LexicalRules,
state: LexicalState,
depth: u32,
}
impl CodeScanner {
pub fn new(rules: LexicalRules) -> Self {
Self {
rules,
state: LexicalState::Code,
depth: 0,
}
}
pub fn scan_line(&mut self, line_number: u32, line: &str) -> ScannedLine {
let chars: Vec<char> = line.chars().collect();
let len = chars.len();
let mut i = 0;
let mut delta: i32 = 0;
let was_code_at_start = self.state == LexicalState::Code;
let mut entered_code = false;
while i < len {
match &self.state {
LexicalState::Code => {
entered_code = true;
if !self.rules.line_comment.is_empty() {
let lc = self.rules.line_comment;
if lc == "//" && i + 1 < len && chars[i] == '/' && chars[i + 1] == '/' {
self.state = LexicalState::LineComment;
break; }
if lc == "#" && chars[i] == '#' {
self.state = LexicalState::LineComment;
break;
}
}
if self.rules.block_comments
&& i + 1 < len
&& chars[i] == '/'
&& chars[i + 1] == '*'
{
self.state = LexicalState::BlockComment { depth: 1 };
i += 2;
continue;
}
if self.rules.raw_strings == RawStringSyntax::RustHashes {
if let Some(hashes) = self.try_rust_raw_string(&chars, i) {
self.state = LexicalState::RawString { hashes };
let mut skip = if chars[i] == 'b' { 2 } else { 1 }; skip += hashes as usize; skip += 1; i += skip;
continue;
}
}
if self.rules.raw_strings == RawStringSyntax::TripleQuote
&& i + 2 < len
&& (chars[i] == '"' || chars[i] == '\'')
&& chars[i + 1] == chars[i]
&& chars[i + 2] == chars[i]
{
let q = chars[i];
self.state = LexicalState::TripleQuote { quote: q };
i += 3;
continue;
}
if self.rules.raw_strings == RawStringSyntax::GoBacktick && chars[i] == '`' {
self.state = LexicalState::StringLiteral {
quote: '`',
escapable: false,
};
i += 1;
continue;
}
if self.rules.template_literals && chars[i] == '`' {
self.state = LexicalState::TemplateLiteral;
i += 1;
continue;
}
if self.rules.char_literals && chars[i] == '\'' {
if self.is_char_literal(&chars, i) {
self.state = LexicalState::CharLiteral;
i += 1; continue;
}
i += 1;
continue;
}
if chars[i] == '"' {
self.state = LexicalState::StringLiteral {
quote: '"',
escapable: true,
};
i += 1;
continue;
}
if !self.rules.char_literals && chars[i] == '\'' {
self.state = LexicalState::StringLiteral {
quote: '\'',
escapable: true,
};
i += 1;
continue;
}
if chars[i] == 'b' && i + 1 < len && chars[i + 1] == '"' {
self.state = LexicalState::StringLiteral {
quote: '"',
escapable: true,
};
i += 2;
continue;
}
if chars[i] == '{' {
delta += 1;
self.depth = self.depth.saturating_add(1);
} else if chars[i] == '}' {
delta -= 1;
self.depth = self.depth.saturating_sub(1);
}
i += 1;
}
LexicalState::LineComment => {
break;
}
LexicalState::BlockComment { depth: d } => {
let d = *d;
if i + 1 < len && chars[i] == '*' && chars[i + 1] == '/' {
if d <= 1 {
self.state = LexicalState::Code;
} else {
self.state = LexicalState::BlockComment { depth: d - 1 };
}
i += 2;
continue;
}
if self.rules.nested_comments
&& i + 1 < len
&& chars[i] == '/'
&& chars[i + 1] == '*'
{
self.state = LexicalState::BlockComment { depth: d + 1 };
i += 2;
continue;
}
i += 1;
}
LexicalState::StringLiteral { quote, escapable } => {
let q = *quote;
let esc = *escapable;
if esc && chars[i] == '\\' {
i += 2; continue;
}
if chars[i] == q {
self.state = LexicalState::Code;
i += 1;
continue;
}
i += 1;
}
LexicalState::RawString { hashes } => {
let h = *hashes;
if chars[i] == '"' {
let mut count = 0u32;
let mut j = i + 1;
while j < len && chars[j] == '#' && count < h {
count += 1;
j += 1;
}
if count == h {
self.state = LexicalState::Code;
i = j;
continue;
}
}
i += 1;
}
LexicalState::TripleQuote { quote } => {
let q = *quote;
if i + 2 < len && chars[i] == q && chars[i + 1] == q && chars[i + 2] == q {
self.state = LexicalState::Code;
i += 3;
continue;
}
i += 1;
}
LexicalState::CharLiteral => {
if chars[i] == '\\' {
i += 2; continue;
}
if chars[i] == '\'' {
self.state = LexicalState::Code;
i += 1;
continue;
}
i += 1;
}
LexicalState::TemplateLiteral => {
if chars[i] == '\\' {
i += 2;
continue;
}
if chars[i] == '`' {
self.state = LexicalState::Code;
i += 1;
continue;
}
i += 1;
}
}
}
if self.state == LexicalState::LineComment {
self.state = LexicalState::Code;
}
let is_non_code = !was_code_at_start && !entered_code;
ScannedLine {
line_number,
brace_delta: delta,
depth_after: self.depth,
is_non_code,
}
}
pub fn scan_all(&mut self, content: &str) -> Vec<ScannedLine> {
content
.lines()
.enumerate()
.map(|(i, line)| self.scan_line(u32::try_from(i + 1).expect("line within u32"), line))
.collect()
}
pub fn current_depth(&self) -> u32 {
self.depth
}
fn try_rust_raw_string(&self, chars: &[char], i: usize) -> Option<u32> {
let len = chars.len();
let mut pos = i;
if pos < len && chars[pos] == 'b' {
pos += 1;
}
if pos >= len || chars[pos] != 'r' {
return None;
}
pos += 1;
let mut hashes = 0u32;
while pos < len && chars[pos] == '#' {
hashes += 1;
pos += 1;
}
if pos < len && chars[pos] == '"' {
Some(hashes)
} else {
None
}
}
fn is_char_literal(&self, chars: &[char], i: usize) -> bool {
let len = chars.len();
if i + 2 >= len {
return false;
}
let next = chars[i + 1];
if next == '\\' && i + 3 < len && chars[i + 3] == '\'' {
return true;
}
if chars[i + 2] == '\'' {
return true;
}
false
}
}
pub fn find_closing_brace(scanned: &[ScannedLine], open_idx: usize) -> usize {
if open_idx >= scanned.len() {
return scanned.len().saturating_sub(1);
}
let target_depth = scanned[open_idx].depth_after.saturating_sub(1);
for (i, line) in scanned.iter().enumerate().skip(open_idx + 1) {
if line.depth_after == target_depth && line.brace_delta < 0 {
return i;
}
}
scanned.len().saturating_sub(1)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn scanner_counts_braces_in_code() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("fn foo() {\n bar();\n}");
assert_eq!(results[0].brace_delta, 1, "Opening brace on line 1");
assert_eq!(results[1].brace_delta, 0, "No braces on line 2");
assert_eq!(results[2].brace_delta, -1, "Closing brace on line 3");
assert_eq!(results[0].depth_after, 1);
assert_eq!(results[2].depth_after, 0);
}
#[test]
fn scanner_ignores_braces_in_string_literal() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("let s = \"{\";");
assert_eq!(
results[0].brace_delta, 0,
"Brace inside string should not be counted"
);
}
#[test]
fn scanner_ignores_braces_in_line_comment() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("// { not a brace");
assert_eq!(
results[0].brace_delta, 0,
"Brace in line comment should not be counted"
);
}
#[test]
fn scanner_ignores_braces_in_block_comment() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("/* { } */");
assert_eq!(
results[0].brace_delta, 0,
"Braces in block comment should not be counted"
);
}
#[test]
fn scanner_ignores_braces_in_char_literal() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("let c = '{';");
assert_eq!(
results[0].brace_delta, 0,
"Brace in char literal should not be counted"
);
}
#[test]
fn scanner_handles_rust_raw_string() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("let s = r#\"{ }\"#;");
assert_eq!(
results[0].brace_delta, 0,
"Braces in raw string should not be counted"
);
}
#[test]
fn scanner_handles_nested_block_comments_rust() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("/* outer /* inner { } */ still comment { } */\nlet x = 1;");
assert_eq!(
results[0].brace_delta, 0,
"Braces in nested block comment should not be counted"
);
assert!(
results[0].is_non_code || results[0].brace_delta == 0,
"Line should be non-code or have 0 delta"
);
assert!(!results[1].is_non_code, "Line after comment is code");
}
#[test]
fn scanner_non_nested_block_comments_c() {
let mut scanner = CodeScanner::new(LexicalRules::c());
let results = scanner.scan_all("/* start /* not nested */ {");
assert_eq!(
results[0].brace_delta, 1,
"After non-nested block comment ends, brace in code should be counted"
);
}
#[test]
fn scanner_handles_escape_in_string() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("let s = \"escaped \\\" still string {\";");
assert_eq!(
results[0].brace_delta, 0,
"Brace after escaped quote in string should not be counted"
);
}
#[test]
fn scanner_handles_multiline_string() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("let s = \"hello\nworld { }\n\";");
assert_eq!(results.len(), 3);
assert_eq!(
results[1].brace_delta, 0,
"Braces on second line of multi-line string should not be counted"
);
assert!(
results[1].is_non_code,
"Second line should be marked as non-code (inside string)"
);
}
#[test]
fn scanner_handles_js_template_literal() {
let mut scanner = CodeScanner::new(LexicalRules::javascript());
let results = scanner.scan_all("let s = `template { literal`;");
assert_eq!(
results[0].brace_delta, 0,
"Brace in template literal should not be counted"
);
}
#[test]
fn scanner_handles_go_backtick_raw_string() {
let mut scanner = CodeScanner::new(LexicalRules::go());
let results = scanner.scan_all("var s = `raw { string`");
assert_eq!(
results[0].brace_delta, 0,
"Brace in Go backtick raw string should not be counted"
);
}
#[test]
fn scanner_lifetime_not_char_literal() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("fn foo<'a>(x: &'a str) {");
assert_eq!(
results[0].brace_delta, 1,
"Opening brace should be counted; 'a is a lifetime, not char literal"
);
}
#[test]
fn scanner_mixed_code_and_string_on_one_line() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("let x = \"{\"; let y = 1;");
assert!(
!results[0].is_non_code,
"Line with code and string should not be marked as non-code"
);
assert_eq!(
results[0].brace_delta, 0,
"Brace in string should not be counted"
);
}
#[test]
fn scanner_depth_tracks_across_lines() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let code = "fn foo() {\n if true {\n bar();\n }\n}";
let results = scanner.scan_all(code);
assert_eq!(results[0].depth_after, 1, "After fn foo() {{");
assert_eq!(results[1].depth_after, 2, "After if true {{");
assert_eq!(results[2].depth_after, 2, "bar() stays at depth 2");
assert_eq!(results[3].depth_after, 1, "After inner }}");
assert_eq!(results[4].depth_after, 0, "After outer }}");
}
#[test]
fn scanner_depth_saturates_at_zero() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let results = scanner.scan_all("}\n}");
assert_eq!(
results[0].depth_after, 0,
"Extra closing brace should saturate at 0"
);
assert_eq!(
results[1].depth_after, 0,
"Second extra closing brace should still be 0"
);
}
#[test]
fn find_closing_brace_uses_scanned_depth() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let code = "fn foo() {\n if true {\n bar();\n }\n}";
let scanned = scanner.scan_all(code);
let close = find_closing_brace(&scanned, 0);
assert_eq!(close, 4, "Closing brace for fn foo() should be at line 5");
let close = find_closing_brace(&scanned, 1);
assert_eq!(close, 3, "Closing brace for if true should be at line 4");
}
#[test]
fn find_closing_brace_skips_braces_in_strings() {
let mut scanner = CodeScanner::new(LexicalRules::rust());
let code = "fn foo() {\n let s = \"}\";\n}";
let scanned = scanner.scan_all(code);
let close = find_closing_brace(&scanned, 0);
assert_eq!(
close, 2,
"Should skip brace in string and find actual closing brace"
);
}
}