use std::collections::HashMap;
use std::path::{Path, PathBuf};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use tracing::debug;
use tree_sitter::{Node, Tree};
use wide::{u8x32, u32x8, CmpEq};
use crate::callgraph::scanner::{ProjectScanner, ScanConfig};
use crate::error::{Result, BrrrError};
use crate::lang::LanguageRegistry;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct LOCMetrics {
pub physical: u32,
pub source: u32,
pub logical: u32,
pub comment: u32,
pub blank: u32,
pub code_to_comment_ratio: f64,
}
impl LOCMetrics {
#[must_use]
pub fn from_counts(physical: u32, source: u32, logical: u32, comment: u32, blank: u32) -> Self {
let code_to_comment_ratio = if comment > 0 {
f64::from(source) / f64::from(comment)
} else {
0.0
};
Self {
physical,
source,
logical,
comment,
blank,
code_to_comment_ratio,
}
}
#[must_use]
pub fn comment_density(&self) -> f64 {
if self.source > 0 {
f64::from(self.comment) / f64::from(self.source) * 100.0
} else {
0.0
}
}
#[must_use]
pub fn blank_ratio(&self) -> f64 {
if self.physical > 0 {
f64::from(self.blank) / f64::from(self.physical) * 100.0
} else {
0.0
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FunctionSize {
pub name: String,
pub file: PathBuf,
pub line: usize,
pub end_line: usize,
pub sloc: u32,
pub statements: u32,
pub comment_density: f64,
pub is_too_long: bool,
}
impl FunctionSize {
pub const DEFAULT_THRESHOLD: u32 = 50;
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileLOC {
pub file: PathBuf,
pub language: Option<String>,
pub metrics: LOCMetrics,
pub functions: Vec<FunctionSize>,
pub oversized_functions: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LanguageLOC {
pub language: String,
pub file_count: usize,
pub metrics: LOCMetrics,
pub avg_sloc_per_file: f64,
pub avg_statements_per_function: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LOCDistribution {
pub total_sloc: u32,
pub total_physical: u32,
pub total_logical: u32,
pub total_comment: u32,
pub total_blank: u32,
pub code_to_comment_ratio: f64,
pub blank_ratio: f64,
pub avg_sloc_per_file: f64,
pub max_sloc: u32,
pub min_sloc: u32,
pub median_sloc: u32,
pub total_functions: usize,
pub avg_function_size: f64,
pub oversized_function_count: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LOCAnalysis {
pub path: PathBuf,
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
pub files: Vec<FileLOC>,
pub by_language: Vec<LanguageLOC>,
pub stats: LOCDistribution,
pub largest_files: Vec<FileRanking>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub oversized_functions: Vec<FunctionSize>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub errors: Vec<LOCError>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileRanking {
pub file: PathBuf,
pub sloc: u32,
pub percentage: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LOCError {
pub file: PathBuf,
pub message: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
enum LineType {
Blank = 0,
CommentOnly = 1,
Code = 2,
StringLiteral = 3,
}
struct LineClassifier {
line_types: Vec<LineType>,
total_lines: usize,
}
impl LineClassifier {
fn new(total_lines: usize) -> Self {
Self {
line_types: vec![LineType::Code; total_lines],
total_lines,
}
}
fn mark_blank(&mut self, line: usize) {
if line < self.total_lines {
self.line_types[line] = LineType::Blank;
}
}
#[allow(dead_code)]
fn mark_comment_only(&mut self, start_line: usize, end_line: usize) {
for line in start_line..=end_line.min(self.total_lines.saturating_sub(1)) {
if self.line_types[line] != LineType::Code {
self.line_types[line] = LineType::CommentOnly;
}
}
}
#[allow(dead_code)]
fn mark_string_literal(&mut self, start_line: usize, end_line: usize) {
for line in start_line..=end_line.min(self.total_lines.saturating_sub(1)) {
if self.line_types[line] == LineType::Blank {
self.line_types[line] = LineType::StringLiteral;
}
}
}
fn count(&self) -> (u32, u32, u32, u32) {
let bytes: &[u8] = unsafe {
std::slice::from_raw_parts(
self.line_types.as_ptr().cast::<u8>(),
self.line_types.len(),
)
};
let mut blank = 0u32;
let mut comment = 0u32;
let mut code = 0u32;
let mut string_literal = 0u32;
let blank_v = u8x32::splat(LineType::Blank as u8);
let comment_v = u8x32::splat(LineType::CommentOnly as u8);
let code_v = u8x32::splat(LineType::Code as u8);
let string_v = u8x32::splat(LineType::StringLiteral as u8);
let chunks = bytes.chunks_exact(32);
let remainder = chunks.remainder();
for chunk in chunks {
let arr: [u8; 32] = chunk.try_into().unwrap_or([0u8; 32]);
let data = u8x32::from(arr);
let blank_m = data.cmp_eq(blank_v);
let comment_m = data.cmp_eq(comment_v);
let code_m = data.cmp_eq(code_v);
let string_m = data.cmp_eq(string_v);
blank += Self::count_mask_matches(blank_m);
comment += Self::count_mask_matches(comment_m);
code += Self::count_mask_matches(code_m);
string_literal += Self::count_mask_matches(string_m);
}
for &byte in remainder {
match byte {
0 => blank += 1,
1 => comment += 1,
2 => code += 1,
3 => string_literal += 1,
_ => {} }
}
(blank, comment, code + string_literal, string_literal)
}
#[inline(always)]
#[allow(clippy::transmute_ptr_to_ref)]
fn count_mask_matches(mask: wide::u8x32) -> u32 {
Self::count_mask_matches_impl(mask)
}
}
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
impl LineClassifier {
#[inline(always)]
fn count_mask_matches_impl(mask: wide::u8x32) -> u32 {
use std::arch::x86_64::{__m256i, _mm256_movemask_epi8};
#[allow(unsafe_code)]
unsafe {
let m: __m256i = std::mem::transmute(mask);
let bits = _mm256_movemask_epi8(m) as u32;
bits.count_ones()
}
}
}
#[cfg(not(all(target_arch = "x86_64", target_feature = "avx2")))]
impl LineClassifier {
#[inline(always)]
fn count_mask_matches_impl(mask: wide::u8x32) -> u32 {
let ones = wide::u8x32::splat(1);
let counted = mask & ones;
let arr: [u8; 32] = counted.into();
arr.iter().fold(0u32, |acc, &x| acc + u32::from(x))
}
}
#[inline]
fn simd_sum_u32(slice: &[u32]) -> u32 {
if slice.is_empty() {
return 0;
}
let chunks = slice.chunks_exact(8);
let remainder = chunks.remainder();
let mut acc = u32x8::splat(0);
for chunk in chunks {
let arr: [u32; 8] = chunk.try_into().unwrap_or([0u32; 8]);
let v = u32x8::from(arr);
acc += v;
}
let arr: [u32; 8] = acc.into();
let mut sum = arr[0]
.wrapping_add(arr[1])
.wrapping_add(arr[2])
.wrapping_add(arr[3])
.wrapping_add(arr[4])
.wrapping_add(arr[5])
.wrapping_add(arr[6])
.wrapping_add(arr[7]);
for &val in remainder {
sum = sum.wrapping_add(val);
}
sum
}
const WS_SPACE: u8 = b' '; const WS_TAB: u8 = b'\t'; const WS_NEWLINE: u8 = b'\n'; const WS_CR: u8 = b'\r';
#[inline(always)]
fn is_ascii_ws(byte: u8) -> bool {
matches!(byte, WS_SPACE | WS_TAB | WS_NEWLINE | WS_CR)
}
#[inline]
fn find_first_nonws(bytes: &[u8]) -> Option<usize> {
if bytes.is_empty() {
return None;
}
let space_v = u8x32::splat(WS_SPACE);
let tab_v = u8x32::splat(WS_TAB);
let nl_v = u8x32::splat(WS_NEWLINE);
let cr_v = u8x32::splat(WS_CR);
let chunks = bytes.chunks_exact(32);
let remainder = chunks.remainder();
let chunk_count = bytes.len() / 32;
for (chunk_idx, chunk) in chunks.enumerate() {
let arr: [u8; 32] = chunk.try_into().unwrap_or([0u8; 32]);
let data = u8x32::from(arr);
let is_ws = data.cmp_eq(space_v)
| data.cmp_eq(tab_v)
| data.cmp_eq(nl_v)
| data.cmp_eq(cr_v);
if let Some(offset) = find_first_zero_in_ws_mask(is_ws) {
return Some(chunk_idx * 32 + offset);
}
}
let base_offset = chunk_count * 32;
for (i, &byte) in remainder.iter().enumerate() {
if !is_ascii_ws(byte) {
return Some(base_offset + i);
}
}
None
}
#[inline]
fn find_last_nonws(bytes: &[u8]) -> Option<usize> {
if bytes.is_empty() {
return None;
}
let space_v = u8x32::splat(WS_SPACE);
let tab_v = u8x32::splat(WS_TAB);
let nl_v = u8x32::splat(WS_NEWLINE);
let cr_v = u8x32::splat(WS_CR);
let len = bytes.len();
let chunk_count = len / 32;
let remainder_len = len % 32;
if remainder_len > 0 {
let remainder_start = chunk_count * 32;
for i in (0..remainder_len).rev() {
if !is_ascii_ws(bytes[remainder_start + i]) {
return Some(remainder_start + i);
}
}
}
for chunk_idx in (0..chunk_count).rev() {
let start = chunk_idx * 32;
let chunk = &bytes[start..start + 32];
let arr: [u8; 32] = chunk.try_into().unwrap_or([0u8; 32]);
let data = u8x32::from(arr);
let is_ws = data.cmp_eq(space_v)
| data.cmp_eq(tab_v)
| data.cmp_eq(nl_v)
| data.cmp_eq(cr_v);
if let Some(offset) = find_last_zero_in_ws_mask(is_ws) {
return Some(start + offset);
}
}
None
}
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
#[inline(always)]
fn find_first_zero_in_ws_mask(mask: u8x32) -> Option<usize> {
use std::arch::x86_64::{__m256i, _mm256_movemask_epi8};
#[allow(unsafe_code)]
unsafe {
let m: __m256i = std::mem::transmute(mask);
let bits = _mm256_movemask_epi8(m) as u32;
let inverted = !bits;
if inverted == 0 {
None } else {
Some(inverted.trailing_zeros() as usize)
}
}
}
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
#[inline(always)]
fn find_last_zero_in_ws_mask(mask: u8x32) -> Option<usize> {
use std::arch::x86_64::{__m256i, _mm256_movemask_epi8};
#[allow(unsafe_code)]
unsafe {
let m: __m256i = std::mem::transmute(mask);
let bits = _mm256_movemask_epi8(m) as u32;
let inverted = !bits;
if inverted == 0 {
None } else {
Some(31 - inverted.leading_zeros() as usize)
}
}
}
#[cfg(not(all(target_arch = "x86_64", target_feature = "avx2")))]
#[inline(always)]
fn find_first_zero_in_ws_mask(mask: u8x32) -> Option<usize> {
let arr: [u8; 32] = mask.into();
arr.iter().position(|&x| x == 0)
}
#[cfg(not(all(target_arch = "x86_64", target_feature = "avx2")))]
#[inline(always)]
fn find_last_zero_in_ws_mask(mask: u8x32) -> Option<usize> {
let arr: [u8; 32] = mask.into();
arr.iter().rposition(|&x| x == 0)
}
struct AggregateMetricsCollector {
physical: Vec<u32>,
source: Vec<u32>,
logical: Vec<u32>,
comment: Vec<u32>,
blank: Vec<u32>,
}
impl AggregateMetricsCollector {
fn with_capacity(capacity: usize) -> Self {
Self {
physical: Vec::with_capacity(capacity),
source: Vec::with_capacity(capacity),
logical: Vec::with_capacity(capacity),
comment: Vec::with_capacity(capacity),
blank: Vec::with_capacity(capacity),
}
}
fn collect_from_file_locs(files: &[FileLOC]) -> Self {
let mut collector = Self::with_capacity(files.len());
for f in files {
collector.physical.push(f.metrics.physical);
collector.source.push(f.metrics.source);
collector.logical.push(f.metrics.logical);
collector.comment.push(f.metrics.comment);
collector.blank.push(f.metrics.blank);
}
collector
}
fn collect_from_file_loc_refs(files: &[&FileLOC]) -> Self {
let mut collector = Self::with_capacity(files.len());
for f in files {
collector.physical.push(f.metrics.physical);
collector.source.push(f.metrics.source);
collector.logical.push(f.metrics.logical);
collector.comment.push(f.metrics.comment);
collector.blank.push(f.metrics.blank);
}
collector
}
#[inline]
fn sum_all(&self) -> (u32, u32, u32, u32, u32) {
(
simd_sum_u32(&self.physical),
simd_sum_u32(&self.source),
simd_sum_u32(&self.logical),
simd_sum_u32(&self.comment),
simd_sum_u32(&self.blank),
)
}
}
const COMMENT_NODE_TYPES: &[&str] = &[
"comment", "line_comment", "block_comment", ];
const STRING_NODE_TYPES: &[&str] = &[
"string", "string_literal", "raw_string_literal", "template_string", "interpreted_string_literal", "raw_string", ];
const STATEMENT_NODE_TYPES: &[&str] = &[
"expression_statement", "return_statement", "if_statement", "for_statement",
"while_statement", "try_statement", "with_statement", "assert_statement",
"raise_statement", "pass_statement", "break_statement", "continue_statement",
"import_statement", "import_from_statement", "global_statement", "nonlocal_statement",
"delete_statement", "future_import_statement", "match_statement",
"expression_statement", "return_statement", "if_statement", "switch_statement",
"for_statement", "for_in_statement", "while_statement", "do_statement",
"try_statement", "with_statement", "throw_statement", "break_statement",
"continue_statement", "import_statement", "export_statement", "variable_declaration",
"lexical_declaration",
"expression_statement", "let_declaration", "return_expression", "if_expression",
"match_expression", "for_expression", "while_expression", "loop_expression",
"break_expression", "continue_expression", "macro_invocation",
"expression_statement", "return_statement", "if_statement", "switch_statement",
"for_statement", "go_statement", "select_statement", "defer_statement",
"var_declaration", "short_var_declaration", "assignment_statement",
"expression_statement", "return_statement", "if_statement", "switch_expression",
"for_statement", "enhanced_for_statement", "while_statement", "do_statement",
"try_statement", "throw_statement", "break_statement", "continue_statement",
"local_variable_declaration", "assert_statement",
"expression_statement", "return_statement", "if_statement", "switch_statement",
"for_statement", "while_statement", "do_statement", "break_statement",
"continue_statement", "goto_statement", "declaration", "compound_statement",
];
const FUNCTION_NODE_TYPES: &[&str] = &[
"function_definition",
"function_declaration", "method_definition", "arrow_function",
"function_item",
"function_declaration", "method_declaration",
"method_declaration", "constructor_declaration",
"function_definition",
];
struct ASTAnalyzer<'a> {
source: &'a [u8],
lines: Vec<&'a str>,
classifier: LineClassifier,
statement_count: u32,
function_ranges: Vec<(String, usize, usize)>, }
impl<'a> ASTAnalyzer<'a> {
fn new(source: &'a [u8]) -> Self {
let source_str = std::str::from_utf8(source).unwrap_or("");
let lines: Vec<&str> = source_str.lines().collect();
let total_lines = lines.len();
let mut classifier = LineClassifier::new(total_lines);
for (i, line) in lines.iter().enumerate() {
if line.trim().is_empty() {
classifier.mark_blank(i);
}
}
Self {
source,
lines,
classifier,
statement_count: 0,
function_ranges: Vec::new(),
}
}
fn analyze(&mut self, tree: &Tree, lang_name: &str) {
self.walk_node(tree.root_node(), lang_name, 0);
}
fn walk_node(&mut self, node: Node, lang_name: &str, depth: usize) {
let kind = node.kind();
let start_line = node.start_position().row;
let end_line = node.end_position().row;
if COMMENT_NODE_TYPES.contains(&kind) {
self.process_comment(node, start_line, end_line);
}
if STRING_NODE_TYPES.contains(&kind) && !self.is_docstring(node, lang_name) {
if end_line > start_line {
for line in (start_line + 1)..end_line {
if line < self.classifier.total_lines {
self.classifier.line_types[line] = LineType::StringLiteral;
}
}
}
}
if STATEMENT_NODE_TYPES.contains(&kind) {
self.statement_count += 1;
}
if FUNCTION_NODE_TYPES.contains(&kind) {
let name = self.extract_function_name(node);
self.function_ranges.push((name, start_line, end_line));
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
self.walk_node(child, lang_name, depth + 1);
}
}
fn process_comment(&mut self, node: Node, start_line: usize, end_line: usize) {
for line_idx in start_line..=end_line.min(self.classifier.total_lines.saturating_sub(1)) {
if self.is_line_only_comment(line_idx, node) {
self.classifier.line_types[line_idx] = LineType::CommentOnly;
}
}
}
fn is_line_only_comment(&self, line_idx: usize, comment_node: Node) -> bool {
if line_idx >= self.lines.len() {
return false;
}
let line = self.lines[line_idx];
let line_bytes = line.as_bytes();
let line_start_byte = self.line_start_byte(line_idx);
let comment_start = comment_node.start_byte();
let comment_end = comment_node.end_byte();
let first_nonws = match find_first_nonws(line_bytes) {
Some(idx) => idx,
None => return false, };
let last_nonws = find_last_nonws(line_bytes).unwrap_or(first_nonws);
let content_start = line_start_byte + first_nonws;
let content_end = line_start_byte + last_nonws + 1;
comment_start <= content_start && comment_end >= content_end
}
fn line_start_byte(&self, line_idx: usize) -> usize {
let mut offset = 0;
for (i, line) in self.lines.iter().enumerate() {
if i == line_idx {
return offset;
}
offset += line.len() + 1; }
offset
}
fn is_docstring(&self, node: Node, lang_name: &str) -> bool {
if lang_name != "python" {
return false;
}
if let Some(parent) = node.parent() {
if parent.kind() == "expression_statement" {
if let Some(grandparent) = parent.parent() {
let gp_kind = grandparent.kind();
if gp_kind == "block" || gp_kind == "module" {
let mut cursor = grandparent.walk();
for (idx, child) in grandparent.children(&mut cursor).enumerate() {
if child.kind() == "expression_statement" && child.id() == parent.id() {
return idx < 3; }
if STATEMENT_NODE_TYPES.contains(&child.kind()) && child.id() != parent.id() {
return false;
}
}
}
}
}
}
false
}
fn extract_function_name(&self, node: Node) -> String {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
match child.kind() {
"identifier" | "name" | "property_identifier" => {
return child.utf8_text(self.source)
.unwrap_or("")
.to_string();
}
"function_declarator" | "declarator" => {
let mut inner_cursor = child.walk();
for inner in child.children(&mut inner_cursor) {
if inner.kind() == "identifier" {
return inner.utf8_text(self.source)
.unwrap_or("")
.to_string();
}
}
}
_ => {}
}
}
"<anonymous>".to_string()
}
fn get_metrics(&self) -> LOCMetrics {
let (blank, comment, source, _) = self.classifier.count();
let physical = self.classifier.total_lines as u32;
LOCMetrics::from_counts(physical, source, self.statement_count, comment, blank)
}
fn get_function_metrics(&self, file: &Path) -> Vec<FunctionSize> {
self.function_ranges.iter().map(|(name, start, end)| {
let sloc = self.count_sloc_in_range(*start, *end);
let statements = self.count_statements_in_range(*start, *end);
let comment_lines = self.count_comments_in_range(*start, *end);
let comment_density = if sloc > 0 {
f64::from(comment_lines) / f64::from(sloc) * 100.0
} else {
0.0
};
FunctionSize {
name: name.clone(),
file: file.to_path_buf(),
line: *start + 1, end_line: *end + 1,
sloc,
statements,
comment_density,
is_too_long: sloc > FunctionSize::DEFAULT_THRESHOLD,
}
}).collect()
}
fn count_sloc_in_range(&self, start: usize, end: usize) -> u32 {
let end_idx = end.min(self.classifier.total_lines.saturating_sub(1));
if start > end_idx {
return 0;
}
let slice = &self.classifier.line_types[start..=end_idx];
let bytes: &[u8] = unsafe {
std::slice::from_raw_parts(slice.as_ptr().cast::<u8>(), slice.len())
};
let mut count = 0u32;
let code_val = u8x32::splat(LineType::Code as u8);
let str_val = u8x32::splat(LineType::StringLiteral as u8);
let chunks = bytes.chunks_exact(32);
let remainder = chunks.remainder();
for chunk in chunks {
let arr: [u8; 32] = chunk.try_into().unwrap_or([0u8; 32]);
let data = u8x32::from(arr);
let mask_code = data.cmp_eq(code_val);
let mask_str = data.cmp_eq(str_val);
let mask = mask_code | mask_str;
count += LineClassifier::count_mask_matches(mask);
}
for &byte in remainder {
if byte == LineType::Code as u8 || byte == LineType::StringLiteral as u8 {
count += 1;
}
}
count
}
fn count_statements_in_range(&self, start: usize, end: usize) -> u32 {
let sloc = self.count_sloc_in_range(start, end);
(f64::from(sloc) * 0.8).round() as u32
}
fn count_comments_in_range(&self, start: usize, end: usize) -> u32 {
let mut count = 0u32;
for i in start..=end.min(self.classifier.total_lines.saturating_sub(1)) {
if self.classifier.line_types[i] == LineType::CommentOnly {
count += 1;
}
}
count
}
}
pub fn analyze_loc(
path: impl AsRef<Path>,
language: Option<&str>,
function_threshold: Option<u32>,
) -> Result<LOCAnalysis> {
let path = path.as_ref();
let threshold = function_threshold.unwrap_or(FunctionSize::DEFAULT_THRESHOLD);
if !path.exists() {
return Err(BrrrError::Io(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("Path not found: {}", path.display()),
)));
}
if path.is_file() {
return analyze_file_loc(path, threshold);
}
let path_str = path.to_str().ok_or_else(|| {
BrrrError::InvalidArgument("Invalid path encoding".to_string())
})?;
let scanner = ProjectScanner::new(path_str)?;
let config = if let Some(lang) = language {
ScanConfig::for_language(lang)
} else {
ScanConfig::default()
};
let scan_result = scanner.scan_with_config(&config)?;
if scan_result.files.is_empty() {
return Err(BrrrError::InvalidArgument(format!(
"No source files found in {} (filter: {:?})",
path.display(),
language
)));
}
debug!("Analyzing {} files for LOC metrics", scan_result.files.len());
let results: Vec<std::result::Result<FileLOC, LOCError>> = scan_result
.files
.par_iter()
.map(|file| analyze_single_file(file, threshold))
.collect();
let mut files = Vec::new();
let mut errors = Vec::new();
for result in results {
match result {
Ok(file_loc) => files.push(file_loc),
Err(e) => errors.push(e),
}
}
let by_language = aggregate_by_language(&files);
let stats = calculate_distribution(&files);
let largest_files = get_largest_files(&files, &stats, 10);
let mut oversized_functions: Vec<FunctionSize> = files
.iter()
.flat_map(|f| f.functions.iter())
.filter(|f| f.is_too_long)
.cloned()
.collect();
oversized_functions.sort_by(|a, b| b.sloc.cmp(&a.sloc));
Ok(LOCAnalysis {
path: path.to_path_buf(),
language: language.map(String::from),
files,
by_language,
stats,
largest_files,
oversized_functions,
errors,
})
}
pub fn analyze_file_loc(
path: impl AsRef<Path>,
function_threshold: u32,
) -> Result<LOCAnalysis> {
let path = path.as_ref();
if !path.exists() {
return Err(BrrrError::Io(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("File not found: {}", path.display()),
)));
}
let file_loc = analyze_single_file(path, function_threshold)
.map_err(|e| BrrrError::InvalidArgument(e.message))?;
let by_language = aggregate_by_language(&[file_loc.clone()]);
let stats = calculate_distribution(&[file_loc.clone()]);
let largest_files = get_largest_files(&[file_loc.clone()], &stats, 1);
let oversized_functions: Vec<FunctionSize> = file_loc.functions
.iter()
.filter(|f| f.is_too_long)
.cloned()
.collect();
Ok(LOCAnalysis {
path: path.to_path_buf(),
language: file_loc.language.clone(),
files: vec![file_loc],
by_language,
stats,
largest_files,
oversized_functions,
errors: Vec::new(),
})
}
fn analyze_single_file(path: &Path, threshold: u32) -> std::result::Result<FileLOC, LOCError> {
let source = std::fs::read(path).map_err(|e| LOCError {
file: path.to_path_buf(),
message: format!("Failed to read file: {}", e),
})?;
let registry = LanguageRegistry::global();
let lang = registry.detect_language(path);
let lang_name = lang.map(|l| l.name().to_string());
let (metrics, functions) = if let Some(ref lang_impl) = lang {
let mut parser = lang_impl.parser().map_err(|e| LOCError {
file: path.to_path_buf(),
message: format!("Failed to create parser: {}", e),
})?;
let tree = parser.parse(&source, None).ok_or_else(|| LOCError {
file: path.to_path_buf(),
message: "Failed to parse file".to_string(),
})?;
let mut analyzer = ASTAnalyzer::new(&source);
analyzer.analyze(&tree, lang_impl.name());
let metrics = analyzer.get_metrics();
let mut functions = analyzer.get_function_metrics(path);
for func in &mut functions {
func.is_too_long = func.sloc > threshold;
}
(metrics, functions)
} else {
let metrics = simple_line_count(&source);
(metrics, Vec::new())
};
let oversized_functions = functions.iter().filter(|f| f.is_too_long).count();
Ok(FileLOC {
file: path.to_path_buf(),
language: lang_name,
metrics,
functions,
oversized_functions,
})
}
fn simple_line_count(source: &[u8]) -> LOCMetrics {
let source_str = std::str::from_utf8(source).unwrap_or("");
let lines: Vec<&str> = source_str.lines().collect();
let physical = lines.len() as u32;
let blank = lines.iter().filter(|l| l.trim().is_empty()).count() as u32;
let source = physical - blank;
LOCMetrics::from_counts(physical, source, source, 0, blank)
}
fn aggregate_by_language(files: &[FileLOC]) -> Vec<LanguageLOC> {
let mut by_lang: HashMap<String, Vec<&FileLOC>> = HashMap::new();
for file in files {
let lang = file.language.clone().unwrap_or_else(|| "unknown".to_string());
by_lang.entry(lang).or_default().push(file);
}
let mut result: Vec<LanguageLOC> = by_lang
.into_iter()
.map(|(lang, lang_files)| {
let file_count = lang_files.len();
let collector = AggregateMetricsCollector::collect_from_file_loc_refs(&lang_files);
let (total_physical, total_source, total_logical, total_comment, total_blank) =
collector.sum_all();
let metrics = LOCMetrics::from_counts(
total_physical,
total_source,
total_logical,
total_comment,
total_blank,
);
let total_functions: usize = lang_files.iter().map(|f| f.functions.len()).sum();
let total_statements: u32 = lang_files
.iter()
.flat_map(|f| f.functions.iter())
.map(|func| func.statements)
.sum();
let avg_sloc_per_file = if file_count > 0 {
f64::from(total_source) / file_count as f64
} else {
0.0
};
let avg_statements_per_function = if total_functions > 0 {
f64::from(total_statements) / total_functions as f64
} else {
0.0
};
LanguageLOC {
language: lang,
file_count,
metrics,
avg_sloc_per_file,
avg_statements_per_function,
}
})
.collect();
result.sort_by(|a, b| b.metrics.source.cmp(&a.metrics.source));
result
}
fn calculate_distribution(files: &[FileLOC]) -> LOCDistribution {
if files.is_empty() {
return LOCDistribution {
total_sloc: 0,
total_physical: 0,
total_logical: 0,
total_comment: 0,
total_blank: 0,
code_to_comment_ratio: 0.0,
blank_ratio: 0.0,
avg_sloc_per_file: 0.0,
max_sloc: 0,
min_sloc: 0,
median_sloc: 0,
total_functions: 0,
avg_function_size: 0.0,
oversized_function_count: 0,
};
}
let collector = AggregateMetricsCollector::collect_from_file_locs(files);
let (total_physical, total_sloc, total_logical, total_comment, total_blank) =
collector.sum_all();
let code_to_comment_ratio = if total_comment > 0 {
f64::from(total_sloc) / f64::from(total_comment)
} else {
0.0
};
let blank_ratio = if total_physical > 0 {
f64::from(total_blank) / f64::from(total_physical) * 100.0
} else {
0.0
};
let file_count = files.len();
let avg_sloc_per_file = f64::from(total_sloc) / file_count as f64;
let mut sloc_values: Vec<u32> = files.iter().map(|f| f.metrics.source).collect();
sloc_values.sort_unstable();
let max_sloc = *sloc_values.last().unwrap_or(&0);
let min_sloc = *sloc_values.first().unwrap_or(&0);
let median_sloc = if file_count % 2 == 0 && file_count >= 2 {
(sloc_values[file_count / 2 - 1] + sloc_values[file_count / 2]) / 2
} else {
sloc_values.get(file_count / 2).copied().unwrap_or(0)
};
let total_functions: usize = files.iter().map(|f| f.functions.len()).sum();
let total_func_sloc: u32 = files
.iter()
.flat_map(|f| f.functions.iter())
.map(|func| func.sloc)
.sum();
let avg_function_size = if total_functions > 0 {
f64::from(total_func_sloc) / total_functions as f64
} else {
0.0
};
let oversized_function_count: usize = files.iter().map(|f| f.oversized_functions).sum();
LOCDistribution {
total_sloc,
total_physical,
total_logical,
total_comment,
total_blank,
code_to_comment_ratio,
blank_ratio,
avg_sloc_per_file,
max_sloc,
min_sloc,
median_sloc,
total_functions,
avg_function_size,
oversized_function_count,
}
}
fn get_largest_files(files: &[FileLOC], stats: &LOCDistribution, limit: usize) -> Vec<FileRanking> {
let mut sorted: Vec<&FileLOC> = files.iter().collect();
sorted.sort_by(|a, b| b.metrics.source.cmp(&a.metrics.source));
sorted
.into_iter()
.take(limit)
.map(|f| FileRanking {
file: f.file.clone(),
sloc: f.metrics.source,
percentage: if stats.total_sloc > 0 {
f64::from(f.metrics.source) / f64::from(stats.total_sloc) * 100.0
} else {
0.0
},
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn create_temp_file(content: &str, extension: &str) -> NamedTempFile {
let mut file = tempfile::Builder::new()
.suffix(extension)
.tempfile()
.expect("Failed to create temp file");
file.write_all(content.as_bytes())
.expect("Failed to write to temp file");
file
}
#[test]
fn test_loc_metrics_basic() {
let source = r#"
# Comment
def hello():
"""Docstring"""
print("Hello")
# Another comment
def world():
pass
"#;
let file = create_temp_file(source, ".py");
let result = analyze_file_loc(file.path(), 50);
assert!(result.is_ok(), "Analysis should succeed");
let analysis = result.unwrap();
assert!(analysis.stats.total_physical > 0);
assert!(analysis.stats.total_comment > 0 || analysis.stats.total_sloc > 0);
}
#[test]
fn test_loc_metrics_empty_file() {
let source = "";
let file = create_temp_file(source, ".py");
let result = analyze_file_loc(file.path(), 50);
assert!(result.is_ok());
let analysis = result.unwrap();
assert_eq!(analysis.stats.total_sloc, 0);
}
#[test]
fn test_loc_metrics_blank_lines() {
let source = "def foo():\n pass\n\n\n\ndef bar():\n pass\n";
let file = create_temp_file(source, ".py");
let result = analyze_file_loc(file.path(), 50);
assert!(result.is_ok());
let analysis = result.unwrap();
assert!(analysis.stats.total_blank > 0, "Should detect blank lines");
}
#[test]
fn test_loc_metrics_comments_only() {
let source = r#"# Line 1
# Line 2
# Line 3
"#;
let file = create_temp_file(source, ".py");
let result = analyze_file_loc(file.path(), 50);
assert!(result.is_ok());
let analysis = result.unwrap();
assert!(analysis.stats.total_comment >= 2);
}
#[test]
fn test_loc_metrics_multiline_string() {
let source = r#"
def foo():
x = """
This is a
multi-line
string
"""
return x
"#;
let file = create_temp_file(source, ".py");
let result = analyze_file_loc(file.path(), 50);
assert!(result.is_ok());
let analysis = result.unwrap();
assert!(analysis.stats.total_sloc > 0);
}
#[test]
fn test_function_size_detection() {
let source = r#"
def small_function():
pass
def larger_function():
x = 1
y = 2
z = 3
return x + y + z
"#;
let file = create_temp_file(source, ".py");
let result = analyze_file_loc(file.path(), 50);
assert!(result.is_ok());
let analysis = result.unwrap();
assert!(!analysis.files.is_empty());
let file_loc = &analysis.files[0];
assert!(!file_loc.functions.is_empty(), "Should detect functions");
}
#[test]
fn test_typescript_loc() {
let source = r#"
// Single line comment
function hello(): void {
console.log("Hello");
}
/*
* Block comment
*/
const world = () => {
return "world";
};
"#;
let file = create_temp_file(source, ".ts");
let result = analyze_file_loc(file.path(), 50);
assert!(result.is_ok());
let analysis = result.unwrap();
assert!(analysis.stats.total_sloc > 0);
assert!(analysis.stats.total_comment > 0);
}
#[test]
fn test_rust_loc() {
let source = r#"
// Comment
fn main() {
let x = 42;
println!("{}", x);
}
/* Block comment */
fn helper() -> i32 {
0
}
"#;
let file = create_temp_file(source, ".rs");
let result = analyze_file_loc(file.path(), 50);
assert!(result.is_ok());
let analysis = result.unwrap();
assert!(analysis.stats.total_sloc > 0);
}
#[test]
fn test_code_to_comment_ratio() {
let metrics = LOCMetrics::from_counts(100, 80, 60, 20, 10);
assert!((metrics.code_to_comment_ratio - 4.0).abs() < 0.001);
assert!((metrics.comment_density() - 25.0).abs() < 0.001);
}
#[test]
fn test_loc_metrics_go() {
let source = r#"
package main
// Comment
func main() {
x := 42
fmt.Println(x)
}
"#;
let file = create_temp_file(source, ".go");
let result = analyze_file_loc(file.path(), 50);
assert!(result.is_ok());
let analysis = result.unwrap();
assert!(analysis.stats.total_sloc > 0);
}
#[test]
fn test_oversized_function_detection() {
let mut lines = vec!["def big_function():".to_string()];
for i in 0..60 {
lines.push(format!(" x{} = {}", i, i));
}
lines.push(" pass".to_string());
let source = lines.join("\n");
let file = create_temp_file(&source, ".py");
let result = analyze_file_loc(file.path(), 50);
assert!(result.is_ok());
let analysis = result.unwrap();
assert!(analysis.stats.oversized_function_count > 0 ||
!analysis.oversized_functions.is_empty() ||
analysis.files.iter().any(|f| f.oversized_functions > 0));
}
#[test]
fn test_simd_sum_u32_correctness() {
assert_eq!(super::simd_sum_u32(&[]), 0);
assert_eq!(super::simd_sum_u32(&[42]), 42);
assert_eq!(super::simd_sum_u32(&[1, 2, 3, 4, 5]), 15);
assert_eq!(super::simd_sum_u32(&[1, 2, 3, 4, 5, 6, 7, 8]), 36);
let arr16: Vec<u32> = (1..=16).collect();
assert_eq!(super::simd_sum_u32(&arr16), 136);
let arr17: Vec<u32> = (1..=17).collect();
assert_eq!(super::simd_sum_u32(&arr17), 153);
let arr100: Vec<u32> = (1..=100).collect();
assert_eq!(super::simd_sum_u32(&arr100), 5050);
let large_arr = vec![u32::MAX / 4; 8];
let expected = (u32::MAX / 4).wrapping_mul(8);
assert_eq!(super::simd_sum_u32(&large_arr), expected);
}
#[test]
fn test_aggregate_metrics_collector() {
let files: Vec<super::FileLOC> = (0..20).map(|i| {
super::FileLOC {
file: std::path::PathBuf::from(format!("test_{}.py", i)),
language: Some("python".to_string()),
metrics: super::LOCMetrics::from_counts(
100 + i, 80 + i, 60 + i, 10 + i, 10, ),
functions: Vec::new(),
oversized_functions: 0,
}
}).collect();
let collector = super::AggregateMetricsCollector::collect_from_file_locs(&files);
let (physical, source, logical, comment, blank) = collector.sum_all();
assert_eq!(physical, 2190);
assert_eq!(source, 1790); assert_eq!(logical, 1390); assert_eq!(comment, 390); assert_eq!(blank, 200); }
#[test]
fn test_find_first_nonws_empty() {
assert_eq!(super::find_first_nonws(b""), None);
}
#[test]
fn test_find_first_nonws_all_whitespace() {
assert_eq!(super::find_first_nonws(b" "), None);
assert_eq!(super::find_first_nonws(b"\t\t\t"), None);
assert_eq!(super::find_first_nonws(b" \t \n \r "), None);
assert_eq!(super::find_first_nonws(b" "), None); }
#[test]
fn test_find_first_nonws_immediate() {
assert_eq!(super::find_first_nonws(b"hello"), Some(0));
assert_eq!(super::find_first_nonws(b"x"), Some(0));
assert_eq!(super::find_first_nonws(b"#comment"), Some(0));
}
#[test]
fn test_find_first_nonws_with_leading_spaces() {
assert_eq!(super::find_first_nonws(b" hello"), Some(2));
assert_eq!(super::find_first_nonws(b"\t\thello"), Some(2));
assert_eq!(super::find_first_nonws(b" code"), Some(4));
}
#[test]
fn test_find_first_nonws_simd_boundary() {
let mut data = vec![b' '; 31];
data.push(b'x');
assert_eq!(super::find_first_nonws(&data), Some(31));
let mut data = vec![b' '; 32];
data.push(b'x');
assert_eq!(super::find_first_nonws(&data), Some(32));
let mut data = vec![b' '; 33];
data.push(b'x');
assert_eq!(super::find_first_nonws(&data), Some(33));
let mut data = vec![b' '; 40];
data.push(b'x');
assert_eq!(super::find_first_nonws(&data), Some(40));
}
#[test]
fn test_find_first_nonws_large_input() {
let mut data = vec![b' '; 100];
data.extend_from_slice(b"content");
assert_eq!(super::find_first_nonws(&data), Some(100));
}
#[test]
fn test_find_last_nonws_empty() {
assert_eq!(super::find_last_nonws(b""), None);
}
#[test]
fn test_find_last_nonws_all_whitespace() {
assert_eq!(super::find_last_nonws(b" "), None);
assert_eq!(super::find_last_nonws(b"\t\n\r "), None);
assert_eq!(super::find_last_nonws(b" "), None); }
#[test]
fn test_find_last_nonws_immediate() {
assert_eq!(super::find_last_nonws(b"hello"), Some(4));
assert_eq!(super::find_last_nonws(b"x"), Some(0));
}
#[test]
fn test_find_last_nonws_with_trailing_spaces() {
assert_eq!(super::find_last_nonws(b"hello "), Some(4));
assert_eq!(super::find_last_nonws(b"code\t\t"), Some(3));
assert_eq!(super::find_last_nonws(b"x "), Some(0));
}
#[test]
fn test_find_last_nonws_simd_boundary() {
let mut data = b"hello".to_vec();
data.extend(vec![b' '; 27]); assert_eq!(super::find_last_nonws(&data), Some(4));
let mut data = b"hello".to_vec();
data.extend(vec![b' '; 28]); assert_eq!(super::find_last_nonws(&data), Some(4));
let mut data = b"hello".to_vec();
data.extend(vec![b' '; 59]); assert_eq!(super::find_last_nonws(&data), Some(4));
}
#[test]
fn test_find_last_nonws_large_input() {
let mut data = b"content".to_vec();
data.extend(vec![b' '; 100]);
assert_eq!(super::find_last_nonws(&data), Some(6)); }
#[test]
fn test_find_nonws_consistency() {
let test_cases = [
b" hello world ".as_slice(),
b"\thello\t".as_slice(),
b" x ".as_slice(),
b"no_whitespace".as_slice(),
];
for input in test_cases {
let first = super::find_first_nonws(input);
let last = super::find_last_nonws(input);
assert!(first.is_some(), "first should find non-ws in {:?}", input);
assert!(last.is_some(), "last should find non-ws in {:?}", input);
assert!(first.unwrap() <= last.unwrap(),
"first {} should be <= last {} for {:?}",
first.unwrap(), last.unwrap(), input);
}
}
#[test]
fn test_is_ascii_ws() {
assert!(super::is_ascii_ws(b' '));
assert!(super::is_ascii_ws(b'\t'));
assert!(super::is_ascii_ws(b'\n'));
assert!(super::is_ascii_ws(b'\r'));
assert!(!super::is_ascii_ws(b'a'));
assert!(!super::is_ascii_ws(b'0'));
assert!(!super::is_ascii_ws(b'#'));
assert!(!super::is_ascii_ws(0));
assert!(!super::is_ascii_ws(255));
}
}