use crate::result::Lang;
pub fn is_binary(data: &[u8]) -> bool {
if data.is_empty() {
return false;
}
if data.len() >= 2 {
if (data[0] == 0xFF && data[1] == 0xFE) || (data[0] == 0xFE && data[1] == 0xFF) {
return false;
}
}
let nul_count = data.iter().filter(|&&b| b == 0).count();
if nul_count == 0 {
return false;
}
let nul_ratio = nul_count as f64 / data.len() as f64;
if nul_ratio <= 0.10 {
return false;
}
let (even_nuls, odd_nuls) = data
.iter()
.enumerate()
.fold((0usize, 0usize), |(e, o), (i, &b)| {
if b == 0 {
if i % 2 == 0 {
(e + 1, o)
} else {
(e, o + 1)
}
} else {
(e, o)
}
});
let even_ratio = even_nuls as f64 / data.len() as f64;
let odd_ratio = odd_nuls as f64 / data.len() as f64;
if even_ratio > 0.40 || odd_ratio > 0.40 {
return false;
}
true
}
fn is_hash_comment_lang(lang: &Lang) -> bool {
match lang {
Lang::Identified(name) => matches!(
name.as_str(),
"Python" | "Shell" | "YAML" | "Makefile" | "Perl" | "Ruby"
),
_ => false,
}
}
pub fn count_lines(data: &[u8], lang: &Lang) -> (usize, usize, usize) {
if data.is_empty() {
return (0, 0, 0);
}
let data = data.strip_suffix(b"\n").unwrap_or(data);
let data = data.strip_suffix(b"\r").unwrap_or(data);
let mut code = 0usize;
let mut comment = 0usize;
let mut blank = 0usize;
let mut in_block = false;
let mut in_string = false;
let mut string_char = b'"';
let hash_comments = is_hash_comment_lang(lang);
for line in data.split(|&b| b == b'\n') {
let first_non_ws = line.iter().position(|&b| !b.is_ascii_whitespace());
match first_non_ws {
None => {
if in_block {
comment += 1;
} else {
blank += 1;
}
}
Some(_) => {
let mut has_code = false;
let mut has_comment = false;
let mut i = 0usize;
while i < line.len() {
let b = line[i];
if in_block {
has_comment = true;
if i + 1 < line.len() && b == b'*' && line[i + 1] == b'/' {
in_block = false;
i += 1; }
} else if in_string {
has_code = true;
if b == b'\\' {
i += 1;
} else if b == string_char {
in_string = false;
}
} else {
if b.is_ascii_whitespace() {
} else if b == b'"' || b == b'\'' {
in_string = true;
string_char = b;
has_code = true;
} else if i + 1 < line.len() && b == b'/' && line[i + 1] == b'/' {
has_comment = true;
break; } else if b == b'#' && hash_comments {
has_comment = true;
break;
} else if i + 1 < line.len() && b == b'/' && line[i + 1] == b'*' {
has_comment = true;
in_block = true;
i += 1; } else {
has_code = true;
}
}
i += 1;
}
in_string = false;
if has_code {
code += 1;
}
if has_comment {
comment += 1;
}
}
}
}
(code, comment, blank)
}
#[cfg(test)]
mod tests {
use super::*;
fn rs() -> Lang {
Lang::Identified("Rust".to_string())
}
fn py() -> Lang {
Lang::Identified("Python".to_string())
}
#[test]
fn test_empty_file() {
assert_eq!(count_lines(b"", &rs()), (0, 0, 0));
}
#[test]
fn test_trailing_newline() {
assert_eq!(count_lines(b"line1\n", &rs()), (1, 0, 0));
assert_eq!(count_lines(b"line1\n\n", &rs()), (1, 0, 1));
}
#[test]
fn test_inline_comments() {
assert_eq!(count_lines(b"code(); // comment", &rs()), (1, 1, 0));
assert_eq!(count_lines(b"// full comment", &rs()), (0, 1, 0));
}
#[test]
fn test_string_markers() {
assert_eq!(
count_lines(b"let x = \"// not a comment\";", &rs()),
(1, 0, 0)
);
assert_eq!(
count_lines(b"let x = \"/* not a block */\";", &rs()),
(1, 0, 0)
);
}
#[test]
fn test_multiline_string() {
let data = b"let x = \"\n continuation\n \";";
assert_eq!(count_lines(data, &rs()), (3, 0, 0));
}
#[test]
fn test_hash_logic() {
assert_eq!(count_lines(b"#attribute", &rs()), (1, 0, 0));
assert_eq!(count_lines(b"# comment", &py()), (0, 1, 0));
}
#[test]
fn test_block_comments() {
let data = b"/*\n multi\n line\n */";
assert_eq!(count_lines(data, &rs()), (0, 4, 0));
}
#[test]
fn test_utf16_not_binary() {
let utf16_le = vec![
0xFF, 0xFE, b'h', 0, b'e', 0, b'l', 0, b'l', 0, b'o', 0,
];
assert!(!is_binary(&utf16_le));
}
#[test]
fn test_block_comment_in_string() {
assert_eq!(
count_lines(b"let x = \"/* not a block */\";", &rs()),
(1, 0, 0)
);
assert_eq!(
count_lines(
b"let x = \"/* not a block */\";\nlet y = 1;",
&rs()
),
(2, 0, 0)
);
}
}