use std::fs::File;
use std::io::{self, BufRead, BufReader};
use std::path::Path;
use anyhow::{Context, Result};
use crate::languages::{find_language_index_for_path, language_markers_bytes};
use crate::types::FileCounts;
use once_cell::sync::OnceCell;
struct AnalyzerConfig {
no_mmap: bool,
mmap_threshold: u64,
}
static ANALYZER_CONFIG: OnceCell<AnalyzerConfig> = OnceCell::new();
pub fn set_analyzer_config(no_mmap: bool, mmap_threshold: Option<u64>) {
let _ = ANALYZER_CONFIG.set(AnalyzerConfig {
no_mmap,
mmap_threshold: mmap_threshold.unwrap_or(4 * 1024 * 1024),
});
}
pub fn analyze_file(path: &Path) -> Result<FileCounts> {
let file = File::open(path).with_context(|| format!("open file: {}", path.display()))?;
if let Some(cfg) = ANALYZER_CONFIG.get() {
if !cfg.no_mmap {
if let Ok(meta) = file.metadata() {
if meta.len() >= cfg.mmap_threshold {
if let Ok(mmap) = unsafe { memmap2::Mmap::map(&file) } {
let mut rdr = std::io::Cursor::new(&mmap[..]);
return analyze_reader(&mut rdr, path);
}
}
}
}
}
let mut reader = BufReader::new(file);
analyze_reader(&mut reader, path)
}
pub fn analyze_reader<R: BufRead + ?Sized>(reader: &mut R, path_hint: &Path) -> Result<FileCounts> {
let lang_idx = find_language_index_for_path(path_hint);
let mut counts = FileCounts::one_file();
let mut buf = Vec::with_capacity(8192);
let mut in_block: Option<(Vec<u8>, Vec<u8>)> = None;
type MarkersTuple = (&'static [Vec<u8>], Option<(&'static [u8], &'static [u8])>);
let (line_markers_vec, block_markers_bytes): MarkersTuple = if let Some(idx) = lang_idx {
language_markers_bytes(idx)
} else {
(&[], None)
};
if let Ok(slice) = reader.fill_buf() {
if slice.is_empty() {
return Ok(counts);
}
}
let mut pending: Vec<u8> = Vec::new();
loop {
buf.resize(8192, 0);
let n = match io::Read::read(reader, &mut buf) {
Ok(0) => 0,
Ok(n) => n,
Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => return Err(e).with_context(|| format!("read: {}", path_hint.display())),
};
if n == 0 {
if !pending.is_empty() {
process_line(
&mut counts,
line_markers_vec,
&block_markers_bytes,
&mut in_block,
trim_cr(&pending),
);
pending.clear();
}
break;
}
let chunk = &buf[..n];
let mut start = 0;
for i in memchr::memchr_iter(b'\n', chunk) {
if pending.is_empty() {
process_line(
&mut counts,
line_markers_vec,
&block_markers_bytes,
&mut in_block,
trim_cr(&chunk[start..i]),
);
} else {
pending.extend_from_slice(&chunk[start..i]);
let line = trim_cr(&pending);
process_line(
&mut counts,
line_markers_vec,
&block_markers_bytes,
&mut in_block,
line,
);
pending.clear();
}
start = i + 1;
}
if start < chunk.len() {
pending.extend_from_slice(&chunk[start..]);
}
}
Ok(counts)
}
pub fn analyze_reader_owned<R: BufRead>(mut reader: R, path_hint: &Path) -> Result<FileCounts> {
analyze_reader(&mut reader, path_hint)
}
fn trim_ascii_start(mut s: &[u8]) -> &[u8] {
while let Some((&b, rest)) = s.split_first() {
if b.is_ascii_whitespace() {
s = rest;
} else {
break;
}
}
s
}
fn trim_cr(s: &[u8]) -> &[u8] {
if let Some((&last, body)) = s.split_last() {
if last == b'\r' {
return body;
}
}
s
}
fn find_bytes(hay: &[u8], needle: &[u8]) -> Option<usize> {
if needle.is_empty() {
return Some(0);
}
memchr::memmem::find(hay, needle)
}
fn process_line(
counts: &mut FileCounts,
line_markers: &[Vec<u8>],
block_markers: &Option<(&[u8], &[u8])>,
in_block: &mut Option<(Vec<u8>, Vec<u8>)>,
raw: &[u8],
) {
counts.total += 1;
let trimmed = trim_ascii_start(raw);
if trimmed.is_empty() {
counts.blank += 1;
return;
}
if let &mut Some((_, ref end)) = in_block {
if let Some(idx) = find_bytes(trimmed, end.as_slice()) {
let after = &trimmed[idx + end.len()..];
*in_block = None;
if trim_ascii_start(after).is_empty() {
counts.comment += 1;
} else {
counts.code += 1;
}
return;
} else {
counts.comment += 1;
return;
}
}
if let Some((start, end)) = block_markers {
if let Some(start_idx) = find_bytes(trimmed, start) {
if let Some(end_rel) = find_bytes(&trimmed[start_idx + start.len()..], end) {
let before = &trimmed[..start_idx];
let after = &trimmed[start_idx + start.len() + end_rel + end.len()..];
if trim_ascii_start(before).is_empty() && trim_ascii_start(after).is_empty() {
counts.comment += 1;
} else {
counts.code += 1;
}
return;
} else {
*in_block = Some((start.to_vec(), end.to_vec()));
let before = &trimmed[..start_idx];
if trim_ascii_start(before).is_empty() {
counts.comment += 1;
} else {
counts.code += 1;
}
return;
}
}
}
let leading = trim_ascii_start(trimmed);
for bytes in line_markers {
let bytes = bytes.as_slice();
if leading.len() >= bytes.len() && &leading[..bytes.len()] == bytes {
counts.comment += 1;
return;
}
}
counts.code += 1;
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::tempdir;
#[test]
fn rust_line_and_block_comments() {
let dir = tempdir().unwrap();
let path = dir.path().join("sample.rs");
let mut f = std::fs::File::create(&path).unwrap();
write!(
f,
"// line\ncode\n/* block */\ncode /* mid */ more\n/* start\ncontinued\nend */\n"
)
.unwrap();
let counts = analyze_file(&path).unwrap();
assert_eq!(counts.total, 7);
assert_eq!(counts.comment, 5);
assert_eq!(counts.code, 2);
assert_eq!(counts.blank, 0);
}
#[test]
fn python_triple_quoted_strings_treated_as_code() {
let dir = tempdir().unwrap();
let path = dir.path().join("doc.py");
let mut f = std::fs::File::create(&path).unwrap();
write!(
f,
"\n\n\"\"\"Module docstring\nspans lines\n\"\"\"\n\n# comment line\nprint(1)\n"
)
.unwrap();
let counts = analyze_file(&path).unwrap();
assert_eq!(counts.total, 8);
assert_eq!(counts.blank, 3);
assert_eq!(counts.comment, 1);
assert_eq!(counts.code, 4);
}
#[test]
fn html_block_comments() {
let dir = tempdir().unwrap();
let path = dir.path().join("page.html");
let mut f = std::fs::File::create(&path).unwrap();
write!(
f,
"<!-- head -->\n<div>content</div>\n<!-- start\ncontinued\nend -->\n<div><!-- mid --></div>\n"
)
.unwrap();
let counts = analyze_file(&path).unwrap();
assert_eq!(counts.total, 6);
assert_eq!(counts.comment, 4);
assert_eq!(counts.code, 2);
assert_eq!(counts.blank, 0);
}
#[test]
fn markdown_html_comments() {
let dir = tempdir().unwrap();
let path = dir.path().join("README.md");
let mut f = std::fs::File::create(&path).unwrap();
write!(
f,
"# Title\n\n<!-- intro -->\nSome text paragraph.\n<!-- start\nmultiline\nend -->\n"
)
.unwrap();
let counts = analyze_file(&path).unwrap();
assert_eq!(counts.total, 7);
assert_eq!(counts.blank, 1);
assert_eq!(counts.comment, 4);
assert_eq!(counts.code, 2);
}
#[test]
fn ini_line_comments() {
let dir = tempdir().unwrap();
let path = dir.path().join("settings.ini");
let mut f = std::fs::File::create(&path).unwrap();
write!(
f,
"; leading comment\n# another comment\n\n[section]\nkey=value\nkey2 = value2 # trailing\n"
)
.unwrap();
let counts = analyze_file(&path).unwrap();
assert_eq!(counts.total, 6);
assert_eq!(counts.blank, 1);
assert_eq!(counts.comment, 2);
assert_eq!(counts.code, 3);
}
#[test]
fn svg_xml_comments() {
let dir = tempdir().unwrap();
let path = dir.path().join("icon.svg");
let mut f = std::fs::File::create(&path).unwrap();
write!(
f,
"<?xml version=\"1.0\"?>\n<!-- single -->\n<svg>\n <!-- start\n mid\n end -->\n</svg>\n"
)
.unwrap();
let counts = analyze_file(&path).unwrap();
assert_eq!(counts.total, 7);
assert_eq!(counts.comment, 4);
assert_eq!(counts.blank, 0);
assert_eq!(counts.code, 3);
}
}