mod c;
mod cpp;
mod elixir;
mod go;
mod haskell;
pub mod import;
mod java;
mod python;
mod ruby;
mod rust;
mod scala;
mod typescript;
use std::collections::HashMap;
use anyhow::Result;
use rayon::prelude::*;
use sha2::{Digest, Sha256};
use crate::analysis::walker::{Language, WalkedFile};
use crate::store::record::{TodoComment, TodoKind};
pub use import::{ImportKind, ImportStatement};
#[derive(Debug, Clone)]
pub struct StaticFileAnalysis {
pub path: String,
pub language: Language,
pub entry_points: Vec<String>,
pub exported_types: Vec<String>,
pub imports: Vec<ImportStatement>,
pub todos: Vec<TodoComment>,
pub unsafe_count: u32,
pub unwrap_count: u32,
pub panic_count: u32,
pub branch_count: u32,
pub module_doc: Option<String>,
pub content_hash: Option<String>,
pub line_count: u32,
}
impl StaticFileAnalysis {
pub(crate) fn empty(file: &WalkedFile) -> Self {
Self {
path: file.rel_path.clone(),
language: file.language,
entry_points: Vec::new(),
exported_types: Vec::new(),
imports: Vec::new(),
todos: Vec::new(),
unsafe_count: 0,
unwrap_count: 0,
panic_count: 0,
branch_count: 0,
module_doc: None,
content_hash: None,
line_count: 0,
}
}
}
pub fn parse_file(file: &WalkedFile) -> Result<StaticFileAnalysis> {
if !is_parseable_language(file.language) {
return Ok(StaticFileAnalysis::empty(file));
}
let bytes = match read_source_bytes(file) {
Some(b) => b,
None => return Ok(StaticFileAnalysis::empty(file)),
};
analyze_file_bytes(file, &bytes)
}
pub fn parse_files_parallel(files: &[WalkedFile]) -> Vec<StaticFileAnalysis> {
files
.par_iter()
.map(|f| {
parse_file(f).unwrap_or_else(|e| {
tracing::warn!("parser: unexpected error on {}: {e}", f.rel_path);
StaticFileAnalysis::empty(f)
})
})
.collect()
}
pub struct HashParseOutput {
pub parsed_files: Vec<WalkedFile>,
pub analyses: Vec<StaticFileAnalysis>,
pub new_mtimes: HashMap<String, u64>,
pub parse_count: usize,
pub skipped_count: usize,
}
pub fn hash_and_parse_parallel(
files: &[WalkedFile],
stored_mtimes: &HashMap<String, u64>,
) -> HashParseOutput {
enum Slot {
Changed(Box<(WalkedFile, StaticFileAnalysis)>),
Unchanged,
}
let slots: Vec<Option<Slot>> = files
.par_iter()
.map(|f| {
if f.mtime_secs != 0 && stored_mtimes.get(&f.rel_path) == Some(&f.mtime_secs) {
return Some(Slot::Unchanged);
}
if !is_parseable_language(f.language) {
return Some(Slot::Changed(Box::new((
f.clone(),
StaticFileAnalysis::empty(f),
))));
}
let bytes = match std::fs::read(&f.abs_path) {
Ok(b) => b,
Err(_) => return None, };
let analysis = analyze_file_bytes(f, &bytes).unwrap_or_else(|e| {
tracing::warn!("parser: error on {}: {e}", f.rel_path);
StaticFileAnalysis::empty(f)
});
Some(Slot::Changed(Box::new((f.clone(), analysis))))
})
.collect();
let mut parsed_files = Vec::new();
let mut analyses = Vec::new();
let mut new_mtimes = HashMap::new();
let mut skipped_count = 0usize;
for slot in slots.into_iter().flatten() {
match slot {
Slot::Changed(boxed) => {
let (file, analysis) = *boxed;
new_mtimes.insert(file.rel_path.clone(), file.mtime_secs);
parsed_files.push(file);
analyses.push(analysis);
}
Slot::Unchanged => skipped_count += 1,
}
}
let parse_count = parsed_files.len();
HashParseOutput {
parsed_files,
analyses,
new_mtimes,
parse_count,
skipped_count,
}
}
fn is_parseable_language(language: Language) -> bool {
matches!(
language,
Language::Rust
| Language::TypeScript
| Language::JavaScript
| Language::Python
| Language::Go
| Language::Java
| Language::C
| Language::Cpp
| Language::Ruby
| Language::Scala
| Language::Elixir
| Language::Haskell
)
}
pub(crate) fn analyze_file_bytes(file: &WalkedFile, bytes: &[u8]) -> Result<StaticFileAnalysis> {
let source = String::from_utf8_lossy(bytes);
let mut analysis = parse_file_from_source(file, &source)?;
analysis.content_hash = Some(format!("{:x}", Sha256::digest(bytes)));
analysis.line_count = count_lines(bytes);
Ok(analysis)
}
fn parse_file_from_source(file: &WalkedFile, source: &str) -> Result<StaticFileAnalysis> {
match file.language {
Language::Rust => rust::parse_rust(file, source),
Language::TypeScript | Language::JavaScript => typescript::parse_typescript(file, source),
Language::Python => python::parse_python(file, source),
Language::Go => go::parse_go(file, source),
Language::Java => java::parse_java(file, source),
Language::C => c::parse_c(file, source),
Language::Cpp => cpp::parse_cpp(file, source),
Language::Ruby => ruby::parse_ruby(file, source),
Language::Scala => scala::parse_scala(file, source),
Language::Elixir => elixir::parse_elixir(file, source),
Language::Haskell => haskell::parse_haskell(file, source),
_ => Ok(StaticFileAnalysis::empty(file)),
}
}
fn read_source_bytes(file: &WalkedFile) -> Option<Vec<u8>> {
match std::fs::read(&file.abs_path) {
Ok(bytes) => Some(bytes),
Err(e) => {
tracing::warn!("parser: cannot read {}: {e}", file.rel_path);
None
}
}
}
fn count_lines(bytes: &[u8]) -> u32 {
if bytes.is_empty() {
return 0;
}
let newline_count = bytes.iter().filter(|&&b| b == b'\n').count() as u32;
if bytes.last() == Some(&b'\n') {
newline_count
} else {
newline_count + 1
}
}
pub(crate) fn extract_todo(comment: &str, line: u32) -> Option<TodoComment> {
let inner = comment
.trim_start_matches('/')
.trim_start_matches('*')
.trim_start_matches('#')
.trim_end_matches('/')
.trim_end_matches('*')
.trim();
let b = inner.as_bytes();
let kind = if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"TODO") {
TodoKind::Todo
} else if b.len() >= 5 && b[..5].eq_ignore_ascii_case(b"FIXME") {
TodoKind::Fixme
} else if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"HACK") {
TodoKind::Hack
} else if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"NOTE") {
TodoKind::Note
} else if b.len() >= 10 && b[..10].eq_ignore_ascii_case(b"DEPRECATED") {
TodoKind::Deprecated
} else if b.len() >= 4 && b[..4].eq_ignore_ascii_case(b"@TS-") {
TodoKind::Note
} else if inner.contains("type: ignore") {
TodoKind::Note
} else {
return None;
};
Some(TodoComment {
text: inner.to_owned(),
line,
kind,
})
}
pub(crate) fn normalize_doc(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut last_was_space = true; for ch in s.chars() {
if ch.is_whitespace() {
if !last_was_space {
out.push(' ');
last_was_space = true;
}
} else {
out.push(ch);
last_was_space = false;
}
}
if out.ends_with(' ') {
out.pop();
}
out
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn extract_todo_none_for_plain_comment() {
assert!(extract_todo("// nothing special", 1).is_none());
}
#[test]
fn extract_todo_rust_line_comment() {
let t = extract_todo("// TODO: do something", 3).unwrap();
assert_eq!(t.kind, TodoKind::Todo);
assert_eq!(t.line, 3);
}
#[test]
fn extract_todo_rust_block_comment() {
let t = extract_todo("/* FIXME: clean up */", 10).unwrap();
assert_eq!(t.kind, TodoKind::Fixme);
}
#[test]
fn extract_todo_rust_doc_comment() {
let t = extract_todo("/// TODO: document", 1).unwrap();
assert_eq!(t.kind, TodoKind::Todo);
}
#[test]
fn extract_todo_python_hash_comment() {
let t = extract_todo("# TODO: fix this", 5).unwrap();
assert_eq!(t.kind, TodoKind::Todo);
}
#[test]
fn extract_todo_ts_ignore() {
let t = extract_todo("// @ts-ignore", 1).unwrap();
assert_eq!(t.kind, TodoKind::Note);
}
#[test]
fn extract_todo_ts_expect_error() {
let t = extract_todo("// @ts-expect-error", 1).unwrap();
assert_eq!(t.kind, TodoKind::Note);
}
#[test]
fn extract_todo_python_type_ignore() {
let t = extract_todo("# type: ignore", 1).unwrap();
assert_eq!(t.kind, TodoKind::Note);
}
#[test]
fn extract_todo_python_type_ignore_with_code() {
let t = extract_todo("# type: ignore[attr-defined]", 1).unwrap();
assert_eq!(t.kind, TodoKind::Note);
}
#[test]
fn extract_todo_case_insensitive() {
let t = extract_todo("// todo: lowercase", 1).unwrap();
assert_eq!(t.kind, TodoKind::Todo);
}
#[test]
fn unsupported_language_skipped_without_disk_read() {
let f = WalkedFile {
abs_path: PathBuf::from("/nonexistent/file.txt"),
rel_path: "notes.txt".to_owned(),
language: Language::Unknown,
size_bytes: 0,
mtime_secs: 0,
};
let a = parse_file(&f).unwrap();
assert!(a.entry_points.is_empty());
}
#[test]
fn parse_files_parallel_preserves_order() {
use tempfile::TempDir;
let dir = TempDir::new().unwrap();
let files: Vec<WalkedFile> = (0..3)
.map(|i| {
let rel = format!("f{i}.rs");
let abs = dir.path().join(&rel);
std::fs::write(&abs, format!("pub fn f{i}() {{}}")).unwrap();
WalkedFile {
abs_path: abs,
rel_path: rel,
language: Language::Rust,
size_bytes: 20,
mtime_secs: 0,
}
})
.collect();
let results = parse_files_parallel(&files);
assert_eq!(results[0].path, "f0.rs");
assert_eq!(results[1].path, "f1.rs");
assert_eq!(results[2].path, "f2.rs");
}
#[test]
fn parse_file_populates_hash_and_line_count() {
use tempfile::TempDir;
let dir = TempDir::new().unwrap();
let abs = dir.path().join("f.rs");
std::fs::write(&abs, "pub fn f() {}\n").unwrap();
let file = WalkedFile {
abs_path: abs,
rel_path: "f.rs".to_string(),
language: Language::Rust,
size_bytes: 13,
mtime_secs: 0,
};
let analysis = parse_file(&file).unwrap();
assert!(analysis.content_hash.is_some());
assert_eq!(analysis.line_count, 1);
}
#[test]
fn parse_file_counts_single_line_without_trailing_newline() {
use tempfile::TempDir;
let dir = TempDir::new().unwrap();
let abs = dir.path().join("f.rs");
std::fs::write(&abs, "pub fn f() {}").unwrap();
let file = WalkedFile {
abs_path: abs,
rel_path: "f.rs".to_string(),
language: Language::Rust,
size_bytes: 12,
mtime_secs: 0,
};
let analysis = parse_file(&file).unwrap();
assert_eq!(analysis.line_count, 1);
}
#[test]
fn parse_file_counts_multiple_lines_without_trailing_newline() {
use tempfile::TempDir;
let dir = TempDir::new().unwrap();
let abs = dir.path().join("f.rs");
std::fs::write(&abs, "pub fn f() {}\npub fn g() {}").unwrap();
let file = WalkedFile {
abs_path: abs,
rel_path: "f.rs".to_string(),
language: Language::Rust,
size_bytes: 27,
mtime_secs: 0,
};
let analysis = parse_file(&file).unwrap();
assert_eq!(analysis.line_count, 2);
}
}