use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::process::Command;
use serde::Deserialize;
use super::types::{CrawledPackage, CrawlerOptions};
#[cfg(test)]
const DEFAULT_BATCH_SIZE: usize = 100;
const SKIP_DIRS: &[&str] = &[
"dist",
"build",
"coverage",
"tmp",
"temp",
"__pycache__",
"vendor",
];
#[derive(Deserialize)]
struct PackageJsonPartial {
name: Option<String>,
version: Option<String>,
}
pub async fn read_package_json(pkg_json_path: &Path) -> Option<(String, String)> {
let content = tokio::fs::read_to_string(pkg_json_path).await.ok()?;
let pkg: PackageJsonPartial = serde_json::from_str(&content).ok()?;
let name = pkg.name?;
let version = pkg.version?;
if name.is_empty() || version.is_empty() {
return None;
}
Some((name, version))
}
pub fn parse_package_name(full_name: &str) -> (Option<String>, String) {
if full_name.starts_with('@') {
if let Some(slash_idx) = full_name.find('/') {
let namespace = full_name[..slash_idx].to_string();
let name = full_name[slash_idx + 1..].to_string();
return (Some(namespace), name);
}
}
(None, full_name.to_string())
}
pub fn build_npm_purl(namespace: Option<&str>, name: &str, version: &str) -> String {
match namespace {
Some(ns) => format!("pkg:npm/{ns}/{name}@{version}"),
None => format!("pkg:npm/{name}@{version}"),
}
}
pub fn get_npm_global_prefix() -> Result<String, String> {
let output = Command::new("npm")
.args(["root", "-g"])
.stdin(std::process::Stdio::null())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.output()
.map_err(|e| format!("Failed to run `npm root -g`: {e}"))?;
if !output.status.success() {
return Err(
"Failed to determine npm global prefix. Ensure npm is installed and in PATH."
.to_string(),
);
}
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
}
pub fn get_yarn_global_prefix() -> Option<String> {
let output = Command::new("yarn")
.args(["global", "dir"])
.stdin(std::process::Stdio::null())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.output()
.ok()?;
if !output.status.success() {
return None;
}
let dir = String::from_utf8_lossy(&output.stdout).trim().to_string();
if dir.is_empty() {
return None;
}
Some(PathBuf::from(dir).join("node_modules").to_string_lossy().to_string())
}
pub fn get_pnpm_global_prefix() -> Option<String> {
let output = Command::new("pnpm")
.args(["root", "-g"])
.stdin(std::process::Stdio::null())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.output()
.ok()?;
if !output.status.success() {
return None;
}
let path = String::from_utf8_lossy(&output.stdout).trim().to_string();
if path.is_empty() {
return None;
}
Some(path)
}
pub fn get_bun_global_prefix() -> Option<String> {
let output = Command::new("bun")
.args(["pm", "bin", "-g"])
.stdin(std::process::Stdio::null())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.output()
.ok()?;
if !output.status.success() {
return None;
}
let bin_path = String::from_utf8_lossy(&output.stdout).trim().to_string();
if bin_path.is_empty() {
return None;
}
let bun_root = PathBuf::from(&bin_path);
let bun_root = bun_root.parent()?;
Some(
bun_root
.join("install")
.join("global")
.join("node_modules")
.to_string_lossy()
.to_string(),
)
}
fn find_node_dirs_sync(base: &Path, segments: &[&str]) -> Vec<PathBuf> {
if !base.is_dir() {
return Vec::new();
}
if segments.is_empty() {
return vec![base.to_path_buf()];
}
let first = segments[0];
let rest = &segments[1..];
if first == "*" {
let mut results = Vec::new();
if let Ok(entries) = std::fs::read_dir(base) {
for entry in entries.flatten() {
let is_dir = entry
.metadata()
.map(|m| m.is_dir())
.unwrap_or(false);
if is_dir {
results.extend(find_node_dirs_sync(&base.join(entry.file_name()), rest));
}
}
}
results
} else {
find_node_dirs_sync(&base.join(first), rest)
}
}
pub struct NpmCrawler;
impl NpmCrawler {
pub fn new() -> Self {
Self
}
pub async fn get_node_modules_paths(&self, options: &CrawlerOptions) -> Result<Vec<PathBuf>, std::io::Error> {
if options.global || options.global_prefix.is_some() {
if let Some(ref custom) = options.global_prefix {
return Ok(vec![custom.clone()]);
}
return Ok(self.get_global_node_modules_paths());
}
Ok(self.find_local_node_modules_dirs(&options.cwd).await)
}
pub async fn crawl_all(&self, options: &CrawlerOptions) -> Vec<CrawledPackage> {
let mut packages = Vec::new();
let mut seen = HashSet::new();
let nm_paths = self.get_node_modules_paths(options).await.unwrap_or_default();
for nm_path in &nm_paths {
let found = self.scan_node_modules(nm_path, &mut seen).await;
packages.extend(found);
}
packages
}
pub async fn find_by_purls(
&self,
node_modules_path: &Path,
purls: &[String],
) -> Result<HashMap<String, CrawledPackage>, std::io::Error> {
let mut result: HashMap<String, CrawledPackage> = HashMap::new();
struct Target {
namespace: Option<String>,
name: String,
version: String,
#[allow(dead_code)] purl: String,
dir_key: String,
}
let purl_set: HashSet<&str> = purls.iter().map(|s| s.as_str()).collect();
let mut targets: Vec<Target> = Vec::new();
for purl in purls {
if let Some((ns, name, version)) = Self::parse_purl_components(purl) {
let dir_key = match &ns {
Some(ns_str) => format!("{ns_str}/{name}"),
None => name.clone(),
};
targets.push(Target {
namespace: ns,
name,
version,
purl: purl.clone(),
dir_key,
});
}
}
for target in &targets {
let pkg_path = node_modules_path.join(&target.dir_key);
let pkg_json_path = pkg_path.join("package.json");
if let Some((_, version)) = read_package_json(&pkg_json_path).await {
if version == target.version {
let purl = build_npm_purl(
target.namespace.as_deref(),
&target.name,
&version,
);
if purl_set.contains(purl.as_str()) {
result.insert(
purl.clone(),
CrawledPackage {
name: target.name.clone(),
version,
namespace: target.namespace.clone(),
purl,
path: pkg_path.clone(),
},
);
}
}
}
}
Ok(result)
}
fn get_global_node_modules_paths(&self) -> Vec<PathBuf> {
let mut seen = HashSet::new();
let mut paths = Vec::new();
let mut add = |p: PathBuf| {
if p.is_dir() && seen.insert(p.clone()) {
paths.push(p);
}
};
if let Ok(npm_path) = get_npm_global_prefix() {
add(PathBuf::from(npm_path));
}
if let Some(pnpm_path) = get_pnpm_global_prefix() {
add(PathBuf::from(pnpm_path));
}
if let Some(yarn_path) = get_yarn_global_prefix() {
add(PathBuf::from(yarn_path));
}
if let Some(bun_path) = get_bun_global_prefix() {
add(PathBuf::from(bun_path));
}
if cfg!(target_os = "macos") {
let home = std::env::var("HOME").unwrap_or_default();
add(PathBuf::from("/opt/homebrew/lib/node_modules"));
add(PathBuf::from("/usr/local/lib/node_modules"));
if !home.is_empty() {
for p in find_node_dirs_sync(
&PathBuf::from(&home).join(".nvm/versions/node"),
&["*", "lib", "node_modules"],
) {
add(p);
}
for p in find_node_dirs_sync(
&PathBuf::from(&home).join(".volta/tools/image/node"),
&["*", "lib", "node_modules"],
) {
add(p);
}
for p in find_node_dirs_sync(
&PathBuf::from(&home).join(".fnm/node-versions"),
&["*", "installation", "lib", "node_modules"],
) {
add(p);
}
}
}
paths
}
async fn find_local_node_modules_dirs(&self, start_path: &Path) -> Vec<PathBuf> {
let mut results = Vec::new();
let direct = start_path.join("node_modules");
if is_dir(&direct).await {
results.push(direct);
}
Self::find_workspace_node_modules(start_path, &mut results).await;
results
}
fn find_workspace_node_modules<'a>(
dir: &'a Path,
results: &'a mut Vec<PathBuf>,
) -> std::pin::Pin<Box<dyn std::future::Future<Output = ()> + 'a>> {
Box::pin(async move {
let mut entries = match tokio::fs::read_dir(dir).await {
Ok(rd) => rd,
Err(_) => return,
};
let mut entry_list = Vec::new();
while let Ok(Some(entry)) = entries.next_entry().await {
entry_list.push(entry);
}
for entry in entry_list {
let file_type = match entry.file_type().await {
Ok(ft) => ft,
Err(_) => continue,
};
if !file_type.is_dir() {
continue;
}
let name = entry.file_name();
let name_str = name.to_string_lossy();
if name_str == "node_modules"
|| name_str.starts_with('.')
|| SKIP_DIRS.contains(&name_str.as_ref())
{
continue;
}
let full_path = dir.join(&name);
let sub_nm = full_path.join("node_modules");
if is_dir(&sub_nm).await {
results.push(sub_nm);
}
Self::find_workspace_node_modules(&full_path, results).await;
}
})
}
async fn scan_node_modules(
&self,
node_modules_path: &Path,
seen: &mut HashSet<String>,
) -> Vec<CrawledPackage> {
let mut results = Vec::new();
let mut entries = match tokio::fs::read_dir(node_modules_path).await {
Ok(rd) => rd,
Err(_) => return results,
};
let mut entry_list = Vec::new();
while let Ok(Some(entry)) = entries.next_entry().await {
entry_list.push(entry);
}
for entry in entry_list {
let name = entry.file_name();
let name_str = name.to_string_lossy().to_string();
if name_str.starts_with('.') || name_str == "node_modules" {
continue;
}
let file_type = match entry.file_type().await {
Ok(ft) => ft,
Err(_) => continue,
};
if !file_type.is_dir() && !file_type.is_symlink() {
continue;
}
let entry_path = node_modules_path.join(&name_str);
if name_str.starts_with('@') {
let scoped =
Self::scan_scoped_packages(&entry_path, seen).await;
results.extend(scoped);
} else {
if let Some(pkg) = Self::check_package(&entry_path, seen).await {
results.push(pkg);
}
if file_type.is_dir() {
let nested =
Self::scan_nested_node_modules(&entry_path, seen).await;
results.extend(nested);
}
}
}
results
}
fn scan_scoped_packages<'a>(
scope_path: &'a Path,
seen: &'a mut HashSet<String>,
) -> std::pin::Pin<Box<dyn std::future::Future<Output = Vec<CrawledPackage>> + 'a>> {
Box::pin(async move {
let mut results = Vec::new();
let mut entries = match tokio::fs::read_dir(scope_path).await {
Ok(rd) => rd,
Err(_) => return results,
};
let mut entry_list = Vec::new();
while let Ok(Some(entry)) = entries.next_entry().await {
entry_list.push(entry);
}
for entry in entry_list {
let name = entry.file_name();
let name_str = name.to_string_lossy().to_string();
if name_str.starts_with('.') {
continue;
}
let file_type = match entry.file_type().await {
Ok(ft) => ft,
Err(_) => continue,
};
if !file_type.is_dir() && !file_type.is_symlink() {
continue;
}
let pkg_path = scope_path.join(&name_str);
if let Some(pkg) = Self::check_package(&pkg_path, seen).await {
results.push(pkg);
}
if file_type.is_dir() {
let nested =
Self::scan_nested_node_modules(&pkg_path, seen).await;
results.extend(nested);
}
}
results
})
}
fn scan_nested_node_modules<'a>(
pkg_path: &'a Path,
seen: &'a mut HashSet<String>,
) -> std::pin::Pin<Box<dyn std::future::Future<Output = Vec<CrawledPackage>> + 'a>> {
Box::pin(async move {
let nested_nm = pkg_path.join("node_modules");
let mut entries = match tokio::fs::read_dir(&nested_nm).await {
Ok(rd) => rd,
Err(_) => return Vec::new(),
};
let mut results = Vec::new();
let mut entry_list = Vec::new();
while let Ok(Some(entry)) = entries.next_entry().await {
entry_list.push(entry);
}
for entry in entry_list {
let name = entry.file_name();
let name_str = name.to_string_lossy().to_string();
if name_str.starts_with('.') || name_str == "node_modules" {
continue;
}
let file_type = match entry.file_type().await {
Ok(ft) => ft,
Err(_) => continue,
};
if !file_type.is_dir() && !file_type.is_symlink() {
continue;
}
let entry_path = nested_nm.join(&name_str);
if name_str.starts_with('@') {
let scoped =
Self::scan_scoped_packages(&entry_path, seen).await;
results.extend(scoped);
} else {
if let Some(pkg) = Self::check_package(&entry_path, seen).await {
results.push(pkg);
}
let deeper =
Self::scan_nested_node_modules(&entry_path, seen).await;
results.extend(deeper);
}
}
results
})
}
async fn check_package(
pkg_path: &Path,
seen: &mut HashSet<String>,
) -> Option<CrawledPackage> {
let pkg_json_path = pkg_path.join("package.json");
let (full_name, version) = read_package_json(&pkg_json_path).await?;
let (namespace, name) = parse_package_name(&full_name);
let purl = build_npm_purl(namespace.as_deref(), &name, &version);
if seen.contains(&purl) {
return None;
}
seen.insert(purl.clone());
Some(CrawledPackage {
name,
version,
namespace,
purl,
path: pkg_path.to_path_buf(),
})
}
fn parse_purl_components(purl: &str) -> Option<(Option<String>, String, String)> {
let base = match purl.find('?') {
Some(idx) => &purl[..idx],
None => purl,
};
let rest = base.strip_prefix("pkg:npm/")?;
let at_idx = rest.rfind('@')?;
let name_part = &rest[..at_idx];
let version = &rest[at_idx + 1..];
if name_part.is_empty() || version.is_empty() {
return None;
}
if name_part.starts_with('@') {
let slash_idx = name_part.find('/')?;
let namespace = name_part[..slash_idx].to_string();
let name = name_part[slash_idx + 1..].to_string();
if name.is_empty() {
return None;
}
Some((Some(namespace), name, version.to_string()))
} else {
Some((None, name_part.to_string(), version.to_string()))
}
}
}
impl Default for NpmCrawler {
fn default() -> Self {
Self::new()
}
}
async fn is_dir(path: &Path) -> bool {
tokio::fs::metadata(path)
.await
.map(|m| m.is_dir())
.unwrap_or(false)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_package_name_scoped() {
let (ns, name) = parse_package_name("@types/node");
assert_eq!(ns.as_deref(), Some("@types"));
assert_eq!(name, "node");
}
#[test]
fn test_parse_package_name_unscoped() {
let (ns, name) = parse_package_name("lodash");
assert!(ns.is_none());
assert_eq!(name, "lodash");
}
#[test]
fn test_build_npm_purl_scoped() {
assert_eq!(
build_npm_purl(Some("@types"), "node", "20.0.0"),
"pkg:npm/@types/node@20.0.0"
);
}
#[test]
fn test_build_npm_purl_unscoped() {
assert_eq!(
build_npm_purl(None, "lodash", "4.17.21"),
"pkg:npm/lodash@4.17.21"
);
}
#[test]
fn test_parse_purl_components_scoped() {
let (ns, name, ver) =
NpmCrawler::parse_purl_components("pkg:npm/@types/node@20.0.0").unwrap();
assert_eq!(ns.as_deref(), Some("@types"));
assert_eq!(name, "node");
assert_eq!(ver, "20.0.0");
}
#[test]
fn test_parse_purl_components_unscoped() {
let (ns, name, ver) =
NpmCrawler::parse_purl_components("pkg:npm/lodash@4.17.21").unwrap();
assert!(ns.is_none());
assert_eq!(name, "lodash");
assert_eq!(ver, "4.17.21");
}
#[test]
fn test_parse_purl_components_invalid() {
assert!(NpmCrawler::parse_purl_components("pkg:pypi/requests@2.0").is_none());
assert!(NpmCrawler::parse_purl_components("not-a-purl").is_none());
}
#[tokio::test]
async fn test_read_package_json_valid() {
let dir = tempfile::tempdir().unwrap();
let pkg_json = dir.path().join("package.json");
tokio::fs::write(
&pkg_json,
r#"{"name": "test-pkg", "version": "1.0.0"}"#,
)
.await
.unwrap();
let result = read_package_json(&pkg_json).await;
assert!(result.is_some());
let (name, version) = result.unwrap();
assert_eq!(name, "test-pkg");
assert_eq!(version, "1.0.0");
}
#[tokio::test]
async fn test_read_package_json_missing() {
let dir = tempfile::tempdir().unwrap();
let pkg_json = dir.path().join("package.json");
assert!(read_package_json(&pkg_json).await.is_none());
}
#[tokio::test]
async fn test_read_package_json_invalid() {
let dir = tempfile::tempdir().unwrap();
let pkg_json = dir.path().join("package.json");
tokio::fs::write(&pkg_json, "not json").await.unwrap();
assert!(read_package_json(&pkg_json).await.is_none());
}
#[tokio::test]
async fn test_crawl_all_basic() {
let dir = tempfile::tempdir().unwrap();
let nm = dir.path().join("node_modules");
let pkg_dir = nm.join("foo");
tokio::fs::create_dir_all(&pkg_dir).await.unwrap();
tokio::fs::write(
pkg_dir.join("package.json"),
r#"{"name": "foo", "version": "1.2.3"}"#,
)
.await
.unwrap();
let crawler = NpmCrawler::new();
let options = CrawlerOptions {
cwd: dir.path().to_path_buf(),
global: false,
global_prefix: None,
batch_size: DEFAULT_BATCH_SIZE,
};
let packages = crawler.crawl_all(&options).await;
assert_eq!(packages.len(), 1);
assert_eq!(packages[0].name, "foo");
assert_eq!(packages[0].version, "1.2.3");
assert_eq!(packages[0].purl, "pkg:npm/foo@1.2.3");
assert!(packages[0].namespace.is_none());
}
#[tokio::test]
async fn test_crawl_all_scoped() {
let dir = tempfile::tempdir().unwrap();
let nm = dir.path().join("node_modules");
let scope_dir = nm.join("@types").join("node");
tokio::fs::create_dir_all(&scope_dir).await.unwrap();
tokio::fs::write(
scope_dir.join("package.json"),
r#"{"name": "@types/node", "version": "20.0.0"}"#,
)
.await
.unwrap();
let crawler = NpmCrawler::new();
let options = CrawlerOptions {
cwd: dir.path().to_path_buf(),
global: false,
global_prefix: None,
batch_size: DEFAULT_BATCH_SIZE,
};
let packages = crawler.crawl_all(&options).await;
assert_eq!(packages.len(), 1);
assert_eq!(packages[0].name, "node");
assert_eq!(packages[0].namespace.as_deref(), Some("@types"));
assert_eq!(packages[0].purl, "pkg:npm/@types/node@20.0.0");
}
#[test]
fn test_find_node_dirs_sync_wildcard() {
let dir = tempfile::tempdir().unwrap();
let nm1 = dir.path().join("v18.0.0/lib/node_modules");
let nm2 = dir.path().join("v20.1.0/lib/node_modules");
std::fs::create_dir_all(&nm1).unwrap();
std::fs::create_dir_all(&nm2).unwrap();
let results = find_node_dirs_sync(dir.path(), &["*", "lib", "node_modules"]);
assert_eq!(results.len(), 2);
assert!(results.contains(&nm1));
assert!(results.contains(&nm2));
}
#[test]
fn test_find_node_dirs_sync_empty() {
let results = find_node_dirs_sync(Path::new("/nonexistent/path/xyz"), &["*", "lib"]);
assert!(results.is_empty());
}
#[test]
fn test_find_node_dirs_sync_literal() {
let dir = tempfile::tempdir().unwrap();
let target = dir.path().join("lib/node_modules");
std::fs::create_dir_all(&target).unwrap();
let results = find_node_dirs_sync(dir.path(), &["lib", "node_modules"]);
assert_eq!(results.len(), 1);
assert_eq!(results[0], target);
}
#[cfg(target_os = "macos")]
#[test]
fn test_macos_get_global_node_modules_paths_no_panic() {
let crawler = NpmCrawler::new();
let _paths = crawler.get_global_node_modules_paths();
}
#[tokio::test]
async fn test_find_by_purls() {
let dir = tempfile::tempdir().unwrap();
let nm = dir.path().join("node_modules");
let foo_dir = nm.join("foo");
tokio::fs::create_dir_all(&foo_dir).await.unwrap();
tokio::fs::write(
foo_dir.join("package.json"),
r#"{"name": "foo", "version": "1.0.0"}"#,
)
.await
.unwrap();
let types_dir = nm.join("@types").join("node");
tokio::fs::create_dir_all(&types_dir).await.unwrap();
tokio::fs::write(
types_dir.join("package.json"),
r#"{"name": "@types/node", "version": "20.0.0"}"#,
)
.await
.unwrap();
let crawler = NpmCrawler::new();
let purls = vec![
"pkg:npm/foo@1.0.0".to_string(),
"pkg:npm/@types/node@20.0.0".to_string(),
"pkg:npm/not-installed@0.0.1".to_string(),
];
let result = crawler.find_by_purls(&nm, &purls).await.unwrap();
assert_eq!(result.len(), 2);
assert!(result.contains_key("pkg:npm/foo@1.0.0"));
assert!(result.contains_key("pkg:npm/@types/node@20.0.0"));
assert!(!result.contains_key("pkg:npm/not-installed@0.0.1"));
}
}