use chrono::Utc;
use crate::registry::package::PackageMetadata;
use crate::types::{AnalysisContext, Finding, FindingCategory, Severity};
use super::Analyzer;
const TOP_PACKAGES: &[&str] = &[
"react",
"express",
"lodash",
"axios",
"chalk",
"commander",
"debug",
"dotenv",
"fs-extra",
"glob",
"inquirer",
"jest",
"moment",
"mongoose",
"next",
"nodemon",
"prettier",
"socket.io",
"typescript",
"uuid",
"webpack",
"yargs",
"bluebird",
"body-parser",
"cheerio",
"cors",
"dayjs",
"eslint",
"fastify",
"helmet",
"jsonwebtoken",
"knex",
"luxon",
"marked",
"mysql2",
"nanoid",
"passport",
"pg",
"pino",
"ramda",
"redis",
"rimraf",
"rxjs",
"semver",
"sequelize",
"sharp",
"underscore",
"validator",
"vue",
"zod",
];
const HALLUCINATION_PATTERNS: &[&str] = &[
"utils-helper",
"data-processor",
"string-formatter",
"json-utils",
"file-helper",
"array-utils",
"object-helper",
"math-helper",
"date-utils",
"config-helper",
"log-helper",
"http-helper",
"api-helper",
"parse-helper",
"format-helper",
"convert-helper",
"validate-helper",
"transform-utils",
"common-utils",
"simple-utils",
];
const KNOWN_LEGITIMATE: &[&str] = &[
"preact",
"reakt",
"rax",
"inferno",
"mithril",
"chokidar",
"chalk-cli",
"expressjs",
"koa",
"hapi",
"fastify",
"restify",
"micro",
"polka",
"morgan",
"cors",
"helmet",
"pino-pretty",
];
fn is_known_legitimate(name: &str) -> bool {
KNOWN_LEGITIMATE.contains(&name)
}
fn is_plugin_or_extension(name: &str, top: &str) -> bool {
if name.starts_with(&format!("{}-", top)) || name.starts_with(&format!("{}.", top)) {
return true;
}
if name.contains(&format!("-{}", top)) && name.len() > top.len() + 5 {
return true;
}
false
}
fn has_homoglyphs(name: &str, top: &str) -> bool {
if name.len() != top.len() {
return false;
}
let mut diff_count = 0;
let mut has_glyph_swap = false;
for (a, b) in name.chars().zip(top.chars()) {
if a != b {
diff_count += 1;
if diff_count > 2 {
return false;
}
let is_homoglyph = matches!(
(a.to_ascii_lowercase(), b.to_ascii_lowercase()),
('0', 'o')
| ('o', '0')
| ('1', 'l')
| ('l', '1')
| ('1', 'i')
| ('i', '1')
| ('l', 'i')
| ('i', 'l')
| ('m', 'n')
| ('n', 'm')
| ('v', 'u')
| ('u', 'v')
| ('d', 'b')
| ('b', 'd')
| ('q', 'p')
| ('p', 'q')
);
if is_homoglyph {
has_glyph_swap = true;
}
}
}
has_glyph_swap && diff_count <= 2
}
fn detect_typosquat_technique(name: &str, top: &str) -> Option<String> {
if has_homoglyphs(name, top) {
return Some(format!("homoglyph substitution (looks like '{}')", top));
}
let name_no_hyphen: String = name.chars().filter(|c| *c != '-').collect();
let top_no_hyphen: String = top.chars().filter(|c| *c != '-').collect();
if name_no_hyphen == top_no_hyphen && name != top {
return Some(format!("hyphen manipulation of '{}'", top));
}
if name.contains('/') {
let bare = name.rsplit('/').next().unwrap_or("");
if bare == top {
return Some(format!("scope-squatting of '{}'", top));
}
}
None
}
fn normalized_levenshtein(a: &str, b: &str) -> f64 {
let max_len = a.len().max(b.len());
if max_len == 0 {
return 0.0;
}
levenshtein(a, b) as f64 / max_len as f64
}
fn levenshtein(a: &str, b: &str) -> usize {
let a_len = a.len();
let b_len = b.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
let mut prev: Vec<usize> = (0..=b_len).collect();
let mut curr = vec![0usize; b_len + 1];
for (i, ca) in a.chars().enumerate() {
curr[0] = i + 1;
for (j, cb) in b.chars().enumerate() {
let cost = if ca == cb { 0 } else { 1 };
curr[j + 1] = (prev[j] + cost).min(prev[j + 1] + 1).min(curr[j] + 1);
}
std::mem::swap(&mut prev, &mut curr);
}
prev[b_len]
}
fn days_since(iso: &str) -> Option<i64> {
let parsed = chrono::DateTime::parse_from_rfc3339(iso).ok()?;
let duration = Utc::now().signed_duration_since(parsed);
Some(duration.num_days())
}
pub struct HallucinationAnalyzer;
impl Default for HallucinationAnalyzer {
fn default() -> Self {
Self::new()
}
}
impl HallucinationAnalyzer {
pub fn new() -> Self {
Self
}
pub fn analyze_metadata(&self, metadata: &PackageMetadata) -> Vec<Finding> {
let mut findings: Vec<Finding> = Vec::new();
let pkg_name = metadata.name.as_deref().unwrap_or("");
let version_count = metadata.versions.len();
let weekly_downloads: u64 = metadata
.extra
.get("weeklyDownloads")
.or_else(|| metadata.extra.get("weekly_downloads"))
.and_then(|v| v.as_u64())
.unwrap_or(0);
let days_since_creation: Option<i64> =
metadata.time.get("created").and_then(|ts| days_since(ts));
let latest_info = metadata.latest_version_info();
let has_install_scripts = latest_info
.map(|v| !v.install_scripts().is_empty())
.unwrap_or(false);
let description = metadata
.description
.as_deref()
.or_else(|| latest_info.and_then(|v| v.description.as_deref()))
.unwrap_or("");
let has_repo = metadata
.extra
.get("repository")
.map(|v| !v.is_null())
.unwrap_or(false);
let maintainer_has_email = metadata
.maintainers
.as_ref()
.and_then(|ms| ms.first())
.and_then(|m| m.email.as_ref())
.map(|e| !e.is_empty())
.unwrap_or(false);
let latest_version_str = metadata.latest_version().unwrap_or("");
if let Some(age) = days_since_creation {
if age <= 30 && weekly_downloads == 0 && has_install_scripts {
findings.push(Finding {
severity: Severity::Critical,
category: FindingCategory::HallucinatedPackage,
title: "Likely malicious hallucinated package".into(),
description: format!(
"Package '{}' was created {} days ago, has 0 weekly downloads, \
and contains install scripts. This is a strong signal of a \
malicious package registered to exploit AI hallucinations.",
pkg_name, age
),
file: None,
line: None,
snippet: None,
});
}
}
if let Some(age) = days_since_creation {
if age < 7 && weekly_downloads < 100 {
findings.push(Finding {
severity: Severity::High,
category: FindingCategory::HallucinatedPackage,
title: "Very new package with almost no downloads".into(),
description: format!(
"Package '{}' was created only {} days ago and has {} weekly \
downloads. Newly created packages with negligible adoption \
are a hallucination/typosquat risk.",
pkg_name, age, weekly_downloads
),
file: None,
line: None,
snippet: None,
});
}
}
if !is_known_legitimate(pkg_name) {
for &top in TOP_PACKAGES {
if pkg_name == top {
continue; }
if is_plugin_or_extension(pkg_name, top) {
continue;
}
if let Some(technique) = detect_typosquat_technique(pkg_name, top) {
findings.push(Finding {
severity: Severity::Critical,
category: FindingCategory::HallucinatedPackage,
title: format!("Typosquat of '{}' detected", top),
description: format!(
"Package '{}' appears to be a typosquat of the popular \
package '{}' via {}.",
pkg_name, top, technique
),
file: None,
line: None,
snippet: None,
});
break;
}
let norm_dist = normalized_levenshtein(pkg_name, top);
let dist = levenshtein(pkg_name, top);
if dist > 0 && dist <= 2 && norm_dist < 0.4 {
findings.push(Finding {
severity: Severity::High,
category: FindingCategory::HallucinatedPackage,
title: format!("Name suspiciously similar to '{}'", top),
description: format!(
"Package '{}' is only {} edit(s) away from the popular \
package '{}' (normalized distance: {:.2}). \
This may be a typosquat or hallucinated variant.",
pkg_name, dist, top, norm_dist
),
file: None,
line: None,
snippet: None,
});
break; }
}
}
if description.is_empty() || description.len() < 10 {
findings.push(Finding {
severity: Severity::Medium,
category: FindingCategory::HallucinatedPackage,
title: "Missing or very short description".into(),
description: format!(
"Package '{}' has no README or a description shorter than 10 \
characters, which is unusual for legitimate packages.",
pkg_name
),
file: None,
line: None,
snippet: None,
});
}
if !has_repo {
findings.push(Finding {
severity: Severity::Medium,
category: FindingCategory::HallucinatedPackage,
title: "No repository URL".into(),
description: format!(
"Package '{}' does not declare a source repository, making it \
harder to verify its legitimacy.",
pkg_name
),
file: None,
line: None,
snippet: None,
});
}
if version_count == 1 && (latest_version_str == "0.0.1" || latest_version_str == "1.0.0") {
findings.push(Finding {
severity: Severity::Medium,
category: FindingCategory::HallucinatedPackage,
title: "Single version published at 0.0.1 or 1.0.0".into(),
description: format!(
"Package '{}' has exactly one published version ({}). \
Combined with other signals this can indicate a placeholder \
or malicious registration.",
pkg_name, latest_version_str
),
file: None,
line: None,
snippet: None,
});
}
if !maintainer_has_email {
findings.push(Finding {
severity: Severity::Medium,
category: FindingCategory::HallucinatedPackage,
title: "Maintainer has no published email".into(),
description: format!(
"The primary maintainer of '{}' has no public email address, \
which can be a signal of a throwaway account.",
pkg_name
),
file: None,
line: None,
snippet: None,
});
}
let lower = pkg_name.to_lowercase();
for &pattern in HALLUCINATION_PATTERNS {
if lower.contains(pattern) {
findings.push(Finding {
severity: Severity::Low,
category: FindingCategory::HallucinatedPackage,
title: "Name matches common AI hallucination pattern".into(),
description: format!(
"Package name '{}' contains the generic fragment '{}', \
which AI assistants frequently hallucinate as a package \
name.",
pkg_name, pattern
),
file: None,
line: None,
snippet: None,
});
break; }
}
findings
}
}
impl Analyzer for HallucinationAnalyzer {
fn name(&self) -> &str {
"hallucination"
}
fn analyze(&self, ctx: &AnalysisContext) -> Vec<Finding> {
self.analyze_metadata(ctx.metadata)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_levenshtein_identical() {
assert_eq!(levenshtein("react", "react"), 0);
}
#[test]
fn test_levenshtein_one_edit() {
assert_eq!(levenshtein("react", "reakt"), 1);
}
#[test]
fn test_levenshtein_two_edits() {
assert_eq!(levenshtein("lodash", "lodasg"), 1);
assert_eq!(levenshtein("axos", "axios"), 1);
}
#[test]
fn test_levenshtein_different() {
assert!(levenshtein("react", "totally-different") > 2);
}
}