use crate::schema::{Meta, Reference, ReferencesFile, Status};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Args;
use regex::Regex;
use serde::Serialize;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
#[derive(Args)]
pub struct ScanArgs {
#[arg(required = true)]
pub files: Vec<PathBuf>,
#[arg(short, long, default_value = "references.yaml")]
pub output: PathBuf,
#[arg(long, default_value = "true")]
pub merge: bool,
}
#[derive(Debug, Serialize)]
pub struct ScanOutput {
pub file: String,
pub scanned_files: usize,
pub total_urls: usize,
pub new_urls: usize,
pub updated_urls: usize,
}
#[derive(Debug, Clone)]
struct FoundUrl {
url: String,
title: Option<String>,
source_file: String,
}
pub async fn run_scan(args: ScanArgs) -> Result<()> {
let output = scan_files(args).await?;
println!("{}", serde_json::to_string(&output)?);
Ok(())
}
pub(crate) async fn scan_files(args: ScanArgs) -> Result<ScanOutput> {
let files = expand_files(&args.files).await?;
if files.is_empty() {
return Ok(ScanOutput {
file: args.output.display().to_string(),
scanned_files: 0,
total_urls: 0,
new_urls: 0,
updated_urls: 0,
});
}
let mut all_urls: Vec<FoundUrl> = Vec::new();
for file in &files {
let content = tokio::fs::read_to_string(file)
.await
.with_context(|| format!("Failed to read {}", file.display()))?;
let found = extract_markdown_urls(&content, file);
all_urls.extend(found);
}
let mut url_map: HashMap<String, Reference> = HashMap::new();
for found in &all_urls {
let entry = url_map.entry(found.url.clone()).or_insert_with(|| {
let categories = infer_categories(&found.source_file);
Reference {
url: found.url.clone(),
title: found.title.clone().unwrap_or_else(|| found.url.clone()),
categories,
cited_in: Vec::new(),
status: Status::Pending,
verified: None,
notes: None,
}
});
if !entry.cited_in.contains(&found.source_file) {
entry.cited_in.push(found.source_file.clone());
}
if let Some(title) = &found.title {
if entry.title == entry.url && title != &entry.url {
entry.title.clone_from(title);
}
}
}
let (mut refs_file, _existing_count) = if args.merge && args.output.exists() {
let content = tokio::fs::read_to_string(&args.output).await?;
let existing: ReferencesFile = serde_yaml::from_str(&content)?;
let count = existing.references.len();
(existing, count)
} else {
let now = Utc::now().format("%Y-%m-%d").to_string();
(
ReferencesFile {
meta: Meta {
created: now,
last_verified: None,
tool: "ref".to_string(),
total_links: 0,
},
references: Vec::new(),
},
0,
)
};
let mut existing_urls: HashMap<String, usize> = HashMap::new();
for (i, r) in refs_file.references.iter().enumerate() {
existing_urls.insert(r.url.clone(), i);
}
let mut new_count = 0;
let mut updated_count = 0;
for (url, new_ref) in url_map {
if let Some(&idx) = existing_urls.get(&url) {
let existing = &mut refs_file.references[idx];
for cited in &new_ref.cited_in {
if !existing.cited_in.contains(cited) {
existing.cited_in.push(cited.clone());
updated_count += 1;
}
}
if existing.title == existing.url && new_ref.title != new_ref.url {
existing.title = new_ref.title;
}
} else {
refs_file.references.push(new_ref);
new_count += 1;
}
}
refs_file.meta.total_links = refs_file.references.len();
refs_file.references.sort_by(|a, b| a.url.cmp(&b.url));
let yaml = serde_yaml::to_string(&refs_file)?;
tokio::fs::write(&args.output, yaml).await?;
Ok(ScanOutput {
file: args.output.display().to_string(),
scanned_files: files.len(),
total_urls: refs_file.references.len(),
new_urls: new_count,
updated_urls: updated_count,
})
}
async fn expand_files(patterns: &[PathBuf]) -> Result<Vec<PathBuf>> {
let mut files = Vec::new();
for pattern in patterns {
let pattern_str = pattern.to_string_lossy();
if pattern_str.contains('*') {
for entry in glob::glob(&pattern_str)? {
let path = entry?;
if path.is_file() {
files.push(path);
}
}
} else if pattern.is_file() {
files.push(pattern.clone());
} else if pattern.is_dir() {
let mut entries = tokio::fs::read_dir(pattern).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
if path.is_file() {
if let Some(ext) = path.extension() {
if ext == "md" || ext == "markdown" {
files.push(path);
}
}
}
}
}
}
Ok(files)
}
fn extract_markdown_urls(content: &str, source_file: &Path) -> Vec<FoundUrl> {
let mut found = Vec::new();
let source = source_file.to_string_lossy().to_string();
let md_link_re = Regex::new(r"\[([^\]]+)\]\((https?://[^\s\)]+)\)").unwrap();
for cap in md_link_re.captures_iter(content) {
let title = cap[1].to_string();
let url = cap[2].to_string();
let url = url
.trim_end_matches([',', '.', ')', ']', ';', ':'])
.to_string();
found.push(FoundUrl {
url,
title: Some(title),
source_file: source.clone(),
});
}
let bare_url_re = Regex::new(r#"https?://[^\s\)>\]"'`]+"#).unwrap();
for mat in bare_url_re.find_iter(content) {
let url = mat.as_str();
let url = url
.trim_end_matches([',', '.', ')', ']', ';', ':'])
.to_string();
if !found.iter().any(|f| f.url == url) {
found.push(FoundUrl {
url,
title: None,
source_file: source.clone(),
});
}
}
found
}
fn infer_categories(path: &str) -> Vec<String> {
let mut categories = Vec::new();
let lower = path.to_lowercase();
if lower.contains("/docs/") || lower.contains("/doc/") || lower.starts_with("docs/") {
categories.push("documentation".to_string());
}
if lower.contains("/research/") || lower.starts_with("research/") {
categories.push("research".to_string());
}
if lower.contains("/adr/") || lower.starts_with("adr/") {
categories.push("architecture".to_string());
}
if lower.contains("/api/") || lower.starts_with("api/") {
categories.push("api".to_string());
}
if lower.contains("/test") || lower.starts_with("test") {
categories.push("testing".to_string());
}
if lower.contains("readme") {
categories.push("readme".to_string());
}
if lower.contains("changelog") {
categories.push("changelog".to_string());
}
if lower.contains("license") {
categories.push("legal".to_string());
}
if categories.is_empty() {
categories.push("general".to_string());
}
categories
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_markdown_urls() {
let content = r"
# Test Document
Check out [Example Site](https://example.com) for more info.
Also see [GitHub](https://github.com/foo/bar).
Bare URL: https://bare.example.org/path
Another [link](https://another.com/page?q=1).
";
let found = extract_markdown_urls(content, Path::new("test.md"));
assert_eq!(found.len(), 4);
let example = found
.iter()
.find(|f| f.url == "https://example.com")
.unwrap();
assert_eq!(example.title, Some("Example Site".to_string()));
let bare = found
.iter()
.find(|f| f.url == "https://bare.example.org/path")
.unwrap();
assert_eq!(bare.title, None);
}
#[test]
fn test_infer_categories() {
assert!(infer_categories("docs/adr/ADR-001.md").contains(&"architecture".to_string()));
assert!(infer_categories("research/topic.md").contains(&"research".to_string()));
assert!(infer_categories("README.md").contains(&"readme".to_string()));
assert!(infer_categories("random.md").contains(&"general".to_string()));
}
}