use std::collections::BTreeMap;
use std::collections::HashMap;
use std::collections::HashSet;
use std::collections::hash_map::DefaultHasher;
use std::hash::Hash;
use std::hash::Hasher;
use std::path::Path;
use std::path::PathBuf;
use globset::Glob;
use globset::GlobSet;
use globset::GlobSetBuilder;
use ignore::gitignore::Gitignore;
use ignore::gitignore::GitignoreBuilder;
use serde::Deserialize;
use serde::Serialize;
use crate::Block;
use crate::BlockType;
use crate::MdtError;
use crate::MdtResult;
use crate::config::CONFIG_FILE_CANDIDATES;
use crate::config::CodeBlockFilter;
use crate::config::DEFAULT_MAX_FILE_SIZE;
use crate::config::MdtConfig;
use crate::config::PaddingConfig;
use crate::engine::validate_transformers;
use crate::index_cache;
use crate::index_cache::FileFingerprint;
use crate::index_cache::ProjectIndexCache;
use crate::parser::ParseDiagnostic;
use crate::parser::parse_with_diagnostics;
use crate::source_scanner::parse_source_with_diagnostics;
#[derive(Debug, Clone)]
pub struct ScanOptions {
pub exclude_patterns: Vec<String>,
pub include_set: GlobSet,
pub template_paths: Vec<PathBuf>,
pub max_file_size: u64,
pub disable_gitignore: bool,
pub markdown_codeblocks: CodeBlockFilter,
pub excluded_blocks: Vec<String>,
pub cache_verify_hash: bool,
}
impl Default for ScanOptions {
fn default() -> Self {
Self {
exclude_patterns: Vec::new(),
include_set: GlobSet::empty(),
template_paths: Vec::new(),
max_file_size: DEFAULT_MAX_FILE_SIZE,
disable_gitignore: false,
markdown_codeblocks: CodeBlockFilter::default(),
excluded_blocks: Vec::new(),
cache_verify_hash: false,
}
}
}
impl ScanOptions {
pub fn from_config(config: Option<&MdtConfig>) -> Self {
let exclude_patterns = config
.map(|c| c.exclude.patterns.clone())
.unwrap_or_default();
let include_patterns = config.map(|c| &c.include.patterns[..]).unwrap_or_default();
let template_paths = config
.map(|c| c.templates.paths.clone())
.unwrap_or_default();
let max_file_size = config.map_or(DEFAULT_MAX_FILE_SIZE, |c| c.max_file_size);
let disable_gitignore = config.is_some_and(|c| c.disable_gitignore);
let markdown_codeblocks = config
.map(|c| c.exclude.markdown_codeblocks.clone())
.unwrap_or_default();
let excluded_blocks = config.map(|c| c.exclude.blocks.clone()).unwrap_or_default();
let cache_verify_hash = std::env::var_os("MDT_CACHE_VERIFY_HASH").is_some();
let include_set = build_glob_set(include_patterns);
Self {
exclude_patterns,
include_set,
template_paths,
max_file_size,
disable_gitignore,
markdown_codeblocks,
excluded_blocks,
cache_verify_hash,
}
}
}
#[derive(Debug, Clone, Default)]
#[allow(clippy::struct_excessive_bools)]
pub struct ValidationOptions {
pub ignore_unclosed_blocks: bool,
pub ignore_unused_blocks: bool,
pub ignore_invalid_names: bool,
pub ignore_invalid_transformers: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub enum DiagnosticKind {
UnclosedBlock { name: String },
UnknownTransformer { name: String },
InvalidTransformerArgs {
name: String,
expected: String,
got: usize,
},
UnusedProvider { name: String },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProjectDiagnostic {
pub file: PathBuf,
pub kind: DiagnosticKind,
pub line: usize,
pub column: usize,
}
impl ProjectDiagnostic {
pub fn is_error(&self, options: &ValidationOptions) -> bool {
match &self.kind {
DiagnosticKind::UnclosedBlock { .. } => !options.ignore_unclosed_blocks,
DiagnosticKind::UnknownTransformer { .. }
| DiagnosticKind::InvalidTransformerArgs { .. } => !options.ignore_invalid_transformers,
DiagnosticKind::UnusedProvider { .. } => !options.ignore_unused_blocks,
}
}
pub fn message(&self) -> String {
match &self.kind {
DiagnosticKind::UnclosedBlock { name } => {
format!("missing closing tag for block `{name}`")
}
DiagnosticKind::UnknownTransformer { name } => {
format!("unknown transformer `{name}`")
}
DiagnosticKind::InvalidTransformerArgs {
name,
expected,
got,
} => format!("transformer `{name}` expects {expected} argument(s), got {got}"),
DiagnosticKind::UnusedProvider { name } => {
format!("provider block `{name}` has no consumers")
}
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Project {
pub providers: HashMap<String, ProviderEntry>,
pub consumers: Vec<ConsumerEntry>,
pub diagnostics: Vec<ProjectDiagnostic>,
}
#[derive(Debug)]
pub struct ProjectContext {
pub project: Project,
pub data: HashMap<String, serde_json::Value>,
pub padding: Option<PaddingConfig>,
}
impl ProjectContext {
pub fn find_missing_providers(&self) -> Vec<String> {
find_missing_providers(&self.project)
}
}
#[derive(Debug, Clone, Serialize)]
pub struct ProjectCacheLastScan {
pub timestamp_unix_ms: u64,
pub full_project_hit: bool,
pub reused_files: u64,
pub reparsed_files: u64,
pub total_files: u64,
}
#[derive(Debug, Clone, Serialize)]
pub struct ProjectCacheTelemetry {
pub scan_count: u64,
pub full_project_hit_count: u64,
pub reused_file_count_total: u64,
pub reparsed_file_count_total: u64,
pub last_scan: Option<ProjectCacheLastScan>,
}
#[derive(Debug, Clone, Serialize)]
pub struct ProjectCacheArtifactState {
pub exists: bool,
pub readable: bool,
pub valid: bool,
}
#[derive(Debug, Clone, Serialize)]
pub struct ProjectCacheCompatibilityState {
pub schema_supported: bool,
pub project_key_matches: bool,
pub hash_verification_enabled: bool,
}
#[derive(Debug, Clone, Serialize)]
pub struct ProjectCacheInspection {
pub path: PathBuf,
pub artifact: ProjectCacheArtifactState,
pub schema_version: Option<u32>,
pub compatibility: ProjectCacheCompatibilityState,
pub telemetry: Option<ProjectCacheTelemetry>,
}
impl ProjectCacheInspection {
pub fn exists(&self) -> bool {
self.artifact.exists
}
pub fn readable(&self) -> bool {
self.artifact.readable
}
pub fn valid(&self) -> bool {
self.artifact.valid
}
pub fn schema_supported(&self) -> bool {
self.compatibility.schema_supported
}
pub fn project_key_matches(&self) -> bool {
self.compatibility.project_key_matches
}
pub fn hash_verification_enabled(&self) -> bool {
self.compatibility.hash_verification_enabled
}
}
impl From<index_cache::LastScanTelemetry> for ProjectCacheLastScan {
fn from(value: index_cache::LastScanTelemetry) -> Self {
Self {
timestamp_unix_ms: value.timestamp_unix_ms,
full_project_hit: value.full_project_hit,
reused_files: value.reused_files,
reparsed_files: value.reparsed_files,
total_files: value.total_files,
}
}
}
impl From<index_cache::CacheTelemetry> for ProjectCacheTelemetry {
fn from(value: index_cache::CacheTelemetry) -> Self {
Self {
scan_count: value.scan_count,
full_project_hit_count: value.full_project_hit_count,
reused_file_count_total: value.reused_file_count_total,
reparsed_file_count_total: value.reparsed_file_count_total,
last_scan: value.last_scan.map(Into::into),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProviderEntry {
pub block: Block,
pub file: PathBuf,
pub content: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConsumerEntry {
pub block: Block,
pub file: PathBuf,
pub content: String,
}
pub fn scan_project(root: &Path) -> MdtResult<Project> {
scan_project_with_options(root, &ScanOptions::default())
}
pub fn scan_project_with_config(root: &Path) -> MdtResult<ProjectContext> {
let config = MdtConfig::load(root)?;
let options = ScanOptions::from_config(config.as_ref());
let project = scan_project_with_options(root, &options)?;
let padding = config.as_ref().and_then(|c| c.padding.clone());
let data = match config {
Some(config) => config.load_data(root)?,
None => HashMap::new(),
};
Ok(ProjectContext {
project,
data,
padding,
})
}
fn build_glob_set(patterns: &[String]) -> GlobSet {
let mut builder = GlobSetBuilder::new();
for pattern in patterns {
if let Ok(glob) = Glob::new(pattern) {
builder.add(glob);
}
}
builder.build().unwrap_or_else(|_| GlobSet::empty())
}
pub fn normalize_line_endings(content: &str) -> String {
if content.contains('\r') {
content.replace("\r\n", "\n").replace('\r', "\n")
} else {
content.to_string()
}
}
fn build_project_cache_key(options: &ScanOptions) -> String {
let mut exclude_patterns = options.exclude_patterns.clone();
exclude_patterns.sort();
let mut template_paths: Vec<String> = options
.template_paths
.iter()
.map(|path| path.to_string_lossy().replace('\\', "/"))
.collect();
template_paths.sort();
let mut excluded_blocks = options.excluded_blocks.clone();
excluded_blocks.sort();
format!(
"index-v2|max={}|disable_gitignore={}|markdown={:?\
}|exclude={}|templates={}|excluded_blocks={}|cache_verify_hash={}",
options.max_file_size,
options.disable_gitignore,
options.markdown_codeblocks,
exclude_patterns.join("\u{1f}"),
template_paths.join("\u{1f}"),
excluded_blocks.join("\u{1f}"),
options.cache_verify_hash,
)
}
pub fn project_cache_path(root: &Path) -> PathBuf {
index_cache::cache_path(root)
}
pub fn inspect_project_cache(root: &Path, options: &ScanOptions) -> ProjectCacheInspection {
let path = project_cache_path(root);
let mut inspection = ProjectCacheInspection {
path: path.clone(),
artifact: ProjectCacheArtifactState {
exists: path.is_file(),
readable: false,
valid: false,
},
schema_version: None,
compatibility: ProjectCacheCompatibilityState {
schema_supported: false,
project_key_matches: false,
hash_verification_enabled: options.cache_verify_hash,
},
telemetry: None,
};
if !inspection.artifact.exists {
return inspection;
}
let Ok(bytes) = std::fs::read(&path) else {
return inspection;
};
inspection.artifact.readable = true;
let Ok(value) = serde_json::from_slice::<serde_json::Value>(&bytes) else {
return inspection;
};
let schema_version = value
.get("schema_version")
.and_then(serde_json::Value::as_u64)
.and_then(|version| u32::try_from(version).ok());
inspection.schema_version = schema_version;
inspection.compatibility.schema_supported =
schema_version == Some(index_cache::CACHE_SCHEMA_VERSION);
let expected_project_key = build_project_cache_key(options);
inspection.compatibility.project_key_matches = value
.get("project_key")
.and_then(serde_json::Value::as_str)
.is_some_and(|key| key == expected_project_key);
let Ok(cache) = serde_json::from_value::<ProjectIndexCache>(value) else {
return inspection;
};
inspection.artifact.valid = inspection.compatibility.schema_supported;
inspection.telemetry = Some(cache.telemetry.into());
inspection
}
fn collect_file_fingerprints(
root: &Path,
files: &[PathBuf],
max_file_size: u64,
verify_hash: bool,
) -> MdtResult<BTreeMap<String, FileFingerprint>> {
let mut fingerprints = BTreeMap::new();
for file in files {
let metadata = std::fs::metadata(file)?;
if metadata.len() > max_file_size {
return Err(MdtError::FileTooLarge {
path: file.display().to_string(),
size: metadata.len(),
limit: max_file_size,
});
}
let content_hash = if verify_hash {
Some(hash_file_contents(file)?)
} else {
None
};
fingerprints.insert(
index_cache::relative_file_key(root, file),
index_cache::build_file_fingerprint(&metadata, content_hash),
);
}
Ok(fingerprints)
}
fn hash_file_contents(path: &Path) -> MdtResult<u64> {
let bytes = std::fs::read(path)?;
let mut hasher = DefaultHasher::new();
bytes.hash(&mut hasher);
Ok(hasher.finish())
}
fn parse_diagnostic_to_project(file: &Path, diag: ParseDiagnostic) -> ProjectDiagnostic {
match diag {
ParseDiagnostic::UnclosedBlock { name, line, column } => {
ProjectDiagnostic {
file: file.to_path_buf(),
kind: DiagnosticKind::UnclosedBlock { name },
line,
column,
}
}
ParseDiagnostic::UnknownTransformer { name, line, column } => {
ProjectDiagnostic {
file: file.to_path_buf(),
kind: DiagnosticKind::UnknownTransformer { name },
line,
column,
}
}
ParseDiagnostic::InvalidTransformerArgs {
name,
expected,
got,
line,
column,
} => {
ProjectDiagnostic {
file: file.to_path_buf(),
kind: DiagnosticKind::InvalidTransformerArgs {
name,
expected,
got,
},
line,
column,
}
}
}
}
fn parse_file_for_scan(
file: &Path,
options: &ScanOptions,
) -> MdtResult<index_cache::CachedFileData> {
let raw_content = std::fs::read_to_string(file)?;
let content = normalize_line_endings(&raw_content);
let (blocks, parse_diagnostics) = if is_markdown_file(file) {
parse_with_diagnostics(&content)?
} else {
parse_source_with_diagnostics(&content, &options.markdown_codeblocks)?
};
let mut diagnostics: Vec<ProjectDiagnostic> = parse_diagnostics
.into_iter()
.map(|diag| parse_diagnostic_to_project(file, diag))
.collect();
let mut providers = Vec::new();
let mut consumers = Vec::new();
let is_template = file
.file_name()
.and_then(|name| name.to_str())
.is_some_and(|name| name.ends_with(".t.md"));
for block in &blocks {
if let Err(MdtError::InvalidTransformerArgs {
name,
expected,
got,
}) = validate_transformers(&block.transformers)
{
diagnostics.push(ProjectDiagnostic {
file: file.to_path_buf(),
kind: DiagnosticKind::InvalidTransformerArgs {
name,
expected,
got,
},
line: block.opening.start.line,
column: block.opening.start.column,
});
}
}
for block in blocks {
if options
.excluded_blocks
.iter()
.any(|name| name == &block.name)
{
continue;
}
let block_content = extract_content_between_tags(&content, &block);
match block.r#type {
BlockType::Provider => {
if !is_template {
continue;
}
providers.push(ProviderEntry {
block,
file: file.to_path_buf(),
content: block_content,
});
}
BlockType::Consumer | BlockType::Inline => {
consumers.push(ConsumerEntry {
block,
file: file.to_path_buf(),
content: block_content,
});
}
}
}
Ok(index_cache::CachedFileData {
providers,
consumers,
diagnostics,
})
}
fn build_project_from_file_data(
root: &Path,
files: &[PathBuf],
file_data: &BTreeMap<String, index_cache::CachedFileData>,
) -> MdtResult<Project> {
let mut providers: HashMap<String, ProviderEntry> = HashMap::new();
let mut consumers = Vec::new();
let mut diagnostics = Vec::new();
for file in files {
let file_key = index_cache::relative_file_key(root, file);
let Some(entry) = file_data.get(&file_key) else {
continue;
};
diagnostics.extend(entry.diagnostics.iter().cloned());
for provider in &entry.providers {
if let Some(existing) = providers.get(&provider.block.name) {
return Err(MdtError::DuplicateProvider {
name: provider.block.name.clone(),
first_file: existing.file.display().to_string(),
second_file: provider.file.display().to_string(),
});
}
providers.insert(provider.block.name.clone(), provider.clone());
}
consumers.extend(entry.consumers.iter().cloned());
}
let referenced_names: HashSet<&str> = consumers
.iter()
.filter(|consumer| consumer.block.r#type == BlockType::Consumer)
.map(|consumer| consumer.block.name.as_str())
.collect();
for (name, entry) in &providers {
if !referenced_names.contains(name.as_str()) {
diagnostics.push(ProjectDiagnostic {
file: entry.file.clone(),
kind: DiagnosticKind::UnusedProvider { name: name.clone() },
line: entry.block.opening.start.line,
column: entry.block.opening.start.column,
});
}
}
Ok(Project {
providers,
consumers,
diagnostics,
})
}
pub fn scan_project_with_options(root: &Path, options: &ScanOptions) -> MdtResult<Project> {
let mut files = collect_files(root, &options.exclude_patterns, options.disable_gitignore)?;
for template_dir in &options.template_paths {
let abs_dir = root.join(template_dir);
if abs_dir.is_dir() {
let extra_files = collect_files(
&abs_dir,
&options.exclude_patterns,
options.disable_gitignore,
)?;
for f in extra_files {
if !files.contains(&f) {
files.push(f);
}
}
}
}
let custom_exclude = build_exclude_matcher(root, &options.exclude_patterns)?;
if !options.include_set.is_empty() {
collect_included_files(
root,
root,
&options.include_set,
&custom_exclude,
&mut files,
true,
)?;
}
let project_key = build_project_cache_key(options);
let file_fingerprints = collect_file_fingerprints(
root,
&files,
options.max_file_size,
options.cache_verify_hash,
)?;
let mut cache = index_cache::load(root, &project_key);
if let Some(cached) = &mut cache {
if cached.files == file_fingerprints {
cached
.telemetry
.record_scan(true, files.len(), 0, files.len());
index_cache::save(root, cached);
return Ok(cached.project.clone());
}
}
let mut merged_file_data = BTreeMap::new();
let mut reused_file_count = 0usize;
let mut reparsed_file_count = 0usize;
for file in &files {
let file_key = index_cache::relative_file_key(root, file);
let fingerprint = file_fingerprints.get(&file_key);
let cached_entry = cache.as_ref().and_then(|cached| {
if cached.files.get(&file_key) == fingerprint {
return cached.file_data.get(&file_key).cloned();
}
None
});
let entry = if let Some(entry) = cached_entry {
reused_file_count = reused_file_count.saturating_add(1);
entry
} else {
reparsed_file_count = reparsed_file_count.saturating_add(1);
parse_file_for_scan(file, options)?
};
merged_file_data.insert(file_key, entry);
}
let project = build_project_from_file_data(root, &files, &merged_file_data)?;
let mut next_cache = ProjectIndexCache::new(
project_key,
file_fingerprints,
merged_file_data,
project.clone(),
);
if let Some(previous_cache) = cache {
next_cache.telemetry = previous_cache.telemetry;
}
next_cache
.telemetry
.record_scan(false, reused_file_count, reparsed_file_count, files.len());
index_cache::save(root, &next_cache);
Ok(project)
}
pub fn extract_content_between_tags(source: &str, block: &Block) -> String {
let start_offset = block.opening.end.offset;
let end_offset = block.closing.start.offset;
if start_offset >= end_offset || end_offset > source.len() {
return String::new();
}
source[start_offset..end_offset].to_string()
}
fn build_exclude_matcher(root: &Path, patterns: &[String]) -> MdtResult<Gitignore> {
let mut builder = GitignoreBuilder::new(root);
for pattern in patterns {
builder.add_line(None, pattern).map_err(|e| {
MdtError::ConfigParse(format!("invalid exclude pattern `{pattern}`: {e}"))
})?;
}
builder
.build()
.map_err(|e| MdtError::ConfigParse(format!("failed to build exclude rules: {e}")))
}
fn build_gitignore(root: &Path) -> Gitignore {
let mut builder = GitignoreBuilder::new(root);
let gitignore_path = root.join(".gitignore");
if gitignore_path.exists() {
let _ = builder.add(gitignore_path);
}
builder.build().unwrap_or_else(|_| {
let empty = GitignoreBuilder::new(root);
empty.build().unwrap_or_else(|_| {
Gitignore::empty()
})
})
}
fn collect_files(
root: &Path,
exclude_patterns: &[String],
disable_gitignore: bool,
) -> MdtResult<Vec<PathBuf>> {
let mut files = Vec::new();
let mut visited_dirs = HashSet::new();
let gitignore = if disable_gitignore {
Gitignore::empty()
} else {
build_gitignore(root)
};
let custom_exclude = build_exclude_matcher(root, exclude_patterns)?;
walk_dir(
root,
root,
&mut files,
true,
&gitignore,
&custom_exclude,
&mut visited_dirs,
)?;
files.sort();
Ok(files)
}
fn is_ignored_directory_name(name: &str) -> bool {
(name.starts_with('.') && name != ".templates") || name == "node_modules" || name == "target"
}
fn has_project_config(dir: &Path) -> bool {
CONFIG_FILE_CANDIDATES
.iter()
.any(|candidate| dir.join(candidate).is_file())
}
#[allow(clippy::only_used_in_recursion)]
fn walk_dir(
root: &Path,
dir: &Path,
files: &mut Vec<PathBuf>,
is_root: bool,
gitignore: &Gitignore,
custom_exclude: &Gitignore,
visited_dirs: &mut HashSet<PathBuf>,
) -> MdtResult<()> {
if !dir.is_dir() {
return Ok(());
}
let canonical = dir.canonicalize().unwrap_or_else(|_| dir.to_path_buf());
if !visited_dirs.insert(canonical.clone()) {
return Err(MdtError::SymlinkCycle {
path: dir.display().to_string(),
});
}
let entries = std::fs::read_dir(dir)?;
for entry in entries {
let entry = entry?;
let path = entry.path();
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if is_ignored_directory_name(name) {
continue;
}
}
let is_dir = path.is_dir();
if gitignore.matched(&path, is_dir).is_ignore() {
continue;
}
if custom_exclude.matched(&path, is_dir).is_ignore() {
continue;
}
if is_dir {
if !is_root && has_project_config(&path) {
continue;
}
walk_dir(
root,
&path,
files,
false,
gitignore,
custom_exclude,
visited_dirs,
)?;
} else if is_scannable_file(&path) {
files.push(path);
}
}
Ok(())
}
fn collect_included_files(
root: &Path,
dir: &Path,
include_set: &GlobSet,
exclude_matcher: &Gitignore,
files: &mut Vec<PathBuf>,
is_root: bool,
) -> MdtResult<()> {
if !dir.is_dir() {
return Ok(());
}
let entries = std::fs::read_dir(dir)?;
for entry in entries {
let entry = entry?;
let path = entry.path();
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if is_ignored_directory_name(name) {
continue;
}
}
let is_dir = path.is_dir();
if exclude_matcher.matched(&path, is_dir).is_ignore() {
continue;
}
if let Ok(rel_path) = path.strip_prefix(root) {
if path.is_file() && include_set.is_match(rel_path) && !files.contains(&path) {
files.push(path.clone());
}
}
if is_dir {
if !is_root && has_project_config(&path) {
continue;
}
collect_included_files(root, &path, include_set, exclude_matcher, files, false)?;
}
}
Ok(())
}
fn is_scannable_file(path: &Path) -> bool {
let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
return false;
};
matches!(
ext,
"md" | "mdx"
| "markdown"
| "rs" | "ts"
| "tsx" | "js"
| "jsx" | "py"
| "go" | "java"
| "kt" | "swift"
| "c" | "cpp"
| "h" | "cs"
)
}
fn is_markdown_file(path: &Path) -> bool {
let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
return false;
};
matches!(ext, "md" | "mdx" | "markdown")
}
pub fn is_template_file(path: &Path) -> bool {
path.file_name()
.and_then(|name| name.to_str())
.is_some_and(|name| name.ends_with(".t.md"))
}
pub fn find_missing_providers(project: &Project) -> Vec<String> {
let mut missing = Vec::new();
for consumer in &project.consumers {
if consumer.block.r#type != BlockType::Consumer {
continue;
}
if !project.providers.contains_key(&consumer.block.name)
&& !missing.contains(&consumer.block.name)
{
missing.push(consumer.block.name.clone());
}
}
missing
}
pub fn validate_project(project: &Project) -> MdtResult<()> {
let missing = find_missing_providers(project);
if let Some(name) = missing.into_iter().next() {
return Err(MdtError::MissingProvider(name));
}
Ok(())
}