use crate::hashing::sha256_file;
use crate::policy::{CompiledPolicy, FileTypeClass, classify_file_type};
use orbok_core::{
FileStatus, JobType, OrbokResult, SourceId, now_iso8601, system_time_iso8601,
};
use orbok_db::Catalog;
use orbok_db::repo::{
FileRepository, IndexJobRepository, NewFile, ObservedMetadata, SourceRecord, SourceRepository,
};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Instant;
#[derive(Debug, Clone)]
pub struct ScanRequest {
pub source_id: SourceId,
pub force_hash: bool,
pub enqueue_index_jobs: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ScanOutcomeKind {
New,
Unchanged,
Stale,
Unsupported,
PermissionDenied,
Failed,
}
#[derive(Debug, Clone, Default)]
pub struct ScanSummary {
pub seen_files: u64,
pub new_files: u64,
pub unchanged_files: u64,
pub stale_files: u64,
pub missing_files: u64,
pub unsupported_files: u64,
pub permission_denied_files: u64,
pub failed_files: u64,
pub queued_index_jobs: u64,
pub duration_ms: u64,
pub canceled: bool,
}
pub struct Scanner<'a> {
catalog: &'a Catalog,
}
impl<'a> Scanner<'a> {
pub fn new(catalog: &'a Catalog) -> Self {
Self { catalog }
}
pub fn scan(&self, request: &ScanRequest, cancel: &AtomicBool) -> OrbokResult<ScanSummary> {
let started = Instant::now();
let scan_started_at = now_iso8601();
let mut summary = ScanSummary::default();
let sources = SourceRepository::new(self.catalog);
let source = sources
.get(&request.source_id)?
.ok_or(orbok_core::OrbokError::SourceNotFound)?;
let policy = CompiledPolicy::from_source(&source);
let root = PathBuf::from(&source.canonical_path);
let files = FileRepository::new(self.catalog);
let jobs = IndexJobRepository::new(self.catalog);
let mut stack = vec![root.clone()];
'walk: while let Some(dir) = stack.pop() {
if cancel.load(Ordering::Relaxed) {
summary.canceled = true;
break 'walk;
}
let entries = match std::fs::read_dir(&dir) {
Ok(entries) => entries,
Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => {
summary.permission_denied_files += 1;
continue;
}
Err(_) => {
summary.failed_files += 1;
continue;
}
};
for entry in entries {
if cancel.load(Ordering::Relaxed) {
summary.canceled = true;
break 'walk;
}
let Ok(entry) = entry else {
summary.failed_files += 1;
continue;
};
let path = entry.path();
let name = entry.file_name().to_string_lossy().into_owned();
if skip_component(&policy, &source, &name) {
continue;
}
let Ok(file_type) = entry.file_type() else {
summary.failed_files += 1;
continue;
};
if file_type.is_symlink() {
if !symlink_allowed(&policy, &root, &path) {
continue;
}
}
if path.is_dir() {
stack.push(path);
continue;
}
if !path.is_file() {
continue;
}
summary.seen_files += 1;
let outcome =
self.process_file(&source, &policy, &files, &jobs, &path, request, &mut summary);
match outcome {
Ok(()) => {}
Err(_) => summary.failed_files += 1,
}
}
}
if !summary.canceled {
summary.missing_files = files.mark_missing_unseen(&source.source_id, &scan_started_at)?;
sources.touch_scanned(&source.source_id)?;
}
summary.duration_ms = started.elapsed().as_millis() as u64;
tracing::info!(
source = source.source_id.as_str(),
seen = summary.seen_files,
new = summary.new_files,
stale = summary.stale_files,
missing = summary.missing_files,
"scan finished"
);
Ok(summary)
}
#[allow(clippy::too_many_arguments)]
fn process_file(
&self,
source: &SourceRecord,
policy: &CompiledPolicy,
files: &FileRepository<'_>,
jobs: &IndexJobRepository<'_>,
path: &Path,
request: &ScanRequest,
summary: &mut ScanSummary,
) -> OrbokResult<()> {
let file_name = path
.file_name()
.map(|n| n.to_string_lossy().into_owned())
.unwrap_or_default();
if !policy.file_included(&file_name) {
return Ok(());
}
let metadata = match std::fs::metadata(path) {
Ok(m) => m,
Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => {
summary.permission_denied_files += 1;
self.upsert_status_only(source, files, path, FileStatus::PermissionDenied)?;
return Ok(());
}
Err(e) => return Err(e.into()),
};
if !policy.size_allowed(metadata.len()) {
return Ok(()); }
let supported = classify_file_type(path) == FileTypeClass::Supported;
let canonical = path.to_string_lossy().into_owned();
let observed = ObservedMetadata {
file_size_bytes: metadata.len(),
modified_at: metadata.modified().ok().map(system_time_iso8601),
platform_file_key: platform_file_key(&metadata),
content_hash: None,
};
let existing = files.get_by_path(&source.source_id, &canonical)?;
match existing {
None => {
let mut observed = observed;
let status = if supported {
observed.content_hash = Some(sha256_file(path)?);
FileStatus::Discovered
} else {
summary.unsupported_files += 1;
FileStatus::Unsupported
};
let record = files.insert(NewFile {
source_id: source.source_id.clone(),
original_path: canonical.clone(),
canonical_path: canonical.clone(),
display_path: display_path(&source.canonical_path, &canonical),
extension: path
.extension()
.map(|e| e.to_string_lossy().to_ascii_lowercase()),
metadata: observed,
status,
})?;
if supported {
summary.new_files += 1;
if request.enqueue_index_jobs {
jobs.enqueue(JobType::Extract, Some(&source.source_id), Some(&record.file_id))?;
summary.queued_index_jobs += 1;
}
}
}
Some(record) => {
let restored_status = (record.file_status == FileStatus::Missing).then(|| {
if record.last_indexed_at.is_some() {
FileStatus::Indexed
} else {
FileStatus::Discovered
}
});
let metadata_unchanged = record.file_size_bytes == observed.file_size_bytes
&& record.modified_at == observed.modified_at;
if metadata_unchanged && !request.force_hash {
match restored_status {
Some(status) => files.update_observed(&record.file_id, &observed, status)?,
None => files.touch_seen(&record.file_id)?,
}
summary.unchanged_files += 1;
return Ok(());
}
let mut observed = observed;
let new_hash = sha256_file(path)?;
if record.content_hash.as_deref() == Some(new_hash.as_str()) {
match restored_status {
Some(status) => files.update_observed(&record.file_id, &observed, status)?,
None => files.touch_seen(&record.file_id)?,
}
summary.unchanged_files += 1;
return Ok(());
}
observed.content_hash = Some(new_hash);
let status = match record.file_status {
FileStatus::Indexed | FileStatus::Stale => FileStatus::Stale,
_ => FileStatus::Discovered,
};
files.update_observed(&record.file_id, &observed, status)?;
summary.stale_files += 1;
if request.enqueue_index_jobs {
jobs.enqueue(JobType::Extract, Some(&source.source_id), Some(&record.file_id))?;
summary.queued_index_jobs += 1;
}
}
}
Ok(())
}
fn upsert_status_only(
&self,
source: &SourceRecord,
files: &FileRepository<'_>,
path: &Path,
status: FileStatus,
) -> OrbokResult<()> {
let canonical = path.to_string_lossy().into_owned();
match files.get_by_path(&source.source_id, &canonical)? {
Some(record) => files.set_status(&record.file_id, status),
None => files
.insert(NewFile {
source_id: source.source_id.clone(),
original_path: canonical.clone(),
canonical_path: canonical.clone(),
display_path: display_path(&source.canonical_path, &canonical),
extension: None,
metadata: ObservedMetadata::default(),
status,
})
.map(|_| ()),
}
}
}
fn skip_component(policy: &CompiledPolicy, source: &SourceRecord, name: &str) -> bool {
if policy.component_excluded(name) {
return true;
}
if CompiledPolicy::component_hidden(name)
&& source.hidden_file_policy == orbok_core::HiddenFilePolicy::Exclude
{
return true;
}
false
}
fn symlink_allowed(policy: &CompiledPolicy, root: &Path, path: &Path) -> bool {
match policy.symlink_policy {
orbok_core::SymlinkPolicy::Ignore => false,
orbok_core::SymlinkPolicy::FollowWithinSource
| orbok_core::SymlinkPolicy::FollowAllWithWarning => match std::fs::canonicalize(path) {
Ok(resolved) => resolved.starts_with(root),
Err(_) => false,
},
}
}
fn display_path(root: &str, canonical: &str) -> String {
canonical
.strip_prefix(root)
.map(|rest| rest.trim_start_matches(['/', '\\']).to_string())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| canonical.to_string())
}
#[cfg(unix)]
fn platform_file_key(metadata: &std::fs::Metadata) -> Option<String> {
use std::os::unix::fs::MetadataExt;
Some(format!("{}:{}", metadata.dev(), metadata.ino()))
}
#[cfg(not(unix))]
fn platform_file_key(_metadata: &std::fs::Metadata) -> Option<String> {
None
}