use anyhow::{Context, Result};
use indexmap::IndexMap;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::io::Write as _;
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use crate::bm25::{Bm25InvertedIndex, resolve_language, tokenize};
use crate::filter::extract_tags;
use crate::frontmatter;
use crate::link_graph::{
DEFAULT_FRONTMATTER_LINK_PROPERTIES, FileLinks, LinkGraph, LinkGraphVisitor,
};
use crate::links::Link;
use crate::scanner::{self, FileVisitor, FrontmatterCollector, ScanAction};
use crate::tasks::TaskExtractor;
use crate::types::{FindTaskInfo, OutlineSection, TaskCount};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IndexEntry {
pub rel_path: String,
pub modified: String,
pub properties: IndexMap<String, serde_json::Value>,
pub tags: Vec<String>,
pub sections: Vec<OutlineSection>,
pub tasks: Vec<FindTaskInfo>,
pub links: Vec<(usize, Link)>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bm25_tokens: Option<Vec<String>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bm25_language: Option<String>,
}
pub trait VaultIndex {
fn entries(&self) -> &[IndexEntry];
fn get(&self, rel_path: &str) -> Option<&IndexEntry>;
fn link_graph(&self) -> &LinkGraph;
fn bm25_index(&self) -> Option<&Bm25InvertedIndex> {
None
}
}
#[derive(Debug, Clone)]
pub struct ScanOptions<'a> {
pub scan_body: bool,
pub bm25_tokenize: bool,
pub default_language: Option<&'a str>,
pub frontmatter_link_props: Option<&'a [String]>,
}
pub struct ScannedIndex {
entries: Vec<IndexEntry>,
path_index: HashMap<String, usize>,
graph: LinkGraph,
}
pub struct IndexWarning {
pub rel_path: String,
pub message: String,
}
pub struct ScannedIndexBuild {
pub index: ScannedIndex,
pub warnings: Vec<IndexWarning>,
}
impl ScannedIndex {
pub fn build(
files: &[(PathBuf, String)],
site_prefix: Option<&str>,
options: &ScanOptions<'_>,
) -> Result<ScannedIndexBuild> {
let mut entries = Vec::with_capacity(files.len());
let mut file_links_vec: Vec<FileLinks> = Vec::with_capacity(files.len());
let mut warnings: Vec<IndexWarning> = Vec::new();
let default_language = options.default_language;
let fm_link_props: Vec<String> = options.frontmatter_link_props.map_or_else(
|| {
DEFAULT_FRONTMATTER_LINK_PROPERTIES
.iter()
.map(|s| (*s).to_owned())
.collect()
},
<[String]>::to_vec,
);
let results: Vec<Result<(IndexEntry, Option<FileLinks>)>> = files
.par_iter()
.map(|(full_path, rel_path)| {
scan_one_file(
full_path,
rel_path,
options.scan_body,
options.bm25_tokenize,
default_language,
&fm_link_props,
)
})
.collect();
for (i, result) in results.into_iter().enumerate() {
match result {
Ok((entry, file_links)) => {
entries.push(entry);
if let Some(fl) = file_links {
file_links_vec.push(fl);
}
}
Err(e) if frontmatter::is_parse_error(&e) => {
warnings.push(IndexWarning {
rel_path: files[i].1.clone(),
message: e.to_string(),
});
}
Err(e) => return Err(e),
}
}
entries.sort_by(|a, b| a.rel_path.cmp(&b.rel_path));
let graph = if options.scan_body {
let graph_build = LinkGraph::from_file_links(file_links_vec, site_prefix);
graph_build.graph
} else {
LinkGraph::default()
};
let path_index: HashMap<String, usize> = entries
.iter()
.enumerate()
.map(|(i, e)| (e.rel_path.clone(), i))
.collect();
Ok(ScannedIndexBuild {
index: ScannedIndex {
entries,
path_index,
graph,
},
warnings,
})
}
}
impl VaultIndex for ScannedIndex {
fn entries(&self) -> &[IndexEntry] {
&self.entries
}
fn get(&self, rel_path: &str) -> Option<&IndexEntry> {
self.path_index.get(rel_path).map(|&i| &self.entries[i])
}
fn link_graph(&self) -> &LinkGraph {
&self.graph
}
}
#[derive(Debug, Serialize, Deserialize)]
struct SnapshotHeader {
vault_dir: String,
site_prefix: Option<String>,
created_at: u64,
pid: u32,
}
#[derive(Serialize, Deserialize)]
struct SnapshotData {
header: SnapshotHeader,
entries: Vec<IndexEntry>,
graph: LinkGraph,
#[serde(default, skip_serializing_if = "Option::is_none")]
bm25_index: Option<Bm25InvertedIndex>,
}
#[derive(Serialize)]
struct SnapshotDataRef<'a> {
header: SnapshotHeader,
entries: &'a [IndexEntry],
graph: &'a LinkGraph,
#[serde(skip_serializing_if = "Option::is_none")]
bm25_index: Option<&'a Bm25InvertedIndex>,
}
pub struct SnapshotIndex {
entries: Vec<IndexEntry>,
path_index: HashMap<String, usize>,
graph: LinkGraph,
header: SnapshotHeader,
bm25_index: Option<Bm25InvertedIndex>,
frontmatter_link_props: Option<Vec<String>>,
}
impl SnapshotIndex {
pub fn remove_entry(&mut self, rel_path: &str) {
if let Some(&idx) = self.path_index.get(rel_path) {
self.entries.remove(idx);
self.rebuild_path_index();
}
}
pub fn insert_entry(&mut self, entry: IndexEntry) {
let pos = self
.entries
.binary_search_by(|e| e.rel_path.cmp(&entry.rel_path))
.unwrap_or_else(|i| i);
self.entries.insert(pos, entry);
self.rebuild_path_index();
}
pub fn get_mut(&mut self, rel_path: &str) -> Option<&mut IndexEntry> {
self.path_index
.get(rel_path)
.copied()
.map(|i| &mut self.entries[i])
}
pub fn graph_mut(&mut self) -> &mut LinkGraph {
&mut self.graph
}
pub fn set_frontmatter_link_props(&mut self, props: Option<Vec<String>>) {
self.frontmatter_link_props = props;
}
fn effective_frontmatter_link_props(&self) -> Vec<String> {
self.frontmatter_link_props.clone().unwrap_or_else(|| {
DEFAULT_FRONTMATTER_LINK_PROPERTIES
.iter()
.map(|s| (*s).to_owned())
.collect()
})
}
pub(crate) fn rescan_entry(&mut self, dir: &Path, rel_path: &str) -> Result<Option<FileLinks>> {
let Some(&idx) = self.path_index.get(rel_path) else {
return Ok(None);
};
let full_path = dir.join(rel_path);
let fm_props = self.effective_frontmatter_link_props();
let (entry, file_links) =
scan_one_file(&full_path, rel_path, true, false, None, &fm_props)?;
self.entries[idx] = entry;
Ok(file_links)
}
pub fn refresh_entry(&mut self, dir: &Path, rel_path: &str) -> Result<bool> {
match self.rescan_entry(dir, rel_path)? {
Some(_) => Ok(true),
None => Ok(false),
}
}
pub fn rename_entry(&mut self, dir: &Path, old_rel: &str, new_rel: &str) -> Result<bool> {
let Some(&old_idx) = self.path_index.get(old_rel) else {
return Ok(false);
};
let full_path = dir.join(new_rel);
let fm_props = self.effective_frontmatter_link_props();
let (entry, _file_links) =
scan_one_file(&full_path, new_rel, true, false, None, &fm_props)?;
self.entries.remove(old_idx);
let pos = self
.entries
.binary_search_by(|e| e.rel_path.cmp(&entry.rel_path))
.unwrap_or_else(|i| i);
self.entries.insert(pos, entry);
self.rebuild_path_index();
Ok(true)
}
fn rebuild_path_index(&mut self) {
self.path_index = self
.entries
.iter()
.enumerate()
.map(|(i, e)| (e.rel_path.clone(), i))
.collect();
}
pub fn save_to(&self, path: &Path) -> Result<()> {
write_snapshot(
self,
path,
&self.header.vault_dir,
self.header.site_prefix.as_deref(),
self.bm25_index.as_ref(),
)
}
fn load_inner(bytes: &[u8], warn: bool) -> Option<Self> {
match rmp_serde::from_slice::<SnapshotData>(bytes) {
Ok(data) => {
const MAX_ENTRIES: usize = 5_000_000;
const MAX_GRAPH_EDGES: usize = 50_000_000;
const MAX_BM25_POSTINGS: usize = 50_000_000;
if data.entries.len() > MAX_ENTRIES {
if warn {
eprintln!(
"warning: index file contains {} entries (limit {}); falling back to disk scan",
data.entries.len(),
MAX_ENTRIES
);
}
return None;
}
for entry in &data.entries {
let rel_path = &entry.rel_path;
if rel_path.contains('\0') {
if warn {
eprintln!(
"warning: index file contains unsafe path '{rel_path}'; falling back to disk scan"
);
}
return None;
}
if std::path::Path::new(rel_path.as_str()).is_absolute() {
if warn {
eprintln!(
"warning: index file contains unsafe path '{rel_path}'; falling back to disk scan"
);
}
return None;
}
if std::path::Path::new(rel_path.as_str())
.components()
.any(|c| {
matches!(
c,
std::path::Component::ParentDir
| std::path::Component::RootDir
| std::path::Component::Prefix(_)
)
})
{
if warn {
eprintln!(
"warning: index file contains unsafe path '{rel_path}'; falling back to disk scan"
);
}
return None;
}
}
let edge_count = data.graph.total_edges();
if edge_count > MAX_GRAPH_EDGES {
if warn {
eprintln!(
"warning: index file contains too many graph edges ({edge_count}); falling back to disk scan"
);
}
return None;
}
if let Some(ref bm25) = data.bm25_index {
let posting_count = bm25.total_postings();
if posting_count > MAX_BM25_POSTINGS {
if warn {
eprintln!(
"warning: index file contains too many BM25 postings ({posting_count}); falling back to disk scan"
);
}
return None;
}
}
if let Some(ref bm25) = data.bm25_index
&& !bm25.validate_doc_ids()
{
if warn {
eprintln!(
"warning: index file contains out-of-bounds BM25 doc_id; falling back to disk scan"
);
}
return None;
}
let mut entries = data.entries;
entries.sort_by(|a, b| a.rel_path.cmp(&b.rel_path));
let path_index: HashMap<String, usize> = entries
.iter()
.enumerate()
.map(|(i, e)| (e.rel_path.clone(), i))
.collect();
Some(Self {
entries,
path_index,
graph: data.graph,
header: data.header,
bm25_index: data.bm25_index,
frontmatter_link_props: None,
})
}
Err(e) => {
if warn {
eprintln!(
"warning: index file is incompatible ({e}); falling back to disk scan"
);
}
None
}
}
}
pub fn load(path: &Path) -> Result<Option<Self>> {
let Some(bytes) = read_index_bytes(path, true)? else {
return Ok(None);
};
Ok(Self::load_inner(&bytes, true))
}
fn load_silent(path: &Path) -> Result<Option<Self>> {
let Some(bytes) = read_index_bytes(path, false)? else {
return Ok(None);
};
Ok(Self::load_inner(&bytes, false))
}
pub fn validate(&self, vault_dir: &str, site_prefix: Option<&str>) -> bool {
self.header.vault_dir == vault_dir && self.header.site_prefix.as_deref() == site_prefix
}
pub fn save(
index: &dyn VaultIndex,
path: &Path,
vault_dir: &str,
site_prefix: Option<&str>,
bm25_index: Option<&Bm25InvertedIndex>,
) -> Result<()> {
write_snapshot(index, path, vault_dir, site_prefix, bm25_index)
}
pub fn bm25_index(&self) -> Option<&Bm25InvertedIndex> {
self.bm25_index.as_ref()
}
pub fn header_info(&self) -> (&str, Option<&str>, u64, u32) {
(
&self.header.vault_dir,
self.header.site_prefix.as_deref(),
self.header.created_at,
self.header.pid,
)
}
}
const MAX_INDEX_FILE_SIZE: u64 = 512 * 1024 * 1024;
fn read_index_bytes(path: &Path, warn: bool) -> Result<Option<Vec<u8>>> {
use std::io::Read as _;
let file = std::fs::File::open(path)
.with_context(|| format!("failed to open index file: {}", path.display()))?;
let meta = file
.metadata()
.with_context(|| format!("failed to stat index file: {}", path.display()))?;
if meta.len() > MAX_INDEX_FILE_SIZE {
if warn {
eprintln!(
"warning: index file is too large ({} bytes, limit {}); falling back to disk scan",
meta.len(),
MAX_INDEX_FILE_SIZE
);
}
return Ok(None);
}
#[allow(clippy::cast_possible_truncation)]
let mut bytes = Vec::with_capacity(meta.len() as usize);
file.take(MAX_INDEX_FILE_SIZE + 1)
.read_to_end(&mut bytes)
.with_context(|| format!("failed to read index file: {}", path.display()))?;
Ok(Some(bytes))
}
fn write_snapshot(
index: &dyn VaultIndex,
path: &Path,
vault_dir: &str,
site_prefix: Option<&str>,
bm25_index: Option<&Bm25InvertedIndex>,
) -> Result<()> {
let header = SnapshotHeader {
vault_dir: vault_dir.to_owned(),
site_prefix: site_prefix.map(str::to_owned),
created_at: SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0),
pid: std::process::id(),
};
let stripped_entries: Vec<IndexEntry>;
let entries: &[IndexEntry] = if bm25_index.is_some() {
stripped_entries = index
.entries()
.iter()
.map(|e| {
let mut e = e.clone();
e.bm25_tokens = None;
e.bm25_language = None;
e
})
.collect();
&stripped_entries
} else {
index.entries()
};
let data = SnapshotDataRef {
header,
entries,
graph: index.link_graph(),
bm25_index,
};
let bytes = rmp_serde::to_vec_named(&data).context("failed to serialize index")?;
let parent = path
.parent()
.context("index path has no parent directory")?;
let mut tmp =
tempfile::NamedTempFile::new_in(parent).context("failed to create temp file for index")?;
tmp.write_all(&bytes)
.context("failed to write temp index")?;
tmp.persist(path)
.map_err(|e| e.error)
.with_context(|| format!("failed to rename index into place: {}", path.display()))?;
Ok(())
}
impl VaultIndex for SnapshotIndex {
fn entries(&self) -> &[IndexEntry] {
&self.entries
}
fn get(&self, rel_path: &str) -> Option<&IndexEntry> {
self.path_index.get(rel_path).map(|&i| &self.entries[i])
}
fn link_graph(&self) -> &LinkGraph {
&self.graph
}
fn bm25_index(&self) -> Option<&Bm25InvertedIndex> {
self.bm25_index.as_ref()
}
}
fn is_pid_alive(pid: u32) -> bool {
if pid == 0 {
return false;
}
#[cfg(unix)]
{
if pid > i32::MAX as u32 {
return false;
}
let res = unsafe { libc::kill(pid.cast_signed(), 0) };
if res == 0 {
true
} else {
let errno = std::io::Error::last_os_error().raw_os_error().unwrap_or(0);
errno != libc::ESRCH
}
}
#[cfg(not(unix))]
{
let _ = pid;
true
}
}
pub fn find_stale_indexes(dir: &Path) -> Result<Vec<(PathBuf, String, u64)>> {
let mut stale = Vec::new();
let Ok(read_dir) = std::fs::read_dir(dir) else {
return Ok(stale);
};
for entry in read_dir {
let entry = entry?;
let path = entry.path();
let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
continue;
};
if !name.ends_with(".hyalo-index") {
continue;
}
if let Ok(Some(idx)) = SnapshotIndex::load_silent(&path) {
let (vault_dir, _, created_at, pid) = idx.header_info();
if !is_pid_alive(pid) {
stale.push((path, vault_dir.to_owned(), created_at));
}
}
}
Ok(stale)
}
pub(crate) fn scan_one_file(
full_path: &Path,
rel_path: &str,
scan_body: bool,
bm25_tokenize: bool,
default_language: Option<&str>,
frontmatter_link_props: &[String],
) -> Result<(IndexEntry, Option<FileLinks>)> {
let mut fm = FrontmatterCollector::new(scan_body);
let mut body_collector = BodyCollector::new(bm25_tokenize);
let (sections, tasks, links, file_links) = if scan_body {
let mut section_scanner = SectionScanner::new();
let mut task_extractor = TaskExtractor::new();
let mut link_visitor = LinkGraphVisitor::with_frontmatter_props(
PathBuf::from(rel_path),
frontmatter_link_props.to_vec(),
);
scanner::scan_file_multi(
full_path,
&mut [
&mut fm,
&mut section_scanner,
&mut task_extractor,
&mut link_visitor,
&mut body_collector,
],
)?;
let sections = section_scanner.into_sections();
let tasks = task_extractor.into_tasks();
let fl = link_visitor.into_file_links();
let links_clone: Vec<(usize, Link)> = fl
.links
.iter()
.map(|(line, link)| (*line, link.clone()))
.collect();
(sections, tasks, links_clone, Some(fl))
} else {
scanner::scan_file_multi(full_path, &mut [&mut fm, &mut body_collector])?;
(Vec::new(), Vec::new(), Vec::new(), None)
};
let props = fm.into_props();
let tags = extract_tags(&props);
let modified = format_modified(full_path)?;
let (bm25_tokens, bm25_language) = if bm25_tokenize {
let body = body_collector.into_body();
let title: &str = props
.get("title")
.and_then(|v| v.as_str())
.unwrap_or_else(|| {
sections
.iter()
.find(|s| s.level == 1)
.and_then(|s| s.heading.as_deref())
.unwrap_or("")
});
let fm_lang = props.get("language").and_then(|v| v.as_str());
let lang = resolve_language(fm_lang, None, default_language);
let combined = format!("{title} {body}");
let stemmer = rust_stemmers::Stemmer::create(lang.to_algorithm());
let tokens = tokenize(&combined, &stemmer);
(Some(tokens), Some(lang.canonical_name().to_owned()))
} else {
(None, None)
};
let entry = IndexEntry {
rel_path: rel_path.to_owned(),
modified,
properties: props,
tags,
sections,
tasks,
links,
bm25_tokens,
bm25_language,
};
Ok((entry, file_links))
}
pub fn format_modified(path: &Path) -> Result<String> {
let meta = std::fs::metadata(path)
.with_context(|| format!("failed to read metadata for {}", path.display()))?;
let mtime = meta
.modified()
.with_context(|| format!("mtime not available for {}", path.display()))?;
let secs = mtime.duration_since(SystemTime::UNIX_EPOCH).map_or_else(
|_| {
crate::warn::warn(format!(
"mtime for {} is before 1970-01-01; using epoch as fallback",
path.display()
));
0
},
|d| d.as_secs(),
);
Ok(format_iso8601(secs))
}
pub fn format_iso8601(secs: u64) -> String {
const SECS_PER_MIN: u64 = 60;
const SECS_PER_HOUR: u64 = 3600;
const SECS_PER_DAY: u64 = 86400;
let days = secs / SECS_PER_DAY;
let rem = secs % SECS_PER_DAY;
let hh = rem / SECS_PER_HOUR;
let mm = (rem % SECS_PER_HOUR) / SECS_PER_MIN;
let ss = rem % SECS_PER_MIN;
let z = days.cast_signed() + 719_468_i64;
let era = if z >= 0 { z } else { z - 146_096 } / 146_097;
let doe = (z - era * 146_097).cast_unsigned();
let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146_096) / 365;
let y = yoe.cast_signed() + era * 400;
let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
let mp = (5 * doy + 2) / 153;
let d = doy - (153 * mp + 2) / 5 + 1;
let m = if mp < 10 { mp + 3 } else { mp - 9 };
let y = if m <= 2 { y + 1 } else { y };
format!("{y:04}-{m:02}-{d:02}T{hh:02}:{mm:02}:{ss:02}Z")
}
use crate::heading::parse_atx_heading;
use crate::links;
struct SectionBuilder {
level: u8,
heading: Option<String>,
line: usize,
links: Vec<String>,
task_total: usize,
task_done: usize,
code_blocks: Vec<String>,
}
impl SectionBuilder {
fn new(level: u8, heading: Option<String>, line: usize) -> Self {
Self {
level,
heading,
line,
links: Vec::new(),
task_total: 0,
task_done: 0,
code_blocks: Vec::new(),
}
}
fn finish(self) -> OutlineSection {
let tasks = if self.task_total > 0 {
Some(TaskCount {
total: self.task_total,
done: self.task_done,
})
} else {
None
};
OutlineSection {
level: self.level,
heading: self.heading,
line: self.line,
links: self.links,
tasks,
code_blocks: self.code_blocks,
}
}
}
struct BodyCollector {
active: bool,
buf: String,
}
impl BodyCollector {
fn new(active: bool) -> Self {
Self {
active,
buf: String::new(),
}
}
fn into_body(self) -> String {
self.buf
}
}
impl FileVisitor for BodyCollector {
fn needs_body(&self) -> bool {
self.active
}
fn on_body_line(&mut self, raw: &str, _cleaned: &str, _line_num: usize) -> ScanAction {
if !self.buf.is_empty() {
self.buf.push('\n');
}
self.buf.push_str(raw);
ScanAction::Continue
}
fn on_code_block_line(&mut self, raw: &str, _line_num: usize) -> ScanAction {
if !self.buf.is_empty() {
self.buf.push('\n');
}
self.buf.push_str(raw);
ScanAction::Continue
}
}
struct SectionScanner {
current: SectionBuilder,
sections: Vec<OutlineSection>,
}
impl SectionScanner {
fn new() -> Self {
Self {
current: SectionBuilder::new(0, None, 1),
sections: Vec::new(),
}
}
fn into_sections(mut self) -> Vec<OutlineSection> {
let last = std::mem::replace(&mut self.current, SectionBuilder::new(0, None, 0));
let finished = last.finish();
let should_emit = finished.level > 0
|| !finished.links.is_empty()
|| finished.tasks.is_some()
|| !finished.code_blocks.is_empty();
if should_emit {
self.sections.push(finished);
}
self.sections
}
}
impl FileVisitor for SectionScanner {
fn on_body_line(&mut self, raw: &str, cleaned: &str, line_num: usize) -> ScanAction {
if let Some((level, heading_text)) = parse_atx_heading(raw) {
let finished = std::mem::replace(
&mut self.current,
SectionBuilder::new(level, Some(heading_text.to_owned()), line_num),
);
let should_emit = finished.level > 0
|| !finished.links.is_empty()
|| finished.task_total > 0
|| !finished.code_blocks.is_empty();
if should_emit {
self.sections.push(finished.finish());
}
return ScanAction::Continue;
}
let mut line_links: Vec<links::Link> = Vec::new();
links::extract_links_from_text(cleaned, &mut line_links);
for link in line_links {
self.current.links.push(format_link_string(&link));
}
if let Some((_status, done)) = crate::tasks::detect_task_checkbox(raw) {
self.current.task_total += 1;
if done {
self.current.task_done += 1;
}
}
ScanAction::Continue
}
fn on_code_fence_open(&mut self, _raw: &str, language: &str, _line_num: usize) -> ScanAction {
if !language.is_empty() {
self.current.code_blocks.push(language.to_owned());
}
ScanAction::Continue
}
}
fn format_link_string(link: &links::Link) -> String {
match link.kind {
links::LinkKind::Wikilink => match &link.label {
Some(label) if !label.is_empty() => format!("[[{}|{}]]", link.target, label),
_ => format!("[[{}]]", link.target),
},
links::LinkKind::Markdown => match &link.label {
Some(label) if !label.is_empty() => format!("[{}]({})", label, link.target),
_ => format!("[]({})", link.target),
},
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
macro_rules! md {
($s:expr) => {
$s.strip_prefix('\n').unwrap_or($s)
};
}
fn setup_vault() -> (tempfile::TempDir, Vec<(PathBuf, String)>) {
let tmp = tempfile::tempdir().unwrap();
fs::write(
tmp.path().join("a.md"),
md!(r"
---
title: Alpha
status: draft
tags:
- rust
- cli
---
# Introduction
See [[b]] for context.
## Tasks
- [ ] Write tests
- [x] Write code
"),
)
.unwrap();
fs::write(
tmp.path().join("b.md"),
md!(r"
---
title: Beta
status: done
tags:
- rust
---
# Content
See [[a]] for details.
"),
)
.unwrap();
let files = vec![
(tmp.path().join("a.md"), "a.md".to_owned()),
(tmp.path().join("b.md"), "b.md".to_owned()),
];
(tmp, files)
}
#[test]
fn scanned_index_builds_entries() {
let (_tmp, files) = setup_vault();
let build = ScannedIndex::build(
&files,
None,
&ScanOptions {
scan_body: true,
bm25_tokenize: false,
default_language: None,
frontmatter_link_props: None,
},
)
.unwrap();
assert!(build.warnings.is_empty());
assert_eq!(build.index.entries().len(), 2);
}
#[test]
fn scanned_index_get_by_path() {
let (_tmp, files) = setup_vault();
let build = ScannedIndex::build(
&files,
None,
&ScanOptions {
scan_body: true,
bm25_tokenize: false,
default_language: None,
frontmatter_link_props: None,
},
)
.unwrap();
let idx = &build.index;
let a = idx.get("a.md").unwrap();
assert_eq!(a.tags, vec!["rust", "cli"]);
assert_eq!(a.properties.get("status").unwrap(), "draft");
let b = idx.get("b.md").unwrap();
assert_eq!(b.tags, vec!["rust"]);
assert!(idx.get("c.md").is_none());
}
#[test]
fn scanned_index_sections_and_tasks() {
let (_tmp, files) = setup_vault();
let build = ScannedIndex::build(
&files,
None,
&ScanOptions {
scan_body: true,
bm25_tokenize: false,
default_language: None,
frontmatter_link_props: None,
},
)
.unwrap();
let a = build.index.get("a.md").unwrap();
assert_eq!(a.sections.len(), 2);
assert_eq!(a.sections[0].heading.as_deref(), Some("Introduction"));
assert_eq!(a.sections[1].heading.as_deref(), Some("Tasks"));
assert_eq!(a.tasks.len(), 2);
assert!(!a.tasks[0].done);
assert!(a.tasks[1].done);
}
#[test]
fn scanned_index_link_graph() {
let (_tmp, files) = setup_vault();
let build = ScannedIndex::build(
&files,
None,
&ScanOptions {
scan_body: true,
bm25_tokenize: false,
default_language: None,
frontmatter_link_props: None,
},
)
.unwrap();
let graph = build.index.link_graph();
let a_backlinks = graph.backlinks("a");
assert!(!a_backlinks.is_empty());
let b_backlinks = graph.backlinks("b");
assert!(!b_backlinks.is_empty());
}
#[test]
fn scanned_index_outbound_links() {
let (_tmp, files) = setup_vault();
let build = ScannedIndex::build(
&files,
None,
&ScanOptions {
scan_body: true,
bm25_tokenize: false,
default_language: None,
frontmatter_link_props: None,
},
)
.unwrap();
let a = build.index.get("a.md").unwrap();
assert_eq!(a.links.len(), 1);
assert_eq!(a.links[0].1.target, "b");
}
#[test]
fn scanned_index_skips_broken_frontmatter() {
let tmp = tempfile::tempdir().unwrap();
fs::write(
tmp.path().join("good.md"),
md!(r"
---
title: Good
---
Content.
"),
)
.unwrap();
fs::write(
tmp.path().join("bad.md"),
"---\n: invalid yaml [[[{\n---\nContent.\n",
)
.unwrap();
let files = vec![
(tmp.path().join("good.md"), "good.md".to_owned()),
(tmp.path().join("bad.md"), "bad.md".to_owned()),
];
let build = ScannedIndex::build(
&files,
None,
&ScanOptions {
scan_body: true,
bm25_tokenize: false,
default_language: None,
frontmatter_link_props: None,
},
)
.unwrap();
assert_eq!(build.index.entries().len(), 1);
assert_eq!(build.warnings.len(), 1);
assert_eq!(build.warnings[0].rel_path, "bad.md");
}
#[test]
fn scanned_index_modified_is_iso8601() {
let (_tmp, files) = setup_vault();
let build = ScannedIndex::build(
&files,
None,
&ScanOptions {
scan_body: true,
bm25_tokenize: false,
default_language: None,
frontmatter_link_props: None,
},
)
.unwrap();
let a = build.index.get("a.md").unwrap();
assert!(
a.modified.contains('T') && a.modified.ends_with('Z'),
"unexpected timestamp: {}",
a.modified
);
}
#[test]
fn snapshot_roundtrip() {
let (_tmp, files) = setup_vault();
let build = ScannedIndex::build(
&files,
None,
&ScanOptions {
scan_body: true,
bm25_tokenize: false,
default_language: None,
frontmatter_link_props: None,
},
)
.unwrap();
let index = &build.index;
let snap_dir = tempfile::tempdir().unwrap();
let snap_path = snap_dir.path().join(".hyalo-index");
SnapshotIndex::save(index, &snap_path, "/tmp/vault", None, None).unwrap();
let loaded = SnapshotIndex::load(&snap_path)
.unwrap()
.expect("snapshot should deserialize");
assert_eq!(loaded.entries().len(), index.entries().len());
let a = loaded.get("a.md").unwrap();
assert_eq!(a.tags, vec!["rust", "cli"]);
assert_eq!(a.properties.get("status").unwrap(), "draft");
assert_eq!(a.sections.len(), 2);
assert_eq!(a.tasks.len(), 2);
assert_eq!(a.links.len(), 1);
assert_eq!(a.links[0].1.target, "b");
let bl = loaded.link_graph().backlinks("a");
assert!(!bl.is_empty());
}
#[test]
fn scanned_index_skip_body() {
let (_tmp, files) = setup_vault();
let build = ScannedIndex::build(
&files,
None,
&ScanOptions {
scan_body: false,
bm25_tokenize: false,
default_language: None,
frontmatter_link_props: None,
},
)
.unwrap();
assert!(build.warnings.is_empty());
let idx = &build.index;
let a = idx.get("a.md").unwrap();
assert_eq!(a.tags, vec!["rust", "cli"]);
assert_eq!(a.properties.get("status").unwrap(), "draft");
assert!(a.sections.is_empty());
assert!(a.tasks.is_empty());
assert!(a.links.is_empty());
assert!(idx.link_graph().backlinks("a").is_empty());
assert!(idx.link_graph().backlinks("b").is_empty());
}
fn make_snapshot_bytes(rel_path: &str) -> Vec<u8> {
let data = SnapshotData {
header: SnapshotHeader {
vault_dir: "/tmp/vault".to_owned(),
site_prefix: None,
created_at: 0,
pid: std::process::id(),
},
entries: vec![IndexEntry {
rel_path: rel_path.to_owned(),
modified: "2024-01-01T00:00:00Z".to_owned(),
properties: IndexMap::default(),
tags: vec![],
sections: vec![],
tasks: vec![],
links: vec![],
bm25_tokens: None,
bm25_language: None,
}],
graph: LinkGraph::default(),
bm25_index: None,
};
rmp_serde::to_vec_named(&data).unwrap()
}
#[test]
fn load_inner_rejects_parent_traversal() {
let bytes = make_snapshot_bytes("../../escape.md");
assert!(
SnapshotIndex::load_inner(&bytes, false).is_none(),
"snapshot with '..' path components must be rejected"
);
}
#[test]
fn load_inner_rejects_absolute_path() {
let bytes = make_snapshot_bytes("/etc/passwd");
assert!(
SnapshotIndex::load_inner(&bytes, false).is_none(),
"snapshot with absolute rel_path must be rejected"
);
#[cfg(windows)]
{
let bytes = make_snapshot_bytes("C:\\Windows\\System32\\config\\sam");
assert!(
SnapshotIndex::load_inner(&bytes, false).is_none(),
"snapshot with Windows absolute rel_path must be rejected"
);
}
}
#[test]
fn load_inner_rejects_null_byte() {
let bytes = make_snapshot_bytes("foo\0bar.md");
assert!(
SnapshotIndex::load_inner(&bytes, false).is_none(),
"snapshot with null-byte path must be rejected"
);
}
#[test]
fn load_inner_rejects_bm25_out_of_bounds_doc_id() {
use crate::bm25::{Bm25InvertedIndex, Posting};
use std::collections::HashMap;
let mut postings: HashMap<String, Vec<Posting>> = HashMap::new();
postings.insert(
"rust".to_owned(),
vec![Posting {
doc_id: 999, term_freq: 1,
positions: vec![0],
}],
);
let bad_bm25 = Bm25InvertedIndex::new_for_test(
postings,
vec![5], vec!["doc.md".to_owned()], 5.0,
);
let data = SnapshotData {
header: SnapshotHeader {
vault_dir: "/tmp/vault".to_owned(),
site_prefix: None,
created_at: 0,
pid: std::process::id(),
},
entries: vec![IndexEntry {
rel_path: "doc.md".to_owned(),
modified: "2024-01-01T00:00:00Z".to_owned(),
properties: IndexMap::default(),
tags: vec![],
sections: vec![],
tasks: vec![],
links: vec![],
bm25_tokens: None,
bm25_language: None,
}],
graph: LinkGraph::default(),
bm25_index: Some(bad_bm25),
};
let bytes = rmp_serde::to_vec_named(&data).unwrap();
assert!(
SnapshotIndex::load_inner(&bytes, false).is_none(),
"snapshot with out-of-bounds BM25 doc_id must be rejected (MED-1)"
);
}
#[test]
fn load_inner_rejects_bm25_mismatched_doc_lengths() {
use crate::bm25::{Bm25InvertedIndex, Posting};
use std::collections::HashMap;
let mut postings: HashMap<String, Vec<Posting>> = HashMap::new();
postings.insert(
"rust".to_owned(),
vec![Posting {
doc_id: 0,
term_freq: 1,
positions: vec![0],
}],
);
let bad_bm25 = Bm25InvertedIndex::new_for_test(
postings,
vec![5, 10], vec!["doc.md".to_owned()], 7.5,
);
let data = SnapshotData {
header: SnapshotHeader {
vault_dir: "/tmp/vault".to_owned(),
site_prefix: None,
created_at: 0,
pid: std::process::id(),
},
entries: vec![IndexEntry {
rel_path: "doc.md".to_owned(),
modified: "2024-01-01T00:00:00Z".to_owned(),
properties: IndexMap::default(),
tags: vec![],
sections: vec![],
tasks: vec![],
links: vec![],
bm25_tokens: None,
bm25_language: None,
}],
graph: LinkGraph::default(),
bm25_index: Some(bad_bm25),
};
let bytes = rmp_serde::to_vec_named(&data).unwrap();
assert!(
SnapshotIndex::load_inner(&bytes, false).is_none(),
"snapshot with mismatched BM25 doc_lengths/doc_paths must be rejected (MED-1)"
);
}
#[test]
fn is_pid_alive_zero_returns_false() {
assert!(
!is_pid_alive(0),
"pid 0 must not be treated as an alive process"
);
}
#[test]
fn load_rejects_oversized_file() {
let tmp = tempfile::tempdir().unwrap();
let path = tmp.path().join("big.hyalo-index");
let f = std::fs::File::create(&path).unwrap();
f.set_len(MAX_INDEX_FILE_SIZE + 1).unwrap();
let result = SnapshotIndex::load(&path).unwrap();
assert!(
result.is_none(),
"oversized index file must return Ok(None)"
);
}
}