use std::{
ops::Deref,
path::{Path, PathBuf},
sync::{
Arc,
atomic::{AtomicBool, Ordering},
},
time::{Instant, UNIX_EPOCH},
};
use papaya::{HashMap, HashSet};
use rayon::prelude::*;
use serde::{Serialize, Serializer, ser::SerializeSeq};
use walkdir::WalkDir;
use crate::Config;
use crate::config::TagSource;
use crate::tag_index::{TagIndex, TaggedPage};
#[derive(Clone, Serialize)]
pub struct Repo {
#[serde(skip)]
root_dir: PathBuf,
#[serde(skip)]
static_folder: String,
#[serde(skip)]
markdown_extensions: Vec<String>,
pub index_file: String,
#[serde(skip)]
ignore_dirs: Vec<String>,
#[serde(skip)]
#[allow(dead_code)] ignore_globs: Vec<String>,
#[serde(skip)]
compiled_ignore_globs: Vec<glob::Pattern>,
#[serde(skip)]
pub scanned_folders: HashSet<PathBuf>,
#[serde(skip)]
pub queued_folders: HashMap<PathBuf, PathBuf>,
pub markdown_files: MarkdownFiles,
pub other_files: OtherFiles,
#[serde(skip)]
pub tag_index: Arc<TagIndex>,
#[serde(skip)]
tag_sources: Vec<TagSource>,
#[serde(skip)]
text_extracted: Arc<AtomicBool>,
#[serde(skip)]
media_populated: Arc<AtomicBool>,
#[serde(skip)]
scan_complete: Arc<AtomicBool>,
#[serde(skip)]
scan_notify: Arc<tokio::sync::Notify>,
#[serde(skip)]
media_notify: Arc<tokio::sync::Notify>,
}
#[derive(Clone)]
pub struct MarkdownFiles(HashMap<PathBuf, MarkdownInfo>);
impl Deref for MarkdownFiles {
type Target = HashMap<PathBuf, MarkdownInfo>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl Serialize for MarkdownFiles {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let mut s = serializer.serialize_seq(Some(self.len()))?;
for (_, v) in self.pin().iter() {
s.serialize_element(v)?;
}
s.end()
}
}
#[derive(Clone)]
pub struct OtherFiles(HashMap<PathBuf, OtherFileInfo>);
impl Deref for OtherFiles {
type Target = HashMap<PathBuf, OtherFileInfo>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl Serialize for OtherFiles {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let mut s = serializer.serialize_seq(Some(self.len()))?;
for (_, v) in self.pin().iter() {
s.serialize_element(v)?;
}
s.end()
}
}
#[derive(Clone, Serialize)]
pub struct MarkdownInfo {
pub raw_path: PathBuf,
pub url_path: String,
pub created: u64,
pub modified: u64,
pub frontmatter: Option<crate::markdown::SimpleMetadata>,
}
#[derive(Clone, Serialize)]
pub struct OtherFileInfo {
#[serde(skip)]
pub raw_path: PathBuf,
pub url_path: String,
metadata: StaticFileMetadata,
#[serde(skip)]
pub extracted_text: Option<String>,
}
const MAX_TEXT_EXTRACTION_SIZE: u64 = 10 * 1024 * 1024;
impl OtherFileInfo {
pub fn filetype(&self) -> &'static str {
match &self.metadata.kind {
StaticFileKind::Pdf { .. } => "pdf",
StaticFileKind::Image { .. } => "image",
StaticFileKind::Video { .. } => "video",
StaticFileKind::Audio { .. } => "audio",
StaticFileKind::Text => "text",
StaticFileKind::Other => "other",
}
}
pub fn is_searchable(&self) -> bool {
matches!(
&self.metadata.kind,
StaticFileKind::Pdf { .. } | StaticFileKind::Text
)
}
fn extract_text(&self) -> Option<String> {
if let Some(size) = self.metadata.file_size_bytes
&& size > MAX_TEXT_EXTRACTION_SIZE
{
tracing::debug!(
"Skipping text extraction for {:?}: file too large ({} bytes)",
self.raw_path,
size
);
return None;
}
match &self.metadata.kind {
StaticFileKind::Pdf { .. } => self.extract_pdf_text(),
StaticFileKind::Text => self.extract_plain_text(),
_ => None,
}
}
fn extract_pdf_text(&self) -> Option<String> {
let doc = match lopdf::Document::load(&self.raw_path) {
Ok(doc) => doc,
Err(e) => {
tracing::debug!("Failed to load PDF {:?}: {}", self.raw_path, e);
return None;
}
};
let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
if page_numbers.is_empty() {
return None;
}
match doc.extract_text(&page_numbers) {
Ok(text) => {
let text = text.trim().to_string();
if text.is_empty() { None } else { Some(text) }
}
Err(e) => {
tracing::debug!("Failed to extract PDF text from {:?}: {}", self.raw_path, e);
None
}
}
}
fn extract_plain_text(&self) -> Option<String> {
match std::fs::read_to_string(&self.raw_path) {
Ok(text) => {
let text = text.trim().to_string();
if text.is_empty() { None } else { Some(text) }
}
Err(e) => {
tracing::debug!("Failed to read text file {:?}: {}", self.raw_path, e);
None
}
}
}
}
#[derive(Clone, Default, Serialize)]
pub struct StaticFileMetadata {
path: PathBuf,
created: Option<u64>,
modified: Option<u64>,
file_size_bytes: Option<u64>,
kind: StaticFileKind,
}
#[derive(Clone, Default, Serialize)]
#[serde(tag = "type", rename_all = "lowercase")]
enum StaticFileKind {
Pdf {
description: Option<String>,
title: Option<String>,
author: Option<String>,
subject: Option<String>,
num_pages: Option<usize>,
},
Image {
width: Option<u32>,
height: Option<u32>,
},
Video {
width: Option<u32>,
height: Option<u32>,
duration: Option<String>,
title: Option<String>,
genre: Option<String>,
album: Option<String>,
},
Audio {
duration: Option<String>,
title: Option<String>,
},
Text,
#[default]
Other,
}
impl StaticFileMetadata {
pub fn empty<P: Into<std::path::PathBuf>>(file: P) -> Self {
let file: PathBuf = file.into();
match file
.extension()
.map(|x| x.to_ascii_lowercase().to_string_lossy().to_string())
.as_deref()
{
Some("pdf") => Self {
path: file,
kind: StaticFileKind::Pdf {
description: None,
title: None,
author: None,
subject: None,
num_pages: None,
},
..Default::default()
},
Some("jpg") | Some("jpeg") | Some("png") | Some("webp") | Some("gif") | Some("bmp")
| Some("tif") | Some("tiff") => Self {
path: file,
kind: StaticFileKind::Image {
width: None,
height: None,
},
..Default::default()
},
Some("aiff") | Some("mp3") | Some("aac") | Some("m4a") | Some("ogg") | Some("oga")
| Some("opus") | Some("wma") | Some("flac") | Some("wav") | Some("aif") | Some("") => {
Self {
path: file,
kind: StaticFileKind::Audio {
duration: None,
title: None,
},
..Default::default()
}
}
Some("mp4") | Some("m4v") | Some("mov") | Some("webm") | Some("flv") | Some("mpg")
| Some("mpeg") | Some("avi") | Some("3gp") | Some("wmv") => Self {
path: file,
kind: StaticFileKind::Video {
width: None,
height: None,
duration: None,
title: None,
genre: None,
album: None,
},
..Default::default()
},
Some("txt") | Some("css") | Some("vtt") | Some("srt") | Some("toml") | Some("json")
| Some("js") | Some("ts") => Self {
path: file,
kind: StaticFileKind::Text,
..Default::default()
},
_ => Self {
path: file,
kind: StaticFileKind::Other,
..Default::default()
},
}
}
pub fn populate_basic(self) -> Self {
let mut me = self;
let file_details_start = Instant::now();
let (filesize, created, modified) = match file_details_from_path(&me.path).ok() {
Some((fs, c, m)) => (Some(fs), Some(c), Some(m)),
_ => (None, None, None),
};
tracing::debug!(
"populate file_details for {:?}: {:?}",
me.path,
file_details_start.elapsed()
);
me.file_size_bytes = filesize;
me.created = created;
me.modified = modified;
me
}
#[cfg(feature = "media-metadata")]
pub fn populate_media(self) -> Self {
let mut me = self;
let media_start = Instant::now();
let kind_name = match &me.kind {
StaticFileKind::Pdf { .. } => "pdf",
StaticFileKind::Image { .. } => "image",
StaticFileKind::Video { .. } => "video",
StaticFileKind::Audio { .. } => "audio",
StaticFileKind::Text => "text",
StaticFileKind::Other => "other",
};
me.kind = match me.kind {
StaticFileKind::Pdf { .. } => match crate::pdf_metadata::probe_pdf(&me.path) {
Ok(meta) => StaticFileKind::Pdf {
title: meta.title,
author: meta.author,
subject: meta.subject,
description: None,
num_pages: Some(meta.num_pages as usize),
},
Err(e) => {
tracing::debug!("Failed to extract PDF metadata from {:?}: {}", me.path, e);
me.kind
}
},
StaticFileKind::Image { .. } => {
let metadata = metadata::media_file::MediaFileMetadata::new(&me.path).ok();
StaticFileKind::Image {
width: metadata.as_ref().and_then(|m| m.width),
height: metadata.as_ref().and_then(|m| m.height),
}
}
StaticFileKind::Audio { .. } => {
let metadata = metadata::media_file::MediaFileMetadata::new(&me.path).ok();
StaticFileKind::Audio {
duration: metadata.as_ref().and_then(|m| m.duration.clone()),
title: metadata.as_ref().and_then(|m| m.title.clone()),
}
}
StaticFileKind::Video { .. } => {
let metadata = metadata::media_file::MediaFileMetadata::new(&me.path).ok();
let genre = metadata.as_ref().and_then(|m| {
m.tags
.iter()
.find(|(k, _)| k.eq_ignore_ascii_case("genre"))
.map(|(_, v)| v.clone())
});
let album = metadata.as_ref().and_then(|m| {
m.tags
.iter()
.find(|(k, _)| k.eq_ignore_ascii_case("album"))
.map(|(_, v)| v.clone())
});
StaticFileKind::Video {
width: metadata.as_ref().and_then(|m| m.width),
height: metadata.as_ref().and_then(|m| m.height),
duration: metadata.as_ref().and_then(|m| m.duration.clone()),
title: metadata.as_ref().and_then(|m| m.title.clone()),
genre,
album,
}
}
_ => me.kind,
};
tracing::debug!(
"populate media metadata ({}) for {:?}: {:?}",
kind_name,
me.path,
media_start.elapsed()
);
me
}
pub fn populate(self) -> Self {
let me = self.populate_basic();
#[cfg(feature = "media-metadata")]
let me = me.populate_media();
me
}
pub fn from<P: Into<std::path::PathBuf>>(file: P) -> Self {
let empty = Self::empty(file);
empty.populate()
}
}
impl Repo {
pub fn init_from_config(c: &Config) -> Self {
Self::init(
c.root_dir.clone(),
c.static_folder.clone(),
&c.markdown_extensions[..],
&c.ignore_dirs[..],
&c.ignore_globs[..],
c.index_file.clone(),
&c.tag_sources[..],
)
}
pub fn init<S: Into<String>, P: Into<std::path::PathBuf>>(
root_dir: P,
static_folder: S,
markdown_extensions: &[String],
ignore_dirs: &[String],
ignore_globs: &[String],
index_file: S,
tag_sources: &[TagSource],
) -> Self {
let compiled_ignore_globs: Vec<glob::Pattern> = ignore_globs
.iter()
.filter_map(|pat| {
glob::Pattern::new(pat)
.map_err(|e| tracing::warn!("Invalid ignore glob pattern '{}': {}", pat, e))
.ok()
})
.collect();
Self {
root_dir: root_dir.into(),
static_folder: static_folder.into(),
markdown_extensions: markdown_extensions.to_vec(),
ignore_dirs: ignore_dirs.to_vec(),
ignore_globs: ignore_globs.to_vec(),
compiled_ignore_globs,
index_file: index_file.into(),
scanned_folders: HashSet::new(),
queued_folders: HashMap::new(),
markdown_files: MarkdownFiles(HashMap::new()),
other_files: OtherFiles(HashMap::new()),
tag_index: Arc::new(TagIndex::new()),
tag_sources: tag_sources.to_vec(),
text_extracted: Arc::new(AtomicBool::new(false)),
media_populated: Arc::new(AtomicBool::new(false)),
scan_complete: Arc::new(AtomicBool::new(false)),
scan_notify: Arc::new(tokio::sync::Notify::new()),
media_notify: Arc::new(tokio::sync::Notify::new()),
}
}
pub fn scan_folder<P: AsRef<Path>>(
&self,
relative_folder_path: &P,
) -> Result<(), Box<dyn std::error::Error>> {
let relative_folder_path_ref = relative_folder_path.as_ref();
let start_folder = self
.root_dir
.join(relative_folder_path_ref)
.canonicalize()?;
if self.scanned_folders.pin().contains(&start_folder) {
return Ok(());
}
tracing::debug!("Scanning folder: {:?}", relative_folder_path_ref);
self.scanned_folders.pin().insert(start_folder.clone());
let walkdir_start = Instant::now();
let dir_walker = WalkDir::new(start_folder.clone())
.follow_links(true)
.min_depth(1)
.max_depth(1)
.into_iter()
.filter_entry(|e| {
!should_ignore_compiled(e.path(), &self.ignore_dirs, &self.compiled_ignore_globs)
});
let mut markdown = std::collections::HashMap::new();
let mut other = std::collections::HashMap::new();
for entry in dir_walker.filter_map(|e| e.ok()) {
let path = entry.path();
let extension = path.extension().and_then(|x| x.to_str()).unwrap_or("");
if path.is_dir() {
let relative_entry =
pathdiff::diff_paths(path, &self.root_dir).unwrap_or(path.to_path_buf());
self.queued_folders
.pin()
.insert(path.to_path_buf(), relative_entry);
} else if is_markdown_extension(extension, &self.markdown_extensions) {
if let Ok((_filesize, created, modified)) = file_details_from_path(path) {
let url = build_markdown_url_path(path, &self.root_dir, &self.index_file);
let mdfile = MarkdownInfo {
raw_path: path.to_path_buf(),
url_path: url,
created,
modified,
frontmatter: None,
};
markdown.insert(path.to_path_buf(), mdfile);
} else {
tracing::warn!("Couldn't process markdown file at {:?}", path);
}
} else {
let url = build_static_url_path(path, &self.root_dir, &self.static_folder);
let other_file = OtherFileInfo {
raw_path: path.to_path_buf(),
url_path: url,
metadata: StaticFileMetadata::empty(path),
extracted_text: None,
};
other.insert(path.to_path_buf(), other_file);
}
}
tracing::debug!(
"scan_folder WalkDir for {:?}: {} markdown, {} other files in {:?}",
relative_folder_path_ref,
markdown.len(),
other.len(),
walkdir_start.elapsed()
);
let frontmatter_start = Instant::now();
markdown
.into_par_iter()
.for_each(|(mdfile, mddetails): (PathBuf, MarkdownInfo)| {
let metadata = crate::markdown::extract_metadata_from_file(&mdfile).ok();
let details = if let Some(ref frontmatter) = metadata {
let title = get_page_title(frontmatter, &mddetails.raw_path);
let description = frontmatter
.get("description")
.and_then(|v| v.as_str())
.map(|s| s.to_string());
for tag_source in &self.tag_sources {
if let Some(tag_value_json) = frontmatter.get(&tag_source.field) {
for tag_value in extract_tag_values(tag_value_json) {
let page = if let Some(ref desc) = description {
TaggedPage::with_description(
&mddetails.url_path,
&title,
desc,
&tag_value,
)
} else {
TaggedPage::new(&mddetails.url_path, &title, &tag_value)
};
self.tag_index.add_page(&tag_source.field, &tag_value, page);
}
}
}
MarkdownInfo {
frontmatter: metadata,
..mddetails
}
} else {
mddetails
};
self.markdown_files.pin().insert(mdfile, details);
});
tracing::debug!(
"scan_folder frontmatter extraction for {:?}: {:?}",
relative_folder_path_ref,
frontmatter_start.elapsed()
);
let static_insert_start = Instant::now();
for (file, other_file) in other {
self.other_files.pin().insert(file, other_file);
}
tracing::debug!(
"scan_folder static file registration for {:?}: {:?}",
relative_folder_path_ref,
static_insert_start.elapsed()
);
Ok(())
}
pub fn scan_all(&self) -> Result<(), Box<dyn std::error::Error>> {
let scan_all_start = Instant::now();
let static_deferred = self
.root_dir
.join(&self.static_folder)
.canonicalize()
.ok()
.inspect(|p| {
self.scanned_folders.pin().insert(p.clone());
});
self.scan_folder(&PathBuf::from("."))?;
while !self.queued_folders.is_empty() {
let vec_folders: Vec<_> = self
.queued_folders
.pin()
.iter()
.map(|(_, relative)| relative.clone())
.collect();
self.queued_folders.pin().clear();
assert!(self.queued_folders.is_empty());
tracing::debug!("Parallel batch: {:?}", &vec_folders);
vec_folders.into_par_iter().for_each(|rel_path| {
self.scan_folder(&rel_path).unwrap_or_else(|e| {
tracing::error!("Failed to scan folder {:?}: {e}", &rel_path)
}) });
}
if let Some(ref sp) = static_deferred {
self.scanned_folders.pin().remove(sp);
}
let markdown_count = self.markdown_files.len();
let other_count = self.other_files.len();
let other_pin = self.other_files.pin();
let mut pdf_count: usize = 0;
let mut image_count: usize = 0;
let mut video_count: usize = 0;
let mut audio_count: usize = 0;
let mut text_count: usize = 0;
let mut misc_count: usize = 0;
for (_, info) in other_pin.iter() {
match info.filetype() {
"pdf" => pdf_count += 1,
"image" => image_count += 1,
"video" => video_count += 1,
"audio" => audio_count += 1,
"text" => text_count += 1,
_ => misc_count += 1,
}
}
tracing::info!(
"scan_all completed in {:?}: {} markdown files, {} other files (pdf={}, image={}, video={}, audio={}, text={}, other={})",
scan_all_start.elapsed(),
markdown_count,
other_count,
pdf_count,
image_count,
video_count,
audio_count,
text_count,
misc_count,
);
Ok(())
}
pub fn scan_static_folder(&self) -> Result<(), Box<dyn std::error::Error>> {
let start = Instant::now();
let static_path = self.root_dir.join(&self.static_folder);
if !static_path.is_dir() {
return Ok(());
}
self.scan_folder(&PathBuf::from(&self.static_folder))?;
while !self.queued_folders.is_empty() {
let vec_folders: Vec<_> = self
.queued_folders
.pin()
.iter()
.map(|(_, relative)| relative.clone())
.collect();
self.queued_folders.pin().clear();
vec_folders.into_par_iter().for_each(|rel_path| {
self.scan_folder(&rel_path).unwrap_or_else(|e| {
tracing::error!("Failed to scan folder {:?}: {e}", &rel_path)
})
});
}
let other_count = self.other_files.len();
tracing::info!(
"scan_static_folder completed in {:?}: {} other files total",
start.elapsed(),
other_count,
);
Ok(())
}
pub fn mark_scan_complete(&self) {
self.scan_complete.store(true, Ordering::SeqCst);
self.scan_notify.notify_waiters();
}
pub fn populate_basic_metadata(&self) {
let start = Instant::now();
let pin = self.other_files.pin();
let keys: Vec<PathBuf> = pin
.iter()
.filter(|(_, info)| info.metadata.file_size_bytes.is_none())
.map(|(k, _)| k.clone())
.collect();
let count = keys.len();
drop(pin);
keys.into_par_iter().for_each(|key| {
let pin = self.other_files.pin();
if let Some(info) = pin.get(&key) {
let updated = OtherFileInfo {
metadata: info.metadata.clone().populate_basic(),
..info.clone()
};
drop(pin);
self.other_files.pin().insert(key, updated);
}
});
tracing::info!(
"populate_basic_metadata completed for {} files in {:?}",
count,
start.elapsed()
);
}
pub fn is_scan_complete(&self) -> bool {
self.scan_complete.load(Ordering::SeqCst)
}
pub async fn wait_for_scan(&self) {
if self.is_scan_complete() {
return;
}
loop {
self.scan_notify.notified().await;
if self.is_scan_complete() {
return;
}
}
}
pub fn is_media_populated(&self) -> bool {
self.media_populated.load(Ordering::Acquire)
}
pub fn notify_media_populated(&self) {
self.media_notify.notify_waiters();
}
pub async fn wait_for_media(&self) {
if self.is_media_populated() {
return;
}
loop {
self.media_notify.notified().await;
if self.is_media_populated() {
return;
}
}
}
pub fn to_json(&self) -> serde_json::Result<String> {
serde_json::to_string(self)
}
pub fn populate_media_metadata(&self) {
if self.media_populated.swap(true, Ordering::SeqCst) {
return; }
let start = Instant::now();
let pin = self.other_files.pin();
let entries: Vec<_> = pin.iter().map(|(k, v)| (k.clone(), v.clone())).collect();
drop(pin);
#[cfg(feature = "media-metadata")]
{
entries.into_par_iter().for_each(|(key, info)| {
let updated = OtherFileInfo {
metadata: info.metadata.populate_media(),
..info
};
self.other_files.pin().insert(key, updated);
});
}
#[cfg(not(feature = "media-metadata"))]
let _ = entries;
tracing::info!("populate_media_metadata completed in {:?}", start.elapsed());
}
pub fn ensure_text_extracted(&self) {
if self.text_extracted.swap(true, Ordering::SeqCst) {
return; }
let start = Instant::now();
let pin = self.other_files.pin();
let entries: Vec<_> = pin
.iter()
.filter(|(_, info)| info.is_searchable() && info.extracted_text.is_none())
.map(|(k, v)| (k.clone(), v.clone()))
.collect();
drop(pin);
entries.into_par_iter().for_each(|(key, mut info)| {
info.extracted_text = info.extract_text();
self.other_files.pin().insert(key, info);
});
tracing::info!("ensure_text_extracted completed in {:?}", start.elapsed());
}
pub fn clear(&self) {
self.scanned_folders.pin().clear();
self.markdown_files.pin().clear();
self.other_files.pin().clear();
self.queued_folders.pin().clear();
self.tag_index.clear();
self.text_extracted.store(false, Ordering::SeqCst);
self.media_populated.store(false, Ordering::SeqCst);
}
pub fn invalidate_file(&self, abs_path: &Path, event: &crate::watcher::ChangeEventType) {
let extension = abs_path.extension().and_then(|x| x.to_str()).unwrap_or("");
let is_markdown = is_markdown_extension(extension, &self.markdown_extensions);
match event {
crate::watcher::ChangeEventType::Deleted => {
if is_markdown {
self.markdown_files.pin().remove(abs_path);
} else {
self.other_files.pin().remove(abs_path);
}
}
crate::watcher::ChangeEventType::Created => {
if is_markdown {
if let Ok((_filesize, created, modified)) = file_details_from_path(abs_path) {
let url =
build_markdown_url_path(abs_path, &self.root_dir, &self.index_file);
let frontmatter =
crate::markdown::extract_metadata_from_file(abs_path).ok();
if let Some(ref fm) = frontmatter {
let title = get_page_title(fm, abs_path);
let description = fm
.get("description")
.and_then(|v| v.as_str())
.map(|s| s.to_string());
for tag_source in &self.tag_sources {
if let Some(tag_value_json) = fm.get(&tag_source.field) {
for tag_value in extract_tag_values(tag_value_json) {
let page = if let Some(ref desc) = description {
TaggedPage::with_description(
&url, &title, desc, &tag_value,
)
} else {
TaggedPage::new(&url, &title, &tag_value)
};
self.tag_index.add_page(
&tag_source.field,
&tag_value,
page,
);
}
}
}
}
let info = MarkdownInfo {
raw_path: abs_path.to_path_buf(),
url_path: url,
created,
modified,
frontmatter,
};
self.markdown_files
.pin()
.insert(abs_path.to_path_buf(), info);
}
} else {
let url = build_static_url_path(abs_path, &self.root_dir, &self.static_folder);
let info = OtherFileInfo {
raw_path: abs_path.to_path_buf(),
url_path: url,
metadata: StaticFileMetadata::empty(abs_path).populate_basic(),
extracted_text: None,
};
self.other_files.pin().insert(abs_path.to_path_buf(), info);
}
}
crate::watcher::ChangeEventType::Modified => {
if is_markdown {
if let Ok((_filesize, created, modified)) = file_details_from_path(abs_path) {
let url =
build_markdown_url_path(abs_path, &self.root_dir, &self.index_file);
let frontmatter =
crate::markdown::extract_metadata_from_file(abs_path).ok();
let info = MarkdownInfo {
raw_path: abs_path.to_path_buf(),
url_path: url,
created,
modified,
frontmatter,
};
self.markdown_files
.pin()
.insert(abs_path.to_path_buf(), info);
}
} else {
let url = build_static_url_path(abs_path, &self.root_dir, &self.static_folder);
let info = OtherFileInfo {
raw_path: abs_path.to_path_buf(),
url_path: url,
metadata: StaticFileMetadata::empty(abs_path).populate_basic(),
extracted_text: None,
};
self.other_files.pin().insert(abs_path.to_path_buf(), info);
}
}
}
}
pub fn tag_sources(&self) -> &[TagSource] {
&self.tag_sources
}
pub fn rebuild_tag_index(&self) {
self.tag_index.clear();
let pin = self.markdown_files.pin();
for (_, info) in pin.iter() {
if let Some(ref fm) = info.frontmatter {
let title = get_page_title(fm, &info.raw_path);
let description = fm
.get("description")
.and_then(|v| v.as_str())
.map(|s| s.to_string());
for tag_source in &self.tag_sources {
if let Some(tag_value_json) = fm.get(&tag_source.field) {
for tag_value in extract_tag_values(tag_value_json) {
let page = if let Some(ref desc) = description {
TaggedPage::with_description(
&info.url_path,
&title,
desc,
&tag_value,
)
} else {
TaggedPage::new(&info.url_path, &title, &tag_value)
};
self.tag_index.add_page(&tag_source.field, &tag_value, page);
}
}
}
}
}
}
}
pub fn file_details_from_path<P: AsRef<Path>>(
path: P,
) -> Result<(u64, u64, u64), Box<dyn std::error::Error>> {
let path = path.as_ref();
let metadata = std::fs::metadata(path)?;
let file_size = metadata.len();
let modified = metadata.modified()?;
let modified_secs = modified.duration_since(UNIX_EPOCH)?.as_secs();
let created = metadata.created()?;
let created_secs = created.duration_since(UNIX_EPOCH)?.as_secs();
Ok((file_size, created_secs, modified_secs))
}
pub fn should_ignore(path: &Path, ignore_dirs: &[String], ignore_globs: &[String]) -> bool {
let file_name = path.file_name().and_then(|x| x.to_str()).unwrap_or("");
if file_name.starts_with('.') {
return true;
}
if path.is_dir() && ignore_dirs.iter().any(|x| x.as_str() == file_name) {
return true;
}
ignore_globs.iter().any(|pat| {
glob::Pattern::new(pat)
.map(|pat| pat.matches_path(path))
.unwrap_or(false)
})
}
fn should_ignore_compiled(
path: &Path,
ignore_dirs: &[String],
compiled_patterns: &[glob::Pattern],
) -> bool {
let file_name = path.file_name().and_then(|x| x.to_str()).unwrap_or("");
if file_name.starts_with('.') {
return true;
}
if path.is_dir() && ignore_dirs.iter().any(|x| x.as_str() == file_name) {
return true;
}
compiled_patterns.iter().any(|pat| pat.matches_path(path))
}
pub fn build_markdown_url_path(path: &Path, root_dir: &Path, index_file: &str) -> String {
let mut url = pathdiff::diff_paths(path, root_dir)
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default();
if !url.starts_with('/') {
url.insert(0, '/');
}
if url.ends_with(index_file) {
url.truncate(url.len() - index_file.len());
}
if let Some(dot_pos) = url.rfind('.')
&& !url[dot_pos..].contains('/')
{
url.truncate(dot_pos);
url.push('/');
}
url
}
pub fn build_static_url_path(path: &Path, root_dir: &Path, static_folder: &str) -> String {
let mut url = pathdiff::diff_paths(path, root_dir)
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_default()
.replacen(static_folder, "", 1);
if !url.starts_with('/') {
url.insert(0, '/');
}
url
}
pub fn is_markdown_extension(extension: &str, markdown_extensions: &[String]) -> bool {
markdown_extensions.iter().any(|x| x.as_str() == extension)
}
pub fn parse_tag_values(values: &str) -> impl Iterator<Item = String> + '_ {
values
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
}
pub fn extract_tag_values(value: &serde_json::Value) -> Vec<String> {
match value {
serde_json::Value::String(s) => parse_tag_values(s).collect(),
serde_json::Value::Array(arr) => arr
.iter()
.filter_map(|v| v.as_str())
.map(|s| s.to_string())
.collect(),
_ => vec![],
}
}
fn get_page_title(frontmatter: &crate::markdown::SimpleMetadata, path: &Path) -> String {
frontmatter
.get("title")
.and_then(|v| v.as_str())
.map(|s| s.to_string())
.unwrap_or_else(|| {
path.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("Untitled")
.to_string()
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_should_ignore_hidden_file() {
let path = Path::new(".hidden");
assert!(should_ignore(path, &[], &[]));
}
#[test]
fn test_should_ignore_hidden_dir() {
let path = Path::new(".git");
assert!(should_ignore(path, &[], &[]));
}
#[test]
fn test_should_ignore_normal_file() {
let path = Path::new("readme.md");
assert!(!should_ignore(path, &[], &[]));
}
#[test]
fn test_should_ignore_glob_pattern() {
let path = Path::new("test.log");
let globs = vec!["*.log".to_string()];
assert!(should_ignore(path, &[], &globs));
}
#[test]
fn test_should_ignore_glob_no_match() {
let path = Path::new("test.md");
let globs = vec!["*.log".to_string()];
assert!(!should_ignore(path, &[], &globs));
}
#[test]
fn test_build_markdown_url_path_simple() {
let root = Path::new("/root");
let path = Path::new("/root/readme.md");
assert_eq!(build_markdown_url_path(path, root, "index.md"), "/readme/");
}
#[test]
fn test_build_markdown_url_path_nested() {
let root = Path::new("/root");
let path = Path::new("/root/docs/guide.md");
assert_eq!(
build_markdown_url_path(path, root, "index.md"),
"/docs/guide/"
);
}
#[test]
fn test_build_markdown_url_path_index() {
let root = Path::new("/root");
let path = Path::new("/root/docs/index.md");
assert_eq!(build_markdown_url_path(path, root, "index.md"), "/docs/");
}
#[test]
fn test_build_markdown_url_path_root_index() {
let root = Path::new("/root");
let path = Path::new("/root/index.md");
assert_eq!(build_markdown_url_path(path, root, "index.md"), "/");
}
#[test]
fn test_build_static_url_path_in_static() {
let root = Path::new("/root");
let path = Path::new("/root/static/image.png");
assert_eq!(build_static_url_path(path, root, "static"), "/image.png");
}
#[test]
fn test_build_static_url_path_not_in_static() {
let root = Path::new("/root");
let path = Path::new("/root/assets/image.png");
assert_eq!(
build_static_url_path(path, root, "static"),
"/assets/image.png"
);
}
#[test]
fn test_is_markdown_extension_true() {
let extensions = vec!["md".to_string(), "markdown".to_string()];
assert!(is_markdown_extension("md", &extensions));
assert!(is_markdown_extension("markdown", &extensions));
}
#[test]
fn test_is_markdown_extension_false() {
let extensions = vec!["md".to_string()];
assert!(!is_markdown_extension("txt", &extensions));
assert!(!is_markdown_extension("html", &extensions));
}
#[test]
fn test_parse_tag_values_basic() {
let tags: Vec<String> = parse_tag_values("rust, programming, web dev").collect();
assert_eq!(tags, vec!["rust", "programming", "web dev"]);
}
#[test]
fn test_parse_tag_values_single() {
let tags: Vec<String> = parse_tag_values("rust").collect();
assert_eq!(tags, vec!["rust"]);
}
#[test]
fn test_parse_tag_values_whitespace() {
let tags: Vec<String> = parse_tag_values(" rust , python ").collect();
assert_eq!(tags, vec!["rust", "python"]);
}
#[test]
fn test_parse_tag_values_empty() {
let tags: Vec<String> = parse_tag_values("").collect();
assert!(tags.is_empty());
}
#[test]
fn test_parse_tag_values_empty_between() {
let tags: Vec<String> = parse_tag_values("rust,,python").collect();
assert_eq!(tags, vec!["rust", "python"]);
}
#[test]
fn test_get_page_title_from_frontmatter() {
let mut frontmatter = std::collections::HashMap::new();
frontmatter.insert(
"title".to_string(),
serde_json::Value::String("My Page Title".to_string()),
);
let path = Path::new("/docs/readme.md");
assert_eq!(get_page_title(&frontmatter, path), "My Page Title");
}
#[test]
fn test_get_page_title_from_filename() {
let frontmatter = std::collections::HashMap::new();
let path = Path::new("/docs/rust-guide.md");
assert_eq!(get_page_title(&frontmatter, path), "rust-guide");
}
#[test]
fn test_get_page_title_fallback() {
let frontmatter = std::collections::HashMap::new();
let path = Path::new("/");
assert_eq!(get_page_title(&frontmatter, path), "Untitled");
}
#[test]
fn test_extract_tag_values_from_array() {
let val = serde_json::json!(["rust", "python"]);
let tags = extract_tag_values(&val);
assert_eq!(tags, vec!["rust", "python"]);
}
#[test]
fn test_extract_tag_values_from_comma_string() {
let val = serde_json::json!("rust, python");
let tags = extract_tag_values(&val);
assert_eq!(tags, vec!["rust", "python"]);
}
#[test]
fn test_extract_tag_values_from_single_string() {
let val = serde_json::json!("rust");
let tags = extract_tag_values(&val);
assert_eq!(tags, vec!["rust"]);
}
#[test]
fn test_extract_tag_values_from_number() {
let val = serde_json::json!(42);
let tags = extract_tag_values(&val);
assert!(tags.is_empty());
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
fn valid_name_strategy() -> impl Strategy<Value = String> {
"[a-zA-Z0-9_-]{1,20}"
}
fn extension_strategy() -> impl Strategy<Value = String> {
"[a-z]{1,10}"
}
proptest! {
#[test]
fn prop_should_ignore_deterministic(
name in valid_name_strategy(),
ignore_dirs in proptest::collection::vec(valid_name_strategy(), 0..3),
ignore_globs in proptest::collection::vec("[*][.][a-z]{1,5}", 0..3),
) {
let path = Path::new(&name);
let result1 = should_ignore(path, &ignore_dirs, &ignore_globs);
let result2 = should_ignore(path, &ignore_dirs, &ignore_globs);
prop_assert_eq!(result1, result2);
}
#[test]
fn prop_hidden_files_always_ignored(name in "[.][a-zA-Z0-9]{1,15}") {
let path = Path::new(&name);
prop_assert!(should_ignore(path, &[], &[]));
}
#[test]
fn prop_normal_files_not_ignored(name in "[a-zA-Z][a-zA-Z0-9]{0,15}") {
let path = Path::new(&name);
prop_assert!(!should_ignore(path, &[], &[]));
}
#[test]
fn prop_is_markdown_extension_deterministic(
ext in extension_strategy(),
extensions in proptest::collection::vec(extension_strategy(), 1..5)
) {
let result1 = is_markdown_extension(&ext, &extensions);
let result2 = is_markdown_extension(&ext, &extensions);
prop_assert_eq!(result1, result2);
}
#[test]
fn prop_extension_in_list_returns_true(
extensions in proptest::collection::vec(extension_strategy(), 1..5)
) {
if let Some(ext) = extensions.first() {
prop_assert!(is_markdown_extension(ext, &extensions));
}
}
#[test]
fn prop_markdown_url_starts_with_slash(
subpath in proptest::collection::vec(valid_name_strategy(), 1..4),
filename in valid_name_strategy(),
) {
let root = PathBuf::from("/root");
let mut full_path = root.clone();
for component in &subpath {
full_path.push(component);
}
full_path.push(format!("{}.md", filename));
let url = build_markdown_url_path(&full_path, &root, "index.md");
prop_assert!(url.starts_with('/'), "URL should start with /: {}", url);
}
#[test]
fn prop_markdown_url_ends_with_slash(
subpath in proptest::collection::vec(valid_name_strategy(), 0..4),
filename in valid_name_strategy(),
) {
let root = PathBuf::from("/root");
let mut full_path = root.clone();
for component in &subpath {
full_path.push(component);
}
full_path.push(format!("{}.md", filename));
let url = build_markdown_url_path(&full_path, &root, "index.md");
prop_assert!(url.ends_with('/'), "URL should end with /: {}", url);
}
#[test]
fn prop_static_url_starts_with_slash(
subpath in proptest::collection::vec(valid_name_strategy(), 0..4),
filename in valid_name_strategy(),
ext in extension_strategy(),
) {
let root = PathBuf::from("/root");
let mut full_path = root.clone();
for component in &subpath {
full_path.push(component);
}
full_path.push(format!("{}.{}", filename, ext));
let url = build_static_url_path(&full_path, &root, "static");
prop_assert!(url.starts_with('/'), "URL should start with /: {}", url);
}
#[test]
fn prop_no_double_slashes_in_markdown_url(
subpath in proptest::collection::vec(valid_name_strategy(), 0..4),
filename in valid_name_strategy(),
) {
let root = PathBuf::from("/root");
let mut full_path = root.clone();
for component in &subpath {
full_path.push(component);
}
full_path.push(format!("{}.md", filename));
let url = build_markdown_url_path(&full_path, &root, "index.md");
prop_assert!(!url.contains("//"), "URL should not contain //: {}", url);
}
}
}