obsidian_parser/vault/mod.rs
1//! Obsidian vault parsing and analysis
2//!
3//! Provides functionality for working with entire Obsidian vaults (collections of notes)
4//!
5//! # Performance Recommendations
6//! **Prefer [`ObFileOnDisk`]) over [`ObFileInMemory`](crate::prelude::ObFileInMemory) for large vaults** - it uses significantly less memory
7//! by reading files on-demand rather than loading everything into memory upfront.
8//!
9//! # Examples
10//! ## Basic vault analysis
11//! ```no_run
12//! use obsidian_parser::prelude::*;
13//!
14//! // Open a vault using default properties (HashMap)
15//! let vault = Vault::open_default("/path/to/vault").unwrap();
16//!
17//! // Check for duplicate note names (important for graph operations)
18//! if vault.check_unique_note_name() {
19//! println!("All note names are unique");
20//! } else {
21//! println!("Duplicate note names found!");
22//! }
23//!
24//! // Access parsed files
25//! for file in &vault.files {
26//! println!("Note: {:?}", file.path());
27//! }
28//! ```
29//!
30//! ## Using custom properties
31//! ```no_run
32//! use obsidian_parser::prelude::*;
33//! use serde::Deserialize;
34//!
35//! #[derive(Clone, Default, Deserialize)]
36//! struct NoteProperties {
37//! created: String,
38//! tags: Vec<String>,
39//! priority: u8,
40//! }
41//!
42//! let vault: Vault<NoteProperties> = Vault::open("/path/to/vault").unwrap();
43//!
44//! // Access custom properties
45//! for file in &vault.files {
46//! let properties = file.properties().unwrap();
47//!
48//! println!(
49//! "Note created at {} with tags: {:?}",
50//! properties.created,
51//! properties.tags
52//! );
53//! }
54//! ```
55//!
56//! ## Building knowledge graphs (requires petgraph feature)
57//! ```no_run
58//! #[cfg(feature = "petgraph")]
59//! {
60//! use obsidian_parser::prelude::*;
61//! use petgraph::dot::{Dot, Config};
62//!
63//! let vault = Vault::open_default("/path/to/vault").unwrap();
64//!
65//! // Build directed graph
66//! let graph = vault.get_digraph();
67//! println!("Graph visualization:\n{:?}",
68//! Dot::with_config(&graph, &[Config::EdgeNoLabel])
69//! );
70//!
71//! // Analyze connectivity
72//! let components = petgraph::algo::connected_components(&graph);
73//! println!("Found {} connected components in knowledge base", components);
74//! }
75//! ```
76//!
77//! ## Use custom [`ObFile`] (example for [`ObFileInMemory`](crate::prelude::ObFileInMemory))
78//! ```no_run
79//! use obsidian_parser::prelude::*;
80//! use serde::Deserialize;
81//!
82//! #[derive(Clone, Default, Deserialize)]
83//! struct NoteProperties {
84//! created: String,
85//! tags: Vec<String>,
86//! priority: u8,
87//! }
88//!
89//! let vault: Vault<NoteProperties, ObFileInMemory<NoteProperties>> = Vault::open("/path/to/vault").unwrap();
90//! ```
91
92#[cfg(feature = "petgraph")]
93pub mod vault_petgraph;
94
95#[cfg(test)]
96mod vault_test;
97
98use crate::obfile::ObFile;
99use crate::{error::Error, prelude::ObFileOnDisk};
100use serde::de::DeserializeOwned;
101use std::collections::HashSet;
102use std::{
103 collections::HashMap,
104 marker::PhantomData,
105 path::{Path, PathBuf},
106};
107use walkdir::{DirEntry, WalkDir};
108
109fn is_hidden(entry: &DirEntry) -> bool {
110 entry
111 .file_name()
112 .to_str()
113 .is_some_and(|s| s.starts_with('.'))
114}
115
116/// Represents an entire Obsidian vault
117///
118/// Contains all parsed notes and metadata about the vault. Uses [`ObFileOnDisk`] by default
119/// which is optimized for memory efficiency in large vaults.
120///
121/// # Type Parameters
122/// - `T`: Type for frontmatter properties
123/// - `F`: File representation type
124#[derive(Debug, Default, PartialEq, Eq, Clone)]
125pub struct Vault<T, F = ObFileOnDisk<T>>
126where
127 T: DeserializeOwned + Clone,
128 F: ObFile<T> + Send,
129{
130 /// All files in the vault
131 pub files: Vec<F>,
132
133 /// Path to vault root directory
134 pub path: PathBuf,
135
136 /// Phantom data
137 pub phantom: PhantomData<T>,
138}
139
140fn check_vault(path: impl AsRef<Path>) -> Result<(), Error> {
141 let path_buf = path.as_ref().to_path_buf();
142
143 if !path_buf.is_dir() {
144 #[cfg(feature = "logging")]
145 log::error!("Path is not directory: {}", path_buf.display());
146
147 return Err(Error::IsNotDir(path_buf));
148 }
149
150 Ok(())
151}
152
153fn get_files_for_parse<T: FromIterator<DirEntry>>(path: impl AsRef<Path>) -> T {
154 WalkDir::new(path)
155 .min_depth(1)
156 .into_iter()
157 .filter_entry(|x| !is_hidden(x))
158 .filter_map(Result::ok)
159 .filter(|x| {
160 x.path()
161 .extension()
162 .is_some_and(|ext| ext.eq_ignore_ascii_case("md"))
163 })
164 .collect()
165}
166
167impl<T, F> Vault<T, F>
168where
169 T: DeserializeOwned + Clone,
170 F: ObFile<T> + Send,
171{
172 #[cfg(feature = "rayon")]
173 fn parse_files<L>(files: &[DirEntry], f: L) -> Vec<F>
174 where
175 L: Fn(&DirEntry) -> Option<F> + Sync + Send,
176 {
177 use rayon::prelude::*;
178
179 files.into_par_iter().filter_map(f).collect()
180 }
181
182 #[cfg(not(feature = "rayon"))]
183 fn parse_files<L>(files: &[DirEntry], f: L) -> Vec<F>
184 where
185 L: Fn(&DirEntry) -> Option<F>,
186 {
187 files.into_iter().filter_map(f).collect()
188 }
189
190 /// Opens and parses an Obsidian vault
191 ///
192 /// Recursively scans the directory for Markdown files (.md) and parses them.
193 /// Uses [`ObFileOnDisk`] by default which is more memory efficient than [`ObFileInMemory`](crate::prelude::ObFileInMemory).
194 ///
195 /// # Arguments
196 /// * `path` - Path to the vault directory
197 ///
198 /// # Errors
199 /// Returns `Error` if:
200 /// - Path doesn't exist or isn't a directory
201 ///
202 /// Files that fail parsing are skipped
203 ///
204 /// # Memory Considerations
205 /// For vaults with 1000+ notes, prefer [`ObFileOnDisk`] (default) over [`ObFileInMemory`](crate::prelude::ObFileInMemory) as it:
206 /// 1. Uses 90%+ less memory upfront
207 /// 2. Only loads file content when accessed
208 /// 3. Scales better for large knowledge bases
209 pub fn open<P: AsRef<Path>>(path: P) -> Result<Self, Error> {
210 let path_buf = path.as_ref().to_path_buf();
211
212 #[cfg(feature = "logging")]
213 log::debug!("Opening vault at: {}", path_buf.display());
214
215 check_vault(&path)?;
216 let files_for_parse: Vec<_> = get_files_for_parse(&path);
217
218 #[cfg(feature = "logging")]
219 log::debug!("Found {} markdown files to parse", files_for_parse.len());
220
221 #[allow(unused_variables)]
222 let files = Self::parse_files(&files_for_parse, |file| match F::from_file(file.path()) {
223 Ok(file) => Some(file),
224 Err(e) => {
225 #[cfg(feature = "logging")]
226 log::warn!("Failed to parse {}: {}", file.path().display(), e);
227
228 None
229 }
230 });
231
232 #[cfg(feature = "logging")]
233 log::info!("Parsed {} files", files.len());
234
235 Ok(Self {
236 files,
237 path: path_buf,
238 phantom: PhantomData,
239 })
240 }
241
242 /// Returns duplicated note name
243 ///
244 /// # Other
245 /// See [`check_unique_note_name`](Vault::check_unique_note_name)
246 #[must_use]
247 pub fn get_duplicates_notes(&self) -> Vec<String> {
248 #[cfg(feature = "logging")]
249 log::debug!(
250 "Get duplicates notes in {} ({} files)",
251 self.path.display(),
252 self.files.len()
253 );
254
255 let mut seens_notes = HashSet::new();
256 let mut duplicated_notes = Vec::new();
257
258 #[allow(
259 clippy::missing_panics_doc,
260 clippy::unwrap_used,
261 reason = "In any case, we will have a path to the files"
262 )]
263 for name_note in self.files.iter().map(|x| x.note_name().unwrap()) {
264 if !seens_notes.insert(name_note.clone()) {
265 #[cfg(feature = "logging")]
266 log::trace!("Found duplicate: {name_note}");
267
268 duplicated_notes.push(name_note);
269 }
270 }
271
272 #[cfg(feature = "logging")]
273 if !duplicated_notes.is_empty() {
274 log::warn!("Found {} duplicate filenames", duplicated_notes.len());
275 }
276
277 duplicated_notes
278 }
279
280 /// Checks if all note filenames in the vault are unique
281 ///
282 /// **Critical for graph operations** where notes are identified by name.
283 /// Always run this before calling [`get_digraph`](Vault::get_digraph) or [`get_ungraph`](Vault::get_ungraph).
284 ///
285 /// # Returns
286 /// `true` if all filenames are unique, `false` otherwise
287 ///
288 /// # Performance
289 /// Operates in O(n) time - safe for large vaults
290 ///
291 /// # Other
292 /// See [`get_duplicates_notes`](Vault::get_duplicates_notes)
293 #[must_use]
294 pub fn check_unique_note_name(&self) -> bool {
295 self.get_duplicates_notes().is_empty()
296 }
297}
298
299#[allow(clippy::implicit_hasher)]
300impl Vault<HashMap<String, serde_yml::Value>, ObFileOnDisk> {
301 /// Opens vault using default properties ([`HashMap`]) and [`ObFileOnDisk`] storage
302 ///
303 /// Recommended for most use cases due to its memory efficiency
304 ///
305 /// # Errors
306 /// Returns `Error` if:
307 /// - Path doesn't exist or isn't a directory
308 pub fn open_default<P: AsRef<Path>>(path: P) -> Result<Self, Error> {
309 Self::open(path)
310 }
311}
312
313#[cfg(test)]
314mod tests {
315 use super::*;
316 use crate::{test_utils::init_test_logger, vault::vault_test::create_test_vault};
317 use std::fs::File;
318
319 #[test]
320 fn open() {
321 init_test_logger();
322 let (vault_path, vault_files) = create_test_vault().unwrap();
323 let vault = Vault::open_default(vault_path.path()).unwrap();
324
325 assert_eq!(vault.files.len(), vault_files.len());
326 assert_eq!(vault.path, vault_path.path());
327 }
328
329 #[test]
330 #[should_panic]
331 fn open_not_dir() {
332 init_test_logger();
333 let (vault_path, _) = create_test_vault().unwrap();
334 let path_to_file = vault_path.path().join("main.md");
335 assert!(path_to_file.is_file());
336
337 let _ = Vault::open_default(&path_to_file).unwrap();
338 }
339
340 #[test]
341 fn open_with_extra_files() {
342 init_test_logger();
343 let (vault_path, vault_files) = create_test_vault().unwrap();
344 File::create(vault_path.path().join("extra_file.not_md")).unwrap();
345
346 let vault = Vault::open_default(vault_path.path()).unwrap();
347
348 assert_eq!(vault.files.len(), vault_files.len());
349 assert_eq!(vault.path, vault_path.path());
350 }
351
352 #[test]
353 fn check_unique_note_name() {
354 init_test_logger();
355 let (vault_path, _) = create_test_vault().unwrap();
356
357 let mut vault = Vault::open_default(vault_path.path()).unwrap();
358 assert!(vault.check_unique_note_name());
359
360 vault.files.push(vault.files.first().unwrap().clone());
361 assert!(!vault.check_unique_note_name());
362 }
363}