obsidian_parser/vault/
mod.rs

1//! Obsidian vault parsing and analysis
2//!
3//! Provides functionality for working with entire Obsidian vaults (collections of notes)
4//!
5//! # Performance Recommendations
6//! **Prefer [`ObFileOnDisk`]) over [`ObFileInMemory`](crate::prelude::ObFileInMemory) for large vaults** - it uses significantly less memory
7//! by reading files on-demand rather than loading everything into memory upfront.
8//!
9//! # Examples
10//! ## Basic vault analysis
11//! ```no_run
12//! use obsidian_parser::prelude::*;
13//!
14//! // Open a vault using default properties (HashMap)
15//! let vault = Vault::open_default("/path/to/vault").unwrap();
16//!
17//! // Check for duplicate note names (important for graph operations)
18//! if vault.check_unique_note_name() {
19//!     println!("All note names are unique");
20//! } else {
21//!     println!("Duplicate note names found!");
22//! }
23//!
24//! // Access parsed files
25//! for file in &vault.files {
26//!     println!("Note: {:?}", file.path());
27//! }
28//! ```
29//!
30//! ## Using custom properties
31//! ```no_run
32//! use obsidian_parser::prelude::*;
33//! use serde::Deserialize;
34//!
35//! #[derive(Clone, Default, Deserialize)]
36//! struct NoteProperties {
37//!     created: String,
38//!     tags: Vec<String>,
39//!     priority: u8,
40//! }
41//!
42//! let vault: Vault<NoteProperties> = Vault::open("/path/to/vault").unwrap();
43//!
44//! // Access custom properties
45//! for file in &vault.files {
46//!     let properties = file.properties().unwrap();
47//!
48//!     println!(
49//!         "Note created at {} with tags: {:?}",
50//!         properties.created,
51//!         properties.tags
52//!     );
53//! }
54//! ```
55//!
56//! ## Building knowledge graphs (requires petgraph feature)
57//! ```no_run
58//! #[cfg(feature = "petgraph")]
59//! {
60//!     use obsidian_parser::prelude::*;
61//!     use petgraph::dot::{Dot, Config};
62//!
63//!     let vault = Vault::open_default("/path/to/vault").unwrap();
64//!     
65//!     // Build directed graph
66//!     let graph = vault.get_digraph();
67//!     println!("Graph visualization:\n{:?}",
68//!         Dot::with_config(&graph, &[Config::EdgeNoLabel])
69//!     );
70//!     
71//!     // Analyze connectivity
72//!     let components = petgraph::algo::connected_components(&graph);
73//!     println!("Found {} connected components in knowledge base", components);
74//! }
75//! ```
76//!
77//! ## Use custom [`ObFile`] (example for [`ObFileInMemory`](crate::prelude::ObFileInMemory))
78//! ```no_run
79//! use obsidian_parser::prelude::*;
80//! use serde::Deserialize;
81//!
82//! #[derive(Clone, Default, Deserialize)]
83//! struct NoteProperties {
84//!     created: String,
85//!     tags: Vec<String>,
86//!     priority: u8,
87//! }
88//!
89//! let vault: Vault<NoteProperties, ObFileInMemory<NoteProperties>> = Vault::open("/path/to/vault").unwrap();
90//! ```
91
92#[cfg(feature = "petgraph")]
93pub mod vault_petgraph;
94
95#[cfg(test)]
96mod vault_test;
97
98use crate::obfile::ObFile;
99use crate::{error::Error, prelude::ObFileOnDisk};
100use serde::de::DeserializeOwned;
101use std::collections::HashSet;
102use std::{
103    collections::HashMap,
104    marker::PhantomData,
105    path::{Path, PathBuf},
106};
107use walkdir::{DirEntry, WalkDir};
108
109fn is_hidden(entry: &DirEntry) -> bool {
110    entry
111        .file_name()
112        .to_str()
113        .is_some_and(|s| s.starts_with('.'))
114}
115
116/// Represents an entire Obsidian vault
117///
118/// Contains all parsed notes and metadata about the vault. Uses [`ObFileOnDisk`] by default
119/// which is optimized for memory efficiency in large vaults.
120///
121/// # Type Parameters
122/// - `T`: Type for frontmatter properties
123/// - `F`: File representation type
124#[derive(Debug, Default, PartialEq, Eq, Clone)]
125pub struct Vault<T, F = ObFileOnDisk<T>>
126where
127    T: DeserializeOwned + Clone,
128    F: ObFile<T> + Send,
129{
130    /// All files in the vault
131    pub files: Vec<F>,
132
133    /// Path to vault root directory
134    pub path: PathBuf,
135
136    /// Phantom data
137    pub phantom: PhantomData<T>,
138}
139
140fn check_vault(path: impl AsRef<Path>) -> Result<(), Error> {
141    let path_buf = path.as_ref().to_path_buf();
142
143    if !path_buf.is_dir() {
144        #[cfg(feature = "logging")]
145        log::error!("Path is not directory: {}", path_buf.display());
146
147        return Err(Error::IsNotDir(path_buf));
148    }
149
150    Ok(())
151}
152
153fn get_files_for_parse<T: FromIterator<DirEntry>>(path: impl AsRef<Path>) -> T {
154    WalkDir::new(path)
155        .min_depth(1)
156        .into_iter()
157        .filter_entry(|x| !is_hidden(x))
158        .filter_map(Result::ok)
159        .filter(|x| {
160            x.path()
161                .extension()
162                .is_some_and(|ext| ext.eq_ignore_ascii_case("md"))
163        })
164        .collect()
165}
166
167impl<T, F> Vault<T, F>
168where
169    T: DeserializeOwned + Clone,
170    F: ObFile<T> + Send,
171{
172    #[cfg(feature = "rayon")]
173    fn parse_files<L>(files: &[DirEntry], f: L) -> Vec<F>
174    where
175        L: Fn(&DirEntry) -> Option<F> + Sync + Send,
176    {
177        use rayon::prelude::*;
178
179        files.into_par_iter().filter_map(f).collect()
180    }
181
182    #[cfg(not(feature = "rayon"))]
183    fn parse_files<L>(files: &[DirEntry], f: L) -> Vec<F>
184    where
185        L: Fn(&DirEntry) -> Option<F>,
186    {
187        files.into_iter().filter_map(f).collect()
188    }
189
190    /// Opens and parses an Obsidian vault
191    ///
192    /// Recursively scans the directory for Markdown files (.md) and parses them.
193    /// Uses [`ObFileOnDisk`] by default which is more memory efficient than [`ObFileInMemory`](crate::prelude::ObFileInMemory).
194    ///
195    /// # Arguments
196    /// * `path` - Path to the vault directory
197    ///
198    /// # Errors
199    /// Returns `Error` if:
200    /// - Path doesn't exist or isn't a directory
201    ///
202    /// Files that fail parsing are skipped
203    ///
204    /// # Memory Considerations
205    /// For vaults with 1000+ notes, prefer [`ObFileOnDisk`] (default) over [`ObFileInMemory`](crate::prelude::ObFileInMemory) as it:
206    /// 1. Uses 90%+ less memory upfront
207    /// 2. Only loads file content when accessed
208    /// 3. Scales better for large knowledge bases
209    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self, Error> {
210        let path_buf = path.as_ref().to_path_buf();
211
212        #[cfg(feature = "logging")]
213        log::debug!("Opening vault at: {}", path_buf.display());
214
215        check_vault(&path)?;
216        let files_for_parse: Vec<_> = get_files_for_parse(&path);
217
218        #[cfg(feature = "logging")]
219        log::debug!("Found {} markdown files to parse", files_for_parse.len());
220
221        #[allow(unused_variables)]
222        let files = Self::parse_files(&files_for_parse, |file| match F::from_file(file.path()) {
223            Ok(file) => Some(file),
224            Err(e) => {
225                #[cfg(feature = "logging")]
226                log::warn!("Failed to parse {}: {}", file.path().display(), e);
227
228                None
229            }
230        });
231
232        #[cfg(feature = "logging")]
233        log::info!("Parsed {} files", files.len());
234
235        Ok(Self {
236            files,
237            path: path_buf,
238            phantom: PhantomData,
239        })
240    }
241
242    /// Returns duplicated note name
243    ///
244    /// # Other
245    /// See [`check_unique_note_name`](Vault::check_unique_note_name)
246    #[must_use]
247    pub fn get_duplicates_notes(&self) -> Vec<String> {
248        #[cfg(feature = "logging")]
249        log::debug!(
250            "Get duplicates notes in {} ({} files)",
251            self.path.display(),
252            self.files.len()
253        );
254
255        let mut seens_notes = HashSet::new();
256        let mut duplicated_notes = Vec::new();
257
258        #[allow(
259            clippy::missing_panics_doc,
260            clippy::unwrap_used,
261            reason = "In any case, we will have a path to the files"
262        )]
263        for name_note in self.files.iter().map(|x| x.note_name().unwrap()) {
264            if !seens_notes.insert(name_note.clone()) {
265                #[cfg(feature = "logging")]
266                log::trace!("Found duplicate: {name_note}");
267
268                duplicated_notes.push(name_note);
269            }
270        }
271
272        #[cfg(feature = "logging")]
273        if !duplicated_notes.is_empty() {
274            log::warn!("Found {} duplicate filenames", duplicated_notes.len());
275        }
276
277        duplicated_notes
278    }
279
280    /// Checks if all note filenames in the vault are unique
281    ///
282    /// **Critical for graph operations** where notes are identified by name.
283    /// Always run this before calling [`get_digraph`](Vault::get_digraph) or [`get_ungraph`](Vault::get_ungraph).
284    ///
285    /// # Returns
286    /// `true` if all filenames are unique, `false` otherwise
287    ///
288    /// # Performance
289    /// Operates in O(n) time - safe for large vaults
290    ///
291    /// # Other
292    /// See [`get_duplicates_notes`](Vault::get_duplicates_notes)
293    #[must_use]
294    pub fn check_unique_note_name(&self) -> bool {
295        self.get_duplicates_notes().is_empty()
296    }
297}
298
299#[allow(clippy::implicit_hasher)]
300impl Vault<HashMap<String, serde_yml::Value>, ObFileOnDisk> {
301    /// Opens vault using default properties ([`HashMap`]) and [`ObFileOnDisk`] storage
302    ///
303    /// Recommended for most use cases due to its memory efficiency
304    ///
305    /// # Errors
306    /// Returns `Error` if:
307    /// - Path doesn't exist or isn't a directory
308    pub fn open_default<P: AsRef<Path>>(path: P) -> Result<Self, Error> {
309        Self::open(path)
310    }
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316    use crate::{test_utils::init_test_logger, vault::vault_test::create_test_vault};
317    use std::fs::File;
318
319    #[test]
320    fn open() {
321        init_test_logger();
322        let (vault_path, vault_files) = create_test_vault().unwrap();
323        let vault = Vault::open_default(vault_path.path()).unwrap();
324
325        assert_eq!(vault.files.len(), vault_files.len());
326        assert_eq!(vault.path, vault_path.path());
327    }
328
329    #[test]
330    #[should_panic]
331    fn open_not_dir() {
332        init_test_logger();
333        let (vault_path, _) = create_test_vault().unwrap();
334        let path_to_file = vault_path.path().join("main.md");
335        assert!(path_to_file.is_file());
336
337        let _ = Vault::open_default(&path_to_file).unwrap();
338    }
339
340    #[test]
341    fn open_with_extra_files() {
342        init_test_logger();
343        let (vault_path, vault_files) = create_test_vault().unwrap();
344        File::create(vault_path.path().join("extra_file.not_md")).unwrap();
345
346        let vault = Vault::open_default(vault_path.path()).unwrap();
347
348        assert_eq!(vault.files.len(), vault_files.len());
349        assert_eq!(vault.path, vault_path.path());
350    }
351
352    #[test]
353    fn check_unique_note_name() {
354        init_test_logger();
355        let (vault_path, _) = create_test_vault().unwrap();
356
357        let mut vault = Vault::open_default(vault_path.path()).unwrap();
358        assert!(vault.check_unique_note_name());
359
360        vault.files.push(vault.files.first().unwrap().clone());
361        assert!(!vault.check_unique_note_name());
362    }
363}