obsidian_parser/vault/
vault_duplicates.rs

1//! Found duplication in vault
2
3use std::collections::HashSet;
4
5use super::Vault;
6use crate::note::Note;
7
8impl<N> Vault<N>
9where
10    N: Note,
11{
12    /// Returns duplicated note name
13    ///
14    /// # Performance
15    /// Operates in O(n log n) time for large vaults
16    ///
17    /// # Other
18    /// See [`have_unique_note_by_name`](Vault::have_duplicates_notes_by_name)
19    #[must_use]
20    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(path = %self.path.display(), count_notes = %self.notes.len())))]
21    pub fn get_duplicates_notes_by_name(&self) -> Vec<&N> {
22        #[cfg(feature = "tracing")]
23        tracing::debug!("Get duplicates notes by name...");
24
25        let mut duplicated_notes = Vec::new();
26        let mut viewed = HashSet::new();
27        for note in self.notes() {
28            if let Some(note_name) = note.note_name() {
29                let already_have = !viewed.insert(note_name);
30
31                if already_have {
32                    duplicated_notes.push(note);
33                }
34            }
35        }
36
37        #[cfg(feature = "tracing")]
38        tracing::debug!("Found {} duplicated notes", duplicated_notes.len());
39
40        duplicated_notes
41    }
42
43    /// Checks if all note name in the vault are unique
44    ///
45    /// # Returns
46    /// `true` if all note name are unique, `false` otherwise
47    ///
48    /// # Performance
49    /// Operates in O(n) time for large vaults
50    ///
51    /// # Other
52    /// See [`get_duplicates_notes_by_name`](Vault::get_duplicates_notes_by_name)
53    #[must_use]
54    pub fn have_duplicates_notes_by_name(&self) -> bool {
55        !self.get_duplicates_notes_by_name().is_empty()
56    }
57
58    /// Get duplicates by content
59    #[cfg(feature = "digest")]
60    #[cfg_attr(docsrs, doc(cfg(feature = "digest")))]
61    #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(path = %self.path.display(), count_notes = %self.notes.len())))]
62    pub fn get_duplicates_notes_by_content<D>(&self) -> Result<Vec<&N>, N::Error>
63    where
64        D: digest::Digest,
65    {
66        #[cfg(feature = "tracing")]
67        tracing::debug!("Get duplicates notes by content");
68
69        let hashed = {
70            let mut hashed = Vec::with_capacity(self.count_notes());
71            for i in 0..self.count_notes() {
72                let content = self.notes()[i].content()?;
73                let hash = D::digest(content.as_bytes());
74
75                hashed.push(hash);
76            }
77
78            hashed
79        };
80
81        let mut duplicated_notes = Vec::new();
82        let mut viewed = HashSet::new();
83        for (note, hash_content) in self.notes().iter().zip(hashed) {
84            let already_have = !viewed.insert(hash_content);
85
86            if already_have {
87                duplicated_notes.push(note);
88            }
89        }
90
91        #[cfg(feature = "tracing")]
92        tracing::debug!("Found {} duplicated notes", duplicated_notes.len());
93
94        Ok(duplicated_notes)
95    }
96
97    /// Check have duplicates notes by content
98    #[cfg(feature = "digest")]
99    #[cfg_attr(docsrs, doc(cfg(feature = "digest")))]
100    pub fn have_duplicates_notes_by_content<D>(&self) -> Result<bool, N::Error>
101    where
102        D: digest::Digest,
103    {
104        Ok(!self.get_duplicates_notes_by_content::<D>()?.is_empty())
105    }
106}
107
108#[cfg(test)]
109mod tests {
110    use crate::{
111        note::{Note, NoteFromFile},
112        prelude::{IteratorVaultBuilder, NoteInMemory, VaultBuilder, VaultOptions},
113        vault::Vault,
114    };
115    use serde::de::DeserializeOwned;
116    use std::{fs::File, io::Write};
117    use tempfile::TempDir;
118
119    fn create_vault_with_diplicates_files<F>() -> (Vault<F>, TempDir)
120    where
121        F: NoteFromFile,
122        F::Error: From<std::io::Error>,
123        F::Properties: DeserializeOwned,
124    {
125        let temp_dir = TempDir::new().unwrap();
126
127        let mut file1 = File::create(&temp_dir.path().join("file.md")).unwrap();
128        file1.write_all(b"same text").unwrap();
129
130        let path_to_duplicate_file = temp_dir.path().join("folder");
131        std::fs::create_dir(&path_to_duplicate_file).unwrap();
132        let mut file2 = File::create(path_to_duplicate_file.join("file.md")).unwrap();
133        file2.write_all(b"same text").unwrap();
134
135        let options = VaultOptions::new(&temp_dir);
136        let vault = VaultBuilder::new(&options)
137            .include_hidden(true)
138            .into_iter()
139            .map(Result::unwrap)
140            .build_vault(&options);
141
142        (vault, temp_dir)
143    }
144
145    fn create_vault_without_diplicates_files<F>() -> (Vault<F>, TempDir)
146    where
147        F: NoteFromFile,
148        F::Error: From<std::io::Error>,
149        F::Properties: DeserializeOwned,
150    {
151        let temp_dir = TempDir::new().unwrap();
152
153        File::create(&temp_dir.path().join("file.md")).unwrap();
154
155        let options = VaultOptions::new(&temp_dir);
156        let vault = VaultBuilder::new(&options)
157            .include_hidden(true)
158            .into_iter()
159            .map(Result::unwrap)
160            .build_vault(&options);
161
162        (vault, temp_dir)
163    }
164
165    #[cfg_attr(feature = "tracing", tracing_test::traced_test)]
166    #[test]
167    fn with_duplicates_notes_by_name() {
168        let (vault, _path) = create_vault_with_diplicates_files::<NoteInMemory>();
169
170        let duplicated_notes: Vec<_> = vault
171            .get_duplicates_notes_by_name()
172            .into_iter()
173            .map(|note| note.note_name().unwrap())
174            .collect();
175
176        assert_eq!(duplicated_notes, ["file".to_string()]);
177        assert!(vault.have_duplicates_notes_by_name());
178    }
179
180    #[cfg_attr(feature = "tracing", tracing_test::traced_test)]
181    #[test]
182    fn without_duplicates_notes_by_name() {
183        let (vault, _path) = create_vault_without_diplicates_files::<NoteInMemory>();
184
185        let duplicated_notes: Vec<_> = vault
186            .get_duplicates_notes_by_name()
187            .into_iter()
188            .map(|note| note.note_name().unwrap())
189            .collect();
190
191        assert_eq!(duplicated_notes.is_empty(), true);
192        assert!(!vault.have_duplicates_notes_by_name());
193    }
194
195    #[cfg_attr(feature = "tracing", tracing_test::traced_test)]
196    #[test]
197    #[cfg(feature = "digest")]
198    fn with_duplicates_notes_by_content() {
199        let (vault, _path) = create_vault_with_diplicates_files::<NoteInMemory>();
200
201        let duplicated_notes: Vec<_> = vault
202            .get_duplicates_notes_by_content::<sha2::Sha256>()
203            .unwrap()
204            .into_iter()
205            .map(|note| note.note_name().unwrap())
206            .collect();
207
208        assert_eq!(duplicated_notes, ["file".to_string()]);
209
210        assert!(
211            vault
212                .have_duplicates_notes_by_content::<sha2::Sha256>()
213                .unwrap()
214        );
215    }
216
217    #[cfg_attr(feature = "tracing", tracing_test::traced_test)]
218    #[test]
219    #[cfg(feature = "digest")]
220    fn without_duplicates_notes_by_content() {
221        let (vault, _path) = create_vault_without_diplicates_files::<NoteInMemory>();
222
223        let duplicated_notes: Vec<_> = vault
224            .get_duplicates_notes_by_content::<sha2::Sha256>()
225            .unwrap()
226            .into_iter()
227            .map(|note| note.note_name().unwrap())
228            .collect();
229
230        assert_eq!(duplicated_notes.is_empty(), true);
231        assert!(
232            !vault
233                .have_duplicates_notes_by_content::<sha2::Sha256>()
234                .unwrap()
235        );
236    }
237}