Skip to main content

ankit_engine/
deduplicate.rs

1//! Duplicate note detection and removal.
2//!
3//! This module provides workflows for finding and removing duplicate notes
4//! based on a key field.
5//!
6//! # Example
7//!
8//! ```no_run
9//! use ankit_engine::Engine;
10//! use ankit_engine::deduplicate::{DedupeQuery, KeepStrategy};
11//!
12//! # async fn example() -> ankit_engine::Result<()> {
13//! let engine = Engine::new();
14//!
15//! // Find duplicates based on the "Front" field
16//! let query = DedupeQuery {
17//!     search: "deck:Japanese".to_string(),
18//!     key_field: "Front".to_string(),
19//!     keep: KeepStrategy::First,
20//! };
21//!
22//! let groups = engine.deduplicate().find_duplicates(&query).await?;
23//! println!("Found {} duplicate groups", groups.len());
24//!
25//! // Remove duplicates (keeps the first, deletes the rest)
26//! let report = engine.deduplicate().remove_duplicates(&query).await?;
27//! println!("Deleted {} duplicate notes", report.deleted);
28//! # Ok(())
29//! # }
30//! ```
31
32use crate::Result;
33use ankit::AnkiClient;
34use serde::Serialize;
35use std::collections::HashMap;
36
37/// Strategy for which duplicate to keep.
38#[derive(Debug, Clone, Copy, Default)]
39pub enum KeepStrategy {
40    /// Keep the first note found (by note ID, oldest).
41    #[default]
42    First,
43    /// Keep the last note found (by note ID, newest).
44    Last,
45    /// Keep the note with the most non-empty fields.
46    MostContent,
47    /// Keep the note with the most tags.
48    MostTags,
49}
50
51/// Query parameters for finding duplicates.
52#[derive(Debug, Clone)]
53pub struct DedupeQuery {
54    /// Anki search query to filter notes.
55    pub search: String,
56    /// Field name to use as the duplicate key.
57    pub key_field: String,
58    /// Strategy for which duplicate to keep.
59    pub keep: KeepStrategy,
60}
61
62/// A group of duplicate notes.
63#[derive(Debug, Clone, Serialize)]
64pub struct DuplicateGroup {
65    /// The key value that these notes share.
66    pub key_value: String,
67    /// The note ID that will be kept.
68    pub keep_note_id: i64,
69    /// The note IDs that are duplicates (to be deleted).
70    pub duplicate_note_ids: Vec<i64>,
71}
72
73/// Information about a note for duplicate comparison.
74#[derive(Debug, Clone)]
75struct NoteForDedupe {
76    note_id: i64,
77    non_empty_count: usize,
78    tag_count: usize,
79}
80
81/// Report from a deduplication operation.
82#[derive(Debug, Clone, Default, Serialize)]
83pub struct DedupeReport {
84    /// Number of duplicate groups found.
85    pub groups_found: usize,
86    /// Number of notes deleted.
87    pub deleted: usize,
88    /// Number of notes kept (one per group).
89    pub kept: usize,
90    /// Details about deleted notes per key.
91    pub details: Vec<DuplicateGroup>,
92}
93
94/// Deduplication workflow engine.
95#[derive(Debug)]
96pub struct DeduplicateEngine<'a> {
97    client: &'a AnkiClient,
98}
99
100impl<'a> DeduplicateEngine<'a> {
101    pub(crate) fn new(client: &'a AnkiClient) -> Self {
102        Self { client }
103    }
104
105    /// Find groups of duplicate notes.
106    ///
107    /// Notes are considered duplicates if they have the same value in the key field.
108    /// Returns groups where each group has the note to keep and notes to delete.
109    ///
110    /// # Arguments
111    ///
112    /// * `query` - Query parameters specifying search filter, key field, and keep strategy
113    ///
114    /// # Example
115    ///
116    /// ```no_run
117    /// # use ankit_engine::Engine;
118    /// # use ankit_engine::deduplicate::{DedupeQuery, KeepStrategy};
119    /// # async fn example() -> ankit_engine::Result<()> {
120    /// let engine = Engine::new();
121    ///
122    /// let query = DedupeQuery {
123    ///     search: "deck:Vocabulary".to_string(),
124    ///     key_field: "Word".to_string(),
125    ///     keep: KeepStrategy::MostContent,
126    /// };
127    ///
128    /// let groups = engine.deduplicate().find_duplicates(&query).await?;
129    /// for group in &groups {
130    ///     println!("'{}': keep {}, delete {:?}",
131    ///         group.key_value, group.keep_note_id, group.duplicate_note_ids);
132    /// }
133    /// # Ok(())
134    /// # }
135    /// ```
136    pub async fn find_duplicates(&self, query: &DedupeQuery) -> Result<Vec<DuplicateGroup>> {
137        let note_ids = self.client.notes().find(&query.search).await?;
138
139        if note_ids.is_empty() {
140            return Ok(Vec::new());
141        }
142
143        let note_infos = self.client.notes().info(&note_ids).await?;
144
145        // Group notes by key field value
146        let mut groups: HashMap<String, Vec<NoteForDedupe>> = HashMap::new();
147
148        for info in note_infos {
149            // Get the key field value
150            let key_value = info
151                .fields
152                .get(&query.key_field)
153                .map(|f| normalize_key(&f.value))
154                .unwrap_or_default();
155
156            // Skip notes with empty key
157            if key_value.is_empty() {
158                continue;
159            }
160
161            // Count non-empty fields
162            let non_empty_count = info
163                .fields
164                .values()
165                .filter(|f| !f.value.trim().is_empty())
166                .count();
167
168            groups.entry(key_value).or_default().push(NoteForDedupe {
169                note_id: info.note_id,
170                non_empty_count,
171                tag_count: info.tags.len(),
172            });
173        }
174
175        // Convert to DuplicateGroups (only groups with more than one note)
176        let mut result = Vec::new();
177
178        for (key, mut notes) in groups {
179            if notes.len() <= 1 {
180                continue;
181            }
182
183            // Sort notes based on keep strategy
184            match query.keep {
185                KeepStrategy::First => {
186                    notes.sort_by_key(|n| n.note_id);
187                }
188                KeepStrategy::Last => {
189                    notes.sort_by_key(|n| std::cmp::Reverse(n.note_id));
190                }
191                KeepStrategy::MostContent => {
192                    // Sort by non-empty count descending, then by note_id ascending for ties
193                    notes.sort_by(|a, b| {
194                        b.non_empty_count
195                            .cmp(&a.non_empty_count)
196                            .then_with(|| a.note_id.cmp(&b.note_id))
197                    });
198                }
199                KeepStrategy::MostTags => {
200                    // Sort by tag count descending, then by note_id ascending for ties
201                    notes.sort_by(|a, b| {
202                        b.tag_count
203                            .cmp(&a.tag_count)
204                            .then_with(|| a.note_id.cmp(&b.note_id))
205                    });
206                }
207            }
208
209            let keep_note_id = notes[0].note_id;
210            let duplicate_note_ids: Vec<i64> = notes[1..].iter().map(|n| n.note_id).collect();
211
212            result.push(DuplicateGroup {
213                key_value: key,
214                keep_note_id,
215                duplicate_note_ids,
216            });
217        }
218
219        // Sort by key for consistent output
220        result.sort_by(|a, b| a.key_value.cmp(&b.key_value));
221
222        Ok(result)
223    }
224
225    /// Preview deduplication without making changes.
226    ///
227    /// Returns the same information as `find_duplicates` but formatted as a report.
228    pub async fn preview(&self, query: &DedupeQuery) -> Result<DedupeReport> {
229        let groups = self.find_duplicates(query).await?;
230
231        let deleted: usize = groups.iter().map(|g| g.duplicate_note_ids.len()).sum();
232
233        Ok(DedupeReport {
234            groups_found: groups.len(),
235            deleted,
236            kept: groups.len(),
237            details: groups,
238        })
239    }
240
241    /// Remove duplicate notes.
242    ///
243    /// Keeps one note per duplicate group (based on keep strategy) and deletes the rest.
244    ///
245    /// # Arguments
246    ///
247    /// * `query` - Query parameters specifying search filter, key field, and keep strategy
248    ///
249    /// # Example
250    ///
251    /// ```no_run
252    /// # use ankit_engine::Engine;
253    /// # use ankit_engine::deduplicate::{DedupeQuery, KeepStrategy};
254    /// # async fn example() -> ankit_engine::Result<()> {
255    /// let engine = Engine::new();
256    ///
257    /// let query = DedupeQuery {
258    ///     search: "deck:Vocabulary tag:imported".to_string(),
259    ///     key_field: "Word".to_string(),
260    ///     keep: KeepStrategy::MostContent,
261    /// };
262    ///
263    /// let report = engine.deduplicate().remove_duplicates(&query).await?;
264    /// println!("Deleted {} duplicates, kept {}", report.deleted, report.kept);
265    /// # Ok(())
266    /// # }
267    /// ```
268    pub async fn remove_duplicates(&self, query: &DedupeQuery) -> Result<DedupeReport> {
269        let groups = self.find_duplicates(query).await?;
270
271        if groups.is_empty() {
272            return Ok(DedupeReport::default());
273        }
274
275        // Collect all note IDs to delete
276        let to_delete: Vec<i64> = groups
277            .iter()
278            .flat_map(|g| g.duplicate_note_ids.iter().copied())
279            .collect();
280
281        let deleted_count = to_delete.len();
282        let kept_count = groups.len();
283
284        // Delete the duplicates
285        if !to_delete.is_empty() {
286            self.client.notes().delete(&to_delete).await?;
287        }
288
289        Ok(DedupeReport {
290            groups_found: groups.len(),
291            deleted: deleted_count,
292            kept: kept_count,
293            details: groups,
294        })
295    }
296
297    /// Delete specific duplicate notes.
298    ///
299    /// Use this after reviewing the results from `find_duplicates` to selectively
300    /// delete duplicates.
301    ///
302    /// # Arguments
303    ///
304    /// * `note_ids` - Note IDs to delete
305    pub async fn delete_notes(&self, note_ids: &[i64]) -> Result<usize> {
306        if note_ids.is_empty() {
307            return Ok(0);
308        }
309
310        self.client.notes().delete(note_ids).await?;
311        Ok(note_ids.len())
312    }
313}
314
315/// Normalize a key value for comparison.
316///
317/// Strips HTML, collapses whitespace, and converts to lowercase.
318fn normalize_key(value: &str) -> String {
319    // Simple HTML stripping (remove tags)
320    let mut result = String::with_capacity(value.len());
321    let mut in_tag = false;
322
323    for ch in value.chars() {
324        match ch {
325            '<' => in_tag = true,
326            '>' => in_tag = false,
327            _ if !in_tag => result.push(ch),
328            _ => {}
329        }
330    }
331
332    // Collapse whitespace and trim
333    result
334        .split_whitespace()
335        .collect::<Vec<_>>()
336        .join(" ")
337        .to_lowercase()
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn test_normalize_key() {
346        assert_eq!(normalize_key("hello"), "hello");
347        assert_eq!(normalize_key("Hello World"), "hello world");
348        assert_eq!(normalize_key("  hello   world  "), "hello world");
349        assert_eq!(normalize_key("<b>hello</b>"), "hello");
350        assert_eq!(
351            normalize_key("<div>Hello <span>World</span></div>"),
352            "hello world"
353        );
354    }
355
356    #[test]
357    fn test_normalize_key_empty() {
358        assert_eq!(normalize_key(""), "");
359        assert_eq!(normalize_key("   "), "");
360        assert_eq!(normalize_key("<>"), "");
361    }
362
363    #[test]
364    fn test_normalize_key_html_attributes() {
365        assert_eq!(normalize_key("<a href=\"url\">Link</a>"), "link");
366        assert_eq!(
367            normalize_key("<div class=\"foo\" id=\"bar\">Content</div>"),
368            "content"
369        );
370    }
371
372    #[test]
373    fn test_normalize_key_unclosed_tags() {
374        assert_eq!(normalize_key("<p>Unclosed"), "unclosed");
375        assert_eq!(normalize_key("Text<br>More"), "textmore");
376    }
377
378    #[test]
379    fn test_normalize_key_newlines() {
380        assert_eq!(normalize_key("hello\nworld"), "hello world");
381        assert_eq!(normalize_key("hello\r\nworld"), "hello world");
382        assert_eq!(normalize_key("hello\tworld"), "hello world");
383    }
384
385    #[test]
386    fn test_keep_strategy_default() {
387        let strategy = KeepStrategy::default();
388        assert!(matches!(strategy, KeepStrategy::First));
389    }
390
391    #[test]
392    fn test_dedupe_query_construction() {
393        let query = DedupeQuery {
394            search: "deck:Test".to_string(),
395            key_field: "Front".to_string(),
396            keep: KeepStrategy::MostContent,
397        };
398
399        assert_eq!(query.search, "deck:Test");
400        assert_eq!(query.key_field, "Front");
401        assert!(matches!(query.keep, KeepStrategy::MostContent));
402    }
403
404    #[test]
405    fn test_duplicate_group_construction() {
406        let group = DuplicateGroup {
407            key_value: "hello".to_string(),
408            keep_note_id: 1000,
409            duplicate_note_ids: vec![1001, 1002, 1003],
410        };
411
412        assert_eq!(group.key_value, "hello");
413        assert_eq!(group.keep_note_id, 1000);
414        assert_eq!(group.duplicate_note_ids.len(), 3);
415        assert!(group.duplicate_note_ids.contains(&1001));
416    }
417
418    #[test]
419    fn test_duplicate_group_serialization() {
420        let group = DuplicateGroup {
421            key_value: "test".to_string(),
422            keep_note_id: 123,
423            duplicate_note_ids: vec![456, 789],
424        };
425
426        let json = serde_json::to_string(&group).unwrap();
427        assert!(json.contains("\"key_value\":\"test\""));
428        assert!(json.contains("\"keep_note_id\":123"));
429        assert!(json.contains("\"duplicate_note_ids\":[456,789]"));
430    }
431
432    #[test]
433    fn test_dedupe_report_default() {
434        let report = DedupeReport::default();
435        assert_eq!(report.groups_found, 0);
436        assert_eq!(report.deleted, 0);
437        assert_eq!(report.kept, 0);
438        assert!(report.details.is_empty());
439    }
440
441    #[test]
442    fn test_dedupe_report_construction() {
443        let group = DuplicateGroup {
444            key_value: "word".to_string(),
445            keep_note_id: 100,
446            duplicate_note_ids: vec![101, 102],
447        };
448
449        let report = DedupeReport {
450            groups_found: 1,
451            deleted: 2,
452            kept: 1,
453            details: vec![group],
454        };
455
456        assert_eq!(report.groups_found, 1);
457        assert_eq!(report.deleted, 2);
458        assert_eq!(report.kept, 1);
459        assert_eq!(report.details.len(), 1);
460    }
461
462    #[test]
463    fn test_dedupe_report_serialization() {
464        let report = DedupeReport {
465            groups_found: 2,
466            deleted: 5,
467            kept: 2,
468            details: vec![],
469        };
470
471        let json = serde_json::to_string(&report).unwrap();
472        assert!(json.contains("\"groups_found\":2"));
473        assert!(json.contains("\"deleted\":5"));
474        assert!(json.contains("\"kept\":2"));
475    }
476
477    #[test]
478    fn test_note_for_dedupe_construction() {
479        let note = NoteForDedupe {
480            note_id: 12345,
481            non_empty_count: 3,
482            tag_count: 2,
483        };
484
485        assert_eq!(note.note_id, 12345);
486        assert_eq!(note.non_empty_count, 3);
487        assert_eq!(note.tag_count, 2);
488    }
489
490    #[test]
491    fn test_keep_strategy_variants() {
492        // Verify all variants can be constructed
493        let _first = KeepStrategy::First;
494        let _last = KeepStrategy::Last;
495        let _most_content = KeepStrategy::MostContent;
496        let _most_tags = KeepStrategy::MostTags;
497    }
498}