1use crate::Result;
33use ankit::AnkiClient;
34use serde::Serialize;
35use std::collections::HashMap;
36
37#[derive(Debug, Clone, Copy, Default)]
39pub enum KeepStrategy {
40 #[default]
42 First,
43 Last,
45 MostContent,
47 MostTags,
49}
50
51#[derive(Debug, Clone)]
53pub struct DedupeQuery {
54 pub search: String,
56 pub key_field: String,
58 pub keep: KeepStrategy,
60}
61
62#[derive(Debug, Clone, Serialize)]
64pub struct DuplicateGroup {
65 pub key_value: String,
67 pub keep_note_id: i64,
69 pub duplicate_note_ids: Vec<i64>,
71}
72
73#[derive(Debug, Clone)]
75struct NoteForDedupe {
76 note_id: i64,
77 non_empty_count: usize,
78 tag_count: usize,
79}
80
81#[derive(Debug, Clone, Default, Serialize)]
83pub struct DedupeReport {
84 pub groups_found: usize,
86 pub deleted: usize,
88 pub kept: usize,
90 pub details: Vec<DuplicateGroup>,
92}
93
94#[derive(Debug)]
96pub struct DeduplicateEngine<'a> {
97 client: &'a AnkiClient,
98}
99
100impl<'a> DeduplicateEngine<'a> {
101 pub(crate) fn new(client: &'a AnkiClient) -> Self {
102 Self { client }
103 }
104
105 pub async fn find_duplicates(&self, query: &DedupeQuery) -> Result<Vec<DuplicateGroup>> {
137 let note_ids = self.client.notes().find(&query.search).await?;
138
139 if note_ids.is_empty() {
140 return Ok(Vec::new());
141 }
142
143 let note_infos = self.client.notes().info(¬e_ids).await?;
144
145 let mut groups: HashMap<String, Vec<NoteForDedupe>> = HashMap::new();
147
148 for info in note_infos {
149 let key_value = info
151 .fields
152 .get(&query.key_field)
153 .map(|f| normalize_key(&f.value))
154 .unwrap_or_default();
155
156 if key_value.is_empty() {
158 continue;
159 }
160
161 let non_empty_count = info
163 .fields
164 .values()
165 .filter(|f| !f.value.trim().is_empty())
166 .count();
167
168 groups.entry(key_value).or_default().push(NoteForDedupe {
169 note_id: info.note_id,
170 non_empty_count,
171 tag_count: info.tags.len(),
172 });
173 }
174
175 let mut result = Vec::new();
177
178 for (key, mut notes) in groups {
179 if notes.len() <= 1 {
180 continue;
181 }
182
183 match query.keep {
185 KeepStrategy::First => {
186 notes.sort_by_key(|n| n.note_id);
187 }
188 KeepStrategy::Last => {
189 notes.sort_by_key(|n| std::cmp::Reverse(n.note_id));
190 }
191 KeepStrategy::MostContent => {
192 notes.sort_by(|a, b| {
194 b.non_empty_count
195 .cmp(&a.non_empty_count)
196 .then_with(|| a.note_id.cmp(&b.note_id))
197 });
198 }
199 KeepStrategy::MostTags => {
200 notes.sort_by(|a, b| {
202 b.tag_count
203 .cmp(&a.tag_count)
204 .then_with(|| a.note_id.cmp(&b.note_id))
205 });
206 }
207 }
208
209 let keep_note_id = notes[0].note_id;
210 let duplicate_note_ids: Vec<i64> = notes[1..].iter().map(|n| n.note_id).collect();
211
212 result.push(DuplicateGroup {
213 key_value: key,
214 keep_note_id,
215 duplicate_note_ids,
216 });
217 }
218
219 result.sort_by(|a, b| a.key_value.cmp(&b.key_value));
221
222 Ok(result)
223 }
224
225 pub async fn preview(&self, query: &DedupeQuery) -> Result<DedupeReport> {
229 let groups = self.find_duplicates(query).await?;
230
231 let deleted: usize = groups.iter().map(|g| g.duplicate_note_ids.len()).sum();
232
233 Ok(DedupeReport {
234 groups_found: groups.len(),
235 deleted,
236 kept: groups.len(),
237 details: groups,
238 })
239 }
240
241 pub async fn remove_duplicates(&self, query: &DedupeQuery) -> Result<DedupeReport> {
269 let groups = self.find_duplicates(query).await?;
270
271 if groups.is_empty() {
272 return Ok(DedupeReport::default());
273 }
274
275 let to_delete: Vec<i64> = groups
277 .iter()
278 .flat_map(|g| g.duplicate_note_ids.iter().copied())
279 .collect();
280
281 let deleted_count = to_delete.len();
282 let kept_count = groups.len();
283
284 if !to_delete.is_empty() {
286 self.client.notes().delete(&to_delete).await?;
287 }
288
289 Ok(DedupeReport {
290 groups_found: groups.len(),
291 deleted: deleted_count,
292 kept: kept_count,
293 details: groups,
294 })
295 }
296
297 pub async fn delete_notes(&self, note_ids: &[i64]) -> Result<usize> {
306 if note_ids.is_empty() {
307 return Ok(0);
308 }
309
310 self.client.notes().delete(note_ids).await?;
311 Ok(note_ids.len())
312 }
313}
314
315fn normalize_key(value: &str) -> String {
319 let mut result = String::with_capacity(value.len());
321 let mut in_tag = false;
322
323 for ch in value.chars() {
324 match ch {
325 '<' => in_tag = true,
326 '>' => in_tag = false,
327 _ if !in_tag => result.push(ch),
328 _ => {}
329 }
330 }
331
332 result
334 .split_whitespace()
335 .collect::<Vec<_>>()
336 .join(" ")
337 .to_lowercase()
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 #[test]
345 fn test_normalize_key() {
346 assert_eq!(normalize_key("hello"), "hello");
347 assert_eq!(normalize_key("Hello World"), "hello world");
348 assert_eq!(normalize_key(" hello world "), "hello world");
349 assert_eq!(normalize_key("<b>hello</b>"), "hello");
350 assert_eq!(
351 normalize_key("<div>Hello <span>World</span></div>"),
352 "hello world"
353 );
354 }
355
356 #[test]
357 fn test_normalize_key_empty() {
358 assert_eq!(normalize_key(""), "");
359 assert_eq!(normalize_key(" "), "");
360 assert_eq!(normalize_key("<>"), "");
361 }
362
363 #[test]
364 fn test_normalize_key_html_attributes() {
365 assert_eq!(normalize_key("<a href=\"url\">Link</a>"), "link");
366 assert_eq!(
367 normalize_key("<div class=\"foo\" id=\"bar\">Content</div>"),
368 "content"
369 );
370 }
371
372 #[test]
373 fn test_normalize_key_unclosed_tags() {
374 assert_eq!(normalize_key("<p>Unclosed"), "unclosed");
375 assert_eq!(normalize_key("Text<br>More"), "textmore");
376 }
377
378 #[test]
379 fn test_normalize_key_newlines() {
380 assert_eq!(normalize_key("hello\nworld"), "hello world");
381 assert_eq!(normalize_key("hello\r\nworld"), "hello world");
382 assert_eq!(normalize_key("hello\tworld"), "hello world");
383 }
384
385 #[test]
386 fn test_keep_strategy_default() {
387 let strategy = KeepStrategy::default();
388 assert!(matches!(strategy, KeepStrategy::First));
389 }
390
391 #[test]
392 fn test_dedupe_query_construction() {
393 let query = DedupeQuery {
394 search: "deck:Test".to_string(),
395 key_field: "Front".to_string(),
396 keep: KeepStrategy::MostContent,
397 };
398
399 assert_eq!(query.search, "deck:Test");
400 assert_eq!(query.key_field, "Front");
401 assert!(matches!(query.keep, KeepStrategy::MostContent));
402 }
403
404 #[test]
405 fn test_duplicate_group_construction() {
406 let group = DuplicateGroup {
407 key_value: "hello".to_string(),
408 keep_note_id: 1000,
409 duplicate_note_ids: vec![1001, 1002, 1003],
410 };
411
412 assert_eq!(group.key_value, "hello");
413 assert_eq!(group.keep_note_id, 1000);
414 assert_eq!(group.duplicate_note_ids.len(), 3);
415 assert!(group.duplicate_note_ids.contains(&1001));
416 }
417
418 #[test]
419 fn test_duplicate_group_serialization() {
420 let group = DuplicateGroup {
421 key_value: "test".to_string(),
422 keep_note_id: 123,
423 duplicate_note_ids: vec![456, 789],
424 };
425
426 let json = serde_json::to_string(&group).unwrap();
427 assert!(json.contains("\"key_value\":\"test\""));
428 assert!(json.contains("\"keep_note_id\":123"));
429 assert!(json.contains("\"duplicate_note_ids\":[456,789]"));
430 }
431
432 #[test]
433 fn test_dedupe_report_default() {
434 let report = DedupeReport::default();
435 assert_eq!(report.groups_found, 0);
436 assert_eq!(report.deleted, 0);
437 assert_eq!(report.kept, 0);
438 assert!(report.details.is_empty());
439 }
440
441 #[test]
442 fn test_dedupe_report_construction() {
443 let group = DuplicateGroup {
444 key_value: "word".to_string(),
445 keep_note_id: 100,
446 duplicate_note_ids: vec![101, 102],
447 };
448
449 let report = DedupeReport {
450 groups_found: 1,
451 deleted: 2,
452 kept: 1,
453 details: vec![group],
454 };
455
456 assert_eq!(report.groups_found, 1);
457 assert_eq!(report.deleted, 2);
458 assert_eq!(report.kept, 1);
459 assert_eq!(report.details.len(), 1);
460 }
461
462 #[test]
463 fn test_dedupe_report_serialization() {
464 let report = DedupeReport {
465 groups_found: 2,
466 deleted: 5,
467 kept: 2,
468 details: vec![],
469 };
470
471 let json = serde_json::to_string(&report).unwrap();
472 assert!(json.contains("\"groups_found\":2"));
473 assert!(json.contains("\"deleted\":5"));
474 assert!(json.contains("\"kept\":2"));
475 }
476
477 #[test]
478 fn test_note_for_dedupe_construction() {
479 let note = NoteForDedupe {
480 note_id: 12345,
481 non_empty_count: 3,
482 tag_count: 2,
483 };
484
485 assert_eq!(note.note_id, 12345);
486 assert_eq!(note.non_empty_count, 3);
487 assert_eq!(note.tag_count, 2);
488 }
489
490 #[test]
491 fn test_keep_strategy_variants() {
492 let _first = KeepStrategy::First;
494 let _last = KeepStrategy::Last;
495 let _most_content = KeepStrategy::MostContent;
496 let _most_tags = KeepStrategy::MostTags;
497 }
498}