Skip to main content

ankit_engine/
media.rs

1//! Media audit and cleanup operations.
2//!
3//! This module provides workflows for auditing media files
4//! and cleaning up orphaned or missing references.
5
6use crate::Result;
7use ankit::AnkiClient;
8use serde::Serialize;
9use std::collections::HashSet;
10
11/// Result of a media audit.
12#[derive(Debug, Clone, Default, Serialize)]
13pub struct MediaAudit {
14    /// Total number of media files.
15    pub total_files: usize,
16    /// Total size of media files in bytes.
17    pub total_size_bytes: u64,
18    /// Media files not referenced by any note.
19    pub orphaned: Vec<String>,
20    /// Media references in notes that don't exist.
21    pub missing: Vec<MissingMedia>,
22    /// Media files by type.
23    pub by_type: MediaByType,
24}
25
26/// Missing media reference.
27#[derive(Debug, Clone, Serialize)]
28pub struct MissingMedia {
29    /// The note ID referencing this media.
30    pub note_id: i64,
31    /// The missing filename.
32    pub filename: String,
33}
34
35/// Media file counts by type.
36#[derive(Debug, Clone, Default, Serialize)]
37pub struct MediaByType {
38    /// Number of image files.
39    pub images: usize,
40    /// Number of audio files.
41    pub audio: usize,
42    /// Number of video files.
43    pub video: usize,
44    /// Number of other files.
45    pub other: usize,
46}
47
48/// Result of a cleanup operation.
49#[derive(Debug, Clone, Default, Serialize)]
50pub struct CleanupReport {
51    /// Number of files deleted.
52    pub files_deleted: usize,
53    /// Bytes freed.
54    pub bytes_freed: u64,
55    /// Files that failed to delete.
56    pub failed: Vec<String>,
57}
58
59/// Media workflow engine.
60#[derive(Debug)]
61pub struct MediaEngine<'a> {
62    client: &'a AnkiClient,
63}
64
65impl<'a> MediaEngine<'a> {
66    pub(crate) fn new(client: &'a AnkiClient) -> Self {
67        Self { client }
68    }
69
70    /// Audit media files in the collection.
71    ///
72    /// Identifies orphaned files (not referenced by notes) and
73    /// missing references (notes referencing non-existent files).
74    ///
75    /// # Example
76    ///
77    /// ```no_run
78    /// # use ankit_engine::Engine;
79    /// # async fn example() -> ankit_engine::Result<()> {
80    /// let engine = Engine::new();
81    /// let audit = engine.media().audit().await?;
82    /// println!("Found {} orphaned files", audit.orphaned.len());
83    /// println!("Found {} missing references", audit.missing.len());
84    /// # Ok(())
85    /// # }
86    /// ```
87    pub async fn audit(&self) -> Result<MediaAudit> {
88        // Get all media files
89        let all_files = self.client.media().list("*").await?;
90
91        let mut audit = MediaAudit {
92            total_files: all_files.len(),
93            ..Default::default()
94        };
95
96        // Categorize by type
97        for file in &all_files {
98            let lower = file.to_lowercase();
99            if lower.ends_with(".jpg")
100                || lower.ends_with(".jpeg")
101                || lower.ends_with(".png")
102                || lower.ends_with(".gif")
103                || lower.ends_with(".webp")
104                || lower.ends_with(".svg")
105            {
106                audit.by_type.images += 1;
107            } else if lower.ends_with(".mp3")
108                || lower.ends_with(".wav")
109                || lower.ends_with(".ogg")
110                || lower.ends_with(".m4a")
111                || lower.ends_with(".flac")
112            {
113                audit.by_type.audio += 1;
114            } else if lower.ends_with(".mp4")
115                || lower.ends_with(".webm")
116                || lower.ends_with(".mkv")
117                || lower.ends_with(".avi")
118            {
119                audit.by_type.video += 1;
120            } else {
121                audit.by_type.other += 1;
122            }
123        }
124
125        // Get all notes and check for media references
126        let all_notes = self.client.notes().find("*").await?;
127
128        if all_notes.is_empty() {
129            // No notes, all media is orphaned
130            audit.orphaned = all_files;
131            return Ok(audit);
132        }
133
134        // Get note info in batches
135        let mut referenced_files: HashSet<String> = HashSet::new();
136        let batch_size = 100;
137
138        for chunk in all_notes.chunks(batch_size) {
139            let infos = self.client.notes().info(chunk).await?;
140            for info in infos {
141                for field in info.fields.values() {
142                    // Extract media references from field content
143                    // Matches [sound:filename] and <img src="filename">
144                    for filename in extract_media_references(&field.value) {
145                        referenced_files.insert(filename);
146                    }
147                }
148            }
149        }
150
151        // Find orphaned files
152        let file_set: HashSet<_> = all_files.iter().cloned().collect();
153        audit.orphaned = all_files
154            .iter()
155            .filter(|f| !referenced_files.contains(*f))
156            .cloned()
157            .collect();
158
159        // Find missing references
160        for filename in &referenced_files {
161            if !file_set.contains(filename) {
162                // Find which note references this
163                // For now, just record the filename without the note ID
164                audit.missing.push(MissingMedia {
165                    note_id: 0, // Would need to track this during extraction
166                    filename: filename.clone(),
167                });
168            }
169        }
170
171        Ok(audit)
172    }
173
174    /// Delete orphaned media files.
175    ///
176    /// # Arguments
177    ///
178    /// * `dry_run` - If true, only report what would be deleted
179    ///
180    /// # Example
181    ///
182    /// ```no_run
183    /// # use ankit_engine::Engine;
184    /// # async fn example() -> ankit_engine::Result<()> {
185    /// let engine = Engine::new();
186    ///
187    /// // Preview what would be deleted
188    /// let preview = engine.media().cleanup_orphaned(true).await?;
189    /// println!("Would delete {} files", preview.files_deleted);
190    ///
191    /// // Actually delete
192    /// let report = engine.media().cleanup_orphaned(false).await?;
193    /// # Ok(())
194    /// # }
195    /// ```
196    pub async fn cleanup_orphaned(&self, dry_run: bool) -> Result<CleanupReport> {
197        let audit = self.audit().await?;
198
199        if dry_run || audit.orphaned.is_empty() {
200            return Ok(CleanupReport {
201                files_deleted: audit.orphaned.len(),
202                ..Default::default()
203            });
204        }
205
206        let mut report = CleanupReport::default();
207
208        for filename in audit.orphaned {
209            match self.client.media().delete(&filename).await {
210                Ok(_) => report.files_deleted += 1,
211                Err(_) => report.failed.push(filename),
212            }
213        }
214
215        Ok(report)
216    }
217
218    /// List media files matching a pattern.
219    ///
220    /// # Arguments
221    ///
222    /// * `pattern` - Glob pattern (e.g., "*.mp3", "image_*")
223    ///
224    /// # Example
225    ///
226    /// ```no_run
227    /// # use ankit_engine::Engine;
228    /// # async fn example() -> ankit_engine::Result<()> {
229    /// let engine = Engine::new();
230    /// let audio_files = engine.media().list("*.mp3").await?;
231    /// # Ok(())
232    /// # }
233    /// ```
234    pub async fn list(&self, pattern: &str) -> Result<Vec<String>> {
235        Ok(self.client.media().list(pattern).await?)
236    }
237}
238
239/// Extract media filenames from HTML field content.
240fn extract_media_references(html: &str) -> Vec<String> {
241    let mut files = Vec::new();
242
243    // Match [sound:filename]
244    let sound_pattern = regex_lite::Regex::new(r"\[sound:([^\]]+)\]").unwrap();
245    for cap in sound_pattern.captures_iter(html) {
246        if let Some(m) = cap.get(1) {
247            files.push(m.as_str().to_string());
248        }
249    }
250
251    // Match <img src="filename">
252    let img_pattern = regex_lite::Regex::new(r#"<img[^>]+src="([^"]+)"[^>]*>"#).unwrap();
253    for cap in img_pattern.captures_iter(html) {
254        if let Some(m) = cap.get(1) {
255            let src = m.as_str();
256            // Skip external URLs
257            if !src.starts_with("http://") && !src.starts_with("https://") {
258                files.push(src.to_string());
259            }
260        }
261    }
262
263    files
264}