Skip to main content

engine/
scanner.rs

1use crate::error::ScanError;
2use format::sniff_file;
3use hash::{
4    hash_file_md5_hex, hash_file_sha256_hex, hash_file_xxhash64_hex, partial_digest, PartialOptions,
5};
6use metadata::{
7    read_audio_tags, read_docx_core, read_exif, read_pdf_info, read_video_ffprobe, AudioMeta,
8    DocxMeta, ExifMeta, FfprobeOptions, PdfMeta, VideoMeta,
9};
10use output::{
11    utc_from_system_time, write_record_line, AudioInfo, DocxCore, ExifInfo, FileRecord, FormatInfo,
12    HashesInfo, IdentityInfo, MetaInfo, PdfInfo, VideoInfo,
13};
14use phash::phash_u64;
15use std::fs::Metadata;
16use std::io::Write;
17use std::path::{Path, PathBuf};
18use walker::{walk_roots_fn, FilterOptions, WalkMode};
19
20/// Thin handle around [`ScanOptions`] for callers that prefer an object API.
21#[derive(Debug, Clone)]
22pub struct Scanner {
23    pub options: ScanOptions,
24}
25
26impl Scanner {
27    pub fn new(options: ScanOptions) -> Self {
28        Self { options }
29    }
30
31    pub fn scan<W: Write>(&self, writer: &mut W) -> Result<(), ScanError> {
32        scan(&self.options, writer)
33    }
34}
35
36#[derive(Debug, Clone)]
37pub enum XxhashMode {
38    Off,
39    Full,
40    /// Head/tail digest for large-file tiering (same hex field as full xxhash on `FileRecord`).
41    Partial(PartialOptions),
42}
43
44#[derive(Debug, Clone)]
45pub struct ScanOptions {
46    pub roots: Vec<PathBuf>,
47    pub filter: FilterOptions,
48    pub walk_mode: WalkMode,
49    pub xxhash: XxhashMode,
50    pub md5: bool,
51    pub sha256: bool,
52    pub sniff_format: bool,
53    pub metadata_static: bool,
54    pub video_ffprobe: bool,
55    pub phash: bool,
56}
57
58impl Default for ScanOptions {
59    fn default() -> Self {
60        Self {
61            roots: Vec::new(),
62            filter: FilterOptions::default(),
63            walk_mode: WalkMode::Standard,
64            xxhash: XxhashMode::Off,
65            md5: false,
66            sha256: false,
67            sniff_format: false,
68            metadata_static: false,
69            video_ffprobe: false,
70            phash: false,
71        }
72    }
73}
74
75#[derive(Debug, Clone, Default)]
76pub struct ScanProgress {
77    pub files_scanned: u64,
78    pub bytes_scanned: u64,
79    pub current_path: String,
80}
81
82/// Walk `opts.roots` and write one NDJSON line per file to `writer`.
83pub fn scan<W: Write>(opts: &ScanOptions, writer: &mut W) -> Result<(), ScanError> {
84    let mut should_stop = || false;
85    let mut on_progress = |_p: ScanProgress| {};
86    let mut on_record = |rec: FileRecord| -> Result<(), ScanError> {
87        write_record_line(writer, &rec).map_err(ScanError::Io)
88    };
89    let _ = scan_with_callbacks(opts, &mut should_stop, &mut on_progress, &mut on_record)?;
90    Ok(())
91}
92
93pub fn scan_with_callbacks<ShouldStop, OnProgress, OnRecord>(
94    opts: &ScanOptions,
95    should_stop: &mut ShouldStop,
96    on_progress: &mut OnProgress,
97    on_record: &mut OnRecord,
98) -> Result<(u64, u64), ScanError>
99where
100    ShouldStop: FnMut() -> bool,
101    OnProgress: FnMut(ScanProgress),
102    OnRecord: FnMut(FileRecord) -> Result<(), ScanError>,
103{
104    let (tx, rx) = std::sync::mpsc::channel();
105    walk_roots_fn(&opts.roots, &opts.filter, opts.walk_mode, {
106        let tx = tx.clone();
107        let opts = opts.clone();
108        move |path| {
109            let rec = build_record(&path, &opts);
110            let _ = tx.send(rec);
111        }
112    })?;
113    drop(tx);
114    let mut files_scanned: u64 = 0;
115    let mut bytes_scanned: u64 = 0;
116    for rec in rx {
117        if should_stop() {
118            break;
119        }
120        files_scanned += 1;
121        bytes_scanned += rec.identity.size;
122        on_progress(ScanProgress {
123            files_scanned,
124            bytes_scanned,
125            current_path: rec.identity.path.clone(),
126        });
127        on_record(rec)?;
128    }
129    Ok((files_scanned, bytes_scanned))
130}
131
132fn apply_fs_timestamps(r: &mut FileRecord, meta: &Metadata) {
133    if let Ok(t) = meta.modified() {
134        r.identity.modified_at = utc_from_system_time(t);
135    }
136    if let Ok(t) = meta.accessed() {
137        r.identity.accessed_at = utc_from_system_time(t);
138    }
139    if let Ok(t) = meta.created() {
140        r.identity.created_at = utc_from_system_time(t);
141    }
142    apply_file_identity(r, meta);
143}
144
145fn build_record(path: &Path, opts: &ScanOptions) -> FileRecord {
146    let mut r = FileRecord {
147        schema_version: 1,
148        identity: IdentityInfo {
149            path: path.display().to_string(),
150            size: 0,
151            ..Default::default()
152        },
153        hashes: Some(HashesInfo::default()),
154        format: Some(FormatInfo::default()),
155        meta: Some(MetaInfo::default()),
156        error: None,
157    };
158
159    let meta = match std::fs::metadata(path) {
160        Ok(m) => m,
161        Err(e) => {
162            r.error = Some(e.to_string());
163            return r;
164        }
165    };
166    r.identity.size = meta.len();
167    apply_fs_timestamps(&mut r, &meta);
168
169    if opts.sniff_format {
170        match sniff_file(path) {
171            Ok(s) => {
172                let fmt = r.format.get_or_insert_with(FormatInfo::default);
173                fmt.mime = Some(s.media_type);
174                fmt.kind = Some(s.format_short_name);
175                fmt.extension_match = Some(s.extension_matches);
176            }
177            Err(e) => {
178                r.error = Some(format!("format: {e}"));
179            }
180        }
181    }
182
183    match &opts.xxhash {
184        XxhashMode::Off => {
185            if let Some(h) = r.hashes.as_mut() {
186                h.mode = Some("off".to_string());
187            }
188        }
189        XxhashMode::Full => match hash_file_xxhash64_hex(path) {
190            Ok(h) => {
191                let hashes = r.hashes.get_or_insert_with(HashesInfo::default);
192                hashes.xxhash64 = Some(h);
193                hashes.mode = Some("full".to_string());
194            }
195            Err(e) => r.error = Some(format!("xxhash: {e}")),
196        },
197        XxhashMode::Partial(po) => match partial_digest(path, po) {
198            Ok(h) => {
199                let hashes = r.hashes.get_or_insert_with(HashesInfo::default);
200                hashes.xxhash64 = Some(h);
201                hashes.mode = Some("partial".to_string());
202            }
203            Err(e) => r.error = Some(format!("xxhash partial: {e}")),
204        },
205    }
206
207    if opts.sha256 {
208        match hash_file_sha256_hex(path) {
209            Ok(h) => {
210                let hashes = r.hashes.get_or_insert_with(HashesInfo::default);
211                hashes.sha256 = Some(h);
212            }
213            Err(e) => r.error = Some(format!("sha256: {e}")),
214        }
215    }
216    if opts.md5 {
217        match hash_file_md5_hex(path) {
218            Ok(h) => {
219                let hashes = r.hashes.get_or_insert_with(HashesInfo::default);
220                hashes.md5 = Some(h);
221            }
222            Err(e) => r.error = Some(format!("md5: {e}")),
223        }
224    }
225
226    if opts.metadata_static {
227        if is_image_path(path) {
228            if let Some(e) = read_exif(path) {
229                r.meta.get_or_insert_with(MetaInfo::default).exif = Some(map_exif(e));
230            }
231        }
232        if is_audio_path(path) {
233            if let Some(a) = read_audio_tags(path) {
234                r.meta.get_or_insert_with(MetaInfo::default).audio = Some(map_audio(a));
235            }
236        }
237        if path.extension().and_then(|e| e.to_str()) == Some("pdf") {
238            if let Some(p) = read_pdf_info(path) {
239                r.meta.get_or_insert_with(MetaInfo::default).pdf = Some(map_pdf(p));
240            }
241        }
242        if path.extension().and_then(|e| e.to_str()) == Some("docx") {
243            if let Some(d) = read_docx_core(path) {
244                r.meta.get_or_insert_with(MetaInfo::default).docx = Some(map_docx(d));
245            }
246        }
247    }
248
249    if opts.video_ffprobe && is_video_path(path) {
250        if let Some(v) = read_video_ffprobe(path, &FfprobeOptions::default()) {
251            r.meta.get_or_insert_with(MetaInfo::default).video = Some(map_video(v));
252        }
253    }
254
255    if opts.phash && is_image_path(path) {
256        match phash_u64(path) {
257            Ok(h) => r.meta.get_or_insert_with(MetaInfo::default).phash = Some(h),
258            Err(e) => {
259                if r.error.is_none() {
260                    r.error = Some(format!("phash: {e}"));
261                }
262            }
263        }
264    }
265
266    if matches!(
267        r.hashes.as_ref(),
268        Some(h) if h.xxhash64.is_none() && h.md5.is_none() && h.sha256.is_none() && h.mode.is_none()
269    ) {
270        r.hashes = None;
271    }
272    if matches!(
273        r.format.as_ref(),
274        Some(f) if f.kind.is_none() && f.mime.is_none() && f.extension_match.is_none() && f.confidence.is_none()
275    ) {
276        r.format = None;
277    }
278    if matches!(
279        r.meta.as_ref(),
280        Some(m)
281            if m.phash.is_none()
282                && m.exif.is_none()
283                && m.audio.is_none()
284                && m.video.is_none()
285                && m.pdf.is_none()
286                && m.docx.is_none()
287    ) {
288        r.meta = None;
289    }
290
291    r
292}
293
294#[cfg(unix)]
295fn apply_file_identity(r: &mut FileRecord, meta: &Metadata) {
296    #[cfg(unix)]
297    {
298        use std::os::unix::fs::MetadataExt;
299        r.identity.inode = Some(meta.ino());
300        r.identity.device_id = Some(meta.dev());
301    }
302}
303
304#[cfg(not(unix))]
305fn apply_file_identity(_r: &mut FileRecord, _meta: &Metadata) {}
306
307fn map_exif(m: ExifMeta) -> ExifInfo {
308    ExifInfo {
309        camera: m.camera,
310        lens: m.lens,
311        gps_lat: m.gps_lat,
312        gps_lon: m.gps_lon,
313        extra: m.extra,
314    }
315}
316
317fn map_audio(m: AudioMeta) -> AudioInfo {
318    AudioInfo {
319        artist: m.artist,
320        title: m.title,
321        album: m.album,
322    }
323}
324
325fn map_pdf(m: PdfMeta) -> PdfInfo {
326    PdfInfo {
327        page_count: m.page_count,
328        author: m.author,
329        title: m.title,
330    }
331}
332
333fn map_docx(m: DocxMeta) -> DocxCore {
334    DocxCore {
335        creator: m.creator,
336        last_modified_by: m.last_modified_by,
337        revision: m.revision,
338    }
339}
340
341fn map_video(m: VideoMeta) -> VideoInfo {
342    VideoInfo {
343        codec_name: m.codec_name,
344        width: m.width,
345        height: m.height,
346        duration_secs: m.duration_secs,
347    }
348}
349
350fn is_image_path(p: &Path) -> bool {
351    matches!(
352        p.extension()
353            .and_then(|e| e.to_str())
354            .map(|e| e.to_ascii_lowercase())
355            .as_deref(),
356        Some("jpg")
357            | Some("jpeg")
358            | Some("png")
359            | Some("webp")
360            | Some("tif")
361            | Some("tiff")
362            | Some("heic")
363    )
364}
365
366fn is_audio_path(p: &Path) -> bool {
367    matches!(
368        p.extension()
369            .and_then(|e| e.to_str())
370            .map(|e| e.to_ascii_lowercase())
371            .as_deref(),
372        Some("mp3") | Some("flac") | Some("m4a") | Some("ogg") | Some("opus") | Some("wav")
373    )
374}
375
376fn is_video_path(p: &Path) -> bool {
377    matches!(
378        p.extension()
379            .and_then(|e| e.to_str())
380            .map(|e| e.to_ascii_lowercase())
381            .as_deref(),
382        Some("mp4") | Some("mkv") | Some("mov") | Some("webm") | Some("avi")
383    )
384}
385
386#[cfg(test)]
387mod tests {
388    use super::*;
389    use std::fs;
390    use std::path::PathBuf;
391    use tempfile::tempdir;
392    use tempfile::NamedTempFile;
393
394    #[test]
395    fn apply_fs_timestamps_sets_modified_at_for_temp_file() {
396        let f = NamedTempFile::new().unwrap();
397        let meta = fs::metadata(f.path()).unwrap();
398        let mut r = FileRecord {
399            schema_version: 1,
400            ..Default::default()
401        };
402        r.identity.path = "x".to_string();
403        r.identity.size = 0;
404        apply_fs_timestamps(&mut r, &meta);
405        assert!(
406            r.identity.modified_at.is_some(),
407            "mtime should be available for a normal file"
408        );
409        let line = serde_json::to_string(&r).unwrap();
410        assert!(line.contains("modified_at"));
411        assert!(line.contains('Z'));
412    }
413
414    #[test]
415    fn standard_and_full_modes_produce_same_record_for_same_file() {
416        let dir = tempdir().expect("temp dir");
417        let p = dir.path().join("a.txt");
418        fs::write(&p, "hello").expect("write file");
419
420        let collect = |mode: WalkMode| -> FileRecord {
421            let opts = ScanOptions {
422                roots: vec![PathBuf::from(dir.path())],
423                walk_mode: mode,
424                ..Default::default()
425            };
426            let mut out: Vec<FileRecord> = Vec::new();
427            let mut should_stop = || false;
428            let mut on_progress = |_p: ScanProgress| {};
429            let mut on_record = |r: FileRecord| -> Result<(), ScanError> {
430                out.push(r);
431                Ok(())
432            };
433            let _ = scan_with_callbacks(&opts, &mut should_stop, &mut on_progress, &mut on_record)
434                .expect("scan callbacks");
435            out.into_iter()
436                .find(|r| r.identity.path == p.display().to_string())
437                .expect("record for target file")
438        };
439
440        let standard = collect(WalkMode::Standard);
441        let full = collect(WalkMode::Full);
442        let standard_json = serde_json::to_string(&standard).expect("std json");
443        let full_json = serde_json::to_string(&full).expect("full json");
444        assert_eq!(standard_json, full_json);
445    }
446}