1use crate::error::ScanError;
2use format::sniff_file;
3use hash::{
4 hash_file_md5_hex, hash_file_sha256_hex, hash_file_xxhash64_hex, partial_digest, PartialOptions,
5};
6use metadata::{
7 read_audio_tags, read_docx_core, read_exif, read_pdf_info, read_video_ffprobe, AudioMeta,
8 DocxMeta, ExifMeta, FfprobeOptions, PdfMeta, VideoMeta,
9};
10use output::{
11 utc_from_system_time, write_record_line, AudioInfo, DocxCore, ExifInfo, FileRecord, FormatInfo,
12 HashesInfo, IdentityInfo, MetaInfo, PdfInfo, VideoInfo,
13};
14use phash::phash_u64;
15use std::fs::Metadata;
16use std::io::Write;
17use std::path::{Path, PathBuf};
18use walker::{walk_roots_fn, FilterOptions, WalkMode};
19
20#[derive(Debug, Clone)]
22pub struct Scanner {
23 pub options: ScanOptions,
24}
25
26impl Scanner {
27 pub fn new(options: ScanOptions) -> Self {
28 Self { options }
29 }
30
31 pub fn scan<W: Write>(&self, writer: &mut W) -> Result<(), ScanError> {
32 scan(&self.options, writer)
33 }
34}
35
36#[derive(Debug, Clone)]
37pub enum XxhashMode {
38 Off,
39 Full,
40 Partial(PartialOptions),
42}
43
44#[derive(Debug, Clone)]
45pub struct ScanOptions {
46 pub roots: Vec<PathBuf>,
47 pub filter: FilterOptions,
48 pub walk_mode: WalkMode,
49 pub xxhash: XxhashMode,
50 pub md5: bool,
51 pub sha256: bool,
52 pub sniff_format: bool,
53 pub metadata_static: bool,
54 pub video_ffprobe: bool,
55 pub phash: bool,
56}
57
58impl Default for ScanOptions {
59 fn default() -> Self {
60 Self {
61 roots: Vec::new(),
62 filter: FilterOptions::default(),
63 walk_mode: WalkMode::Standard,
64 xxhash: XxhashMode::Off,
65 md5: false,
66 sha256: false,
67 sniff_format: false,
68 metadata_static: false,
69 video_ffprobe: false,
70 phash: false,
71 }
72 }
73}
74
75#[derive(Debug, Clone, Default)]
76pub struct ScanProgress {
77 pub files_scanned: u64,
78 pub bytes_scanned: u64,
79 pub current_path: String,
80}
81
82pub fn scan<W: Write>(opts: &ScanOptions, writer: &mut W) -> Result<(), ScanError> {
84 let mut should_stop = || false;
85 let mut on_progress = |_p: ScanProgress| {};
86 let mut on_record = |rec: FileRecord| -> Result<(), ScanError> {
87 write_record_line(writer, &rec).map_err(ScanError::Io)
88 };
89 let _ = scan_with_callbacks(opts, &mut should_stop, &mut on_progress, &mut on_record)?;
90 Ok(())
91}
92
93pub fn scan_with_callbacks<ShouldStop, OnProgress, OnRecord>(
94 opts: &ScanOptions,
95 should_stop: &mut ShouldStop,
96 on_progress: &mut OnProgress,
97 on_record: &mut OnRecord,
98) -> Result<(u64, u64), ScanError>
99where
100 ShouldStop: FnMut() -> bool,
101 OnProgress: FnMut(ScanProgress),
102 OnRecord: FnMut(FileRecord) -> Result<(), ScanError>,
103{
104 let (tx, rx) = std::sync::mpsc::channel();
105 walk_roots_fn(&opts.roots, &opts.filter, opts.walk_mode, {
106 let tx = tx.clone();
107 let opts = opts.clone();
108 move |path| {
109 let rec = build_record(&path, &opts);
110 let _ = tx.send(rec);
111 }
112 })?;
113 drop(tx);
114 let mut files_scanned: u64 = 0;
115 let mut bytes_scanned: u64 = 0;
116 for rec in rx {
117 if should_stop() {
118 break;
119 }
120 files_scanned += 1;
121 bytes_scanned += rec.identity.size;
122 on_progress(ScanProgress {
123 files_scanned,
124 bytes_scanned,
125 current_path: rec.identity.path.clone(),
126 });
127 on_record(rec)?;
128 }
129 Ok((files_scanned, bytes_scanned))
130}
131
132fn apply_fs_timestamps(r: &mut FileRecord, meta: &Metadata) {
133 if let Ok(t) = meta.modified() {
134 r.identity.modified_at = utc_from_system_time(t);
135 }
136 if let Ok(t) = meta.accessed() {
137 r.identity.accessed_at = utc_from_system_time(t);
138 }
139 if let Ok(t) = meta.created() {
140 r.identity.created_at = utc_from_system_time(t);
141 }
142 apply_file_identity(r, meta);
143}
144
145fn build_record(path: &Path, opts: &ScanOptions) -> FileRecord {
146 let mut r = FileRecord {
147 schema_version: 1,
148 identity: IdentityInfo {
149 path: path.display().to_string(),
150 size: 0,
151 ..Default::default()
152 },
153 hashes: Some(HashesInfo::default()),
154 format: Some(FormatInfo::default()),
155 meta: Some(MetaInfo::default()),
156 error: None,
157 };
158
159 let meta = match std::fs::metadata(path) {
160 Ok(m) => m,
161 Err(e) => {
162 r.error = Some(e.to_string());
163 return r;
164 }
165 };
166 r.identity.size = meta.len();
167 apply_fs_timestamps(&mut r, &meta);
168
169 if opts.sniff_format {
170 match sniff_file(path) {
171 Ok(s) => {
172 let fmt = r.format.get_or_insert_with(FormatInfo::default);
173 fmt.mime = Some(s.media_type);
174 fmt.kind = Some(s.format_short_name);
175 fmt.extension_match = Some(s.extension_matches);
176 }
177 Err(e) => {
178 r.error = Some(format!("format: {e}"));
179 }
180 }
181 }
182
183 match &opts.xxhash {
184 XxhashMode::Off => {
185 if let Some(h) = r.hashes.as_mut() {
186 h.mode = Some("off".to_string());
187 }
188 }
189 XxhashMode::Full => match hash_file_xxhash64_hex(path) {
190 Ok(h) => {
191 let hashes = r.hashes.get_or_insert_with(HashesInfo::default);
192 hashes.xxhash64 = Some(h);
193 hashes.mode = Some("full".to_string());
194 }
195 Err(e) => r.error = Some(format!("xxhash: {e}")),
196 },
197 XxhashMode::Partial(po) => match partial_digest(path, po) {
198 Ok(h) => {
199 let hashes = r.hashes.get_or_insert_with(HashesInfo::default);
200 hashes.xxhash64 = Some(h);
201 hashes.mode = Some("partial".to_string());
202 }
203 Err(e) => r.error = Some(format!("xxhash partial: {e}")),
204 },
205 }
206
207 if opts.sha256 {
208 match hash_file_sha256_hex(path) {
209 Ok(h) => {
210 let hashes = r.hashes.get_or_insert_with(HashesInfo::default);
211 hashes.sha256 = Some(h);
212 }
213 Err(e) => r.error = Some(format!("sha256: {e}")),
214 }
215 }
216 if opts.md5 {
217 match hash_file_md5_hex(path) {
218 Ok(h) => {
219 let hashes = r.hashes.get_or_insert_with(HashesInfo::default);
220 hashes.md5 = Some(h);
221 }
222 Err(e) => r.error = Some(format!("md5: {e}")),
223 }
224 }
225
226 if opts.metadata_static {
227 if is_image_path(path) {
228 if let Some(e) = read_exif(path) {
229 r.meta.get_or_insert_with(MetaInfo::default).exif = Some(map_exif(e));
230 }
231 }
232 if is_audio_path(path) {
233 if let Some(a) = read_audio_tags(path) {
234 r.meta.get_or_insert_with(MetaInfo::default).audio = Some(map_audio(a));
235 }
236 }
237 if path.extension().and_then(|e| e.to_str()) == Some("pdf") {
238 if let Some(p) = read_pdf_info(path) {
239 r.meta.get_or_insert_with(MetaInfo::default).pdf = Some(map_pdf(p));
240 }
241 }
242 if path.extension().and_then(|e| e.to_str()) == Some("docx") {
243 if let Some(d) = read_docx_core(path) {
244 r.meta.get_or_insert_with(MetaInfo::default).docx = Some(map_docx(d));
245 }
246 }
247 }
248
249 if opts.video_ffprobe && is_video_path(path) {
250 if let Some(v) = read_video_ffprobe(path, &FfprobeOptions::default()) {
251 r.meta.get_or_insert_with(MetaInfo::default).video = Some(map_video(v));
252 }
253 }
254
255 if opts.phash && is_image_path(path) {
256 match phash_u64(path) {
257 Ok(h) => r.meta.get_or_insert_with(MetaInfo::default).phash = Some(h),
258 Err(e) => {
259 if r.error.is_none() {
260 r.error = Some(format!("phash: {e}"));
261 }
262 }
263 }
264 }
265
266 if matches!(
267 r.hashes.as_ref(),
268 Some(h) if h.xxhash64.is_none() && h.md5.is_none() && h.sha256.is_none() && h.mode.is_none()
269 ) {
270 r.hashes = None;
271 }
272 if matches!(
273 r.format.as_ref(),
274 Some(f) if f.kind.is_none() && f.mime.is_none() && f.extension_match.is_none() && f.confidence.is_none()
275 ) {
276 r.format = None;
277 }
278 if matches!(
279 r.meta.as_ref(),
280 Some(m)
281 if m.phash.is_none()
282 && m.exif.is_none()
283 && m.audio.is_none()
284 && m.video.is_none()
285 && m.pdf.is_none()
286 && m.docx.is_none()
287 ) {
288 r.meta = None;
289 }
290
291 r
292}
293
294#[cfg(unix)]
295fn apply_file_identity(r: &mut FileRecord, meta: &Metadata) {
296 #[cfg(unix)]
297 {
298 use std::os::unix::fs::MetadataExt;
299 r.identity.inode = Some(meta.ino());
300 r.identity.device_id = Some(meta.dev());
301 }
302}
303
304#[cfg(not(unix))]
305fn apply_file_identity(_r: &mut FileRecord, _meta: &Metadata) {}
306
307fn map_exif(m: ExifMeta) -> ExifInfo {
308 ExifInfo {
309 camera: m.camera,
310 lens: m.lens,
311 gps_lat: m.gps_lat,
312 gps_lon: m.gps_lon,
313 extra: m.extra,
314 }
315}
316
317fn map_audio(m: AudioMeta) -> AudioInfo {
318 AudioInfo {
319 artist: m.artist,
320 title: m.title,
321 album: m.album,
322 }
323}
324
325fn map_pdf(m: PdfMeta) -> PdfInfo {
326 PdfInfo {
327 page_count: m.page_count,
328 author: m.author,
329 title: m.title,
330 }
331}
332
333fn map_docx(m: DocxMeta) -> DocxCore {
334 DocxCore {
335 creator: m.creator,
336 last_modified_by: m.last_modified_by,
337 revision: m.revision,
338 }
339}
340
341fn map_video(m: VideoMeta) -> VideoInfo {
342 VideoInfo {
343 codec_name: m.codec_name,
344 width: m.width,
345 height: m.height,
346 duration_secs: m.duration_secs,
347 }
348}
349
350fn is_image_path(p: &Path) -> bool {
351 matches!(
352 p.extension()
353 .and_then(|e| e.to_str())
354 .map(|e| e.to_ascii_lowercase())
355 .as_deref(),
356 Some("jpg")
357 | Some("jpeg")
358 | Some("png")
359 | Some("webp")
360 | Some("tif")
361 | Some("tiff")
362 | Some("heic")
363 )
364}
365
366fn is_audio_path(p: &Path) -> bool {
367 matches!(
368 p.extension()
369 .and_then(|e| e.to_str())
370 .map(|e| e.to_ascii_lowercase())
371 .as_deref(),
372 Some("mp3") | Some("flac") | Some("m4a") | Some("ogg") | Some("opus") | Some("wav")
373 )
374}
375
376fn is_video_path(p: &Path) -> bool {
377 matches!(
378 p.extension()
379 .and_then(|e| e.to_str())
380 .map(|e| e.to_ascii_lowercase())
381 .as_deref(),
382 Some("mp4") | Some("mkv") | Some("mov") | Some("webm") | Some("avi")
383 )
384}
385
386#[cfg(test)]
387mod tests {
388 use super::*;
389 use std::fs;
390 use std::path::PathBuf;
391 use tempfile::tempdir;
392 use tempfile::NamedTempFile;
393
394 #[test]
395 fn apply_fs_timestamps_sets_modified_at_for_temp_file() {
396 let f = NamedTempFile::new().unwrap();
397 let meta = fs::metadata(f.path()).unwrap();
398 let mut r = FileRecord {
399 schema_version: 1,
400 ..Default::default()
401 };
402 r.identity.path = "x".to_string();
403 r.identity.size = 0;
404 apply_fs_timestamps(&mut r, &meta);
405 assert!(
406 r.identity.modified_at.is_some(),
407 "mtime should be available for a normal file"
408 );
409 let line = serde_json::to_string(&r).unwrap();
410 assert!(line.contains("modified_at"));
411 assert!(line.contains('Z'));
412 }
413
414 #[test]
415 fn standard_and_full_modes_produce_same_record_for_same_file() {
416 let dir = tempdir().expect("temp dir");
417 let p = dir.path().join("a.txt");
418 fs::write(&p, "hello").expect("write file");
419
420 let collect = |mode: WalkMode| -> FileRecord {
421 let opts = ScanOptions {
422 roots: vec![PathBuf::from(dir.path())],
423 walk_mode: mode,
424 ..Default::default()
425 };
426 let mut out: Vec<FileRecord> = Vec::new();
427 let mut should_stop = || false;
428 let mut on_progress = |_p: ScanProgress| {};
429 let mut on_record = |r: FileRecord| -> Result<(), ScanError> {
430 out.push(r);
431 Ok(())
432 };
433 let _ = scan_with_callbacks(&opts, &mut should_stop, &mut on_progress, &mut on_record)
434 .expect("scan callbacks");
435 out.into_iter()
436 .find(|r| r.identity.path == p.display().to_string())
437 .expect("record for target file")
438 };
439
440 let standard = collect(WalkMode::Standard);
441 let full = collect(WalkMode::Full);
442 let standard_json = serde_json::to_string(&standard).expect("std json");
443 let full_json = serde_json::to_string(&full).expect("full json");
444 assert_eq!(standard_json, full_json);
445 }
446}