Skip to main content

provenant/utils/
file.rs

1use std::collections::BTreeSet;
2use std::fs;
3use std::io::{BufReader, Cursor, Read};
4use std::panic::{AssertUnwindSafe, catch_unwind};
5use std::path::Path;
6
7use chrono::{TimeZone, Utc};
8use flate2::read::ZlibDecoder;
9use glob::Pattern;
10use image::{ImageDecoder, ImageFormat, ImageReader};
11use quick_xml::events::Event;
12use quick_xml::reader::Reader as XmlReader;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum ExtractedTextKind {
16    None,
17    Decoded,
18    Pdf,
19    BinaryStrings,
20    ImageMetadata,
21}
22
23const MAX_IMAGE_METADATA_VALUES: usize = 64;
24const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
25
26/// Get the creation date of a file or directory as an RFC3339 string.
27pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
28    metadata.created().ok().map(|time: std::time::SystemTime| {
29        let seconds_since_epoch = time
30            .duration_since(std::time::UNIX_EPOCH)
31            .unwrap()
32            .as_secs() as i64;
33
34        Utc.timestamp_opt(seconds_since_epoch, 0)
35            .single()
36            .unwrap_or_else(Utc::now)
37            .to_rfc3339()
38    })
39}
40
41/// Check if a path should be excluded based on a list of glob patterns.
42pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
43    let path_str = path.to_string_lossy();
44    let file_name = path
45        .file_name()
46        .map(|name| name.to_string_lossy())
47        .unwrap_or_default();
48
49    for pattern in exclude_patterns {
50        // Match against full path
51        if pattern.matches(&path_str) {
52            return true;
53        }
54
55        // Match against just the file/directory name
56        if pattern.matches(&file_name) {
57            return true;
58        }
59    }
60
61    false
62}
63
64/// Decode a byte buffer to a String, trying UTF-8 first, then Latin-1.
65///
66/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
67/// so it can decode any byte sequence. This matches Python ScanCode's use of
68/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
69pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
70    match String::from_utf8(bytes.to_vec()) {
71        Ok(s) => s,
72        Err(e) => {
73            let bytes = e.into_bytes();
74            // Binary heuristic: >10% control chars (0x00-0x08, 0x0E-0x1F) means binary.
75            let control_count = bytes
76                .iter()
77                .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
78                .count();
79            if control_count > bytes.len() / 10 {
80                return String::new();
81            }
82            bytes.iter().map(|&b| b as char).collect()
83        }
84    }
85}
86
87pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
88    let ext = path
89        .extension()
90        .and_then(|e| e.to_str())
91        .map(|s| s.to_ascii_lowercase());
92
93    if matches!(ext.as_deref(), Some("pdf")) {
94        let text = extract_pdf_text(bytes);
95        return if text.is_empty() {
96            (String::new(), ExtractedTextKind::None)
97        } else {
98            (text, ExtractedTextKind::Pdf)
99        };
100    }
101
102    if let Some(format) = supported_image_metadata_format(ext.as_deref()) {
103        let text = extract_image_metadata_text(bytes, format);
104        return if text.is_empty() {
105            if is_supported_image_container(bytes, format) {
106                (String::new(), ExtractedTextKind::None)
107            } else {
108                let decoded = decode_bytes_to_string(bytes);
109                if decoded.is_empty() {
110                    (String::new(), ExtractedTextKind::None)
111                } else {
112                    (decoded, ExtractedTextKind::Decoded)
113                }
114            }
115        } else {
116            (text, ExtractedTextKind::ImageMetadata)
117        };
118    }
119
120    let decoded = decode_bytes_to_string(bytes);
121    if !decoded.is_empty() {
122        return (decoded, ExtractedTextKind::Decoded);
123    }
124
125    if matches!(ext.as_deref(), Some("dll") | Some("exe")) {
126        let text = extract_printable_strings(bytes);
127        return if text.is_empty() {
128            (String::new(), ExtractedTextKind::None)
129        } else {
130            (text, ExtractedTextKind::BinaryStrings)
131        };
132    }
133
134    (String::new(), ExtractedTextKind::None)
135}
136
137fn supported_image_metadata_format(ext: Option<&str>) -> Option<ImageFormat> {
138    match ext? {
139        "jpg" | "jpeg" => Some(ImageFormat::Jpeg),
140        "png" => Some(ImageFormat::Png),
141        "tif" | "tiff" => Some(ImageFormat::Tiff),
142        "webp" => Some(ImageFormat::WebP),
143        _ => None,
144    }
145}
146
147fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
148    match format {
149        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
150        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
151        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
152        ImageFormat::WebP => {
153            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
154        }
155        _ => false,
156    }
157}
158
159fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
160    let mut values = Vec::new();
161    values.extend(extract_exif_metadata_values(bytes));
162    values.extend(extract_xmp_metadata_values(bytes, format));
163    values_to_text(values)
164}
165
166fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
167    let mut cursor = BufReader::new(Cursor::new(bytes));
168    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
169        Ok(exif) => exif,
170        Err(_) => return Vec::new(),
171    };
172
173    let mut values = Vec::new();
174    for field in exif.fields() {
175        let rendered = match field.tag {
176            exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
177                Some(field.display_value().with_unit(&exif).to_string())
178            }
179            exif::Tag::Artist => Some(format!(
180                "Author: {}",
181                field.display_value().with_unit(&exif)
182            )),
183            _ => None,
184        };
185
186        if let Some(rendered) = rendered {
187            values.push(rendered);
188        }
189    }
190
191    values
192}
193
194fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
195    let xmp = match extract_raw_xmp_packet(bytes, format) {
196        Some(xmp) => xmp,
197        None => return Vec::new(),
198    };
199
200    parse_xmp_values(&xmp)
201}
202
203fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
204    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
205    if let Ok(mut decoder) = reader.into_decoder()
206        && let Ok(Some(xmp)) = decoder.xmp_metadata()
207    {
208        return Some(xmp);
209    }
210
211    match format {
212        ImageFormat::Png => extract_png_xmp_packet(bytes),
213        _ => None,
214    }
215}
216
217fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
218    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
219
220    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
221        return None;
222    }
223
224    let mut offset = PNG_SIGNATURE.len();
225    while offset + 12 <= bytes.len() {
226        let length = u32::from_be_bytes([
227            bytes[offset],
228            bytes[offset + 1],
229            bytes[offset + 2],
230            bytes[offset + 3],
231        ]) as usize;
232        let chunk_start = offset + 8;
233        let chunk_end = chunk_start + length;
234        if chunk_end + 4 > bytes.len() {
235            return None;
236        }
237
238        let chunk_type = &bytes[offset + 4..offset + 8];
239        if chunk_type == b"iTXt" {
240            let data = &bytes[chunk_start..chunk_end];
241            if let Some(xmp) = parse_png_itxt_xmp(data) {
242                return Some(xmp);
243            }
244        }
245
246        offset = chunk_end + 4;
247    }
248
249    None
250}
251
252fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
253    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
254
255    let keyword_end = data.iter().position(|&b| b == 0)?;
256    if &data[..keyword_end] != XMP_KEYWORD {
257        return None;
258    }
259
260    let mut cursor = keyword_end + 1;
261    let compression_flag = *data.get(cursor)?;
262    cursor += 1;
263    let compression_method = *data.get(cursor)?;
264    cursor += 1;
265    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
266        return None;
267    }
268
269    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
270    cursor = language_end + 1;
271
272    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
273    cursor = translated_end + 1;
274
275    let text_bytes = &data[cursor..];
276    if compression_flag == 1 {
277        let mut decoder = ZlibDecoder::new(text_bytes);
278        let mut decoded = Vec::new();
279        decoder.read_to_end(&mut decoded).ok()?;
280        Some(decoded)
281    } else {
282        Some(text_bytes.to_vec())
283    }
284}
285
286fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
287    let mut reader = XmlReader::from_reader(xmp);
288    reader.config_mut().trim_text(true);
289
290    let mut buf = Vec::new();
291    let mut stack: Vec<String> = Vec::new();
292    let mut values = Vec::new();
293
294    loop {
295        match reader.read_event_into(&mut buf) {
296            Ok(Event::Start(e)) => {
297                stack.push(local_xml_name(e.name().as_ref()));
298            }
299            Ok(Event::End(_)) => {
300                stack.pop();
301            }
302            Ok(Event::Empty(_)) => {}
303            Ok(Event::Text(text)) => {
304                if let Some(field) = stack
305                    .iter()
306                    .rev()
307                    .find_map(|name| allowed_xmp_field(name.as_str()))
308                    && let Ok(decoded) = text.decode()
309                {
310                    let decoded = decoded.into_owned();
311                    if !decoded.trim().is_empty() {
312                        values.push(format_xmp_value(field, &decoded));
313                    }
314                }
315            }
316            Ok(Event::CData(text)) => {
317                if let Some(field) = stack
318                    .iter()
319                    .rev()
320                    .find_map(|name| allowed_xmp_field(name.as_str()))
321                    && let Ok(decoded) = text.decode()
322                {
323                    let decoded = decoded.into_owned();
324                    if !decoded.trim().is_empty() {
325                        values.push(format_xmp_value(field, &decoded));
326                    }
327                }
328            }
329            Ok(Event::Eof) | Err(_) => break,
330            _ => {}
331        }
332        buf.clear();
333    }
334
335    values
336}
337
338fn local_xml_name(name: &[u8]) -> String {
339    let name = std::str::from_utf8(name).unwrap_or_default();
340    name.rsplit(':').next().unwrap_or(name).to_string()
341}
342
343fn allowed_xmp_field(name: &str) -> Option<&'static str> {
344    match name {
345        "creator" => Some("creator"),
346        "rights" => Some("rights"),
347        "description" => Some("description"),
348        "title" => Some("title"),
349        "subject" => Some("subject"),
350        "UsageTerms" => Some("usage_terms"),
351        "WebStatement" => Some("web_statement"),
352        _ => None,
353    }
354}
355
356fn format_xmp_value(field: &str, value: &str) -> String {
357    match field {
358        "creator" => format!("Author: {value}"),
359        _ => value.to_string(),
360    }
361}
362
363fn values_to_text(values: Vec<String>) -> String {
364    let mut seen = BTreeSet::new();
365    let mut lines = Vec::new();
366    let mut total_bytes = 0usize;
367
368    for value in values {
369        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
370            break;
371        }
372
373        let normalized = normalize_metadata_value(&value);
374        if normalized.is_empty() || !seen.insert(normalized.clone()) {
375            continue;
376        }
377
378        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
379        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
380            break;
381        }
382
383        total_bytes += added_bytes;
384        lines.push(normalized);
385    }
386
387    lines.join("\n")
388}
389
390fn normalize_metadata_value(value: &str) -> String {
391    value
392        .chars()
393        .filter(|&ch| ch != '\0')
394        .collect::<String>()
395        .split_whitespace()
396        .collect::<Vec<_>>()
397        .join(" ")
398        .trim()
399        .to_string()
400}
401
402fn extract_pdf_text(bytes: &[u8]) -> String {
403    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
404        return String::new();
405    }
406
407    let extracted = catch_unwind(AssertUnwindSafe(|| {
408        pdf_extract::extract_text_from_mem_by_pages(bytes)
409    }));
410    match extracted {
411        Ok(Ok(pages)) => {
412            let Some(text) = pages.into_iter().next() else {
413                return String::new();
414            };
415            let normalized = text.replace(['\r', '\u{0c}'], "\n");
416            if normalized.trim().is_empty() {
417                String::new()
418            } else {
419                normalized
420            }
421        }
422        Ok(Err(_)) | Err(_) => String::new(),
423    }
424}
425
426pub fn extract_printable_strings(bytes: &[u8]) -> String {
427    const MIN_LEN: usize = 4;
428    const MAX_OUTPUT_BYTES: usize = 2_000_000;
429
430    fn is_printable_ascii(b: u8) -> bool {
431        matches!(b, 0x20..=0x7E)
432    }
433
434    let mut out = String::new();
435    let mut run: Vec<u8> = Vec::new();
436
437    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
438        if run.len() >= MIN_LEN {
439            if !out.is_empty() {
440                out.push('\n');
441            }
442            out.push_str(&String::from_utf8_lossy(run));
443        }
444        run.clear();
445    };
446
447    for &b in bytes {
448        if is_printable_ascii(b) {
449            run.push(b);
450        } else {
451            flush_run(&mut out, &mut run);
452            if out.len() >= MAX_OUTPUT_BYTES {
453                return out;
454            }
455        }
456    }
457    flush_run(&mut out, &mut run);
458    if out.len() >= MAX_OUTPUT_BYTES {
459        return out;
460    }
461
462    for start in 0..=1 {
463        run.clear();
464        let mut i = start;
465        while i + 1 < bytes.len() {
466            let b0 = bytes[i];
467            let b1 = bytes[i + 1];
468            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
469            if is_printable_ascii(ch) && zero == 0 {
470                run.push(ch);
471            } else {
472                flush_run(&mut out, &mut run);
473                if out.len() >= MAX_OUTPUT_BYTES {
474                    return out;
475                }
476            }
477            i += 2;
478        }
479        flush_run(&mut out, &mut run);
480        if out.len() >= MAX_OUTPUT_BYTES {
481            return out;
482        }
483    }
484
485    out
486}