Skip to main content

provenant/utils/
file.rs

1use std::collections::BTreeSet;
2use std::fs;
3use std::io::{BufReader, Cursor, Read};
4use std::panic::{AssertUnwindSafe, catch_unwind};
5use std::path::Path;
6
7use chrono::{TimeZone, Utc};
8use flate2::read::ZlibDecoder;
9use glob::Pattern;
10use image::{ImageDecoder, ImageFormat, ImageReader};
11use quick_xml::events::Event;
12use quick_xml::reader::Reader as XmlReader;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum ExtractedTextKind {
16    None,
17    Decoded,
18    Pdf,
19    BinaryStrings,
20    ImageMetadata,
21}
22
23const MAX_IMAGE_METADATA_VALUES: usize = 64;
24const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
25
26/// Get the creation date of a file or directory as an RFC3339 string.
27pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
28    metadata.created().ok().map(|time: std::time::SystemTime| {
29        let seconds_since_epoch = time
30            .duration_since(std::time::UNIX_EPOCH)
31            .unwrap()
32            .as_secs() as i64;
33
34        Utc.timestamp_opt(seconds_since_epoch, 0)
35            .single()
36            .unwrap_or_else(Utc::now)
37            .to_rfc3339()
38    })
39}
40
41/// Check if a path should be excluded based on a list of glob patterns.
42pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
43    let path_str = path.to_string_lossy();
44    let file_name = path
45        .file_name()
46        .map(|name| name.to_string_lossy())
47        .unwrap_or_default();
48
49    for pattern in exclude_patterns {
50        // Match against full path
51        if pattern.matches(&path_str) {
52            return true;
53        }
54
55        // Match against just the file/directory name
56        if pattern.matches(&file_name) {
57            return true;
58        }
59    }
60
61    false
62}
63
64/// Decode a byte buffer to a String, trying UTF-8 first, then Latin-1.
65///
66/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
67/// so it can decode any byte sequence. This matches Python ScanCode's use of
68/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
69pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
70    match String::from_utf8(bytes.to_vec()) {
71        Ok(s) => s,
72        Err(e) => {
73            let bytes = e.into_bytes();
74            // Binary heuristic: >10% control chars (0x00-0x08, 0x0E-0x1F) means binary.
75            let control_count = bytes
76                .iter()
77                .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
78                .count();
79            if control_count > bytes.len() / 10 {
80                return String::new();
81            }
82            bytes.iter().map(|&b| b as char).collect()
83        }
84    }
85}
86
87pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
88    let ext = path
89        .extension()
90        .and_then(|e| e.to_str())
91        .map(|s| s.to_ascii_lowercase());
92
93    if matches!(ext.as_deref(), Some("pdf")) {
94        let text = extract_pdf_text(bytes);
95        return if text.is_empty() {
96            (String::new(), ExtractedTextKind::None)
97        } else {
98            (text, ExtractedTextKind::Pdf)
99        };
100    }
101
102    if let Some(format) = supported_image_metadata_format(ext.as_deref()) {
103        let text = extract_image_metadata_text(bytes, format);
104        return if text.is_empty() {
105            if is_supported_image_container(bytes, format) {
106                (String::new(), ExtractedTextKind::None)
107            } else {
108                let decoded = decode_bytes_to_string(bytes);
109                if decoded.is_empty() {
110                    (String::new(), ExtractedTextKind::None)
111                } else {
112                    (decoded, ExtractedTextKind::Decoded)
113                }
114            }
115        } else {
116            (text, ExtractedTextKind::ImageMetadata)
117        };
118    }
119
120    let decoded = decode_bytes_to_string(bytes);
121    if !decoded.is_empty() {
122        return (decoded, ExtractedTextKind::Decoded);
123    }
124
125    if matches!(ext.as_deref(), Some("jar")) && is_zip_archive(bytes) {
126        return (String::new(), ExtractedTextKind::None);
127    }
128
129    // Skip string extraction for PDFs - they have their own text extraction above
130    // and we don't want to extract strings from PDF binary content
131    if matches!(ext.as_deref(), Some("pdf")) {
132        return (String::new(), ExtractedTextKind::None);
133    }
134
135    let text = extract_printable_strings(bytes);
136    if text.is_empty() {
137        (String::new(), ExtractedTextKind::None)
138    } else {
139        (text, ExtractedTextKind::BinaryStrings)
140    }
141}
142
143fn supported_image_metadata_format(ext: Option<&str>) -> Option<ImageFormat> {
144    match ext? {
145        "jpg" | "jpeg" => Some(ImageFormat::Jpeg),
146        "png" => Some(ImageFormat::Png),
147        "tif" | "tiff" => Some(ImageFormat::Tiff),
148        "webp" => Some(ImageFormat::WebP),
149        _ => None,
150    }
151}
152
153fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
154    match format {
155        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
156        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
157        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
158        ImageFormat::WebP => {
159            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
160        }
161        _ => false,
162    }
163}
164
165fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
166    let mut values = Vec::new();
167    values.extend(extract_exif_metadata_values(bytes));
168    values.extend(extract_xmp_metadata_values(bytes, format));
169    values_to_text(values)
170}
171
172fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
173    let mut cursor = BufReader::new(Cursor::new(bytes));
174    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
175        Ok(exif) => exif,
176        Err(_) => return Vec::new(),
177    };
178
179    let mut values = Vec::new();
180    for field in exif.fields() {
181        let rendered = match field.tag {
182            exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
183                Some(field.display_value().with_unit(&exif).to_string())
184            }
185            exif::Tag::Artist => Some(format!(
186                "Author: {}",
187                field.display_value().with_unit(&exif)
188            )),
189            _ => None,
190        };
191
192        if let Some(rendered) = rendered {
193            values.push(rendered);
194        }
195    }
196
197    values
198}
199
200fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
201    let xmp = match extract_raw_xmp_packet(bytes, format) {
202        Some(xmp) => xmp,
203        None => return Vec::new(),
204    };
205
206    parse_xmp_values(&xmp)
207}
208
209fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
210    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
211    if let Ok(mut decoder) = reader.into_decoder()
212        && let Ok(Some(xmp)) = decoder.xmp_metadata()
213    {
214        return Some(xmp);
215    }
216
217    match format {
218        ImageFormat::Png => extract_png_xmp_packet(bytes),
219        _ => None,
220    }
221}
222
223fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
224    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
225
226    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
227        return None;
228    }
229
230    let mut offset = PNG_SIGNATURE.len();
231    while offset + 12 <= bytes.len() {
232        let length = u32::from_be_bytes([
233            bytes[offset],
234            bytes[offset + 1],
235            bytes[offset + 2],
236            bytes[offset + 3],
237        ]) as usize;
238        let chunk_start = offset + 8;
239        let chunk_end = chunk_start + length;
240        if chunk_end + 4 > bytes.len() {
241            return None;
242        }
243
244        let chunk_type = &bytes[offset + 4..offset + 8];
245        if chunk_type == b"iTXt" {
246            let data = &bytes[chunk_start..chunk_end];
247            if let Some(xmp) = parse_png_itxt_xmp(data) {
248                return Some(xmp);
249            }
250        }
251
252        offset = chunk_end + 4;
253    }
254
255    None
256}
257
258fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
259    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
260
261    let keyword_end = data.iter().position(|&b| b == 0)?;
262    if &data[..keyword_end] != XMP_KEYWORD {
263        return None;
264    }
265
266    let mut cursor = keyword_end + 1;
267    let compression_flag = *data.get(cursor)?;
268    cursor += 1;
269    let compression_method = *data.get(cursor)?;
270    cursor += 1;
271    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
272        return None;
273    }
274
275    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
276    cursor = language_end + 1;
277
278    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
279    cursor = translated_end + 1;
280
281    let text_bytes = &data[cursor..];
282    if compression_flag == 1 {
283        let mut decoder = ZlibDecoder::new(text_bytes);
284        let mut decoded = Vec::new();
285        decoder.read_to_end(&mut decoded).ok()?;
286        Some(decoded)
287    } else {
288        Some(text_bytes.to_vec())
289    }
290}
291
292fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
293    let mut reader = XmlReader::from_reader(xmp);
294    reader.config_mut().trim_text(true);
295
296    let mut buf = Vec::new();
297    let mut stack: Vec<String> = Vec::new();
298    let mut values = Vec::new();
299
300    loop {
301        match reader.read_event_into(&mut buf) {
302            Ok(Event::Start(e)) => {
303                stack.push(local_xml_name(e.name().as_ref()));
304            }
305            Ok(Event::End(_)) => {
306                stack.pop();
307            }
308            Ok(Event::Empty(_)) => {}
309            Ok(Event::Text(text)) => {
310                if let Some(field) = stack
311                    .iter()
312                    .rev()
313                    .find_map(|name| allowed_xmp_field(name.as_str()))
314                    && let Ok(decoded) = text.decode()
315                {
316                    let decoded = decoded.into_owned();
317                    if !decoded.trim().is_empty() {
318                        values.push(format_xmp_value(field, &decoded));
319                    }
320                }
321            }
322            Ok(Event::CData(text)) => {
323                if let Some(field) = stack
324                    .iter()
325                    .rev()
326                    .find_map(|name| allowed_xmp_field(name.as_str()))
327                    && let Ok(decoded) = text.decode()
328                {
329                    let decoded = decoded.into_owned();
330                    if !decoded.trim().is_empty() {
331                        values.push(format_xmp_value(field, &decoded));
332                    }
333                }
334            }
335            Ok(Event::Eof) | Err(_) => break,
336            _ => {}
337        }
338        buf.clear();
339    }
340
341    values
342}
343
344fn local_xml_name(name: &[u8]) -> String {
345    let name = std::str::from_utf8(name).unwrap_or_default();
346    name.rsplit(':').next().unwrap_or(name).to_string()
347}
348
349fn allowed_xmp_field(name: &str) -> Option<&'static str> {
350    match name {
351        "creator" => Some("creator"),
352        "rights" => Some("rights"),
353        "description" => Some("description"),
354        "title" => Some("title"),
355        "subject" => Some("subject"),
356        "UsageTerms" => Some("usage_terms"),
357        "WebStatement" => Some("web_statement"),
358        _ => None,
359    }
360}
361
362fn format_xmp_value(field: &str, value: &str) -> String {
363    match field {
364        "creator" => format!("Author: {value}"),
365        _ => value.to_string(),
366    }
367}
368
369fn values_to_text(values: Vec<String>) -> String {
370    let mut seen = BTreeSet::new();
371    let mut lines = Vec::new();
372    let mut total_bytes = 0usize;
373
374    for value in values {
375        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
376            break;
377        }
378
379        let normalized = normalize_metadata_value(&value);
380        if normalized.is_empty() || !seen.insert(normalized.clone()) {
381            continue;
382        }
383
384        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
385        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
386            break;
387        }
388
389        total_bytes += added_bytes;
390        lines.push(normalized);
391    }
392
393    lines.join("\n")
394}
395
396fn normalize_metadata_value(value: &str) -> String {
397    value
398        .chars()
399        .filter(|&ch| ch != '\0')
400        .collect::<String>()
401        .split_whitespace()
402        .collect::<Vec<_>>()
403        .join(" ")
404        .trim()
405        .to_string()
406}
407
408fn extract_pdf_text(bytes: &[u8]) -> String {
409    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
410        return String::new();
411    }
412
413    let extracted = catch_unwind(AssertUnwindSafe(|| {
414        pdf_extract::extract_text_from_mem_by_pages(bytes)
415    }));
416    match extracted {
417        Ok(Ok(pages)) => {
418            let Some(text) = pages.into_iter().next() else {
419                return String::new();
420            };
421            let normalized = text.replace(['\r', '\u{0c}'], "\n");
422            if normalized.trim().is_empty() {
423                String::new()
424            } else {
425                normalized
426            }
427        }
428        Ok(Err(_)) | Err(_) => String::new(),
429    }
430}
431
432fn is_zip_archive(bytes: &[u8]) -> bool {
433    bytes.starts_with(b"PK\x03\x04")
434        || bytes.starts_with(b"PK\x05\x06")
435        || bytes.starts_with(b"PK\x07\x08")
436}
437
438pub fn extract_printable_strings(bytes: &[u8]) -> String {
439    const MIN_LEN: usize = 4;
440    const MAX_OUTPUT_BYTES: usize = 2_000_000;
441
442    fn is_printable_ascii(b: u8) -> bool {
443        matches!(b, 0x20..=0x7E)
444    }
445
446    let mut out = String::new();
447    let mut run: Vec<u8> = Vec::new();
448
449    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
450        if run.len() >= MIN_LEN {
451            if !out.is_empty() {
452                out.push('\n');
453            }
454            out.push_str(&String::from_utf8_lossy(run));
455        }
456        run.clear();
457    };
458
459    for &b in bytes {
460        if is_printable_ascii(b) {
461            run.push(b);
462        } else {
463            flush_run(&mut out, &mut run);
464            if out.len() >= MAX_OUTPUT_BYTES {
465                return out;
466            }
467        }
468    }
469    flush_run(&mut out, &mut run);
470    if out.len() >= MAX_OUTPUT_BYTES {
471        return out;
472    }
473
474    for start in 0..=1 {
475        run.clear();
476        let mut i = start;
477        while i + 1 < bytes.len() {
478            let b0 = bytes[i];
479            let b1 = bytes[i + 1];
480            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
481            if is_printable_ascii(ch) && zero == 0 {
482                run.push(ch);
483            } else {
484                flush_run(&mut out, &mut run);
485                if out.len() >= MAX_OUTPUT_BYTES {
486                    return out;
487                }
488            }
489            i += 2;
490        }
491        flush_run(&mut out, &mut run);
492        if out.len() >= MAX_OUTPUT_BYTES {
493            return out;
494        }
495    }
496
497    out
498}
499
500#[cfg(test)]
501mod tests {
502    use std::path::Path;
503
504    use super::{ExtractedTextKind, extract_text_for_detection};
505
506    #[test]
507    fn test_extract_text_for_detection_skips_jar_archives() {
508        let path = Path::new(
509            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
510        );
511        let bytes = std::fs::read(path).expect("failed to read jar fixture");
512
513        let (text, kind) = extract_text_for_detection(path, &bytes);
514
515        assert!(text.is_empty());
516        assert_eq!(kind, ExtractedTextKind::None);
517    }
518}