1use std::collections::BTreeSet;
2use std::fs;
3use std::io::{BufReader, Cursor, Read};
4use std::panic::{AssertUnwindSafe, catch_unwind};
5use std::path::Path;
6
7use chrono::{TimeZone, Utc};
8use flate2::read::ZlibDecoder;
9use glob::Pattern;
10use image::{ImageDecoder, ImageFormat, ImageReader};
11use quick_xml::events::Event;
12use quick_xml::reader::Reader as XmlReader;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum ExtractedTextKind {
16 None,
17 Decoded,
18 Pdf,
19 BinaryStrings,
20 ImageMetadata,
21}
22
23const MAX_IMAGE_METADATA_VALUES: usize = 64;
24const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
25
26pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
28 metadata.created().ok().map(|time: std::time::SystemTime| {
29 let seconds_since_epoch = time
30 .duration_since(std::time::UNIX_EPOCH)
31 .unwrap()
32 .as_secs() as i64;
33
34 Utc.timestamp_opt(seconds_since_epoch, 0)
35 .single()
36 .unwrap_or_else(Utc::now)
37 .to_rfc3339()
38 })
39}
40
41pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
43 let path_str = path.to_string_lossy();
44 let file_name = path
45 .file_name()
46 .map(|name| name.to_string_lossy())
47 .unwrap_or_default();
48
49 for pattern in exclude_patterns {
50 if pattern.matches(&path_str) {
52 return true;
53 }
54
55 if pattern.matches(&file_name) {
57 return true;
58 }
59 }
60
61 false
62}
63
64pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
70 match String::from_utf8(bytes.to_vec()) {
71 Ok(s) => s,
72 Err(e) => {
73 let bytes = e.into_bytes();
74 let control_count = bytes
76 .iter()
77 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
78 .count();
79 if control_count > bytes.len() / 10 {
80 return String::new();
81 }
82 bytes.iter().map(|&b| b as char).collect()
83 }
84 }
85}
86
87pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
88 let ext = path
89 .extension()
90 .and_then(|e| e.to_str())
91 .map(|s| s.to_ascii_lowercase());
92
93 if matches!(ext.as_deref(), Some("pdf")) {
94 let text = extract_pdf_text(bytes);
95 return if text.is_empty() {
96 (String::new(), ExtractedTextKind::None)
97 } else {
98 (text, ExtractedTextKind::Pdf)
99 };
100 }
101
102 if let Some(format) = supported_image_metadata_format(ext.as_deref()) {
103 let text = extract_image_metadata_text(bytes, format);
104 return if text.is_empty() {
105 if is_supported_image_container(bytes, format) {
106 (String::new(), ExtractedTextKind::None)
107 } else {
108 let decoded = decode_bytes_to_string(bytes);
109 if decoded.is_empty() {
110 (String::new(), ExtractedTextKind::None)
111 } else {
112 (decoded, ExtractedTextKind::Decoded)
113 }
114 }
115 } else {
116 (text, ExtractedTextKind::ImageMetadata)
117 };
118 }
119
120 let decoded = decode_bytes_to_string(bytes);
121 if !decoded.is_empty() {
122 return (decoded, ExtractedTextKind::Decoded);
123 }
124
125 if matches!(ext.as_deref(), Some("jar")) && is_zip_archive(bytes) {
126 return (String::new(), ExtractedTextKind::None);
127 }
128
129 if matches!(ext.as_deref(), Some("pdf")) {
132 return (String::new(), ExtractedTextKind::None);
133 }
134
135 let text = extract_printable_strings(bytes);
136 if text.is_empty() {
137 (String::new(), ExtractedTextKind::None)
138 } else {
139 (text, ExtractedTextKind::BinaryStrings)
140 }
141}
142
143fn supported_image_metadata_format(ext: Option<&str>) -> Option<ImageFormat> {
144 match ext? {
145 "jpg" | "jpeg" => Some(ImageFormat::Jpeg),
146 "png" => Some(ImageFormat::Png),
147 "tif" | "tiff" => Some(ImageFormat::Tiff),
148 "webp" => Some(ImageFormat::WebP),
149 _ => None,
150 }
151}
152
153fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
154 match format {
155 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
156 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
157 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
158 ImageFormat::WebP => {
159 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
160 }
161 _ => false,
162 }
163}
164
165fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
166 let mut values = Vec::new();
167 values.extend(extract_exif_metadata_values(bytes));
168 values.extend(extract_xmp_metadata_values(bytes, format));
169 values_to_text(values)
170}
171
172fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
173 let mut cursor = BufReader::new(Cursor::new(bytes));
174 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
175 Ok(exif) => exif,
176 Err(_) => return Vec::new(),
177 };
178
179 let mut values = Vec::new();
180 for field in exif.fields() {
181 let rendered = match field.tag {
182 exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
183 Some(field.display_value().with_unit(&exif).to_string())
184 }
185 exif::Tag::Artist => Some(format!(
186 "Author: {}",
187 field.display_value().with_unit(&exif)
188 )),
189 _ => None,
190 };
191
192 if let Some(rendered) = rendered {
193 values.push(rendered);
194 }
195 }
196
197 values
198}
199
200fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
201 let xmp = match extract_raw_xmp_packet(bytes, format) {
202 Some(xmp) => xmp,
203 None => return Vec::new(),
204 };
205
206 parse_xmp_values(&xmp)
207}
208
209fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
210 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
211 if let Ok(mut decoder) = reader.into_decoder()
212 && let Ok(Some(xmp)) = decoder.xmp_metadata()
213 {
214 return Some(xmp);
215 }
216
217 match format {
218 ImageFormat::Png => extract_png_xmp_packet(bytes),
219 _ => None,
220 }
221}
222
223fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
224 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
225
226 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
227 return None;
228 }
229
230 let mut offset = PNG_SIGNATURE.len();
231 while offset + 12 <= bytes.len() {
232 let length = u32::from_be_bytes([
233 bytes[offset],
234 bytes[offset + 1],
235 bytes[offset + 2],
236 bytes[offset + 3],
237 ]) as usize;
238 let chunk_start = offset + 8;
239 let chunk_end = chunk_start + length;
240 if chunk_end + 4 > bytes.len() {
241 return None;
242 }
243
244 let chunk_type = &bytes[offset + 4..offset + 8];
245 if chunk_type == b"iTXt" {
246 let data = &bytes[chunk_start..chunk_end];
247 if let Some(xmp) = parse_png_itxt_xmp(data) {
248 return Some(xmp);
249 }
250 }
251
252 offset = chunk_end + 4;
253 }
254
255 None
256}
257
258fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
259 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
260
261 let keyword_end = data.iter().position(|&b| b == 0)?;
262 if &data[..keyword_end] != XMP_KEYWORD {
263 return None;
264 }
265
266 let mut cursor = keyword_end + 1;
267 let compression_flag = *data.get(cursor)?;
268 cursor += 1;
269 let compression_method = *data.get(cursor)?;
270 cursor += 1;
271 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
272 return None;
273 }
274
275 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
276 cursor = language_end + 1;
277
278 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
279 cursor = translated_end + 1;
280
281 let text_bytes = &data[cursor..];
282 if compression_flag == 1 {
283 let mut decoder = ZlibDecoder::new(text_bytes);
284 let mut decoded = Vec::new();
285 decoder.read_to_end(&mut decoded).ok()?;
286 Some(decoded)
287 } else {
288 Some(text_bytes.to_vec())
289 }
290}
291
292fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
293 let mut reader = XmlReader::from_reader(xmp);
294 reader.config_mut().trim_text(true);
295
296 let mut buf = Vec::new();
297 let mut stack: Vec<String> = Vec::new();
298 let mut values = Vec::new();
299
300 loop {
301 match reader.read_event_into(&mut buf) {
302 Ok(Event::Start(e)) => {
303 stack.push(local_xml_name(e.name().as_ref()));
304 }
305 Ok(Event::End(_)) => {
306 stack.pop();
307 }
308 Ok(Event::Empty(_)) => {}
309 Ok(Event::Text(text)) => {
310 if let Some(field) = stack
311 .iter()
312 .rev()
313 .find_map(|name| allowed_xmp_field(name.as_str()))
314 && let Ok(decoded) = text.decode()
315 {
316 let decoded = decoded.into_owned();
317 if !decoded.trim().is_empty() {
318 values.push(format_xmp_value(field, &decoded));
319 }
320 }
321 }
322 Ok(Event::CData(text)) => {
323 if let Some(field) = stack
324 .iter()
325 .rev()
326 .find_map(|name| allowed_xmp_field(name.as_str()))
327 && let Ok(decoded) = text.decode()
328 {
329 let decoded = decoded.into_owned();
330 if !decoded.trim().is_empty() {
331 values.push(format_xmp_value(field, &decoded));
332 }
333 }
334 }
335 Ok(Event::Eof) | Err(_) => break,
336 _ => {}
337 }
338 buf.clear();
339 }
340
341 values
342}
343
344fn local_xml_name(name: &[u8]) -> String {
345 let name = std::str::from_utf8(name).unwrap_or_default();
346 name.rsplit(':').next().unwrap_or(name).to_string()
347}
348
349fn allowed_xmp_field(name: &str) -> Option<&'static str> {
350 match name {
351 "creator" => Some("creator"),
352 "rights" => Some("rights"),
353 "description" => Some("description"),
354 "title" => Some("title"),
355 "subject" => Some("subject"),
356 "UsageTerms" => Some("usage_terms"),
357 "WebStatement" => Some("web_statement"),
358 _ => None,
359 }
360}
361
362fn format_xmp_value(field: &str, value: &str) -> String {
363 match field {
364 "creator" => format!("Author: {value}"),
365 _ => value.to_string(),
366 }
367}
368
369fn values_to_text(values: Vec<String>) -> String {
370 let mut seen = BTreeSet::new();
371 let mut lines = Vec::new();
372 let mut total_bytes = 0usize;
373
374 for value in values {
375 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
376 break;
377 }
378
379 let normalized = normalize_metadata_value(&value);
380 if normalized.is_empty() || !seen.insert(normalized.clone()) {
381 continue;
382 }
383
384 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
385 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
386 break;
387 }
388
389 total_bytes += added_bytes;
390 lines.push(normalized);
391 }
392
393 lines.join("\n")
394}
395
396fn normalize_metadata_value(value: &str) -> String {
397 value
398 .chars()
399 .filter(|&ch| ch != '\0')
400 .collect::<String>()
401 .split_whitespace()
402 .collect::<Vec<_>>()
403 .join(" ")
404 .trim()
405 .to_string()
406}
407
408fn extract_pdf_text(bytes: &[u8]) -> String {
409 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
410 return String::new();
411 }
412
413 let extracted = catch_unwind(AssertUnwindSafe(|| {
414 pdf_extract::extract_text_from_mem_by_pages(bytes)
415 }));
416 match extracted {
417 Ok(Ok(pages)) => {
418 let Some(text) = pages.into_iter().next() else {
419 return String::new();
420 };
421 let normalized = text.replace(['\r', '\u{0c}'], "\n");
422 if normalized.trim().is_empty() {
423 String::new()
424 } else {
425 normalized
426 }
427 }
428 Ok(Err(_)) | Err(_) => String::new(),
429 }
430}
431
432fn is_zip_archive(bytes: &[u8]) -> bool {
433 bytes.starts_with(b"PK\x03\x04")
434 || bytes.starts_with(b"PK\x05\x06")
435 || bytes.starts_with(b"PK\x07\x08")
436}
437
438pub fn extract_printable_strings(bytes: &[u8]) -> String {
439 const MIN_LEN: usize = 4;
440 const MAX_OUTPUT_BYTES: usize = 2_000_000;
441
442 fn is_printable_ascii(b: u8) -> bool {
443 matches!(b, 0x20..=0x7E)
444 }
445
446 let mut out = String::new();
447 let mut run: Vec<u8> = Vec::new();
448
449 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
450 if run.len() >= MIN_LEN {
451 if !out.is_empty() {
452 out.push('\n');
453 }
454 out.push_str(&String::from_utf8_lossy(run));
455 }
456 run.clear();
457 };
458
459 for &b in bytes {
460 if is_printable_ascii(b) {
461 run.push(b);
462 } else {
463 flush_run(&mut out, &mut run);
464 if out.len() >= MAX_OUTPUT_BYTES {
465 return out;
466 }
467 }
468 }
469 flush_run(&mut out, &mut run);
470 if out.len() >= MAX_OUTPUT_BYTES {
471 return out;
472 }
473
474 for start in 0..=1 {
475 run.clear();
476 let mut i = start;
477 while i + 1 < bytes.len() {
478 let b0 = bytes[i];
479 let b1 = bytes[i + 1];
480 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
481 if is_printable_ascii(ch) && zero == 0 {
482 run.push(ch);
483 } else {
484 flush_run(&mut out, &mut run);
485 if out.len() >= MAX_OUTPUT_BYTES {
486 return out;
487 }
488 }
489 i += 2;
490 }
491 flush_run(&mut out, &mut run);
492 if out.len() >= MAX_OUTPUT_BYTES {
493 return out;
494 }
495 }
496
497 out
498}
499
500#[cfg(test)]
501mod tests {
502 use std::path::Path;
503
504 use super::{ExtractedTextKind, extract_text_for_detection};
505
506 #[test]
507 fn test_extract_text_for_detection_skips_jar_archives() {
508 let path = Path::new(
509 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
510 );
511 let bytes = std::fs::read(path).expect("failed to read jar fixture");
512
513 let (text, kind) = extract_text_for_detection(path, &bytes);
514
515 assert!(text.is_empty());
516 assert_eq!(kind, ExtractedTextKind::None);
517 }
518}