1use std::collections::BTreeSet;
2use std::fs;
3use std::io::{BufReader, Cursor, Read};
4use std::panic::{AssertUnwindSafe, catch_unwind};
5use std::path::Path;
6
7use chrono::{TimeZone, Utc};
8use flate2::read::ZlibDecoder;
9use glob::Pattern;
10use image::{ImageDecoder, ImageFormat, ImageReader};
11use quick_xml::events::Event;
12use quick_xml::reader::Reader as XmlReader;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
15pub enum ExtractedTextKind {
16 None,
17 Decoded,
18 Pdf,
19 BinaryStrings,
20 ImageMetadata,
21}
22
23const MAX_IMAGE_METADATA_VALUES: usize = 64;
24const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
25
26pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
28 metadata.created().ok().map(|time: std::time::SystemTime| {
29 let seconds_since_epoch = time
30 .duration_since(std::time::UNIX_EPOCH)
31 .unwrap()
32 .as_secs() as i64;
33
34 Utc.timestamp_opt(seconds_since_epoch, 0)
35 .single()
36 .unwrap_or_else(Utc::now)
37 .to_rfc3339()
38 })
39}
40
41pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
43 let path_str = path.to_string_lossy();
44 let file_name = path
45 .file_name()
46 .map(|name| name.to_string_lossy())
47 .unwrap_or_default();
48
49 for pattern in exclude_patterns {
50 if pattern.matches(&path_str) {
52 return true;
53 }
54
55 if pattern.matches(&file_name) {
57 return true;
58 }
59 }
60
61 false
62}
63
64pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
70 match String::from_utf8(bytes.to_vec()) {
71 Ok(s) => s,
72 Err(e) => {
73 let bytes = e.into_bytes();
74 let control_count = bytes
76 .iter()
77 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
78 .count();
79 if control_count > bytes.len() / 10 {
80 return String::new();
81 }
82 bytes.iter().map(|&b| b as char).collect()
83 }
84 }
85}
86
87pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
88 let ext = path
89 .extension()
90 .and_then(|e| e.to_str())
91 .map(|s| s.to_ascii_lowercase());
92
93 if matches!(ext.as_deref(), Some("pdf")) {
94 let text = extract_pdf_text(bytes);
95 return if text.is_empty() {
96 (String::new(), ExtractedTextKind::None)
97 } else {
98 (text, ExtractedTextKind::Pdf)
99 };
100 }
101
102 if let Some(format) = supported_image_metadata_format(ext.as_deref()) {
103 let text = extract_image_metadata_text(bytes, format);
104 return if text.is_empty() {
105 if is_supported_image_container(bytes, format) {
106 (String::new(), ExtractedTextKind::None)
107 } else {
108 let decoded = decode_bytes_to_string(bytes);
109 if decoded.is_empty() {
110 (String::new(), ExtractedTextKind::None)
111 } else {
112 (decoded, ExtractedTextKind::Decoded)
113 }
114 }
115 } else {
116 (text, ExtractedTextKind::ImageMetadata)
117 };
118 }
119
120 let decoded = decode_bytes_to_string(bytes);
121 if !decoded.is_empty() {
122 return (decoded, ExtractedTextKind::Decoded);
123 }
124
125 if matches!(ext.as_deref(), Some("dll") | Some("exe")) {
126 let text = extract_printable_strings(bytes);
127 return if text.is_empty() {
128 (String::new(), ExtractedTextKind::None)
129 } else {
130 (text, ExtractedTextKind::BinaryStrings)
131 };
132 }
133
134 (String::new(), ExtractedTextKind::None)
135}
136
137fn supported_image_metadata_format(ext: Option<&str>) -> Option<ImageFormat> {
138 match ext? {
139 "jpg" | "jpeg" => Some(ImageFormat::Jpeg),
140 "png" => Some(ImageFormat::Png),
141 "tif" | "tiff" => Some(ImageFormat::Tiff),
142 "webp" => Some(ImageFormat::WebP),
143 _ => None,
144 }
145}
146
147fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
148 match format {
149 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
150 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
151 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
152 ImageFormat::WebP => {
153 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
154 }
155 _ => false,
156 }
157}
158
159fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
160 let mut values = Vec::new();
161 values.extend(extract_exif_metadata_values(bytes));
162 values.extend(extract_xmp_metadata_values(bytes, format));
163 values_to_text(values)
164}
165
166fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
167 let mut cursor = BufReader::new(Cursor::new(bytes));
168 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
169 Ok(exif) => exif,
170 Err(_) => return Vec::new(),
171 };
172
173 let mut values = Vec::new();
174 for field in exif.fields() {
175 let rendered = match field.tag {
176 exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
177 Some(field.display_value().with_unit(&exif).to_string())
178 }
179 exif::Tag::Artist => Some(format!(
180 "Author: {}",
181 field.display_value().with_unit(&exif)
182 )),
183 _ => None,
184 };
185
186 if let Some(rendered) = rendered {
187 values.push(rendered);
188 }
189 }
190
191 values
192}
193
194fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
195 let xmp = match extract_raw_xmp_packet(bytes, format) {
196 Some(xmp) => xmp,
197 None => return Vec::new(),
198 };
199
200 parse_xmp_values(&xmp)
201}
202
203fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
204 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
205 if let Ok(mut decoder) = reader.into_decoder()
206 && let Ok(Some(xmp)) = decoder.xmp_metadata()
207 {
208 return Some(xmp);
209 }
210
211 match format {
212 ImageFormat::Png => extract_png_xmp_packet(bytes),
213 _ => None,
214 }
215}
216
217fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
218 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
219
220 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
221 return None;
222 }
223
224 let mut offset = PNG_SIGNATURE.len();
225 while offset + 12 <= bytes.len() {
226 let length = u32::from_be_bytes([
227 bytes[offset],
228 bytes[offset + 1],
229 bytes[offset + 2],
230 bytes[offset + 3],
231 ]) as usize;
232 let chunk_start = offset + 8;
233 let chunk_end = chunk_start + length;
234 if chunk_end + 4 > bytes.len() {
235 return None;
236 }
237
238 let chunk_type = &bytes[offset + 4..offset + 8];
239 if chunk_type == b"iTXt" {
240 let data = &bytes[chunk_start..chunk_end];
241 if let Some(xmp) = parse_png_itxt_xmp(data) {
242 return Some(xmp);
243 }
244 }
245
246 offset = chunk_end + 4;
247 }
248
249 None
250}
251
252fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
253 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
254
255 let keyword_end = data.iter().position(|&b| b == 0)?;
256 if &data[..keyword_end] != XMP_KEYWORD {
257 return None;
258 }
259
260 let mut cursor = keyword_end + 1;
261 let compression_flag = *data.get(cursor)?;
262 cursor += 1;
263 let compression_method = *data.get(cursor)?;
264 cursor += 1;
265 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
266 return None;
267 }
268
269 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
270 cursor = language_end + 1;
271
272 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
273 cursor = translated_end + 1;
274
275 let text_bytes = &data[cursor..];
276 if compression_flag == 1 {
277 let mut decoder = ZlibDecoder::new(text_bytes);
278 let mut decoded = Vec::new();
279 decoder.read_to_end(&mut decoded).ok()?;
280 Some(decoded)
281 } else {
282 Some(text_bytes.to_vec())
283 }
284}
285
286fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
287 let mut reader = XmlReader::from_reader(xmp);
288 reader.config_mut().trim_text(true);
289
290 let mut buf = Vec::new();
291 let mut stack: Vec<String> = Vec::new();
292 let mut values = Vec::new();
293
294 loop {
295 match reader.read_event_into(&mut buf) {
296 Ok(Event::Start(e)) => {
297 stack.push(local_xml_name(e.name().as_ref()));
298 }
299 Ok(Event::End(_)) => {
300 stack.pop();
301 }
302 Ok(Event::Empty(_)) => {}
303 Ok(Event::Text(text)) => {
304 if let Some(field) = stack
305 .iter()
306 .rev()
307 .find_map(|name| allowed_xmp_field(name.as_str()))
308 && let Ok(decoded) = text.decode()
309 {
310 let decoded = decoded.into_owned();
311 if !decoded.trim().is_empty() {
312 values.push(format_xmp_value(field, &decoded));
313 }
314 }
315 }
316 Ok(Event::CData(text)) => {
317 if let Some(field) = stack
318 .iter()
319 .rev()
320 .find_map(|name| allowed_xmp_field(name.as_str()))
321 && let Ok(decoded) = text.decode()
322 {
323 let decoded = decoded.into_owned();
324 if !decoded.trim().is_empty() {
325 values.push(format_xmp_value(field, &decoded));
326 }
327 }
328 }
329 Ok(Event::Eof) | Err(_) => break,
330 _ => {}
331 }
332 buf.clear();
333 }
334
335 values
336}
337
338fn local_xml_name(name: &[u8]) -> String {
339 let name = std::str::from_utf8(name).unwrap_or_default();
340 name.rsplit(':').next().unwrap_or(name).to_string()
341}
342
343fn allowed_xmp_field(name: &str) -> Option<&'static str> {
344 match name {
345 "creator" => Some("creator"),
346 "rights" => Some("rights"),
347 "description" => Some("description"),
348 "title" => Some("title"),
349 "subject" => Some("subject"),
350 "UsageTerms" => Some("usage_terms"),
351 "WebStatement" => Some("web_statement"),
352 _ => None,
353 }
354}
355
356fn format_xmp_value(field: &str, value: &str) -> String {
357 match field {
358 "creator" => format!("Author: {value}"),
359 _ => value.to_string(),
360 }
361}
362
363fn values_to_text(values: Vec<String>) -> String {
364 let mut seen = BTreeSet::new();
365 let mut lines = Vec::new();
366 let mut total_bytes = 0usize;
367
368 for value in values {
369 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
370 break;
371 }
372
373 let normalized = normalize_metadata_value(&value);
374 if normalized.is_empty() || !seen.insert(normalized.clone()) {
375 continue;
376 }
377
378 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
379 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
380 break;
381 }
382
383 total_bytes += added_bytes;
384 lines.push(normalized);
385 }
386
387 lines.join("\n")
388}
389
390fn normalize_metadata_value(value: &str) -> String {
391 value
392 .chars()
393 .filter(|&ch| ch != '\0')
394 .collect::<String>()
395 .split_whitespace()
396 .collect::<Vec<_>>()
397 .join(" ")
398 .trim()
399 .to_string()
400}
401
402fn extract_pdf_text(bytes: &[u8]) -> String {
403 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
404 return String::new();
405 }
406
407 let extracted = catch_unwind(AssertUnwindSafe(|| {
408 pdf_extract::extract_text_from_mem_by_pages(bytes)
409 }));
410 match extracted {
411 Ok(Ok(pages)) => {
412 let Some(text) = pages.into_iter().next() else {
413 return String::new();
414 };
415 let normalized = text.replace(['\r', '\u{0c}'], "\n");
416 if normalized.trim().is_empty() {
417 String::new()
418 } else {
419 normalized
420 }
421 }
422 Ok(Err(_)) | Err(_) => String::new(),
423 }
424}
425
426pub fn extract_printable_strings(bytes: &[u8]) -> String {
427 const MIN_LEN: usize = 4;
428 const MAX_OUTPUT_BYTES: usize = 2_000_000;
429
430 fn is_printable_ascii(b: u8) -> bool {
431 matches!(b, 0x20..=0x7E)
432 }
433
434 let mut out = String::new();
435 let mut run: Vec<u8> = Vec::new();
436
437 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
438 if run.len() >= MIN_LEN {
439 if !out.is_empty() {
440 out.push('\n');
441 }
442 out.push_str(&String::from_utf8_lossy(run));
443 }
444 run.clear();
445 };
446
447 for &b in bytes {
448 if is_printable_ascii(b) {
449 run.push(b);
450 } else {
451 flush_run(&mut out, &mut run);
452 if out.len() >= MAX_OUTPUT_BYTES {
453 return out;
454 }
455 }
456 }
457 flush_run(&mut out, &mut run);
458 if out.len() >= MAX_OUTPUT_BYTES {
459 return out;
460 }
461
462 for start in 0..=1 {
463 run.clear();
464 let mut i = start;
465 while i + 1 < bytes.len() {
466 let b0 = bytes[i];
467 let b1 = bytes[i + 1];
468 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
469 if is_printable_ascii(ch) && zero == 0 {
470 run.push(ch);
471 } else {
472 flush_run(&mut out, &mut run);
473 if out.len() >= MAX_OUTPUT_BYTES {
474 return out;
475 }
476 }
477 i += 2;
478 }
479 flush_run(&mut out, &mut run);
480 if out.len() >= MAX_OUTPUT_BYTES {
481 return out;
482 }
483 }
484
485 out
486}