1use std::borrow::Cow;
5use std::collections::BTreeSet;
6use std::fs;
7use std::io::{BufReader, Cursor, Read};
8use std::panic::{AssertUnwindSafe, catch_unwind};
9use std::path::Path;
10
11use chrono::{TimeZone, Utc};
12use file_format::{FileFormat, Kind as FileFormatKind};
13use flate2::read::ZlibDecoder;
14use glob::Pattern;
15use image::{ImageDecoder, ImageFormat, ImageReader};
16use mime_guess::from_path;
17use quick_xml::events::Event;
18use quick_xml::reader::Reader as XmlReader;
19
20use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
21use crate::utils::font::extract_font_metadata_text;
22use crate::utils::language::detect_language;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum ExtractedTextKind {
26 None,
27 Decoded,
28 FontMetadata,
29 Pdf,
30 BinaryStrings,
31 ImageMetadata,
32 WindowsExecutableMetadata,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct FileInfoClassification {
37 pub mime_type: String,
38 pub file_type: String,
39 pub programming_language: Option<String>,
40 pub is_binary: bool,
41 pub is_text: bool,
42 pub is_archive: bool,
43 pub is_media: bool,
44 pub is_source: bool,
45 pub is_script: bool,
46}
47
48const MAX_IMAGE_METADATA_VALUES: usize = 64;
49const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
50const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
51const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
52const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
53const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
54 "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
55];
56const BINARY_EXTENSIONS: &[&str] = &[
57 "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
58];
59const ARCHIVE_EXTENSIONS: &[&str] = &[
60 "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
61 "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
62];
63
64pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
66 metadata.modified().ok().map(|time: std::time::SystemTime| {
67 let seconds_since_epoch = time
68 .duration_since(std::time::UNIX_EPOCH)
69 .unwrap()
70 .as_secs() as i64;
71
72 Utc.timestamp_opt(seconds_since_epoch, 0)
73 .single()
74 .unwrap_or_else(Utc::now)
75 .format("%Y-%m-%d")
76 .to_string()
77 })
78}
79
80pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
82 let path_str = path.to_string_lossy();
83 let file_name = path
84 .file_name()
85 .map(|name| name.to_string_lossy())
86 .unwrap_or_default();
87
88 for pattern in exclude_patterns {
89 if pattern.matches(&path_str) {
91 return true;
92 }
93
94 if pattern.matches(&file_name) {
96 return true;
97 }
98 }
99
100 false
101}
102
103pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
109 match String::from_utf8(bytes.to_vec()) {
110 Ok(s) => s,
111 Err(e) => {
112 let bytes = e.into_bytes();
113 if has_binary_control_chars(&bytes) {
114 return String::new();
115 }
116 bytes.iter().map(|&b| b as char).collect()
117 }
118 }
119}
120
121pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
122 let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
123 (text, kind)
124}
125
126pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
127 let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
128 return Cow::Borrowed(text);
129 };
130 if !matches!(
131 extension.to_ascii_lowercase().as_str(),
132 "md" | "markdown" | "html" | "htm"
133 ) {
134 return Cow::Borrowed(text);
135 }
136
137 let mut hints = Vec::new();
138 if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
139 hints.push("Creative Commons Attribution 4.0 International License".to_string());
140 }
141 if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
142 {
143 hints.push(
144 "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
145 .to_string(),
146 );
147 }
148
149 hints.extend(extract_shields_license_badge_hints(text));
150
151 if hints.is_empty() {
152 Cow::Borrowed(text)
153 } else {
154 let mut augmented =
155 String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
156 augmented.push_str(text);
157 augmented.push_str("\n\n");
158 for (index, hint) in hints.into_iter().enumerate() {
159 if index > 0 {
160 augmented.push('\n');
161 }
162 augmented.push_str(&hint);
163 }
164 Cow::Owned(augmented)
165 }
166}
167
168fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
169 let mut hints = Vec::new();
170 let mut rest = text;
171 let needle = "img.shields.io/badge/license-";
172
173 while let Some(index) = rest.find(needle) {
174 let start = index + needle.len();
175 let suffix = &rest[start..];
176 let end = suffix
177 .find([')', ']', '"', '\'', ' ', '\n'])
178 .unwrap_or(suffix.len());
179 let badge = &suffix[..end];
180 let Some(badge) = badge.strip_suffix(".svg") else {
181 rest = &suffix[end..];
182 continue;
183 };
184
185 let mut segments: Vec<_> = badge
186 .split('-')
187 .filter(|segment| !segment.is_empty())
188 .collect();
189 if segments.len() < 2 {
190 rest = &suffix[end..];
191 continue;
192 }
193 segments.pop();
194 let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
195 if !candidate.is_empty() {
196 hints.push(canonical_shields_license_hint(&candidate));
197 }
198
199 rest = &suffix[end..];
200 }
201
202 hints.sort();
203 hints.dedup();
204 hints
205}
206
207fn canonical_shields_license_hint(candidate: &str) -> String {
208 match candidate.trim() {
209 "MIT" => "The MIT License".to_string(),
210 "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
211 other => format!("{other} License"),
212 }
213}
214
215pub(crate) fn extract_text_for_detection_with_diagnostics(
216 path: &Path,
217 bytes: &[u8],
218) -> (String, ExtractedTextKind, Option<String>) {
219 let ext = path
220 .extension()
221 .and_then(|e| e.to_str())
222 .map(|s| s.to_ascii_lowercase());
223 let detected_format = detect_file_format(bytes);
224
225 if looks_like_rtf(bytes, ext.as_deref()) {
226 let text = extract_rtf_text(bytes);
227 return if text.trim().is_empty() {
228 (String::new(), ExtractedTextKind::None, None)
229 } else {
230 (text, ExtractedTextKind::Decoded, None)
231 };
232 }
233
234 if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
235 let (text, scan_error) = extract_pdf_text(path, bytes);
236 return if text.is_empty() {
237 (String::new(), ExtractedTextKind::None, scan_error)
238 } else {
239 (text, ExtractedTextKind::Pdf, None)
240 };
241 }
242
243 if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
244 let text = extract_image_metadata_text(bytes, format);
245 return if text.is_empty() {
246 if is_supported_image_container(bytes, format) {
247 (String::new(), ExtractedTextKind::None, None)
248 } else {
249 let decoded = decode_bytes_to_string(bytes);
250 if decoded.is_empty() {
251 (String::new(), ExtractedTextKind::None, None)
252 } else {
253 (decoded, ExtractedTextKind::Decoded, None)
254 }
255 }
256 } else {
257 (text, ExtractedTextKind::ImageMetadata, None)
258 };
259 }
260
261 if let Some(text) = extract_font_metadata_text(path, bytes) {
262 return (text, ExtractedTextKind::FontMetadata, None);
263 }
264
265 let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
266 let large_opaque_binary = windows_executable_metadata_text.is_none()
267 && is_large_opaque_binary_candidate(bytes, detected_format);
268
269 if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
270 return windows_metadata_or_empty_result(windows_executable_metadata_text);
271 }
272
273 if should_skip_binary_string_extraction(path, bytes, detected_format) {
274 return (String::new(), ExtractedTextKind::None, None);
275 }
276
277 if !large_opaque_binary {
278 let decoded = decode_bytes_to_string(bytes);
279 if !decoded.is_empty() {
280 let combined =
281 combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
282 return (combined, ExtractedTextKind::Decoded, None);
283 }
284 }
285
286 let text = if large_opaque_binary {
287 extract_sampled_printable_strings(bytes)
288 } else {
289 extract_printable_strings(bytes)
290 };
291 if text.is_empty() {
292 windows_metadata_or_empty_result(windows_executable_metadata_text)
293 } else {
294 (
295 combine_extracted_text_fragments(windows_executable_metadata_text, text),
296 ExtractedTextKind::BinaryStrings,
297 None,
298 )
299 }
300}
301
302fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
303 match prefix {
304 Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
305 Some(prefix) if !prefix.is_empty() => prefix,
306 _ => suffix,
307 }
308}
309
310fn windows_metadata_or_empty_result(
311 windows_executable_metadata_text: Option<String>,
312) -> (String, ExtractedTextKind, Option<String>) {
313 if let Some(metadata_text) = windows_executable_metadata_text {
314 (
315 metadata_text,
316 ExtractedTextKind::WindowsExecutableMetadata,
317 None,
318 )
319 } else {
320 (String::new(), ExtractedTextKind::None, None)
321 }
322}
323
324pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
325 let detected_format = detect_file_format(bytes);
326 let detected_language = detect_language(path, bytes);
327 let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
328 let is_text = !is_binary;
329 let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
330 let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
331 let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
332 let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
333 let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
334 let programming_language = is_source.then(|| detected_language.clone()).flatten();
335 let file_type = detect_file_type(
336 path,
337 bytes,
338 detected_format,
339 &mime_type,
340 programming_language.as_deref(),
341 is_binary,
342 is_text,
343 is_archive,
344 is_media,
345 is_script,
346 );
347
348 FileInfoClassification {
349 mime_type,
350 file_type,
351 programming_language,
352 is_binary,
353 is_text,
354 is_archive,
355 is_media,
356 is_source,
357 is_script,
358 }
359}
360
361fn detect_file_format(bytes: &[u8]) -> FileFormat {
362 FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
363}
364
365fn is_utf8_text(bytes: &[u8]) -> bool {
366 std::str::from_utf8(bytes).is_ok()
367}
368
369fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
370 if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
371 return None;
372 }
373
374 let (is_le, body) = match bytes {
375 [0xFF, 0xFE, rest @ ..] => (true, rest),
376 [0xFE, 0xFF, rest @ ..] => (false, rest),
377 _ => return None,
378 };
379
380 if body.is_empty() || body.len() % 2 != 0 {
381 return None;
382 }
383
384 let code_units: Vec<u16> = body
385 .chunks_exact(2)
386 .map(|chunk| {
387 if is_le {
388 u16::from_le_bytes([chunk[0], chunk[1]])
389 } else {
390 u16::from_be_bytes([chunk[0], chunk[1]])
391 }
392 })
393 .collect();
394
395 std::char::decode_utf16(code_units)
396 .collect::<Result<String, _>>()
397 .ok()
398}
399
400fn has_binary_control_chars(bytes: &[u8]) -> bool {
401 let control_count = bytes
402 .iter()
403 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
404 .count();
405 control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
406}
407
408fn has_decodable_text(bytes: &[u8]) -> bool {
409 bytes.is_empty()
410 || is_utf8_text(bytes)
411 || decode_utf16_bom_text(bytes).is_some()
412 || !has_binary_control_chars(bytes)
413}
414
415fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
416 if bytes.is_empty() || is_utf8_text(bytes) {
417 return true;
418 }
419 if let Some(decoded) = decode_utf16_bom_text(bytes) {
420 return decoded
421 .chars()
422 .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
423 }
424
425 let printable_count = bytes
426 .iter()
427 .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
428 .count();
429 printable_count * 2 >= bytes.len()
430}
431
432fn is_textual_media_type(media_type: &str) -> bool {
433 media_type.starts_with("text/")
434 || matches!(
435 media_type,
436 "application/json" | "application/xml" | "text/xml"
437 )
438 || media_type.ends_with("+json")
439 || media_type.ends_with("+xml")
440}
441
442fn is_textual_format(detected_format: FileFormat) -> bool {
443 matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
444 || is_textual_media_type(detected_format.media_type())
445}
446
447fn is_known_binary_format(detected_format: FileFormat) -> bool {
448 !matches!(detected_format, FileFormat::ArbitraryBinaryData)
449 && !is_textual_format(detected_format)
450}
451
452pub fn detect_mime_type(
453 path: &Path,
454 bytes: &[u8],
455 detected_format: FileFormat,
456 programming_language: Option<&str>,
457) -> String {
458 if bytes.is_empty() {
459 return "inode/x-empty".to_string();
460 }
461
462 if lower_extension(path).as_deref() == Some("json") {
463 if let Some(is_binary) = json_binary_override(bytes) {
464 if is_binary {
465 return "application/octet-stream".to_string();
466 }
467 if has_valid_json_text(bytes) {
468 return "application/json".to_string();
469 }
470 return "text/plain".to_string();
471 }
472 if has_valid_json_text(bytes) {
473 return "application/json".to_string();
474 }
475 if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
476 return "text/plain".to_string();
477 }
478 return "application/octet-stream".to_string();
479 }
480
481 if is_zip_archive(bytes) {
482 return detect_zip_like_mime(path);
483 }
484
485 if looks_like_deb(bytes, path) {
486 return "application/vnd.debian.binary-package".to_string();
487 }
488
489 if looks_like_rpm(bytes, path) {
490 return "application/x-rpm".to_string();
491 }
492
493 let guessed_mime = from_path(path)
494 .first_or_octet_stream()
495 .essence_str()
496 .to_string();
497
498 let mime_type = match detected_format {
499 FileFormat::Empty => "inode/x-empty".to_string(),
500 FileFormat::PlainText => {
501 if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
502 "text/plain".to_string()
503 } else {
504 guessed_mime.clone()
505 }
506 }
507 _ => {
508 let detected_mime = detected_format.media_type();
509 if detected_mime == "application/octet-stream"
510 && guessed_mime != "application/octet-stream"
511 {
512 guessed_mime.clone()
513 } else {
514 detected_mime.to_string()
515 }
516 }
517 };
518
519 normalize_mime_type(path, bytes, programming_language, &mime_type)
520}
521
522fn normalize_mime_type(
523 path: &Path,
524 bytes: &[u8],
525 programming_language: Option<&str>,
526 mime_type: &str,
527) -> String {
528 if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
529 return "text/plain".to_string();
530 }
531
532 mime_type.to_string()
533}
534
535fn should_prefer_text_mime(
536 path: &Path,
537 bytes: &[u8],
538 programming_language: Option<&str>,
539 mime_type: &str,
540) -> bool {
541 has_decodable_text(bytes)
542 && looks_like_textual_bytes(bytes)
543 && is_textual_source_candidate(path, programming_language)
544 && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
545}
546
547fn has_valid_json_text(bytes: &[u8]) -> bool {
548 if bytes.len() > JSON_VALIDATION_MAX_BYTES {
549 return false;
550 }
551
552 serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
553 || decode_utf16_bom_text(bytes)
554 .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
555 .is_some()
556}
557
558fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
559 !bytes.contains(&0)
560 && !bytes.contains(&0xFF)
561 && bytes.starts_with(b"[\"")
562 && bytes.ends_with(b"\"]")
563 && bytes.len() >= 8
564}
565
566fn json_binary_override(bytes: &[u8]) -> Option<bool> {
567 if has_valid_json_text(bytes) || decode_utf16_bom_text(bytes).is_some() {
568 return Some(false);
569 }
570
571 if bytes.contains(&0) {
572 return Some(true);
573 }
574
575 if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
576 return Some(true);
577 }
578
579 if is_wrapped_invalid_json_string_text(bytes) {
580 return Some(false);
581 }
582
583 None
584}
585
586fn detect_is_binary(
587 path: &Path,
588 bytes: &[u8],
589 detected_format: FileFormat,
590 programming_language: Option<&str>,
591) -> bool {
592 if lower_extension(path).as_deref() == Some("json")
593 && let Some(is_binary) = json_binary_override(bytes)
594 {
595 return is_binary;
596 }
597
598 if is_textual_format(detected_format) {
599 return false;
600 }
601
602 if lower_extension(path)
603 .as_deref()
604 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
605 {
606 return true;
607 }
608
609 if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
610 return false;
611 }
612
613 has_binary_control_chars(bytes)
614 || is_known_binary_format(detected_format)
615 || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
616 && !looks_like_textual_bytes(bytes))
617}
618
619fn should_treat_binary_bytes_as_text(
620 path: &Path,
621 bytes: &[u8],
622 programming_language: Option<&str>,
623) -> bool {
624 has_decodable_text(bytes)
625 && looks_like_textual_bytes(bytes)
626 && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
627}
628
629fn detect_is_archive(
630 path: &Path,
631 bytes: &[u8],
632 mime_type: &str,
633 is_text: bool,
634 detected_format: FileFormat,
635) -> bool {
636 if is_text {
637 return false;
638 }
639
640 lower_extension(path)
641 .as_deref()
642 .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
643 || matches!(
644 detected_format.kind(),
645 FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
646 )
647 || is_zip_archive(bytes)
648 || looks_like_gzip(bytes)
649 || looks_like_bzip2(bytes)
650 || looks_like_xz(bytes)
651 || looks_like_deb(bytes, path)
652 || looks_like_rpm(bytes, path)
653 || looks_like_squashfs(bytes, path)
654 || mime_type.contains("zip")
655 || mime_type.contains("compressed")
656 || mime_type.contains("tar")
657 || mime_type.contains("x-rpm")
658 || mime_type.contains("debian")
659}
660
661fn detect_is_media(
662 path: &Path,
663 bytes: &[u8],
664 mime_type: &str,
665 detected_format: FileFormat,
666) -> bool {
667 media_mime_from_content(bytes).is_some()
668 || matches!(
669 detected_format.kind(),
670 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
671 )
672 || mime_type.starts_with("image/")
673 || mime_type.starts_with("audio/")
674 || mime_type.starts_with("video/")
675 || (mime_type == "application/octet-stream"
676 && lower_extension(path).as_deref() == Some("tga")
677 && !has_binary_control_chars(bytes))
678}
679
680fn detect_is_script(
681 path: &Path,
682 bytes: &[u8],
683 programming_language: Option<&str>,
684 is_text: bool,
685) -> bool {
686 if !is_text || is_makefile(path) {
687 return false;
688 }
689
690 bytes.starts_with(b"#!")
691 || lower_extension(path).as_deref().is_some_and(|ext| {
692 matches!(
693 ext,
694 "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
695 )
696 })
697 || matches!(
698 programming_language,
699 Some(
700 "Shell"
701 | "Bash"
702 | "Zsh"
703 | "Fish"
704 | "Ksh"
705 | "Python"
706 | "Ruby"
707 | "Perl"
708 | "PHP"
709 | "PowerShell"
710 | "Awk"
711 )
712 )
713}
714
715fn detect_is_source(
716 path: &Path,
717 programming_language: Option<&str>,
718 is_text: bool,
719 is_script: bool,
720) -> bool {
721 if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
722 return false;
723 }
724
725 if is_c_like_source(path) || is_java_like_source(path) {
726 return true;
727 }
728
729 programming_language.is_some() || is_script
730}
731
732#[allow(clippy::too_many_arguments)]
733fn detect_file_type(
734 path: &Path,
735 bytes: &[u8],
736 detected_format: FileFormat,
737 mime_type: &str,
738 programming_language: Option<&str>,
739 is_binary: bool,
740 is_text: bool,
741 is_archive: bool,
742 is_media: bool,
743 is_script: bool,
744) -> String {
745 if bytes.is_empty() {
746 return "empty".to_string();
747 }
748
749 if looks_like_pdf(bytes) {
750 return "PDF document".to_string();
751 }
752
753 if let Some(file_type) = media_file_type_from_content(bytes) {
754 return file_type.to_string();
755 }
756
757 if is_archive {
758 return archive_file_type(path, bytes, detected_format);
759 }
760
761 if is_script {
762 return script_file_type(programming_language, bytes);
763 }
764
765 if is_text {
766 if lower_extension(path).as_deref() == Some("json") {
767 if has_valid_json_text(bytes) {
768 return "JSON text data".to_string();
769 }
770 return text_file_type(bytes);
771 }
772 if lower_extension(path).as_deref() == Some("xml") {
773 return "XML text data".to_string();
774 }
775 if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
776 return "YAML text data".to_string();
777 }
778 if lower_extension(path).as_deref() == Some("toml") {
779 return "TOML text data".to_string();
780 }
781 if matches!(
782 lower_extension(path).as_deref(),
783 Some("ini" | "cfg" | "conf")
784 ) {
785 return "INI text data".to_string();
786 }
787 if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
788 return "Git configuration text".to_string();
789 }
790 if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
791 return text_file_type(bytes);
792 }
793 if programming_language.is_some() && !is_media {
794 return source_file_type(programming_language, bytes);
795 }
796 return text_file_type(bytes);
797 }
798
799 if let Some(file_type) = format_based_file_type(detected_format) {
800 return file_type;
801 }
802
803 if is_binary && mime_type == "application/octet-stream" {
804 return "data".to_string();
805 }
806
807 mime_type.to_string()
808}
809
810fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
811 if matches!(programming_language, Some(language) if is_source_like_language(language)) {
812 return true;
813 }
814
815 if matches!(
816 lower_file_name(path).as_str(),
817 "dockerfile"
818 | "containerfile"
819 | "containerfile.core"
820 | "apkbuild"
821 | "podfile"
822 | "jamfile"
823 | "jamroot"
824 | "meson.build"
825 | "build"
826 | "workspace"
827 | "buck"
828 | "default.nix"
829 | "flake.nix"
830 | "shell.nix"
831 ) {
832 return true;
833 }
834
835 path.extension()
836 .and_then(|ext| ext.to_str())
837 .is_some_and(|ext| {
838 matches!(
839 ext.to_ascii_lowercase().as_str(),
840 "rs" | "py"
841 | "js"
842 | "mjs"
843 | "cjs"
844 | "jsx"
845 | "ts"
846 | "mts"
847 | "cts"
848 | "tsx"
849 | "c"
850 | "cpp"
851 | "cc"
852 | "cxx"
853 | "h"
854 | "hpp"
855 | "m"
856 | "mm"
857 | "s"
858 | "asm"
859 | "java"
860 | "go"
861 | "rb"
862 | "php"
863 | "pl"
864 | "swift"
865 | "sh"
866 | "bash"
867 | "zsh"
868 | "fish"
869 | "ksh"
870 | "ps1"
871 | "psm1"
872 | "psd1"
873 | "awk"
874 | "kt"
875 | "kts"
876 | "dart"
877 | "scala"
878 | "groovy"
879 | "gradle"
880 | "gvy"
881 | "gy"
882 | "gsh"
883 | "cs"
884 | "fs"
885 | "fsx"
886 | "r"
887 | "lua"
888 | "jl"
889 | "ex"
890 | "exs"
891 | "clj"
892 | "cljs"
893 | "cljc"
894 | "hs"
895 | "erl"
896 | "nix"
897 | "zig"
898 | "bzl"
899 | "bazel"
900 | "star"
901 | "sky"
902 | "ml"
903 | "mli"
904 | "tex"
905 )
906 })
907}
908
909fn is_source_like_language(language: &str) -> bool {
910 matches!(
911 language,
912 "Rust"
913 | "Python"
914 | "JavaScript"
915 | "TypeScript"
916 | "JavaScript/TypeScript"
917 | "C"
918 | "C++"
919 | "Objective-C"
920 | "Objective-C++"
921 | "GAS"
922 | "Java"
923 | "Go"
924 | "Ruby"
925 | "PHP"
926 | "Perl"
927 | "Swift"
928 | "Shell"
929 | "PowerShell"
930 | "Awk"
931 | "Kotlin"
932 | "Dart"
933 | "Scala"
934 | "C#"
935 | "F#"
936 | "R"
937 | "Lua"
938 | "Julia"
939 | "Elixir"
940 | "Clojure"
941 | "Haskell"
942 | "Erlang"
943 | "Groovy"
944 | "Nix"
945 | "Zig"
946 | "Starlark"
947 | "OCaml"
948 | "Meson"
949 | "TeX"
950 | "Dockerfile"
951 | "Makefile"
952 | "Jamfile"
953 )
954}
955
956fn extension(path: &Path) -> Option<&str> {
957 path.extension().and_then(|ext| ext.to_str())
958}
959
960fn lower_extension(path: &Path) -> Option<String> {
961 extension(path).map(|ext| ext.to_ascii_lowercase())
962}
963
964fn lower_file_name(path: &Path) -> String {
965 path.file_name()
966 .and_then(|name| name.to_str())
967 .map(|name| name.to_ascii_lowercase())
968 .unwrap_or_default()
969}
970
971fn is_plain_text(path: &Path) -> bool {
972 lower_extension(path)
973 .as_deref()
974 .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
975}
976
977fn is_makefile(path: &Path) -> bool {
978 matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
979}
980
981fn is_source_map(path: &Path) -> bool {
982 let path_lower = path.to_string_lossy().to_ascii_lowercase();
983 path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
984}
985
986fn is_c_like_source(path: &Path) -> bool {
987 lower_extension(path).as_deref().is_some_and(|ext| {
988 matches!(
989 ext,
990 "c" | "cc"
991 | "cp"
992 | "cpp"
993 | "cxx"
994 | "c++"
995 | "h"
996 | "hh"
997 | "hpp"
998 | "hxx"
999 | "h++"
1000 | "i"
1001 | "ii"
1002 | "m"
1003 | "s"
1004 | "asm"
1005 )
1006 })
1007}
1008
1009fn is_java_like_source(path: &Path) -> bool {
1010 lower_extension(path)
1011 .as_deref()
1012 .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1013}
1014
1015fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1016 match detected_format {
1017 FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1018 format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1019 format => Some(match format.kind() {
1020 FileFormatKind::Image => short_name_or_name(&format, "image data"),
1021 FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1022 FileFormatKind::Video => short_name_or_name(&format, "video data"),
1023 _ => format.name().to_string(),
1024 }),
1025 }
1026}
1027
1028fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1029 format
1030 .short_name()
1031 .map(|short_name| format!("{short_name} {suffix}"))
1032 .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1033}
1034
1035fn detect_zip_like_mime(path: &Path) -> String {
1036 match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1037 Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1038 Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1039 "application/java-archive".to_string()
1040 }
1041 _ => "application/zip".to_string(),
1042 }
1043}
1044
1045fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1046 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1047 Some("image/png")
1048 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1049 Some("image/jpeg")
1050 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1051 Some("image/tiff")
1052 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1053 Some("image/webp")
1054 } else {
1055 None
1056 }
1057}
1058
1059fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1060 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1061 Some("PNG image data")
1062 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1063 Some("JPEG image data")
1064 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1065 Some("TIFF image data")
1066 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1067 Some("WebP image data")
1068 } else {
1069 None
1070 }
1071}
1072
1073fn looks_like_pdf(bytes: &[u8]) -> bool {
1074 bytes.starts_with(b"%PDF-")
1075}
1076
1077fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1078 ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1079}
1080
1081fn extract_rtf_text(bytes: &[u8]) -> String {
1082 let text = String::from_utf8_lossy(bytes);
1083 let chars: Vec<char> = text.chars().collect();
1084 let mut output = String::new();
1085 let mut index = 0usize;
1086
1087 while index < chars.len() {
1088 match chars[index] {
1089 '{' | '}' => {
1090 index += 1;
1091 }
1092 '\\' => {
1093 index += 1;
1094 if index >= chars.len() {
1095 break;
1096 }
1097
1098 match chars[index] {
1099 '\\' | '{' | '}' => {
1100 output.push(chars[index]);
1101 index += 1;
1102 }
1103 '\'' => {
1104 if index + 2 < chars.len() {
1105 let hex = [chars[index + 1], chars[index + 2]];
1106 let hex: String = hex.iter().collect();
1107 if let Ok(value) = u8::from_str_radix(&hex, 16) {
1108 output.push(value as char);
1109 index += 3;
1110 continue;
1111 }
1112 }
1113 index += 1;
1114 }
1115 control if control.is_ascii_alphabetic() => {
1116 let start = index;
1117 while index < chars.len() && chars[index].is_ascii_alphabetic() {
1118 index += 1;
1119 }
1120 let control_word: String = chars[start..index].iter().collect();
1121
1122 let number_start = index;
1123 if index < chars.len()
1124 && (chars[index] == '-' || chars[index].is_ascii_digit())
1125 {
1126 index += 1;
1127 while index < chars.len() && chars[index].is_ascii_digit() {
1128 index += 1;
1129 }
1130 }
1131 let parameter: String = chars[number_start..index].iter().collect();
1132
1133 if index < chars.len() && chars[index] == ' ' {
1134 index += 1;
1135 }
1136
1137 match control_word.as_str() {
1138 "par" | "line" => output.push('\n'),
1139 "tab" => output.push('\t'),
1140 "emdash" => output.push('—'),
1141 "endash" => output.push('–'),
1142 "bullet" => output.push('•'),
1143 "lquote" | "rquote" => output.push('\''),
1144 "ldblquote" | "rdblquote" => output.push('"'),
1145 "u" => {
1146 if let Ok(codepoint) = parameter.parse::<i32>() {
1147 let normalized = if codepoint < 0 {
1148 codepoint + 65_536
1149 } else {
1150 codepoint
1151 };
1152 if let Ok(normalized) = u32::try_from(normalized)
1153 && let Some(ch) = char::from_u32(normalized)
1154 {
1155 output.push(ch);
1156 }
1157 }
1158
1159 if index < chars.len()
1160 && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1161 {
1162 index += 1;
1163 }
1164 }
1165 _ => {}
1166 }
1167 }
1168 _ => {
1169 index += 1;
1170 }
1171 }
1172 }
1173 ch => {
1174 output.push(ch);
1175 index += 1;
1176 }
1177 }
1178 }
1179
1180 output
1181 .replace(['\r', '\u{0c}'], "\n")
1182 .lines()
1183 .map(str::trim_end)
1184 .collect::<Vec<_>>()
1185 .join("\n")
1186}
1187
1188fn looks_like_gzip(bytes: &[u8]) -> bool {
1189 bytes.starts_with(&[0x1f, 0x8b])
1190}
1191
1192fn looks_like_bzip2(bytes: &[u8]) -> bool {
1193 bytes.starts_with(b"BZh")
1194}
1195
1196fn looks_like_xz(bytes: &[u8]) -> bool {
1197 bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1198}
1199
1200fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1201 lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1202}
1203
1204fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1205 lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1206}
1207
1208fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1209 lower_extension(path)
1210 .as_deref()
1211 .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1212 && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1213 || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1214}
1215
1216fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1217 if looks_like_deb(bytes, path) {
1218 "debian binary package (format 2.0)".to_string()
1219 } else if looks_like_rpm(bytes, path) {
1220 "RPM package".to_string()
1221 } else if looks_like_squashfs(bytes, path) {
1222 "Squashfs filesystem".to_string()
1223 } else if looks_like_gzip(bytes) {
1224 "gzip compressed data".to_string()
1225 } else if looks_like_bzip2(bytes) {
1226 "bzip2 compressed data".to_string()
1227 } else if looks_like_xz(bytes) {
1228 "XZ compressed data".to_string()
1229 } else if is_zip_archive(bytes) {
1230 "Zip archive data".to_string()
1231 } else if lower_extension(path).as_deref() == Some("gem") {
1232 "POSIX tar archive".to_string()
1233 } else if let Some(file_type) = format_based_file_type(detected_format) {
1234 file_type
1235 } else {
1236 "archive data".to_string()
1237 }
1238}
1239
1240fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1241 let suffix = text_executable_label(bytes);
1242
1243 match programming_language {
1244 Some("Python") => format!("python script, {suffix}"),
1245 Some("Ruby") => format!("ruby script, {suffix}"),
1246 Some("Perl") => format!("perl script, {suffix}"),
1247 Some("PHP") => format!("php script, {suffix}"),
1248 Some("Shell") => format!("shell script, {suffix}"),
1249 Some("Bash") => format!("bash script, {suffix}"),
1250 Some("Zsh") => format!("zsh script, {suffix}"),
1251 Some("Fish") => format!("fish script, {suffix}"),
1252 Some("Ksh") => format!("ksh script, {suffix}"),
1253 Some("JavaScript") => format!("javascript script, {suffix}"),
1254 Some("TypeScript") => format!("typescript script, {suffix}"),
1255 Some("PowerShell") => format!("powershell script, {suffix}"),
1256 Some("Awk") => format!("awk script, {suffix}"),
1257 _ => format!("script, {suffix}"),
1258 }
1259}
1260
1261fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1262 let suffix = text_label(bytes);
1263 match programming_language {
1264 Some("C") => format!("C source, {suffix}"),
1265 Some("C++") => format!("C++ source, {suffix}"),
1266 Some("Java") => format!("Java source, {suffix}"),
1267 Some("C#") => format!("C# source, {suffix}"),
1268 Some("F#") => format!("F# source, {suffix}"),
1269 Some("Go") => format!("Go source, {suffix}"),
1270 Some("Rust") => format!("Rust source, {suffix}"),
1271 Some("Starlark") => format!("Starlark source, {suffix}"),
1272 Some("CMake") => format!("CMake source, {suffix}"),
1273 Some("Meson") => format!("Meson source, {suffix}"),
1274 Some("Nix") => format!("Nix source, {suffix}"),
1275 Some("Groovy") => format!("Groovy source, {suffix}"),
1276 Some("Makefile") => format!("Makefile source, {suffix}"),
1277 Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1278 Some("Jamfile") => format!("Jamfile source, {suffix}"),
1279 Some("Batchfile") => format!("Batchfile source, {suffix}"),
1280 Some(language) => format!("{language} source, {suffix}"),
1281 None => text_file_type(bytes),
1282 }
1283}
1284
1285fn text_file_type(bytes: &[u8]) -> String {
1286 text_label(bytes).to_string()
1287}
1288
1289fn text_label(bytes: &[u8]) -> &'static str {
1290 if std::str::from_utf8(bytes).is_ok() {
1291 if bytes.contains(&b'\n') {
1292 "UTF-8 Unicode text"
1293 } else {
1294 "UTF-8 Unicode text, with no line terminators"
1295 }
1296 } else if bytes.contains(&b'\n') {
1297 "text"
1298 } else {
1299 "text, with no line terminators"
1300 }
1301}
1302
1303fn text_executable_label(bytes: &[u8]) -> &'static str {
1304 if std::str::from_utf8(bytes).is_ok() {
1305 if bytes.contains(&b'\n') {
1306 "UTF-8 Unicode text executable"
1307 } else {
1308 "UTF-8 Unicode text executable, with no line terminators"
1309 }
1310 } else if bytes.contains(&b'\n') {
1311 "text executable"
1312 } else {
1313 "text executable, with no line terminators"
1314 }
1315}
1316
1317fn supported_image_metadata_format(
1318 ext: Option<&str>,
1319 detected_format: FileFormat,
1320) -> Option<ImageFormat> {
1321 match ext {
1322 Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1323 Some("png") => Some(ImageFormat::Png),
1324 Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1325 Some("webp") => Some(ImageFormat::WebP),
1326 _ => match detected_format.media_type() {
1327 "image/jpeg" => Some(ImageFormat::Jpeg),
1328 "image/png" => Some(ImageFormat::Png),
1329 "image/tiff" => Some(ImageFormat::Tiff),
1330 "image/webp" => Some(ImageFormat::WebP),
1331 _ => None,
1332 },
1333 }
1334}
1335
1336fn should_skip_binary_string_extraction(
1337 path: &Path,
1338 bytes: &[u8],
1339 detected_format: FileFormat,
1340) -> bool {
1341 matches!(lower_extension(path).as_deref(), Some("pdf"))
1342 || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1343 .is_some()
1344 || (matches!(
1345 detected_format.kind(),
1346 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1347 ) && !is_textual_format(detected_format))
1348 || media_mime_from_content(bytes).is_some()
1349 || is_zip_archive(bytes)
1350 || looks_like_gzip(bytes)
1351 || looks_like_bzip2(bytes)
1352 || looks_like_xz(bytes)
1353 || looks_like_deb(bytes, path)
1354 || looks_like_rpm(bytes, path)
1355 || looks_like_squashfs(bytes, path)
1356}
1357
1358fn should_skip_large_opaque_binary_text_extraction(
1359 _path: &Path,
1360 bytes: &[u8],
1361 detected_format: FileFormat,
1362) -> bool {
1363 is_large_opaque_binary_candidate(bytes, detected_format)
1364 && !sample_has_promising_printable_strings(bytes)
1365}
1366
1367fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1368 bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1369 && !is_textual_format(detected_format)
1370 && !matches!(
1371 detected_format.kind(),
1372 FileFormatKind::Archive
1373 | FileFormatKind::Compressed
1374 | FileFormatKind::Package
1375 | FileFormatKind::Audio
1376 | FileFormatKind::Image
1377 | FileFormatKind::Video
1378 )
1379}
1380
1381fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1382 const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1383
1384 let mut ranges = Vec::new();
1385 let mut push_range = |start: usize, end: usize| {
1386 if start < end && !ranges.contains(&(start, end)) {
1387 ranges.push((start, end));
1388 }
1389 };
1390
1391 push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1392 if len > SAMPLE_WINDOW_BYTES * 2 {
1393 let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1394 let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1395 push_range(mid_start, mid_end);
1396 }
1397 if len > SAMPLE_WINDOW_BYTES {
1398 push_range(len - SAMPLE_WINDOW_BYTES, len);
1399 }
1400
1401 ranges
1402}
1403
1404fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1405 let mut structured_signal_seen = false;
1406 let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1407 .into_iter()
1408 .filter(|&(start, end)| {
1409 let window = &bytes[start..end];
1410 if has_strong_structured_text_signal(window) {
1411 structured_signal_seen = true;
1412 }
1413 has_license_or_notice_signal(window)
1414 })
1415 .count();
1416
1417 structured_signal_seen || promising_license_windows >= 2
1418}
1419
1420fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1421 let mut combined_lines = BTreeSet::new();
1422
1423 for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1424 let window_text = extract_printable_strings(&bytes[start..end]);
1425 for line in window_text
1426 .lines()
1427 .map(str::trim)
1428 .filter(|line| !line.is_empty())
1429 {
1430 combined_lines.insert(line.to_string());
1431 }
1432 }
1433
1434 combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1435}
1436
1437fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1438 let strings = extract_printable_strings(bytes);
1439 if strings.is_empty() {
1440 return false;
1441 }
1442
1443 let lower = strings.to_ascii_lowercase();
1444 [
1445 "copyright",
1446 "license",
1447 "licensed under",
1448 "all rights reserved",
1449 "permission is hereby granted",
1450 "redistribution and use",
1451 "spdx-license-identifier",
1452 ]
1453 .iter()
1454 .any(|marker| lower.contains(marker))
1455}
1456
1457fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1458 let strings = extract_printable_strings(bytes);
1459 if strings.is_empty() {
1460 return false;
1461 }
1462
1463 let email_markers = strings.matches('@').count();
1464 let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1465
1466 email_markers + url_markers >= 3
1467}
1468
1469fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1470 match format {
1471 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1472 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1473 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1474 ImageFormat::WebP => {
1475 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1476 }
1477 _ => false,
1478 }
1479}
1480
1481fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1482 let mut values = Vec::new();
1483 values.extend(extract_exif_metadata_values(bytes));
1484 values.extend(extract_xmp_metadata_values(bytes, format));
1485 values_to_text(values)
1486}
1487
1488fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1489 let mut cursor = BufReader::new(Cursor::new(bytes));
1490 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1491 Ok(exif) => exif,
1492 Err(_) => return Vec::new(),
1493 };
1494
1495 let mut values = Vec::new();
1496 for field in exif.fields() {
1497 let rendered = match field.tag {
1498 exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1499 Some(field.display_value().with_unit(&exif).to_string())
1500 }
1501 exif::Tag::Artist => Some(format!(
1502 "Author: {}",
1503 field.display_value().with_unit(&exif)
1504 )),
1505 _ => None,
1506 };
1507
1508 if let Some(rendered) = rendered {
1509 values.push(rendered);
1510 }
1511 }
1512
1513 values
1514}
1515
1516fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1517 let xmp = match extract_raw_xmp_packet(bytes, format) {
1518 Some(xmp) => xmp,
1519 None => return Vec::new(),
1520 };
1521
1522 parse_xmp_values(&xmp)
1523}
1524
1525fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1526 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1527 if let Ok(mut decoder) = reader.into_decoder()
1528 && let Ok(Some(xmp)) = decoder.xmp_metadata()
1529 {
1530 return Some(xmp);
1531 }
1532
1533 match format {
1534 ImageFormat::Png => extract_png_xmp_packet(bytes),
1535 _ => None,
1536 }
1537}
1538
1539fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1540 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1541
1542 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1543 return None;
1544 }
1545
1546 let mut offset = PNG_SIGNATURE.len();
1547 while offset + 12 <= bytes.len() {
1548 let length = u32::from_be_bytes([
1549 bytes[offset],
1550 bytes[offset + 1],
1551 bytes[offset + 2],
1552 bytes[offset + 3],
1553 ]) as usize;
1554 let chunk_start = offset + 8;
1555 let chunk_end = chunk_start + length;
1556 if chunk_end + 4 > bytes.len() {
1557 return None;
1558 }
1559
1560 let chunk_type = &bytes[offset + 4..offset + 8];
1561 if chunk_type == b"iTXt" {
1562 let data = &bytes[chunk_start..chunk_end];
1563 if let Some(xmp) = parse_png_itxt_xmp(data) {
1564 return Some(xmp);
1565 }
1566 }
1567
1568 offset = chunk_end + 4;
1569 }
1570
1571 None
1572}
1573
1574fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1575 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1576
1577 let keyword_end = data.iter().position(|&b| b == 0)?;
1578 if &data[..keyword_end] != XMP_KEYWORD {
1579 return None;
1580 }
1581
1582 let mut cursor = keyword_end + 1;
1583 let compression_flag = *data.get(cursor)?;
1584 cursor += 1;
1585 let compression_method = *data.get(cursor)?;
1586 cursor += 1;
1587 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1588 return None;
1589 }
1590
1591 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1592 cursor = language_end + 1;
1593
1594 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1595 cursor = translated_end + 1;
1596
1597 let text_bytes = &data[cursor..];
1598 if compression_flag == 1 {
1599 let mut decoder = ZlibDecoder::new(text_bytes);
1600 let mut decoded = Vec::new();
1601 decoder.read_to_end(&mut decoded).ok()?;
1602 Some(decoded)
1603 } else {
1604 Some(text_bytes.to_vec())
1605 }
1606}
1607
1608fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1609 let mut reader = XmlReader::from_reader(xmp);
1610 reader.config_mut().trim_text(true);
1611
1612 let mut buf = Vec::new();
1613 let mut stack: Vec<String> = Vec::new();
1614 let mut values = Vec::new();
1615
1616 loop {
1617 match reader.read_event_into(&mut buf) {
1618 Ok(Event::Start(e)) => {
1619 stack.push(local_xml_name(e.name().as_ref()));
1620 }
1621 Ok(Event::End(_)) => {
1622 stack.pop();
1623 }
1624 Ok(Event::Empty(_)) => {}
1625 Ok(Event::Text(text)) => {
1626 if let Some(field) = stack
1627 .iter()
1628 .rev()
1629 .find_map(|name| allowed_xmp_field(name.as_str()))
1630 && let Ok(decoded) = text.decode()
1631 {
1632 let decoded = decoded.into_owned();
1633 if !decoded.trim().is_empty() {
1634 values.push(format_xmp_value(field, &decoded));
1635 }
1636 }
1637 }
1638 Ok(Event::CData(text)) => {
1639 if let Some(field) = stack
1640 .iter()
1641 .rev()
1642 .find_map(|name| allowed_xmp_field(name.as_str()))
1643 && let Ok(decoded) = text.decode()
1644 {
1645 let decoded = decoded.into_owned();
1646 if !decoded.trim().is_empty() {
1647 values.push(format_xmp_value(field, &decoded));
1648 }
1649 }
1650 }
1651 Ok(Event::Eof) | Err(_) => break,
1652 _ => {}
1653 }
1654 buf.clear();
1655 }
1656
1657 values
1658}
1659
1660fn local_xml_name(name: &[u8]) -> String {
1661 let name = std::str::from_utf8(name).unwrap_or_default();
1662 name.rsplit(':').next().unwrap_or(name).to_string()
1663}
1664
1665fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1666 match name {
1667 "creator" => Some("creator"),
1668 "rights" => Some("rights"),
1669 "description" => Some("description"),
1670 "title" => Some("title"),
1671 "subject" => Some("subject"),
1672 "UsageTerms" => Some("usage_terms"),
1673 "WebStatement" => Some("web_statement"),
1674 _ => None,
1675 }
1676}
1677
1678fn format_xmp_value(field: &str, value: &str) -> String {
1679 match field {
1680 "creator" => format!("Author: {value}"),
1681 _ => value.to_string(),
1682 }
1683}
1684
1685fn values_to_text(values: Vec<String>) -> String {
1686 let mut seen = BTreeSet::new();
1687 let mut lines = Vec::new();
1688 let mut total_bytes = 0usize;
1689
1690 for value in values {
1691 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1692 break;
1693 }
1694
1695 let normalized = normalize_metadata_value(&value);
1696 if normalized.is_empty() || !seen.insert(normalized.clone()) {
1697 continue;
1698 }
1699
1700 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1701 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1702 break;
1703 }
1704
1705 total_bytes += added_bytes;
1706 lines.push(normalized);
1707 }
1708
1709 lines.join("\n")
1710}
1711
1712fn normalize_metadata_value(value: &str) -> String {
1713 value
1714 .chars()
1715 .filter(|&ch| ch != '\0')
1716 .collect::<String>()
1717 .split_whitespace()
1718 .collect::<Vec<_>>()
1719 .join(" ")
1720 .trim()
1721 .to_string()
1722}
1723
1724fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1725 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1726 return (String::new(), None);
1727 }
1728
1729 let mut failures = Vec::new();
1730 let mut saw_success = false;
1731
1732 let extracted = catch_unwind(AssertUnwindSafe(
1733 || -> Result<String, Box<dyn std::error::Error>> {
1734 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1735 extract_first_pdf_page_text(&mut document)
1736 },
1737 ));
1738 match extracted {
1739 Ok(Ok(text)) => {
1740 saw_success = true;
1741 if let Some(normalized) = normalize_pdf_text(text) {
1742 return (normalized, None);
1743 }
1744 }
1745 Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1746 Err(payload) => failures.push(format!(
1747 "from-bytes first-page panic: {}",
1748 panic_payload_to_string(payload.as_ref())
1749 )),
1750 }
1751
1752 let extracted = catch_unwind(AssertUnwindSafe(
1753 || -> Result<String, Box<dyn std::error::Error>> {
1754 let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1755 extract_pdf_text_from_document(&mut document)
1756 },
1757 ));
1758 match extracted {
1759 Ok(Ok(text)) => {
1760 saw_success = true;
1761 if let Some(normalized) = normalize_pdf_text(text) {
1762 return (normalized, None);
1763 }
1764 }
1765 Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1766 Err(payload) => failures.push(format!(
1767 "open full-document panic: {}",
1768 panic_payload_to_string(payload.as_ref())
1769 )),
1770 }
1771
1772 let extracted = catch_unwind(AssertUnwindSafe(
1773 || -> Result<String, Box<dyn std::error::Error>> {
1774 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1775 extract_pdf_text_from_document(&mut document)
1776 },
1777 ));
1778 match extracted {
1779 Ok(Ok(text)) => {
1780 saw_success = true;
1781 if let Some(normalized) = normalize_pdf_text(text) {
1782 return (normalized, None);
1783 }
1784 }
1785 Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1786 Err(payload) => failures.push(format!(
1787 "from-bytes full-document panic: {}",
1788 panic_payload_to_string(payload.as_ref())
1789 )),
1790 }
1791
1792 if saw_success || is_non_actionable_pdf_failure(&failures) {
1793 (String::new(), None)
1794 } else {
1795 (
1796 String::new(),
1797 Some(format!(
1798 "PDF text extraction failed after {} attempts: {}",
1799 failures.len(),
1800 failures.join("; ")
1801 )),
1802 )
1803 }
1804}
1805
1806fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
1807 !failures.is_empty()
1808 && failures.iter().all(|failure| {
1809 failure.contains("requires a password")
1810 || failure.contains("Encrypt dictionary missing /O")
1811 || failure.contains("Encrypt dictionary missing /U")
1812 || failure.contains("security handler cannot be found")
1813 || failure.contains("Invalid cross-reference table")
1814 })
1815}
1816
1817fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
1818 if let Some(message) = payload.downcast_ref::<&str>() {
1819 (*message).to_string()
1820 } else if let Some(message) = payload.downcast_ref::<String>() {
1821 message.clone()
1822 } else {
1823 "unknown panic payload".to_string()
1824 }
1825}
1826
1827fn extract_first_pdf_page_text(
1828 document: &mut pdf_oxide::document::PdfDocument,
1829) -> Result<String, Box<dyn std::error::Error>> {
1830 if document.page_count()? == 0 {
1831 return Ok(String::new());
1832 }
1833
1834 let extracted_text = document.extract_text(0)?;
1835 let markdown_text =
1836 document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1837 if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1838 return Ok(extracted_text);
1839 }
1840
1841 let pipeline_text =
1842 document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1843
1844 Ok(merge_pdf_first_page_text(
1845 &extracted_text,
1846 &markdown_text,
1847 &pipeline_text,
1848 ))
1849}
1850
1851fn extract_pdf_text_from_document(
1852 document: &mut pdf_oxide::document::PdfDocument,
1853) -> Result<String, Box<dyn std::error::Error>> {
1854 Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1855}
1856
1857fn normalize_pdf_text(text: String) -> Option<String> {
1858 let normalized = text.replace(['\r', '\u{0c}'], "\n");
1859 (!normalized.trim().is_empty()).then_some(normalized)
1860}
1861
1862fn merge_pdf_first_page_text(
1863 _extracted_text: &str,
1864 markdown_text: &str,
1865 pipeline_text: &str,
1866) -> String {
1867 let pipeline = pipeline_text.trim();
1868 if pipeline.is_empty() {
1869 return String::new();
1870 }
1871
1872 let prefix = pdf_first_page_heading_prefix(markdown_text);
1873 let Some(prefix) = prefix else {
1874 return pipeline_text.to_string();
1875 };
1876
1877 if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1878 pipeline_text.to_string()
1879 } else {
1880 format!("{prefix}\n\n{pipeline}")
1881 }
1882}
1883
1884fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1885 normalize_pdf_heading_comparison_text(text)
1886 .contains(&normalize_pdf_heading_comparison_text(prefix))
1887}
1888
1889fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1890 text.split_whitespace()
1891 .map(|part| part.to_ascii_lowercase())
1892 .collect::<Vec<_>>()
1893 .join(" ")
1894}
1895
1896fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1897 let mut lines = Vec::new();
1898
1899 for line in pdf_markdown_heading_lines(markdown_text) {
1900 push_unique_line(&mut lines, line);
1901 }
1902
1903 (!lines.is_empty()).then(|| lines.join("\n"))
1904}
1905
1906fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1907 text.lines()
1908 .map(str::trim)
1909 .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1910 .map(|line| line.trim_matches('#').trim())
1911 .filter(|line| !line.is_empty())
1912 .filter(|line| !looks_like_numbered_section_heading(line))
1913 .take(4)
1914 .map(ToOwned::to_owned)
1915 .collect()
1916}
1917
1918fn push_unique_line(lines: &mut Vec<String>, line: String) {
1919 if !lines.iter().any(|existing| existing == &line) {
1920 lines.push(line);
1921 }
1922}
1923
1924fn looks_like_numbered_section_heading(line: &str) -> bool {
1925 let mut chars = line.chars();
1926 let Some(first) = chars.next() else {
1927 return false;
1928 };
1929
1930 if !first.is_ascii_digit() {
1931 return false;
1932 }
1933
1934 matches!(chars.next(), Some('.'))
1935}
1936
1937fn is_zip_archive(bytes: &[u8]) -> bool {
1938 bytes.starts_with(b"PK\x03\x04")
1939 || bytes.starts_with(b"PK\x05\x06")
1940 || bytes.starts_with(b"PK\x07\x08")
1941}
1942
1943pub fn extract_printable_strings(bytes: &[u8]) -> String {
1944 const MIN_LEN: usize = 4;
1945 const MIN_OUTPUT_BYTES: usize = 2_000_000;
1946 const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1947
1948 let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1949
1950 fn is_printable_ascii(b: u8) -> bool {
1951 matches!(b, 0x20..=0x7E)
1952 }
1953
1954 let mut out = String::new();
1955 let mut run: Vec<u8> = Vec::new();
1956
1957 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1958 if run.len() >= MIN_LEN {
1959 if !out.is_empty() {
1960 out.push('\n');
1961 }
1962 out.push_str(&String::from_utf8_lossy(run));
1963 }
1964 run.clear();
1965 };
1966
1967 for &b in bytes {
1968 if is_printable_ascii(b) {
1969 run.push(b);
1970 } else {
1971 flush_run(&mut out, &mut run);
1972 if out.len() >= max_output_bytes {
1973 return out;
1974 }
1975 }
1976 }
1977 flush_run(&mut out, &mut run);
1978 if out.len() >= max_output_bytes {
1979 return out;
1980 }
1981
1982 for start in 0..=1 {
1983 run.clear();
1984 let mut i = start;
1985 while i + 1 < bytes.len() {
1986 let b0 = bytes[i];
1987 let b1 = bytes[i + 1];
1988 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1989 if is_printable_ascii(ch) && zero == 0 {
1990 run.push(ch);
1991 } else {
1992 flush_run(&mut out, &mut run);
1993 if out.len() >= max_output_bytes {
1994 return out;
1995 }
1996 }
1997 i += 2;
1998 }
1999 flush_run(&mut out, &mut run);
2000 if out.len() >= max_output_bytes {
2001 return out;
2002 }
2003 }
2004
2005 out
2006}
2007
2008#[cfg(test)]
2009mod tests {
2010 use std::path::Path;
2011
2012 use super::{
2013 ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
2014 extract_printable_strings, extract_text_for_detection,
2015 extract_text_for_detection_with_diagnostics, is_non_actionable_pdf_failure,
2016 normalize_mime_type, normalize_pdf_heading_comparison_text,
2017 windows_metadata_or_empty_result,
2018 };
2019
2020 #[test]
2021 fn test_extract_text_for_detection_skips_jar_archives() {
2022 let path = Path::new(
2023 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2024 );
2025 let bytes = std::fs::read(path).expect("failed to read jar fixture");
2026
2027 let (text, kind) = extract_text_for_detection(path, &bytes);
2028
2029 assert!(text.is_empty());
2030 assert_eq!(kind, ExtractedTextKind::None);
2031 }
2032
2033 #[test]
2034 fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2035 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2036 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2037
2038 let (text, kind) = extract_text_for_detection(path, &bytes);
2039
2040 assert_eq!(kind, ExtractedTextKind::Pdf);
2041 assert!(text.contains("Redistribution and use in source and binary forms"));
2042 }
2043
2044 #[test]
2045 fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2046 let path =
2047 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2048 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2049
2050 let (text, kind) = extract_text_for_detection(path, &bytes);
2051
2052 assert_eq!(kind, ExtractedTextKind::Pdf);
2053 assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2054 assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2055 }
2056
2057 #[test]
2058 fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2059 let path =
2060 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2061 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2062
2063 let (text, kind) = extract_text_for_detection(path, &bytes);
2064
2065 assert_eq!(kind, ExtractedTextKind::Pdf);
2066
2067 let normalized = normalize_pdf_heading_comparison_text(&text);
2068 let heading =
2069 normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2070 assert_eq!(normalized.matches(&heading).count(), 1);
2071 }
2072
2073 #[test]
2074 fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2075 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2076 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2077
2078 let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2079
2080 assert_eq!(kind, ExtractedTextKind::Pdf);
2081 assert!(text.contains("Redistribution and use in source and binary forms"));
2082 }
2083
2084 #[test]
2085 fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2086 let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2087
2088 let (text, kind, scan_error) =
2089 extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2090
2091 assert!(text.is_empty());
2092 assert_eq!(kind, ExtractedTextKind::None);
2093 let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2094 assert!(scan_error.contains("PDF text extraction failed after"));
2095 }
2096
2097 #[test]
2098 fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2099 let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2100
2101 let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2102
2103 assert!(text.is_empty());
2104 assert_eq!(kind, ExtractedTextKind::None);
2105 }
2106
2107 #[test]
2108 fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2109 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2110 let text = b"Copyright 2026 Example Project!!!";
2111 bytes[..text.len()].copy_from_slice(text);
2112 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2113 bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2114
2115 let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2116
2117 assert_ne!(kind, ExtractedTextKind::None);
2118 assert!(text.contains("Copyright 2026 Example Project"));
2119 }
2120
2121 #[test]
2122 fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2123 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2124 let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2125 bytes[..noise.len()].copy_from_slice(noise);
2126 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2127 bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2128
2129 let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2130
2131 assert!(text.is_empty());
2132 assert_eq!(kind, ExtractedTextKind::None);
2133 }
2134
2135 #[test]
2136 fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2137 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2138 let bytes = std::fs::read(path).expect("read PE fixture");
2139
2140 let (text, kind) = extract_text_for_detection(path, &bytes);
2141
2142 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2143 assert!(text.contains("License: This program is free software"));
2144 assert!(text.contains("LegalCopyright:"));
2145 }
2146
2147 #[test]
2148 fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2149 {
2150 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2151 let mut bytes = std::fs::read(path).expect("read PE fixture");
2152 bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2153
2154 let (text, kind) = extract_text_for_detection(path, &bytes);
2155
2156 assert_ne!(kind, ExtractedTextKind::None);
2157 assert!(!text.trim().is_empty());
2158 }
2159
2160 #[test]
2161 fn test_windows_metadata_or_empty_result_preserves_metadata() {
2162 let (text, kind, scan_error) =
2163 windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2164
2165 assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2166 assert_eq!(text, "LegalCopyright: Example Corp");
2167 assert!(scan_error.is_none());
2168 }
2169
2170 #[test]
2171 fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2172 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2173 let text = b"Copyright 2026 Example Project!!!";
2174 bytes[..text.len()].copy_from_slice(text);
2175
2176 let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2177
2178 assert!(text.is_empty());
2179 assert_eq!(kind, ExtractedTextKind::None);
2180 }
2181
2182 #[test]
2183 fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2184 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2185 let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2186 bytes[..text.len()].copy_from_slice(text);
2187
2188 let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2189
2190 assert_ne!(kind, ExtractedTextKind::None);
2191 assert!(text.contains("asn@redhat.com"));
2192 assert!(text.contains("https://publicsuffix.org/"));
2193 }
2194
2195 #[test]
2196 fn test_non_actionable_pdf_failures_are_suppressed() {
2197 assert!(is_non_actionable_pdf_failure(&[
2198 "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2199 "open full-document: PDF is encrypted and requires a password".to_string(),
2200 ]));
2201 assert!(is_non_actionable_pdf_failure(&[
2202 "from-bytes first-page: Invalid cross-reference table".to_string(),
2203 "open full-document: Invalid cross-reference table".to_string(),
2204 ]));
2205 assert!(is_non_actionable_pdf_failure(&[
2206 "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2207 "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2208 ]));
2209 assert!(!is_non_actionable_pdf_failure(&[
2210 "from-bytes first-page: some other parser failure".to_string(),
2211 ]));
2212 }
2213
2214 #[test]
2215 fn test_extract_text_for_detection_skips_zip_like_archives() {
2216 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2217
2218 let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2219 let (crate_text, crate_kind) =
2220 extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2221
2222 assert!(whl_text.is_empty());
2223 assert_eq!(whl_kind, ExtractedTextKind::None);
2224 assert!(crate_text.is_empty());
2225 assert_eq!(crate_kind, ExtractedTextKind::None);
2226 }
2227
2228 #[test]
2229 fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2230 let path =
2231 Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2232 let bytes = std::fs::read(path).expect("failed to read lib fixture");
2233
2234 let (text, kind) = extract_text_for_detection(path, &bytes);
2235
2236 assert_ne!(kind, ExtractedTextKind::None);
2237 assert!(text.contains("Copyright nexB and others (c) 2012"));
2238 }
2239
2240 #[test]
2241 fn test_extract_text_for_detection_reads_font_metadata() {
2242 let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2243 let bytes = std::fs::read(path).expect("failed to read font fixture");
2244
2245 let (text, kind) = extract_text_for_detection(path, &bytes);
2246
2247 assert_eq!(kind, ExtractedTextKind::FontMetadata);
2248 assert!(text.contains("License Description:"), "{text}");
2249 assert!(
2250 text.contains("Open Font License") || text.contains("OFL"),
2251 "{text}"
2252 );
2253 }
2254
2255 #[test]
2256 fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2257 let bytes = b"abcd\0".repeat(525_000);
2258
2259 let text = extract_printable_strings(&bytes);
2260
2261 assert!(
2262 text.len() > 2_000_000,
2263 "unexpected truncation at {}",
2264 text.len()
2265 );
2266 assert!(text.ends_with("abcd"));
2267 }
2268
2269 #[test]
2270 fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2271 let path = Path::new(
2272 "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2273 );
2274 let bytes = std::fs::read(path).expect("failed to read svg fixture");
2275
2276 let (text, kind) = extract_text_for_detection(path, &bytes);
2277
2278 assert_eq!(kind, ExtractedTextKind::Decoded);
2279 assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2280 }
2281
2282 #[test]
2283 fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2284 let path = Path::new(
2285 "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2286 );
2287 let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2288
2289 let (text, kind) = extract_text_for_detection(path, &bytes);
2290
2291 assert_eq!(kind, ExtractedTextKind::Decoded);
2292 assert!(text.contains("GNU Lesser General Public"));
2293 assert!(text.contains("version"));
2294 assert!(text.contains("2.1 of the License"));
2295 }
2296
2297 #[test]
2298 fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2299 assert_eq!(
2300 normalize_mime_type(
2301 Path::new("main.ts"),
2302 b"export const answer = 42;\n",
2303 Some("TypeScript"),
2304 "video/mp2t",
2305 ),
2306 "text/plain"
2307 );
2308 }
2309
2310 #[test]
2311 fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2312 assert_eq!(
2313 normalize_mime_type(
2314 Path::new("main.js"),
2315 b"console.log('hello');\n",
2316 Some("JavaScript"),
2317 "application/octet-stream",
2318 ),
2319 "text/plain"
2320 );
2321 }
2322
2323 #[test]
2324 fn test_normalize_mime_type_preserves_binary_video_guess() {
2325 assert_eq!(
2326 normalize_mime_type(
2327 Path::new("main.ts"),
2328 &[0, 159, 146, 150, 0, 1, 2, 3],
2329 Some("TypeScript"),
2330 "video/mp2t",
2331 ),
2332 "video/mp2t"
2333 );
2334 }
2335
2336 #[test]
2337 fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2338 assert_eq!(
2339 normalize_mime_type(
2340 Path::new("main.ts"),
2341 &[0, 159, 146, 150],
2342 Some("TypeScript"),
2343 "application/octet-stream",
2344 ),
2345 "application/octet-stream"
2346 );
2347 }
2348
2349 #[test]
2350 fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2351 let classification = classify_file_info(Path::new("test.txt"), b"");
2352
2353 assert_eq!(classification.mime_type, "inode/x-empty");
2354 assert_eq!(classification.file_type, "empty");
2355 assert!(!classification.is_binary);
2356 assert!(classification.is_text);
2357 assert!(!classification.is_source);
2358 assert_eq!(classification.programming_language, None);
2359 }
2360
2361 #[test]
2362 fn test_classify_file_info_keeps_json_out_of_programming_language() {
2363 let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2364
2365 assert_eq!(classification.mime_type, "application/json");
2366 assert_eq!(classification.file_type, "JSON text data");
2367 assert!(classification.is_text);
2368 assert!(!classification.is_source);
2369 assert_eq!(classification.programming_language, None);
2370 }
2371
2372 #[test]
2373 fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2374 let classification =
2375 classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2376
2377 assert_eq!(classification.mime_type, "text/plain");
2378 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2379 assert!(classification.is_text);
2380 assert!(!classification.is_binary);
2381 }
2382
2383 #[test]
2384 fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2385 let classification =
2386 classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
2387
2388 assert_eq!(classification.mime_type, "application/octet-stream");
2389 assert_eq!(classification.file_type, "data");
2390 assert!(classification.is_binary);
2391 assert!(!classification.is_text);
2392 }
2393
2394 #[test]
2395 fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
2396 let classification = classify_file_info(
2397 Path::new("utf16.json"),
2398 &[
2399 0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
2400 ],
2401 );
2402
2403 assert!(!classification.is_binary);
2404 assert!(classification.is_text);
2405 assert_eq!(classification.mime_type, "application/json");
2406 assert_eq!(classification.file_type, "JSON text data");
2407 }
2408
2409 #[test]
2410 fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
2411 let classification = classify_file_info(Path::new("true.json"), b"true");
2412
2413 assert!(!classification.is_binary);
2414 assert!(classification.is_text);
2415 assert_eq!(classification.mime_type, "application/json");
2416 assert_eq!(classification.file_type, "JSON text data");
2417 }
2418
2419 #[test]
2420 fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
2421 let classification = classify_file_info(
2422 Path::new("wrapped.json"),
2423 &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
2424 );
2425
2426 assert!(!classification.is_binary);
2427 assert!(classification.is_text);
2428 assert_eq!(classification.mime_type, "text/plain");
2429 assert_eq!(classification.file_type, "text, with no line terminators");
2430 }
2431
2432 #[test]
2433 fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
2434 let classification =
2435 classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
2436
2437 assert!(classification.is_binary);
2438 assert!(!classification.is_text);
2439 assert_eq!(classification.mime_type, "application/octet-stream");
2440 assert_eq!(classification.file_type, "data");
2441 }
2442
2443 #[test]
2444 fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
2445 let classification = classify_file_info(
2446 Path::new("crash.json"),
2447 &[
2448 0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
2449 ],
2450 );
2451
2452 assert!(classification.is_binary);
2453 assert!(!classification.is_text);
2454 assert_eq!(classification.mime_type, "application/octet-stream");
2455 }
2456
2457 #[test]
2458 fn test_classify_file_info_treats_dockerfile_as_source() {
2459 let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2460
2461 assert_eq!(
2462 classification.programming_language.as_deref(),
2463 Some("Dockerfile")
2464 );
2465 assert!(classification.is_source);
2466 assert!(!classification.is_script);
2467 assert_eq!(
2468 classification.file_type,
2469 "Dockerfile source, UTF-8 Unicode text"
2470 );
2471 }
2472
2473 #[test]
2474 fn test_classify_file_info_treats_makefile_as_text_not_source() {
2475 let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2476
2477 assert_eq!(classification.programming_language, None);
2478 assert!(classification.is_text);
2479 assert!(!classification.is_source);
2480 assert!(!classification.is_script);
2481 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2482 }
2483
2484 #[test]
2485 fn test_classify_file_info_marks_supported_package_archives() {
2486 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2487
2488 let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2489 let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2490
2491 assert!(egg.is_archive);
2492 assert_eq!(egg.mime_type, "application/zip");
2493 assert_eq!(egg.file_type, "Zip archive data");
2494 assert!(nupkg.is_archive);
2495 assert_eq!(nupkg.mime_type, "application/zip");
2496 assert_eq!(nupkg.file_type, "Zip archive data");
2497 }
2498
2499 #[test]
2500 fn test_classify_file_info_marks_png_as_binary_media() {
2501 let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2502
2503 let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2504
2505 assert_eq!(classification.mime_type, "image/png");
2506 assert_eq!(classification.file_type, "PNG image data");
2507 assert!(classification.is_binary);
2508 assert!(!classification.is_text);
2509 assert!(classification.is_media);
2510 assert!(!classification.is_archive);
2511 assert!(!classification.is_source);
2512 }
2513
2514 #[test]
2515 fn test_classify_file_info_marks_pdf_as_binary_document() {
2516 let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2517
2518 let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2519
2520 assert_eq!(classification.mime_type, "application/pdf");
2521 assert_eq!(classification.file_type, "PDF document");
2522 assert!(classification.is_binary);
2523 assert!(!classification.is_text);
2524 assert!(!classification.is_archive);
2525 assert!(!classification.is_media);
2526 }
2527
2528 #[test]
2529 fn test_classify_file_info_marks_binary_blobs_as_binary() {
2530 let classification =
2531 classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2532
2533 assert!(classification.is_binary);
2534 assert!(!classification.is_text);
2535 assert!(!classification.is_source);
2536 assert_eq!(classification.programming_language, None);
2537 }
2538
2539 #[test]
2540 fn test_classify_file_info_treats_yaml_as_text_not_source() {
2541 let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2542
2543 assert_eq!(classification.programming_language, None);
2544 assert!(classification.is_text);
2545 assert!(!classification.is_source);
2546 assert_eq!(classification.file_type, "YAML text data");
2547 }
2548
2549 #[test]
2550 fn test_classify_file_info_classifies_common_build_manifests() {
2551 let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2552 let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2553 let cmake = classify_file_info(
2554 Path::new("toolchain.cmake"),
2555 b"set(CMAKE_CXX_STANDARD 20)\n",
2556 );
2557 let gitmodules = classify_file_info(
2558 Path::new(".gitmodules"),
2559 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2560 );
2561
2562 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2563 assert!(gradle.is_source);
2564 assert_eq!(gradle.mime_type, "text/plain");
2565 assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
2566
2567 assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2568 assert!(flake.is_source);
2569 assert_eq!(flake.mime_type, "text/plain");
2570 assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
2571
2572 assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
2573 assert!(cmake.is_source);
2574 assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
2575
2576 assert_eq!(gitmodules.programming_language, None);
2577 assert!(gitmodules.is_text);
2578 assert!(!gitmodules.is_source);
2579 assert_eq!(gitmodules.file_type, "Git configuration text");
2580 }
2581
2582 #[test]
2583 fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
2584 let header = classify_file_info(
2585 Path::new("include/demo.hpp"),
2586 b"#pragma once\nclass Demo {};\n",
2587 );
2588 let ipp = classify_file_info(
2589 Path::new("include/detail/demo.ipp"),
2590 b"template <class T> void parse() {}\n",
2591 );
2592
2593 assert_eq!(header.programming_language.as_deref(), Some("C++"));
2594 assert!(header.is_source);
2595 assert!(!header.is_script);
2596 assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
2597
2598 assert_eq!(ipp.programming_language, None);
2599 assert!(!ipp.is_source);
2600 assert!(!ipp.is_script);
2601 assert_eq!(ipp.file_type, "UTF-8 Unicode text");
2602 }
2603
2604 #[test]
2605 fn test_classify_file_info_preserves_specific_shell_family_labels() {
2606 let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
2607
2608 assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
2609 assert!(bash.is_script);
2610 assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
2611 }
2612
2613 #[test]
2614 fn test_classify_file_info_marks_jamfile_as_source() {
2615 let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
2616
2617 assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
2618 assert!(jamfile.is_source);
2619 assert!(!jamfile.is_script);
2620 assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
2621 }
2622
2623 #[test]
2624 fn test_classify_file_info_labels_javascript_shebang_scripts() {
2625 let classification = classify_file_info(
2626 Path::new("bin/run"),
2627 b"#!/usr/bin/env node\nconsole.log('hello');\n",
2628 );
2629
2630 assert_eq!(
2631 classification.programming_language.as_deref(),
2632 Some("JavaScript")
2633 );
2634 assert!(classification.is_script);
2635 assert_eq!(
2636 classification.file_type,
2637 "javascript script, UTF-8 Unicode text executable"
2638 );
2639 }
2640
2641 #[test]
2642 fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2643 let classification = classify_file_info(
2644 Path::new("script.py"),
2645 b"# coding: latin-1\nprint(\"caf\xe9\")\n",
2646 );
2647
2648 assert_eq!(
2649 classification.programming_language.as_deref(),
2650 Some("Python")
2651 );
2652 assert!(classification.is_script);
2653 assert_eq!(classification.file_type, "python script, text executable");
2654 }
2655
2656 #[test]
2657 fn test_classify_file_info_treats_textual_tga_as_media() {
2658 let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
2659
2660 assert!(classification.is_media);
2661 assert!(classification.is_text);
2662 assert!(!classification.is_binary);
2663 }
2664
2665 #[test]
2666 fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
2667 let classification =
2668 classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
2669
2670 assert!(classification.is_binary);
2671 assert!(!classification.is_text);
2672 assert!(!classification.is_source);
2673 assert_eq!(classification.programming_language, None);
2674 }
2675
2676 #[test]
2677 fn test_extract_text_for_detection_skips_unsupported_image_formats() {
2678 let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
2679
2680 let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
2681
2682 assert!(text.is_empty());
2683 assert_eq!(kind, ExtractedTextKind::None);
2684 }
2685
2686 #[test]
2687 fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
2688 let cases = [
2689 (
2690 Path::new("bin/run"),
2691 b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
2692 Some("JavaScript"),
2693 true,
2694 true,
2695 ),
2696 (
2697 Path::new("Dockerfile"),
2698 b"FROM scratch\n".as_slice(),
2699 Some("Dockerfile"),
2700 true,
2701 false,
2702 ),
2703 (
2704 Path::new("package.json"),
2705 br#"{"name":"demo"}"#.as_slice(),
2706 None,
2707 false,
2708 false,
2709 ),
2710 (
2711 Path::new("config.yaml"),
2712 b"key: value\n".as_slice(),
2713 None,
2714 false,
2715 false,
2716 ),
2717 (
2718 Path::new("Makefile"),
2719 b"all:\n\techo hi\n".as_slice(),
2720 None,
2721 false,
2722 false,
2723 ),
2724 ];
2725
2726 for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
2727 let classification = classify_file_info(path, bytes);
2728
2729 assert_eq!(
2730 classification.programming_language.as_deref(),
2731 expected_language,
2732 "unexpected language for {}",
2733 path.display()
2734 );
2735 assert_eq!(
2736 classification.is_source,
2737 expected_is_source,
2738 "unexpected is_source for {}",
2739 path.display()
2740 );
2741 assert_eq!(
2742 classification.is_script,
2743 expected_is_script,
2744 "unexpected is_script for {}",
2745 path.display()
2746 );
2747 }
2748 }
2749}