1use std::borrow::Cow;
2use std::collections::BTreeSet;
3use std::fs;
4use std::io::{BufReader, Cursor, Read};
5use std::panic::{AssertUnwindSafe, catch_unwind};
6use std::path::Path;
7
8use chrono::{TimeZone, Utc};
9use file_format::{FileFormat, Kind as FileFormatKind};
10use flate2::read::ZlibDecoder;
11use glob::Pattern;
12use image::{ImageDecoder, ImageFormat, ImageReader};
13use mime_guess::from_path;
14use quick_xml::events::Event;
15use quick_xml::reader::Reader as XmlReader;
16
17use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
18use crate::utils::font::extract_font_metadata_text;
19use crate::utils::language::detect_language;
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ExtractedTextKind {
23 None,
24 Decoded,
25 FontMetadata,
26 Pdf,
27 BinaryStrings,
28 ImageMetadata,
29 WindowsExecutableMetadata,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct FileInfoClassification {
34 pub mime_type: String,
35 pub file_type: String,
36 pub programming_language: Option<String>,
37 pub is_binary: bool,
38 pub is_text: bool,
39 pub is_archive: bool,
40 pub is_media: bool,
41 pub is_source: bool,
42 pub is_script: bool,
43}
44
45const MAX_IMAGE_METADATA_VALUES: usize = 64;
46const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
47const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
48const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
49const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
50const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
51 "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
52];
53const BINARY_EXTENSIONS: &[&str] = &[
54 "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
55];
56const ARCHIVE_EXTENSIONS: &[&str] = &[
57 "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
58 "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
59];
60
61pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
63 metadata.modified().ok().map(|time: std::time::SystemTime| {
64 let seconds_since_epoch = time
65 .duration_since(std::time::UNIX_EPOCH)
66 .unwrap()
67 .as_secs() as i64;
68
69 Utc.timestamp_opt(seconds_since_epoch, 0)
70 .single()
71 .unwrap_or_else(Utc::now)
72 .format("%Y-%m-%d")
73 .to_string()
74 })
75}
76
77pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
79 let path_str = path.to_string_lossy();
80 let file_name = path
81 .file_name()
82 .map(|name| name.to_string_lossy())
83 .unwrap_or_default();
84
85 for pattern in exclude_patterns {
86 if pattern.matches(&path_str) {
88 return true;
89 }
90
91 if pattern.matches(&file_name) {
93 return true;
94 }
95 }
96
97 false
98}
99
100pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
106 match String::from_utf8(bytes.to_vec()) {
107 Ok(s) => s,
108 Err(e) => {
109 let bytes = e.into_bytes();
110 if has_binary_control_chars(&bytes) {
111 return String::new();
112 }
113 bytes.iter().map(|&b| b as char).collect()
114 }
115 }
116}
117
118pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
119 let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
120 (text, kind)
121}
122
123pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
124 let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
125 return Cow::Borrowed(text);
126 };
127 if !matches!(
128 extension.to_ascii_lowercase().as_str(),
129 "md" | "markdown" | "html" | "htm"
130 ) {
131 return Cow::Borrowed(text);
132 }
133
134 let mut hints = Vec::new();
135 if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
136 hints.push("Creative Commons Attribution 4.0 International License".to_string());
137 }
138 if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
139 {
140 hints.push(
141 "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
142 .to_string(),
143 );
144 }
145
146 hints.extend(extract_shields_license_badge_hints(text));
147
148 if hints.is_empty() {
149 Cow::Borrowed(text)
150 } else {
151 let mut augmented =
152 String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
153 augmented.push_str(text);
154 augmented.push_str("\n\n");
155 for (index, hint) in hints.into_iter().enumerate() {
156 if index > 0 {
157 augmented.push('\n');
158 }
159 augmented.push_str(&hint);
160 }
161 Cow::Owned(augmented)
162 }
163}
164
165fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
166 let mut hints = Vec::new();
167 let mut rest = text;
168 let needle = "img.shields.io/badge/license-";
169
170 while let Some(index) = rest.find(needle) {
171 let start = index + needle.len();
172 let suffix = &rest[start..];
173 let end = suffix
174 .find([')', ']', '"', '\'', ' ', '\n'])
175 .unwrap_or(suffix.len());
176 let badge = &suffix[..end];
177 let Some(badge) = badge.strip_suffix(".svg") else {
178 rest = &suffix[end..];
179 continue;
180 };
181
182 let mut segments: Vec<_> = badge
183 .split('-')
184 .filter(|segment| !segment.is_empty())
185 .collect();
186 if segments.len() < 2 {
187 rest = &suffix[end..];
188 continue;
189 }
190 segments.pop();
191 let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
192 if !candidate.is_empty() {
193 hints.push(canonical_shields_license_hint(&candidate));
194 }
195
196 rest = &suffix[end..];
197 }
198
199 hints.sort();
200 hints.dedup();
201 hints
202}
203
204fn canonical_shields_license_hint(candidate: &str) -> String {
205 match candidate.trim() {
206 "MIT" => "The MIT License".to_string(),
207 "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
208 other => format!("{other} License"),
209 }
210}
211
212pub(crate) fn extract_text_for_detection_with_diagnostics(
213 path: &Path,
214 bytes: &[u8],
215) -> (String, ExtractedTextKind, Option<String>) {
216 let ext = path
217 .extension()
218 .and_then(|e| e.to_str())
219 .map(|s| s.to_ascii_lowercase());
220 let detected_format = detect_file_format(bytes);
221
222 if looks_like_rtf(bytes, ext.as_deref()) {
223 let text = extract_rtf_text(bytes);
224 return if text.trim().is_empty() {
225 (String::new(), ExtractedTextKind::None, None)
226 } else {
227 (text, ExtractedTextKind::Decoded, None)
228 };
229 }
230
231 if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
232 let (text, scan_error) = extract_pdf_text(path, bytes);
233 return if text.is_empty() {
234 (String::new(), ExtractedTextKind::None, scan_error)
235 } else {
236 (text, ExtractedTextKind::Pdf, None)
237 };
238 }
239
240 if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
241 let text = extract_image_metadata_text(bytes, format);
242 return if text.is_empty() {
243 if is_supported_image_container(bytes, format) {
244 (String::new(), ExtractedTextKind::None, None)
245 } else {
246 let decoded = decode_bytes_to_string(bytes);
247 if decoded.is_empty() {
248 (String::new(), ExtractedTextKind::None, None)
249 } else {
250 (decoded, ExtractedTextKind::Decoded, None)
251 }
252 }
253 } else {
254 (text, ExtractedTextKind::ImageMetadata, None)
255 };
256 }
257
258 if let Some(text) = extract_font_metadata_text(path, bytes) {
259 return (text, ExtractedTextKind::FontMetadata, None);
260 }
261
262 let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
263 let large_opaque_binary = windows_executable_metadata_text.is_none()
264 && is_large_opaque_binary_candidate(bytes, detected_format);
265
266 if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
267 return windows_metadata_or_empty_result(windows_executable_metadata_text);
268 }
269
270 if should_skip_binary_string_extraction(path, bytes, detected_format) {
271 return (String::new(), ExtractedTextKind::None, None);
272 }
273
274 if !large_opaque_binary {
275 let decoded = decode_bytes_to_string(bytes);
276 if !decoded.is_empty() {
277 let combined =
278 combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
279 return (combined, ExtractedTextKind::Decoded, None);
280 }
281 }
282
283 let text = if large_opaque_binary {
284 extract_sampled_printable_strings(bytes)
285 } else {
286 extract_printable_strings(bytes)
287 };
288 if text.is_empty() {
289 windows_metadata_or_empty_result(windows_executable_metadata_text)
290 } else {
291 (
292 combine_extracted_text_fragments(windows_executable_metadata_text, text),
293 ExtractedTextKind::BinaryStrings,
294 None,
295 )
296 }
297}
298
299fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
300 match prefix {
301 Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
302 Some(prefix) if !prefix.is_empty() => prefix,
303 _ => suffix,
304 }
305}
306
307fn windows_metadata_or_empty_result(
308 windows_executable_metadata_text: Option<String>,
309) -> (String, ExtractedTextKind, Option<String>) {
310 if let Some(metadata_text) = windows_executable_metadata_text {
311 (
312 metadata_text,
313 ExtractedTextKind::WindowsExecutableMetadata,
314 None,
315 )
316 } else {
317 (String::new(), ExtractedTextKind::None, None)
318 }
319}
320
321pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
322 let detected_format = detect_file_format(bytes);
323 let detected_language = detect_language(path, bytes);
324 let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
325 let is_text = !is_binary;
326 let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
327 let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
328 let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
329 let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
330 let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
331 let programming_language = is_source.then(|| detected_language.clone()).flatten();
332 let file_type = detect_file_type(
333 path,
334 bytes,
335 detected_format,
336 &mime_type,
337 programming_language.as_deref(),
338 is_binary,
339 is_text,
340 is_archive,
341 is_media,
342 is_script,
343 );
344
345 FileInfoClassification {
346 mime_type,
347 file_type,
348 programming_language,
349 is_binary,
350 is_text,
351 is_archive,
352 is_media,
353 is_source,
354 is_script,
355 }
356}
357
358fn detect_file_format(bytes: &[u8]) -> FileFormat {
359 FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
360}
361
362fn is_utf8_text(bytes: &[u8]) -> bool {
363 std::str::from_utf8(bytes).is_ok()
364}
365
366fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
367 if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
368 return None;
369 }
370
371 let (is_le, body) = match bytes {
372 [0xFF, 0xFE, rest @ ..] => (true, rest),
373 [0xFE, 0xFF, rest @ ..] => (false, rest),
374 _ => return None,
375 };
376
377 if body.is_empty() || body.len() % 2 != 0 {
378 return None;
379 }
380
381 let code_units: Vec<u16> = body
382 .chunks_exact(2)
383 .map(|chunk| {
384 if is_le {
385 u16::from_le_bytes([chunk[0], chunk[1]])
386 } else {
387 u16::from_be_bytes([chunk[0], chunk[1]])
388 }
389 })
390 .collect();
391
392 std::char::decode_utf16(code_units)
393 .collect::<Result<String, _>>()
394 .ok()
395}
396
397fn has_binary_control_chars(bytes: &[u8]) -> bool {
398 let control_count = bytes
399 .iter()
400 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
401 .count();
402 control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
403}
404
405fn has_decodable_text(bytes: &[u8]) -> bool {
406 bytes.is_empty()
407 || is_utf8_text(bytes)
408 || decode_utf16_bom_text(bytes).is_some()
409 || !has_binary_control_chars(bytes)
410}
411
412fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
413 if bytes.is_empty() || is_utf8_text(bytes) {
414 return true;
415 }
416 if let Some(decoded) = decode_utf16_bom_text(bytes) {
417 return decoded
418 .chars()
419 .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
420 }
421
422 let printable_count = bytes
423 .iter()
424 .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
425 .count();
426 printable_count * 2 >= bytes.len()
427}
428
429fn is_textual_media_type(media_type: &str) -> bool {
430 media_type.starts_with("text/")
431 || matches!(
432 media_type,
433 "application/json" | "application/xml" | "text/xml"
434 )
435 || media_type.ends_with("+json")
436 || media_type.ends_with("+xml")
437}
438
439fn is_textual_format(detected_format: FileFormat) -> bool {
440 matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
441 || is_textual_media_type(detected_format.media_type())
442}
443
444fn is_known_binary_format(detected_format: FileFormat) -> bool {
445 !matches!(detected_format, FileFormat::ArbitraryBinaryData)
446 && !is_textual_format(detected_format)
447}
448
449pub fn detect_mime_type(
450 path: &Path,
451 bytes: &[u8],
452 detected_format: FileFormat,
453 programming_language: Option<&str>,
454) -> String {
455 if bytes.is_empty() {
456 return "inode/x-empty".to_string();
457 }
458
459 if lower_extension(path).as_deref() == Some("json") {
460 if let Some(is_binary) = json_binary_override(bytes) {
461 if is_binary {
462 return "application/octet-stream".to_string();
463 }
464 if has_valid_json_text(bytes) {
465 return "application/json".to_string();
466 }
467 return "text/plain".to_string();
468 }
469 if has_valid_json_text(bytes) {
470 return "application/json".to_string();
471 }
472 if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
473 return "text/plain".to_string();
474 }
475 return "application/octet-stream".to_string();
476 }
477
478 if is_zip_archive(bytes) {
479 return detect_zip_like_mime(path);
480 }
481
482 if looks_like_deb(bytes, path) {
483 return "application/vnd.debian.binary-package".to_string();
484 }
485
486 if looks_like_rpm(bytes, path) {
487 return "application/x-rpm".to_string();
488 }
489
490 let guessed_mime = from_path(path)
491 .first_or_octet_stream()
492 .essence_str()
493 .to_string();
494
495 let mime_type = match detected_format {
496 FileFormat::Empty => "inode/x-empty".to_string(),
497 FileFormat::PlainText => {
498 if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
499 "text/plain".to_string()
500 } else {
501 guessed_mime.clone()
502 }
503 }
504 _ => {
505 let detected_mime = detected_format.media_type();
506 if detected_mime == "application/octet-stream"
507 && guessed_mime != "application/octet-stream"
508 {
509 guessed_mime.clone()
510 } else {
511 detected_mime.to_string()
512 }
513 }
514 };
515
516 normalize_mime_type(path, bytes, programming_language, &mime_type)
517}
518
519fn normalize_mime_type(
520 path: &Path,
521 bytes: &[u8],
522 programming_language: Option<&str>,
523 mime_type: &str,
524) -> String {
525 if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
526 return "text/plain".to_string();
527 }
528
529 mime_type.to_string()
530}
531
532fn should_prefer_text_mime(
533 path: &Path,
534 bytes: &[u8],
535 programming_language: Option<&str>,
536 mime_type: &str,
537) -> bool {
538 has_decodable_text(bytes)
539 && looks_like_textual_bytes(bytes)
540 && is_textual_source_candidate(path, programming_language)
541 && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
542}
543
544fn has_valid_json_text(bytes: &[u8]) -> bool {
545 if bytes.len() > JSON_VALIDATION_MAX_BYTES {
546 return false;
547 }
548
549 serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
550 || decode_utf16_bom_text(bytes)
551 .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
552 .is_some()
553}
554
555fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
556 !bytes.contains(&0)
557 && !bytes.contains(&0xFF)
558 && bytes.starts_with(b"[\"")
559 && bytes.ends_with(b"\"]")
560 && bytes.len() >= 8
561}
562
563fn json_binary_override(bytes: &[u8]) -> Option<bool> {
564 if has_valid_json_text(bytes) || decode_utf16_bom_text(bytes).is_some() {
565 return Some(false);
566 }
567
568 if bytes.contains(&0) {
569 return Some(true);
570 }
571
572 if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
573 return Some(true);
574 }
575
576 if is_wrapped_invalid_json_string_text(bytes) {
577 return Some(false);
578 }
579
580 None
581}
582
583fn detect_is_binary(
584 path: &Path,
585 bytes: &[u8],
586 detected_format: FileFormat,
587 programming_language: Option<&str>,
588) -> bool {
589 if lower_extension(path).as_deref() == Some("json")
590 && let Some(is_binary) = json_binary_override(bytes)
591 {
592 return is_binary;
593 }
594
595 if is_textual_format(detected_format) {
596 return false;
597 }
598
599 if lower_extension(path)
600 .as_deref()
601 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
602 {
603 return true;
604 }
605
606 if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
607 return false;
608 }
609
610 has_binary_control_chars(bytes)
611 || is_known_binary_format(detected_format)
612 || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
613 && !looks_like_textual_bytes(bytes))
614}
615
616fn should_treat_binary_bytes_as_text(
617 path: &Path,
618 bytes: &[u8],
619 programming_language: Option<&str>,
620) -> bool {
621 has_decodable_text(bytes)
622 && looks_like_textual_bytes(bytes)
623 && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
624}
625
626fn detect_is_archive(
627 path: &Path,
628 bytes: &[u8],
629 mime_type: &str,
630 is_text: bool,
631 detected_format: FileFormat,
632) -> bool {
633 if is_text {
634 return false;
635 }
636
637 lower_extension(path)
638 .as_deref()
639 .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
640 || matches!(
641 detected_format.kind(),
642 FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
643 )
644 || is_zip_archive(bytes)
645 || looks_like_gzip(bytes)
646 || looks_like_bzip2(bytes)
647 || looks_like_xz(bytes)
648 || looks_like_deb(bytes, path)
649 || looks_like_rpm(bytes, path)
650 || looks_like_squashfs(bytes, path)
651 || mime_type.contains("zip")
652 || mime_type.contains("compressed")
653 || mime_type.contains("tar")
654 || mime_type.contains("x-rpm")
655 || mime_type.contains("debian")
656}
657
658fn detect_is_media(
659 path: &Path,
660 bytes: &[u8],
661 mime_type: &str,
662 detected_format: FileFormat,
663) -> bool {
664 media_mime_from_content(bytes).is_some()
665 || matches!(
666 detected_format.kind(),
667 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
668 )
669 || mime_type.starts_with("image/")
670 || mime_type.starts_with("audio/")
671 || mime_type.starts_with("video/")
672 || (mime_type == "application/octet-stream"
673 && lower_extension(path).as_deref() == Some("tga")
674 && !has_binary_control_chars(bytes))
675}
676
677fn detect_is_script(
678 path: &Path,
679 bytes: &[u8],
680 programming_language: Option<&str>,
681 is_text: bool,
682) -> bool {
683 if !is_text || is_makefile(path) {
684 return false;
685 }
686
687 bytes.starts_with(b"#!")
688 || lower_extension(path).as_deref().is_some_and(|ext| {
689 matches!(
690 ext,
691 "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
692 )
693 })
694 || matches!(
695 programming_language,
696 Some(
697 "Shell"
698 | "Bash"
699 | "Zsh"
700 | "Fish"
701 | "Ksh"
702 | "Python"
703 | "Ruby"
704 | "Perl"
705 | "PHP"
706 | "PowerShell"
707 | "Awk"
708 )
709 )
710}
711
712fn detect_is_source(
713 path: &Path,
714 programming_language: Option<&str>,
715 is_text: bool,
716 is_script: bool,
717) -> bool {
718 if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
719 return false;
720 }
721
722 if is_c_like_source(path) || is_java_like_source(path) {
723 return true;
724 }
725
726 programming_language.is_some() || is_script
727}
728
729#[allow(clippy::too_many_arguments)]
730fn detect_file_type(
731 path: &Path,
732 bytes: &[u8],
733 detected_format: FileFormat,
734 mime_type: &str,
735 programming_language: Option<&str>,
736 is_binary: bool,
737 is_text: bool,
738 is_archive: bool,
739 is_media: bool,
740 is_script: bool,
741) -> String {
742 if bytes.is_empty() {
743 return "empty".to_string();
744 }
745
746 if looks_like_pdf(bytes) {
747 return "PDF document".to_string();
748 }
749
750 if let Some(file_type) = media_file_type_from_content(bytes) {
751 return file_type.to_string();
752 }
753
754 if is_archive {
755 return archive_file_type(path, bytes, detected_format);
756 }
757
758 if is_script {
759 return script_file_type(programming_language, bytes);
760 }
761
762 if is_text {
763 if lower_extension(path).as_deref() == Some("json") {
764 if has_valid_json_text(bytes) {
765 return "JSON text data".to_string();
766 }
767 return text_file_type(bytes);
768 }
769 if lower_extension(path).as_deref() == Some("xml") {
770 return "XML text data".to_string();
771 }
772 if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
773 return "YAML text data".to_string();
774 }
775 if lower_extension(path).as_deref() == Some("toml") {
776 return "TOML text data".to_string();
777 }
778 if matches!(
779 lower_extension(path).as_deref(),
780 Some("ini" | "cfg" | "conf")
781 ) {
782 return "INI text data".to_string();
783 }
784 if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
785 return "Git configuration text".to_string();
786 }
787 if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
788 return text_file_type(bytes);
789 }
790 if programming_language.is_some() && !is_media {
791 return source_file_type(programming_language, bytes);
792 }
793 return text_file_type(bytes);
794 }
795
796 if let Some(file_type) = format_based_file_type(detected_format) {
797 return file_type;
798 }
799
800 if is_binary && mime_type == "application/octet-stream" {
801 return "data".to_string();
802 }
803
804 mime_type.to_string()
805}
806
807fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
808 if matches!(programming_language, Some(language) if is_source_like_language(language)) {
809 return true;
810 }
811
812 if matches!(
813 lower_file_name(path).as_str(),
814 "dockerfile"
815 | "containerfile"
816 | "containerfile.core"
817 | "apkbuild"
818 | "podfile"
819 | "jamfile"
820 | "jamroot"
821 | "meson.build"
822 | "build"
823 | "workspace"
824 | "buck"
825 | "default.nix"
826 | "flake.nix"
827 | "shell.nix"
828 ) {
829 return true;
830 }
831
832 path.extension()
833 .and_then(|ext| ext.to_str())
834 .is_some_and(|ext| {
835 matches!(
836 ext.to_ascii_lowercase().as_str(),
837 "rs" | "py"
838 | "js"
839 | "mjs"
840 | "cjs"
841 | "jsx"
842 | "ts"
843 | "mts"
844 | "cts"
845 | "tsx"
846 | "c"
847 | "cpp"
848 | "cc"
849 | "cxx"
850 | "h"
851 | "hpp"
852 | "m"
853 | "mm"
854 | "s"
855 | "asm"
856 | "java"
857 | "go"
858 | "rb"
859 | "php"
860 | "pl"
861 | "swift"
862 | "sh"
863 | "bash"
864 | "zsh"
865 | "fish"
866 | "ksh"
867 | "ps1"
868 | "psm1"
869 | "psd1"
870 | "awk"
871 | "kt"
872 | "kts"
873 | "dart"
874 | "scala"
875 | "groovy"
876 | "gradle"
877 | "gvy"
878 | "gy"
879 | "gsh"
880 | "cs"
881 | "fs"
882 | "fsx"
883 | "r"
884 | "lua"
885 | "jl"
886 | "ex"
887 | "exs"
888 | "clj"
889 | "cljs"
890 | "cljc"
891 | "hs"
892 | "erl"
893 | "nix"
894 | "zig"
895 | "bzl"
896 | "bazel"
897 | "star"
898 | "sky"
899 | "ml"
900 | "mli"
901 | "tex"
902 )
903 })
904}
905
906fn is_source_like_language(language: &str) -> bool {
907 matches!(
908 language,
909 "Rust"
910 | "Python"
911 | "JavaScript"
912 | "TypeScript"
913 | "JavaScript/TypeScript"
914 | "C"
915 | "C++"
916 | "Objective-C"
917 | "Objective-C++"
918 | "GAS"
919 | "Java"
920 | "Go"
921 | "Ruby"
922 | "PHP"
923 | "Perl"
924 | "Swift"
925 | "Shell"
926 | "PowerShell"
927 | "Awk"
928 | "Kotlin"
929 | "Dart"
930 | "Scala"
931 | "C#"
932 | "F#"
933 | "R"
934 | "Lua"
935 | "Julia"
936 | "Elixir"
937 | "Clojure"
938 | "Haskell"
939 | "Erlang"
940 | "Groovy"
941 | "Nix"
942 | "Zig"
943 | "Starlark"
944 | "OCaml"
945 | "Meson"
946 | "TeX"
947 | "Dockerfile"
948 | "Makefile"
949 | "Jamfile"
950 )
951}
952
953fn extension(path: &Path) -> Option<&str> {
954 path.extension().and_then(|ext| ext.to_str())
955}
956
957fn lower_extension(path: &Path) -> Option<String> {
958 extension(path).map(|ext| ext.to_ascii_lowercase())
959}
960
961fn lower_file_name(path: &Path) -> String {
962 path.file_name()
963 .and_then(|name| name.to_str())
964 .map(|name| name.to_ascii_lowercase())
965 .unwrap_or_default()
966}
967
968fn is_plain_text(path: &Path) -> bool {
969 lower_extension(path)
970 .as_deref()
971 .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
972}
973
974fn is_makefile(path: &Path) -> bool {
975 matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
976}
977
978fn is_source_map(path: &Path) -> bool {
979 let path_lower = path.to_string_lossy().to_ascii_lowercase();
980 path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
981}
982
983fn is_c_like_source(path: &Path) -> bool {
984 lower_extension(path).as_deref().is_some_and(|ext| {
985 matches!(
986 ext,
987 "c" | "cc"
988 | "cp"
989 | "cpp"
990 | "cxx"
991 | "c++"
992 | "h"
993 | "hh"
994 | "hpp"
995 | "hxx"
996 | "h++"
997 | "i"
998 | "ii"
999 | "m"
1000 | "s"
1001 | "asm"
1002 )
1003 })
1004}
1005
1006fn is_java_like_source(path: &Path) -> bool {
1007 lower_extension(path)
1008 .as_deref()
1009 .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1010}
1011
1012fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1013 match detected_format {
1014 FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1015 format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1016 format => Some(match format.kind() {
1017 FileFormatKind::Image => short_name_or_name(&format, "image data"),
1018 FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1019 FileFormatKind::Video => short_name_or_name(&format, "video data"),
1020 _ => format.name().to_string(),
1021 }),
1022 }
1023}
1024
1025fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1026 format
1027 .short_name()
1028 .map(|short_name| format!("{short_name} {suffix}"))
1029 .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1030}
1031
1032fn detect_zip_like_mime(path: &Path) -> String {
1033 match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1034 Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1035 Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1036 "application/java-archive".to_string()
1037 }
1038 _ => "application/zip".to_string(),
1039 }
1040}
1041
1042fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1043 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1044 Some("image/png")
1045 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1046 Some("image/jpeg")
1047 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1048 Some("image/tiff")
1049 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1050 Some("image/webp")
1051 } else {
1052 None
1053 }
1054}
1055
1056fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1057 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1058 Some("PNG image data")
1059 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1060 Some("JPEG image data")
1061 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1062 Some("TIFF image data")
1063 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1064 Some("WebP image data")
1065 } else {
1066 None
1067 }
1068}
1069
1070fn looks_like_pdf(bytes: &[u8]) -> bool {
1071 bytes.starts_with(b"%PDF-")
1072}
1073
1074fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1075 ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1076}
1077
1078fn extract_rtf_text(bytes: &[u8]) -> String {
1079 let text = String::from_utf8_lossy(bytes);
1080 let chars: Vec<char> = text.chars().collect();
1081 let mut output = String::new();
1082 let mut index = 0usize;
1083
1084 while index < chars.len() {
1085 match chars[index] {
1086 '{' | '}' => {
1087 index += 1;
1088 }
1089 '\\' => {
1090 index += 1;
1091 if index >= chars.len() {
1092 break;
1093 }
1094
1095 match chars[index] {
1096 '\\' | '{' | '}' => {
1097 output.push(chars[index]);
1098 index += 1;
1099 }
1100 '\'' => {
1101 if index + 2 < chars.len() {
1102 let hex = [chars[index + 1], chars[index + 2]];
1103 let hex: String = hex.iter().collect();
1104 if let Ok(value) = u8::from_str_radix(&hex, 16) {
1105 output.push(value as char);
1106 index += 3;
1107 continue;
1108 }
1109 }
1110 index += 1;
1111 }
1112 control if control.is_ascii_alphabetic() => {
1113 let start = index;
1114 while index < chars.len() && chars[index].is_ascii_alphabetic() {
1115 index += 1;
1116 }
1117 let control_word: String = chars[start..index].iter().collect();
1118
1119 let number_start = index;
1120 if index < chars.len()
1121 && (chars[index] == '-' || chars[index].is_ascii_digit())
1122 {
1123 index += 1;
1124 while index < chars.len() && chars[index].is_ascii_digit() {
1125 index += 1;
1126 }
1127 }
1128 let parameter: String = chars[number_start..index].iter().collect();
1129
1130 if index < chars.len() && chars[index] == ' ' {
1131 index += 1;
1132 }
1133
1134 match control_word.as_str() {
1135 "par" | "line" => output.push('\n'),
1136 "tab" => output.push('\t'),
1137 "emdash" => output.push('—'),
1138 "endash" => output.push('–'),
1139 "bullet" => output.push('•'),
1140 "lquote" | "rquote" => output.push('\''),
1141 "ldblquote" | "rdblquote" => output.push('"'),
1142 "u" => {
1143 if let Ok(codepoint) = parameter.parse::<i32>() {
1144 let normalized = if codepoint < 0 {
1145 codepoint + 65_536
1146 } else {
1147 codepoint
1148 };
1149 if let Ok(normalized) = u32::try_from(normalized)
1150 && let Some(ch) = char::from_u32(normalized)
1151 {
1152 output.push(ch);
1153 }
1154 }
1155
1156 if index < chars.len()
1157 && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1158 {
1159 index += 1;
1160 }
1161 }
1162 _ => {}
1163 }
1164 }
1165 _ => {
1166 index += 1;
1167 }
1168 }
1169 }
1170 ch => {
1171 output.push(ch);
1172 index += 1;
1173 }
1174 }
1175 }
1176
1177 output
1178 .replace(['\r', '\u{0c}'], "\n")
1179 .lines()
1180 .map(str::trim_end)
1181 .collect::<Vec<_>>()
1182 .join("\n")
1183}
1184
1185fn looks_like_gzip(bytes: &[u8]) -> bool {
1186 bytes.starts_with(&[0x1f, 0x8b])
1187}
1188
1189fn looks_like_bzip2(bytes: &[u8]) -> bool {
1190 bytes.starts_with(b"BZh")
1191}
1192
1193fn looks_like_xz(bytes: &[u8]) -> bool {
1194 bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1195}
1196
1197fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1198 lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1199}
1200
1201fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1202 lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1203}
1204
1205fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1206 lower_extension(path)
1207 .as_deref()
1208 .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1209 && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1210 || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1211}
1212
1213fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1214 if looks_like_deb(bytes, path) {
1215 "debian binary package (format 2.0)".to_string()
1216 } else if looks_like_rpm(bytes, path) {
1217 "RPM package".to_string()
1218 } else if looks_like_squashfs(bytes, path) {
1219 "Squashfs filesystem".to_string()
1220 } else if looks_like_gzip(bytes) {
1221 "gzip compressed data".to_string()
1222 } else if looks_like_bzip2(bytes) {
1223 "bzip2 compressed data".to_string()
1224 } else if looks_like_xz(bytes) {
1225 "XZ compressed data".to_string()
1226 } else if is_zip_archive(bytes) {
1227 "Zip archive data".to_string()
1228 } else if lower_extension(path).as_deref() == Some("gem") {
1229 "POSIX tar archive".to_string()
1230 } else if let Some(file_type) = format_based_file_type(detected_format) {
1231 file_type
1232 } else {
1233 "archive data".to_string()
1234 }
1235}
1236
1237fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1238 let suffix = text_executable_label(bytes);
1239
1240 match programming_language {
1241 Some("Python") => format!("python script, {suffix}"),
1242 Some("Ruby") => format!("ruby script, {suffix}"),
1243 Some("Perl") => format!("perl script, {suffix}"),
1244 Some("PHP") => format!("php script, {suffix}"),
1245 Some("Shell") => format!("shell script, {suffix}"),
1246 Some("Bash") => format!("bash script, {suffix}"),
1247 Some("Zsh") => format!("zsh script, {suffix}"),
1248 Some("Fish") => format!("fish script, {suffix}"),
1249 Some("Ksh") => format!("ksh script, {suffix}"),
1250 Some("JavaScript") => format!("javascript script, {suffix}"),
1251 Some("TypeScript") => format!("typescript script, {suffix}"),
1252 Some("PowerShell") => format!("powershell script, {suffix}"),
1253 Some("Awk") => format!("awk script, {suffix}"),
1254 _ => format!("script, {suffix}"),
1255 }
1256}
1257
1258fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1259 let suffix = text_label(bytes);
1260 match programming_language {
1261 Some("C") => format!("C source, {suffix}"),
1262 Some("C++") => format!("C++ source, {suffix}"),
1263 Some("Java") => format!("Java source, {suffix}"),
1264 Some("C#") => format!("C# source, {suffix}"),
1265 Some("F#") => format!("F# source, {suffix}"),
1266 Some("Go") => format!("Go source, {suffix}"),
1267 Some("Rust") => format!("Rust source, {suffix}"),
1268 Some("Starlark") => format!("Starlark source, {suffix}"),
1269 Some("CMake") => format!("CMake source, {suffix}"),
1270 Some("Meson") => format!("Meson source, {suffix}"),
1271 Some("Nix") => format!("Nix source, {suffix}"),
1272 Some("Groovy") => format!("Groovy source, {suffix}"),
1273 Some("Makefile") => format!("Makefile source, {suffix}"),
1274 Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1275 Some("Jamfile") => format!("Jamfile source, {suffix}"),
1276 Some("Batchfile") => format!("Batchfile source, {suffix}"),
1277 Some(language) => format!("{language} source, {suffix}"),
1278 None => text_file_type(bytes),
1279 }
1280}
1281
1282fn text_file_type(bytes: &[u8]) -> String {
1283 text_label(bytes).to_string()
1284}
1285
1286fn text_label(bytes: &[u8]) -> &'static str {
1287 if std::str::from_utf8(bytes).is_ok() {
1288 if bytes.contains(&b'\n') {
1289 "UTF-8 Unicode text"
1290 } else {
1291 "UTF-8 Unicode text, with no line terminators"
1292 }
1293 } else if bytes.contains(&b'\n') {
1294 "text"
1295 } else {
1296 "text, with no line terminators"
1297 }
1298}
1299
1300fn text_executable_label(bytes: &[u8]) -> &'static str {
1301 if std::str::from_utf8(bytes).is_ok() {
1302 if bytes.contains(&b'\n') {
1303 "UTF-8 Unicode text executable"
1304 } else {
1305 "UTF-8 Unicode text executable, with no line terminators"
1306 }
1307 } else if bytes.contains(&b'\n') {
1308 "text executable"
1309 } else {
1310 "text executable, with no line terminators"
1311 }
1312}
1313
1314fn supported_image_metadata_format(
1315 ext: Option<&str>,
1316 detected_format: FileFormat,
1317) -> Option<ImageFormat> {
1318 match ext {
1319 Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1320 Some("png") => Some(ImageFormat::Png),
1321 Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1322 Some("webp") => Some(ImageFormat::WebP),
1323 _ => match detected_format.media_type() {
1324 "image/jpeg" => Some(ImageFormat::Jpeg),
1325 "image/png" => Some(ImageFormat::Png),
1326 "image/tiff" => Some(ImageFormat::Tiff),
1327 "image/webp" => Some(ImageFormat::WebP),
1328 _ => None,
1329 },
1330 }
1331}
1332
1333fn should_skip_binary_string_extraction(
1334 path: &Path,
1335 bytes: &[u8],
1336 detected_format: FileFormat,
1337) -> bool {
1338 matches!(lower_extension(path).as_deref(), Some("pdf"))
1339 || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1340 .is_some()
1341 || (matches!(
1342 detected_format.kind(),
1343 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1344 ) && !is_textual_format(detected_format))
1345 || media_mime_from_content(bytes).is_some()
1346 || is_zip_archive(bytes)
1347 || looks_like_gzip(bytes)
1348 || looks_like_bzip2(bytes)
1349 || looks_like_xz(bytes)
1350 || looks_like_deb(bytes, path)
1351 || looks_like_rpm(bytes, path)
1352 || looks_like_squashfs(bytes, path)
1353}
1354
1355fn should_skip_large_opaque_binary_text_extraction(
1356 _path: &Path,
1357 bytes: &[u8],
1358 detected_format: FileFormat,
1359) -> bool {
1360 is_large_opaque_binary_candidate(bytes, detected_format)
1361 && !sample_has_promising_printable_strings(bytes)
1362}
1363
1364fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1365 bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1366 && !is_textual_format(detected_format)
1367 && !matches!(
1368 detected_format.kind(),
1369 FileFormatKind::Archive
1370 | FileFormatKind::Compressed
1371 | FileFormatKind::Package
1372 | FileFormatKind::Audio
1373 | FileFormatKind::Image
1374 | FileFormatKind::Video
1375 )
1376}
1377
1378fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1379 const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1380
1381 let mut ranges = Vec::new();
1382 let mut push_range = |start: usize, end: usize| {
1383 if start < end && !ranges.contains(&(start, end)) {
1384 ranges.push((start, end));
1385 }
1386 };
1387
1388 push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1389 if len > SAMPLE_WINDOW_BYTES * 2 {
1390 let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1391 let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1392 push_range(mid_start, mid_end);
1393 }
1394 if len > SAMPLE_WINDOW_BYTES {
1395 push_range(len - SAMPLE_WINDOW_BYTES, len);
1396 }
1397
1398 ranges
1399}
1400
1401fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1402 let mut structured_signal_seen = false;
1403 let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1404 .into_iter()
1405 .filter(|&(start, end)| {
1406 let window = &bytes[start..end];
1407 if has_strong_structured_text_signal(window) {
1408 structured_signal_seen = true;
1409 }
1410 has_license_or_notice_signal(window)
1411 })
1412 .count();
1413
1414 structured_signal_seen || promising_license_windows >= 2
1415}
1416
1417fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1418 let mut combined_lines = BTreeSet::new();
1419
1420 for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1421 let window_text = extract_printable_strings(&bytes[start..end]);
1422 for line in window_text
1423 .lines()
1424 .map(str::trim)
1425 .filter(|line| !line.is_empty())
1426 {
1427 combined_lines.insert(line.to_string());
1428 }
1429 }
1430
1431 combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1432}
1433
1434fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1435 let strings = extract_printable_strings(bytes);
1436 if strings.is_empty() {
1437 return false;
1438 }
1439
1440 let lower = strings.to_ascii_lowercase();
1441 [
1442 "copyright",
1443 "license",
1444 "licensed under",
1445 "all rights reserved",
1446 "permission is hereby granted",
1447 "redistribution and use",
1448 "spdx-license-identifier",
1449 ]
1450 .iter()
1451 .any(|marker| lower.contains(marker))
1452}
1453
1454fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1455 let strings = extract_printable_strings(bytes);
1456 if strings.is_empty() {
1457 return false;
1458 }
1459
1460 let email_markers = strings.matches('@').count();
1461 let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1462
1463 email_markers + url_markers >= 3
1464}
1465
1466fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1467 match format {
1468 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1469 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1470 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1471 ImageFormat::WebP => {
1472 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1473 }
1474 _ => false,
1475 }
1476}
1477
1478fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1479 let mut values = Vec::new();
1480 values.extend(extract_exif_metadata_values(bytes));
1481 values.extend(extract_xmp_metadata_values(bytes, format));
1482 values_to_text(values)
1483}
1484
1485fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1486 let mut cursor = BufReader::new(Cursor::new(bytes));
1487 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1488 Ok(exif) => exif,
1489 Err(_) => return Vec::new(),
1490 };
1491
1492 let mut values = Vec::new();
1493 for field in exif.fields() {
1494 let rendered = match field.tag {
1495 exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1496 Some(field.display_value().with_unit(&exif).to_string())
1497 }
1498 exif::Tag::Artist => Some(format!(
1499 "Author: {}",
1500 field.display_value().with_unit(&exif)
1501 )),
1502 _ => None,
1503 };
1504
1505 if let Some(rendered) = rendered {
1506 values.push(rendered);
1507 }
1508 }
1509
1510 values
1511}
1512
1513fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1514 let xmp = match extract_raw_xmp_packet(bytes, format) {
1515 Some(xmp) => xmp,
1516 None => return Vec::new(),
1517 };
1518
1519 parse_xmp_values(&xmp)
1520}
1521
1522fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1523 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1524 if let Ok(mut decoder) = reader.into_decoder()
1525 && let Ok(Some(xmp)) = decoder.xmp_metadata()
1526 {
1527 return Some(xmp);
1528 }
1529
1530 match format {
1531 ImageFormat::Png => extract_png_xmp_packet(bytes),
1532 _ => None,
1533 }
1534}
1535
1536fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1537 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1538
1539 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1540 return None;
1541 }
1542
1543 let mut offset = PNG_SIGNATURE.len();
1544 while offset + 12 <= bytes.len() {
1545 let length = u32::from_be_bytes([
1546 bytes[offset],
1547 bytes[offset + 1],
1548 bytes[offset + 2],
1549 bytes[offset + 3],
1550 ]) as usize;
1551 let chunk_start = offset + 8;
1552 let chunk_end = chunk_start + length;
1553 if chunk_end + 4 > bytes.len() {
1554 return None;
1555 }
1556
1557 let chunk_type = &bytes[offset + 4..offset + 8];
1558 if chunk_type == b"iTXt" {
1559 let data = &bytes[chunk_start..chunk_end];
1560 if let Some(xmp) = parse_png_itxt_xmp(data) {
1561 return Some(xmp);
1562 }
1563 }
1564
1565 offset = chunk_end + 4;
1566 }
1567
1568 None
1569}
1570
1571fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1572 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1573
1574 let keyword_end = data.iter().position(|&b| b == 0)?;
1575 if &data[..keyword_end] != XMP_KEYWORD {
1576 return None;
1577 }
1578
1579 let mut cursor = keyword_end + 1;
1580 let compression_flag = *data.get(cursor)?;
1581 cursor += 1;
1582 let compression_method = *data.get(cursor)?;
1583 cursor += 1;
1584 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1585 return None;
1586 }
1587
1588 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1589 cursor = language_end + 1;
1590
1591 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1592 cursor = translated_end + 1;
1593
1594 let text_bytes = &data[cursor..];
1595 if compression_flag == 1 {
1596 let mut decoder = ZlibDecoder::new(text_bytes);
1597 let mut decoded = Vec::new();
1598 decoder.read_to_end(&mut decoded).ok()?;
1599 Some(decoded)
1600 } else {
1601 Some(text_bytes.to_vec())
1602 }
1603}
1604
1605fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1606 let mut reader = XmlReader::from_reader(xmp);
1607 reader.config_mut().trim_text(true);
1608
1609 let mut buf = Vec::new();
1610 let mut stack: Vec<String> = Vec::new();
1611 let mut values = Vec::new();
1612
1613 loop {
1614 match reader.read_event_into(&mut buf) {
1615 Ok(Event::Start(e)) => {
1616 stack.push(local_xml_name(e.name().as_ref()));
1617 }
1618 Ok(Event::End(_)) => {
1619 stack.pop();
1620 }
1621 Ok(Event::Empty(_)) => {}
1622 Ok(Event::Text(text)) => {
1623 if let Some(field) = stack
1624 .iter()
1625 .rev()
1626 .find_map(|name| allowed_xmp_field(name.as_str()))
1627 && let Ok(decoded) = text.decode()
1628 {
1629 let decoded = decoded.into_owned();
1630 if !decoded.trim().is_empty() {
1631 values.push(format_xmp_value(field, &decoded));
1632 }
1633 }
1634 }
1635 Ok(Event::CData(text)) => {
1636 if let Some(field) = stack
1637 .iter()
1638 .rev()
1639 .find_map(|name| allowed_xmp_field(name.as_str()))
1640 && let Ok(decoded) = text.decode()
1641 {
1642 let decoded = decoded.into_owned();
1643 if !decoded.trim().is_empty() {
1644 values.push(format_xmp_value(field, &decoded));
1645 }
1646 }
1647 }
1648 Ok(Event::Eof) | Err(_) => break,
1649 _ => {}
1650 }
1651 buf.clear();
1652 }
1653
1654 values
1655}
1656
1657fn local_xml_name(name: &[u8]) -> String {
1658 let name = std::str::from_utf8(name).unwrap_or_default();
1659 name.rsplit(':').next().unwrap_or(name).to_string()
1660}
1661
1662fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1663 match name {
1664 "creator" => Some("creator"),
1665 "rights" => Some("rights"),
1666 "description" => Some("description"),
1667 "title" => Some("title"),
1668 "subject" => Some("subject"),
1669 "UsageTerms" => Some("usage_terms"),
1670 "WebStatement" => Some("web_statement"),
1671 _ => None,
1672 }
1673}
1674
1675fn format_xmp_value(field: &str, value: &str) -> String {
1676 match field {
1677 "creator" => format!("Author: {value}"),
1678 _ => value.to_string(),
1679 }
1680}
1681
1682fn values_to_text(values: Vec<String>) -> String {
1683 let mut seen = BTreeSet::new();
1684 let mut lines = Vec::new();
1685 let mut total_bytes = 0usize;
1686
1687 for value in values {
1688 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1689 break;
1690 }
1691
1692 let normalized = normalize_metadata_value(&value);
1693 if normalized.is_empty() || !seen.insert(normalized.clone()) {
1694 continue;
1695 }
1696
1697 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1698 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1699 break;
1700 }
1701
1702 total_bytes += added_bytes;
1703 lines.push(normalized);
1704 }
1705
1706 lines.join("\n")
1707}
1708
1709fn normalize_metadata_value(value: &str) -> String {
1710 value
1711 .chars()
1712 .filter(|&ch| ch != '\0')
1713 .collect::<String>()
1714 .split_whitespace()
1715 .collect::<Vec<_>>()
1716 .join(" ")
1717 .trim()
1718 .to_string()
1719}
1720
1721fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1722 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1723 return (String::new(), None);
1724 }
1725
1726 let mut failures = Vec::new();
1727 let mut saw_success = false;
1728
1729 let extracted = catch_unwind(AssertUnwindSafe(
1730 || -> Result<String, Box<dyn std::error::Error>> {
1731 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1732 extract_first_pdf_page_text(&mut document)
1733 },
1734 ));
1735 match extracted {
1736 Ok(Ok(text)) => {
1737 saw_success = true;
1738 if let Some(normalized) = normalize_pdf_text(text) {
1739 return (normalized, None);
1740 }
1741 }
1742 Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1743 Err(payload) => failures.push(format!(
1744 "from-bytes first-page panic: {}",
1745 panic_payload_to_string(payload.as_ref())
1746 )),
1747 }
1748
1749 let extracted = catch_unwind(AssertUnwindSafe(
1750 || -> Result<String, Box<dyn std::error::Error>> {
1751 let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1752 extract_pdf_text_from_document(&mut document)
1753 },
1754 ));
1755 match extracted {
1756 Ok(Ok(text)) => {
1757 saw_success = true;
1758 if let Some(normalized) = normalize_pdf_text(text) {
1759 return (normalized, None);
1760 }
1761 }
1762 Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1763 Err(payload) => failures.push(format!(
1764 "open full-document panic: {}",
1765 panic_payload_to_string(payload.as_ref())
1766 )),
1767 }
1768
1769 let extracted = catch_unwind(AssertUnwindSafe(
1770 || -> Result<String, Box<dyn std::error::Error>> {
1771 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1772 extract_pdf_text_from_document(&mut document)
1773 },
1774 ));
1775 match extracted {
1776 Ok(Ok(text)) => {
1777 saw_success = true;
1778 if let Some(normalized) = normalize_pdf_text(text) {
1779 return (normalized, None);
1780 }
1781 }
1782 Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1783 Err(payload) => failures.push(format!(
1784 "from-bytes full-document panic: {}",
1785 panic_payload_to_string(payload.as_ref())
1786 )),
1787 }
1788
1789 if saw_success || is_non_actionable_pdf_failure(&failures) {
1790 (String::new(), None)
1791 } else {
1792 (
1793 String::new(),
1794 Some(format!(
1795 "PDF text extraction failed after {} attempts: {}",
1796 failures.len(),
1797 failures.join("; ")
1798 )),
1799 )
1800 }
1801}
1802
1803fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
1804 !failures.is_empty()
1805 && failures.iter().all(|failure| {
1806 failure.contains("requires a password")
1807 || failure.contains("Encrypt dictionary missing /O")
1808 || failure.contains("Encrypt dictionary missing /U")
1809 || failure.contains("security handler cannot be found")
1810 || failure.contains("Invalid cross-reference table")
1811 })
1812}
1813
1814fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
1815 if let Some(message) = payload.downcast_ref::<&str>() {
1816 (*message).to_string()
1817 } else if let Some(message) = payload.downcast_ref::<String>() {
1818 message.clone()
1819 } else {
1820 "unknown panic payload".to_string()
1821 }
1822}
1823
1824fn extract_first_pdf_page_text(
1825 document: &mut pdf_oxide::document::PdfDocument,
1826) -> Result<String, Box<dyn std::error::Error>> {
1827 if document.page_count()? == 0 {
1828 return Ok(String::new());
1829 }
1830
1831 let extracted_text = document.extract_text(0)?;
1832 let markdown_text =
1833 document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1834 if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1835 return Ok(extracted_text);
1836 }
1837
1838 let pipeline_text =
1839 document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1840
1841 Ok(merge_pdf_first_page_text(
1842 &extracted_text,
1843 &markdown_text,
1844 &pipeline_text,
1845 ))
1846}
1847
1848fn extract_pdf_text_from_document(
1849 document: &mut pdf_oxide::document::PdfDocument,
1850) -> Result<String, Box<dyn std::error::Error>> {
1851 Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1852}
1853
1854fn normalize_pdf_text(text: String) -> Option<String> {
1855 let normalized = text.replace(['\r', '\u{0c}'], "\n");
1856 (!normalized.trim().is_empty()).then_some(normalized)
1857}
1858
1859fn merge_pdf_first_page_text(
1860 _extracted_text: &str,
1861 markdown_text: &str,
1862 pipeline_text: &str,
1863) -> String {
1864 let pipeline = pipeline_text.trim();
1865 if pipeline.is_empty() {
1866 return String::new();
1867 }
1868
1869 let prefix = pdf_first_page_heading_prefix(markdown_text);
1870 let Some(prefix) = prefix else {
1871 return pipeline_text.to_string();
1872 };
1873
1874 if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1875 pipeline_text.to_string()
1876 } else {
1877 format!("{prefix}\n\n{pipeline}")
1878 }
1879}
1880
1881fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1882 normalize_pdf_heading_comparison_text(text)
1883 .contains(&normalize_pdf_heading_comparison_text(prefix))
1884}
1885
1886fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1887 text.split_whitespace()
1888 .map(|part| part.to_ascii_lowercase())
1889 .collect::<Vec<_>>()
1890 .join(" ")
1891}
1892
1893fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1894 let mut lines = Vec::new();
1895
1896 for line in pdf_markdown_heading_lines(markdown_text) {
1897 push_unique_line(&mut lines, line);
1898 }
1899
1900 (!lines.is_empty()).then(|| lines.join("\n"))
1901}
1902
1903fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1904 text.lines()
1905 .map(str::trim)
1906 .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1907 .map(|line| line.trim_matches('#').trim())
1908 .filter(|line| !line.is_empty())
1909 .filter(|line| !looks_like_numbered_section_heading(line))
1910 .take(4)
1911 .map(ToOwned::to_owned)
1912 .collect()
1913}
1914
1915fn push_unique_line(lines: &mut Vec<String>, line: String) {
1916 if !lines.iter().any(|existing| existing == &line) {
1917 lines.push(line);
1918 }
1919}
1920
1921fn looks_like_numbered_section_heading(line: &str) -> bool {
1922 let mut chars = line.chars();
1923 let Some(first) = chars.next() else {
1924 return false;
1925 };
1926
1927 if !first.is_ascii_digit() {
1928 return false;
1929 }
1930
1931 matches!(chars.next(), Some('.'))
1932}
1933
1934fn is_zip_archive(bytes: &[u8]) -> bool {
1935 bytes.starts_with(b"PK\x03\x04")
1936 || bytes.starts_with(b"PK\x05\x06")
1937 || bytes.starts_with(b"PK\x07\x08")
1938}
1939
1940pub fn extract_printable_strings(bytes: &[u8]) -> String {
1941 const MIN_LEN: usize = 4;
1942 const MIN_OUTPUT_BYTES: usize = 2_000_000;
1943 const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1944
1945 let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1946
1947 fn is_printable_ascii(b: u8) -> bool {
1948 matches!(b, 0x20..=0x7E)
1949 }
1950
1951 let mut out = String::new();
1952 let mut run: Vec<u8> = Vec::new();
1953
1954 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1955 if run.len() >= MIN_LEN {
1956 if !out.is_empty() {
1957 out.push('\n');
1958 }
1959 out.push_str(&String::from_utf8_lossy(run));
1960 }
1961 run.clear();
1962 };
1963
1964 for &b in bytes {
1965 if is_printable_ascii(b) {
1966 run.push(b);
1967 } else {
1968 flush_run(&mut out, &mut run);
1969 if out.len() >= max_output_bytes {
1970 return out;
1971 }
1972 }
1973 }
1974 flush_run(&mut out, &mut run);
1975 if out.len() >= max_output_bytes {
1976 return out;
1977 }
1978
1979 for start in 0..=1 {
1980 run.clear();
1981 let mut i = start;
1982 while i + 1 < bytes.len() {
1983 let b0 = bytes[i];
1984 let b1 = bytes[i + 1];
1985 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1986 if is_printable_ascii(ch) && zero == 0 {
1987 run.push(ch);
1988 } else {
1989 flush_run(&mut out, &mut run);
1990 if out.len() >= max_output_bytes {
1991 return out;
1992 }
1993 }
1994 i += 2;
1995 }
1996 flush_run(&mut out, &mut run);
1997 if out.len() >= max_output_bytes {
1998 return out;
1999 }
2000 }
2001
2002 out
2003}
2004
2005#[cfg(test)]
2006mod tests {
2007 use std::path::Path;
2008
2009 use super::{
2010 ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
2011 extract_printable_strings, extract_text_for_detection,
2012 extract_text_for_detection_with_diagnostics, is_non_actionable_pdf_failure,
2013 normalize_mime_type, normalize_pdf_heading_comparison_text,
2014 windows_metadata_or_empty_result,
2015 };
2016
2017 #[test]
2018 fn test_extract_text_for_detection_skips_jar_archives() {
2019 let path = Path::new(
2020 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2021 );
2022 let bytes = std::fs::read(path).expect("failed to read jar fixture");
2023
2024 let (text, kind) = extract_text_for_detection(path, &bytes);
2025
2026 assert!(text.is_empty());
2027 assert_eq!(kind, ExtractedTextKind::None);
2028 }
2029
2030 #[test]
2031 fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2032 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2033 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2034
2035 let (text, kind) = extract_text_for_detection(path, &bytes);
2036
2037 assert_eq!(kind, ExtractedTextKind::Pdf);
2038 assert!(text.contains("Redistribution and use in source and binary forms"));
2039 }
2040
2041 #[test]
2042 fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2043 let path =
2044 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2045 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2046
2047 let (text, kind) = extract_text_for_detection(path, &bytes);
2048
2049 assert_eq!(kind, ExtractedTextKind::Pdf);
2050 assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2051 assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2052 }
2053
2054 #[test]
2055 fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2056 let path =
2057 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2058 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2059
2060 let (text, kind) = extract_text_for_detection(path, &bytes);
2061
2062 assert_eq!(kind, ExtractedTextKind::Pdf);
2063
2064 let normalized = normalize_pdf_heading_comparison_text(&text);
2065 let heading =
2066 normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2067 assert_eq!(normalized.matches(&heading).count(), 1);
2068 }
2069
2070 #[test]
2071 fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2072 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2073 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2074
2075 let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2076
2077 assert_eq!(kind, ExtractedTextKind::Pdf);
2078 assert!(text.contains("Redistribution and use in source and binary forms"));
2079 }
2080
2081 #[test]
2082 fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2083 let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2084
2085 let (text, kind, scan_error) =
2086 extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2087
2088 assert!(text.is_empty());
2089 assert_eq!(kind, ExtractedTextKind::None);
2090 let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2091 assert!(scan_error.contains("PDF text extraction failed after"));
2092 }
2093
2094 #[test]
2095 fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2096 let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2097
2098 let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2099
2100 assert!(text.is_empty());
2101 assert_eq!(kind, ExtractedTextKind::None);
2102 }
2103
2104 #[test]
2105 fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2106 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2107 let text = b"Copyright 2026 Example Project!!!";
2108 bytes[..text.len()].copy_from_slice(text);
2109 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2110 bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2111
2112 let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2113
2114 assert_ne!(kind, ExtractedTextKind::None);
2115 assert!(text.contains("Copyright 2026 Example Project"));
2116 }
2117
2118 #[test]
2119 fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2120 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2121 let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2122 bytes[..noise.len()].copy_from_slice(noise);
2123 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2124 bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2125
2126 let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2127
2128 assert!(text.is_empty());
2129 assert_eq!(kind, ExtractedTextKind::None);
2130 }
2131
2132 #[test]
2133 fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2134 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2135 let bytes = std::fs::read(path).expect("read PE fixture");
2136
2137 let (text, kind) = extract_text_for_detection(path, &bytes);
2138
2139 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2140 assert!(text.contains("License: This program is free software"));
2141 assert!(text.contains("LegalCopyright:"));
2142 }
2143
2144 #[test]
2145 fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2146 {
2147 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2148 let mut bytes = std::fs::read(path).expect("read PE fixture");
2149 bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2150
2151 let (text, kind) = extract_text_for_detection(path, &bytes);
2152
2153 assert_ne!(kind, ExtractedTextKind::None);
2154 assert!(!text.trim().is_empty());
2155 }
2156
2157 #[test]
2158 fn test_windows_metadata_or_empty_result_preserves_metadata() {
2159 let (text, kind, scan_error) =
2160 windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2161
2162 assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2163 assert_eq!(text, "LegalCopyright: Example Corp");
2164 assert!(scan_error.is_none());
2165 }
2166
2167 #[test]
2168 fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2169 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2170 let text = b"Copyright 2026 Example Project!!!";
2171 bytes[..text.len()].copy_from_slice(text);
2172
2173 let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2174
2175 assert!(text.is_empty());
2176 assert_eq!(kind, ExtractedTextKind::None);
2177 }
2178
2179 #[test]
2180 fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2181 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2182 let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2183 bytes[..text.len()].copy_from_slice(text);
2184
2185 let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2186
2187 assert_ne!(kind, ExtractedTextKind::None);
2188 assert!(text.contains("asn@redhat.com"));
2189 assert!(text.contains("https://publicsuffix.org/"));
2190 }
2191
2192 #[test]
2193 fn test_non_actionable_pdf_failures_are_suppressed() {
2194 assert!(is_non_actionable_pdf_failure(&[
2195 "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2196 "open full-document: PDF is encrypted and requires a password".to_string(),
2197 ]));
2198 assert!(is_non_actionable_pdf_failure(&[
2199 "from-bytes first-page: Invalid cross-reference table".to_string(),
2200 "open full-document: Invalid cross-reference table".to_string(),
2201 ]));
2202 assert!(is_non_actionable_pdf_failure(&[
2203 "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2204 "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2205 ]));
2206 assert!(!is_non_actionable_pdf_failure(&[
2207 "from-bytes first-page: some other parser failure".to_string(),
2208 ]));
2209 }
2210
2211 #[test]
2212 fn test_extract_text_for_detection_skips_zip_like_archives() {
2213 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2214
2215 let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2216 let (crate_text, crate_kind) =
2217 extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2218
2219 assert!(whl_text.is_empty());
2220 assert_eq!(whl_kind, ExtractedTextKind::None);
2221 assert!(crate_text.is_empty());
2222 assert_eq!(crate_kind, ExtractedTextKind::None);
2223 }
2224
2225 #[test]
2226 fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2227 let path =
2228 Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2229 let bytes = std::fs::read(path).expect("failed to read lib fixture");
2230
2231 let (text, kind) = extract_text_for_detection(path, &bytes);
2232
2233 assert_ne!(kind, ExtractedTextKind::None);
2234 assert!(text.contains("Copyright nexB and others (c) 2012"));
2235 }
2236
2237 #[test]
2238 fn test_extract_text_for_detection_reads_font_metadata() {
2239 let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2240 let bytes = std::fs::read(path).expect("failed to read font fixture");
2241
2242 let (text, kind) = extract_text_for_detection(path, &bytes);
2243
2244 assert_eq!(kind, ExtractedTextKind::FontMetadata);
2245 assert!(text.contains("License Description:"), "{text}");
2246 assert!(
2247 text.contains("Open Font License") || text.contains("OFL"),
2248 "{text}"
2249 );
2250 }
2251
2252 #[test]
2253 fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2254 let bytes = b"abcd\0".repeat(525_000);
2255
2256 let text = extract_printable_strings(&bytes);
2257
2258 assert!(
2259 text.len() > 2_000_000,
2260 "unexpected truncation at {}",
2261 text.len()
2262 );
2263 assert!(text.ends_with("abcd"));
2264 }
2265
2266 #[test]
2267 fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2268 let path = Path::new(
2269 "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2270 );
2271 let bytes = std::fs::read(path).expect("failed to read svg fixture");
2272
2273 let (text, kind) = extract_text_for_detection(path, &bytes);
2274
2275 assert_eq!(kind, ExtractedTextKind::Decoded);
2276 assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2277 }
2278
2279 #[test]
2280 fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2281 let path = Path::new(
2282 "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2283 );
2284 let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2285
2286 let (text, kind) = extract_text_for_detection(path, &bytes);
2287
2288 assert_eq!(kind, ExtractedTextKind::Decoded);
2289 assert!(text.contains("GNU Lesser General Public"));
2290 assert!(text.contains("version"));
2291 assert!(text.contains("2.1 of the License"));
2292 }
2293
2294 #[test]
2295 fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2296 assert_eq!(
2297 normalize_mime_type(
2298 Path::new("main.ts"),
2299 b"export const answer = 42;\n",
2300 Some("TypeScript"),
2301 "video/mp2t",
2302 ),
2303 "text/plain"
2304 );
2305 }
2306
2307 #[test]
2308 fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2309 assert_eq!(
2310 normalize_mime_type(
2311 Path::new("main.js"),
2312 b"console.log('hello');\n",
2313 Some("JavaScript"),
2314 "application/octet-stream",
2315 ),
2316 "text/plain"
2317 );
2318 }
2319
2320 #[test]
2321 fn test_normalize_mime_type_preserves_binary_video_guess() {
2322 assert_eq!(
2323 normalize_mime_type(
2324 Path::new("main.ts"),
2325 &[0, 159, 146, 150, 0, 1, 2, 3],
2326 Some("TypeScript"),
2327 "video/mp2t",
2328 ),
2329 "video/mp2t"
2330 );
2331 }
2332
2333 #[test]
2334 fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2335 assert_eq!(
2336 normalize_mime_type(
2337 Path::new("main.ts"),
2338 &[0, 159, 146, 150],
2339 Some("TypeScript"),
2340 "application/octet-stream",
2341 ),
2342 "application/octet-stream"
2343 );
2344 }
2345
2346 #[test]
2347 fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2348 let classification = classify_file_info(Path::new("test.txt"), b"");
2349
2350 assert_eq!(classification.mime_type, "inode/x-empty");
2351 assert_eq!(classification.file_type, "empty");
2352 assert!(!classification.is_binary);
2353 assert!(classification.is_text);
2354 assert!(!classification.is_source);
2355 assert_eq!(classification.programming_language, None);
2356 }
2357
2358 #[test]
2359 fn test_classify_file_info_keeps_json_out_of_programming_language() {
2360 let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2361
2362 assert_eq!(classification.mime_type, "application/json");
2363 assert_eq!(classification.file_type, "JSON text data");
2364 assert!(classification.is_text);
2365 assert!(!classification.is_source);
2366 assert_eq!(classification.programming_language, None);
2367 }
2368
2369 #[test]
2370 fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2371 let classification =
2372 classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2373
2374 assert_eq!(classification.mime_type, "text/plain");
2375 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2376 assert!(classification.is_text);
2377 assert!(!classification.is_binary);
2378 }
2379
2380 #[test]
2381 fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2382 let classification =
2383 classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
2384
2385 assert_eq!(classification.mime_type, "application/octet-stream");
2386 assert_eq!(classification.file_type, "data");
2387 assert!(classification.is_binary);
2388 assert!(!classification.is_text);
2389 }
2390
2391 #[test]
2392 fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
2393 let classification = classify_file_info(
2394 Path::new("utf16.json"),
2395 &[
2396 0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
2397 ],
2398 );
2399
2400 assert!(!classification.is_binary);
2401 assert!(classification.is_text);
2402 assert_eq!(classification.mime_type, "application/json");
2403 assert_eq!(classification.file_type, "JSON text data");
2404 }
2405
2406 #[test]
2407 fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
2408 let classification = classify_file_info(Path::new("true.json"), b"true");
2409
2410 assert!(!classification.is_binary);
2411 assert!(classification.is_text);
2412 assert_eq!(classification.mime_type, "application/json");
2413 assert_eq!(classification.file_type, "JSON text data");
2414 }
2415
2416 #[test]
2417 fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
2418 let classification = classify_file_info(
2419 Path::new("wrapped.json"),
2420 &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
2421 );
2422
2423 assert!(!classification.is_binary);
2424 assert!(classification.is_text);
2425 assert_eq!(classification.mime_type, "text/plain");
2426 assert_eq!(classification.file_type, "text, with no line terminators");
2427 }
2428
2429 #[test]
2430 fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
2431 let classification =
2432 classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
2433
2434 assert!(classification.is_binary);
2435 assert!(!classification.is_text);
2436 assert_eq!(classification.mime_type, "application/octet-stream");
2437 assert_eq!(classification.file_type, "data");
2438 }
2439
2440 #[test]
2441 fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
2442 let classification = classify_file_info(
2443 Path::new("crash.json"),
2444 &[
2445 0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
2446 ],
2447 );
2448
2449 assert!(classification.is_binary);
2450 assert!(!classification.is_text);
2451 assert_eq!(classification.mime_type, "application/octet-stream");
2452 }
2453
2454 #[test]
2455 fn test_classify_file_info_treats_dockerfile_as_source() {
2456 let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2457
2458 assert_eq!(
2459 classification.programming_language.as_deref(),
2460 Some("Dockerfile")
2461 );
2462 assert!(classification.is_source);
2463 assert!(!classification.is_script);
2464 assert_eq!(
2465 classification.file_type,
2466 "Dockerfile source, UTF-8 Unicode text"
2467 );
2468 }
2469
2470 #[test]
2471 fn test_classify_file_info_treats_makefile_as_text_not_source() {
2472 let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2473
2474 assert_eq!(classification.programming_language, None);
2475 assert!(classification.is_text);
2476 assert!(!classification.is_source);
2477 assert!(!classification.is_script);
2478 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2479 }
2480
2481 #[test]
2482 fn test_classify_file_info_marks_supported_package_archives() {
2483 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2484
2485 let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2486 let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2487
2488 assert!(egg.is_archive);
2489 assert_eq!(egg.mime_type, "application/zip");
2490 assert_eq!(egg.file_type, "Zip archive data");
2491 assert!(nupkg.is_archive);
2492 assert_eq!(nupkg.mime_type, "application/zip");
2493 assert_eq!(nupkg.file_type, "Zip archive data");
2494 }
2495
2496 #[test]
2497 fn test_classify_file_info_marks_png_as_binary_media() {
2498 let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2499
2500 let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2501
2502 assert_eq!(classification.mime_type, "image/png");
2503 assert_eq!(classification.file_type, "PNG image data");
2504 assert!(classification.is_binary);
2505 assert!(!classification.is_text);
2506 assert!(classification.is_media);
2507 assert!(!classification.is_archive);
2508 assert!(!classification.is_source);
2509 }
2510
2511 #[test]
2512 fn test_classify_file_info_marks_pdf_as_binary_document() {
2513 let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2514
2515 let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2516
2517 assert_eq!(classification.mime_type, "application/pdf");
2518 assert_eq!(classification.file_type, "PDF document");
2519 assert!(classification.is_binary);
2520 assert!(!classification.is_text);
2521 assert!(!classification.is_archive);
2522 assert!(!classification.is_media);
2523 }
2524
2525 #[test]
2526 fn test_classify_file_info_marks_binary_blobs_as_binary() {
2527 let classification =
2528 classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2529
2530 assert!(classification.is_binary);
2531 assert!(!classification.is_text);
2532 assert!(!classification.is_source);
2533 assert_eq!(classification.programming_language, None);
2534 }
2535
2536 #[test]
2537 fn test_classify_file_info_treats_yaml_as_text_not_source() {
2538 let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2539
2540 assert_eq!(classification.programming_language, None);
2541 assert!(classification.is_text);
2542 assert!(!classification.is_source);
2543 assert_eq!(classification.file_type, "YAML text data");
2544 }
2545
2546 #[test]
2547 fn test_classify_file_info_classifies_common_build_manifests() {
2548 let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2549 let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2550 let cmake = classify_file_info(
2551 Path::new("toolchain.cmake"),
2552 b"set(CMAKE_CXX_STANDARD 20)\n",
2553 );
2554 let gitmodules = classify_file_info(
2555 Path::new(".gitmodules"),
2556 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2557 );
2558
2559 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2560 assert!(gradle.is_source);
2561 assert_eq!(gradle.mime_type, "text/plain");
2562 assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
2563
2564 assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2565 assert!(flake.is_source);
2566 assert_eq!(flake.mime_type, "text/plain");
2567 assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
2568
2569 assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
2570 assert!(cmake.is_source);
2571 assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
2572
2573 assert_eq!(gitmodules.programming_language, None);
2574 assert!(gitmodules.is_text);
2575 assert!(!gitmodules.is_source);
2576 assert_eq!(gitmodules.file_type, "Git configuration text");
2577 }
2578
2579 #[test]
2580 fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
2581 let header = classify_file_info(
2582 Path::new("include/demo.hpp"),
2583 b"#pragma once\nclass Demo {};\n",
2584 );
2585 let ipp = classify_file_info(
2586 Path::new("include/detail/demo.ipp"),
2587 b"template <class T> void parse() {}\n",
2588 );
2589
2590 assert_eq!(header.programming_language.as_deref(), Some("C++"));
2591 assert!(header.is_source);
2592 assert!(!header.is_script);
2593 assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
2594
2595 assert_eq!(ipp.programming_language, None);
2596 assert!(!ipp.is_source);
2597 assert!(!ipp.is_script);
2598 assert_eq!(ipp.file_type, "UTF-8 Unicode text");
2599 }
2600
2601 #[test]
2602 fn test_classify_file_info_preserves_specific_shell_family_labels() {
2603 let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
2604
2605 assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
2606 assert!(bash.is_script);
2607 assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
2608 }
2609
2610 #[test]
2611 fn test_classify_file_info_marks_jamfile_as_source() {
2612 let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
2613
2614 assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
2615 assert!(jamfile.is_source);
2616 assert!(!jamfile.is_script);
2617 assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
2618 }
2619
2620 #[test]
2621 fn test_classify_file_info_labels_javascript_shebang_scripts() {
2622 let classification = classify_file_info(
2623 Path::new("bin/run"),
2624 b"#!/usr/bin/env node\nconsole.log('hello');\n",
2625 );
2626
2627 assert_eq!(
2628 classification.programming_language.as_deref(),
2629 Some("JavaScript")
2630 );
2631 assert!(classification.is_script);
2632 assert_eq!(
2633 classification.file_type,
2634 "javascript script, UTF-8 Unicode text executable"
2635 );
2636 }
2637
2638 #[test]
2639 fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2640 let classification = classify_file_info(
2641 Path::new("script.py"),
2642 b"# coding: latin-1\nprint(\"caf\xe9\")\n",
2643 );
2644
2645 assert_eq!(
2646 classification.programming_language.as_deref(),
2647 Some("Python")
2648 );
2649 assert!(classification.is_script);
2650 assert_eq!(classification.file_type, "python script, text executable");
2651 }
2652
2653 #[test]
2654 fn test_classify_file_info_treats_textual_tga_as_media() {
2655 let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
2656
2657 assert!(classification.is_media);
2658 assert!(classification.is_text);
2659 assert!(!classification.is_binary);
2660 }
2661
2662 #[test]
2663 fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
2664 let classification =
2665 classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
2666
2667 assert!(classification.is_binary);
2668 assert!(!classification.is_text);
2669 assert!(!classification.is_source);
2670 assert_eq!(classification.programming_language, None);
2671 }
2672
2673 #[test]
2674 fn test_extract_text_for_detection_skips_unsupported_image_formats() {
2675 let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
2676
2677 let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
2678
2679 assert!(text.is_empty());
2680 assert_eq!(kind, ExtractedTextKind::None);
2681 }
2682
2683 #[test]
2684 fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
2685 let cases = [
2686 (
2687 Path::new("bin/run"),
2688 b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
2689 Some("JavaScript"),
2690 true,
2691 true,
2692 ),
2693 (
2694 Path::new("Dockerfile"),
2695 b"FROM scratch\n".as_slice(),
2696 Some("Dockerfile"),
2697 true,
2698 false,
2699 ),
2700 (
2701 Path::new("package.json"),
2702 br#"{"name":"demo"}"#.as_slice(),
2703 None,
2704 false,
2705 false,
2706 ),
2707 (
2708 Path::new("config.yaml"),
2709 b"key: value\n".as_slice(),
2710 None,
2711 false,
2712 false,
2713 ),
2714 (
2715 Path::new("Makefile"),
2716 b"all:\n\techo hi\n".as_slice(),
2717 None,
2718 false,
2719 false,
2720 ),
2721 ];
2722
2723 for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
2724 let classification = classify_file_info(path, bytes);
2725
2726 assert_eq!(
2727 classification.programming_language.as_deref(),
2728 expected_language,
2729 "unexpected language for {}",
2730 path.display()
2731 );
2732 assert_eq!(
2733 classification.is_source,
2734 expected_is_source,
2735 "unexpected is_source for {}",
2736 path.display()
2737 );
2738 assert_eq!(
2739 classification.is_script,
2740 expected_is_script,
2741 "unexpected is_script for {}",
2742 path.display()
2743 );
2744 }
2745 }
2746}