1use std::borrow::Cow;
2use std::collections::BTreeSet;
3use std::fs;
4use std::io::{BufReader, Cursor, Read};
5use std::panic::{AssertUnwindSafe, catch_unwind};
6use std::path::Path;
7
8use chrono::{TimeZone, Utc};
9use file_format::{FileFormat, Kind as FileFormatKind};
10use flate2::read::ZlibDecoder;
11use glob::Pattern;
12use image::{ImageDecoder, ImageFormat, ImageReader};
13use mime_guess::from_path;
14use quick_xml::events::Event;
15use quick_xml::reader::Reader as XmlReader;
16
17use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
18use crate::utils::font::extract_font_metadata_text;
19use crate::utils::language::detect_language;
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ExtractedTextKind {
23 None,
24 Decoded,
25 FontMetadata,
26 Pdf,
27 BinaryStrings,
28 ImageMetadata,
29 WindowsExecutableMetadata,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct FileInfoClassification {
34 pub mime_type: String,
35 pub file_type: String,
36 pub programming_language: Option<String>,
37 pub is_binary: bool,
38 pub is_text: bool,
39 pub is_archive: bool,
40 pub is_media: bool,
41 pub is_source: bool,
42 pub is_script: bool,
43}
44
45const MAX_IMAGE_METADATA_VALUES: usize = 64;
46const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
47const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
48const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
49const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
50 "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
51];
52const BINARY_EXTENSIONS: &[&str] = &[
53 "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
54];
55const ARCHIVE_EXTENSIONS: &[&str] = &[
56 "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
57 "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
58];
59
60pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
62 metadata.modified().ok().map(|time: std::time::SystemTime| {
63 let seconds_since_epoch = time
64 .duration_since(std::time::UNIX_EPOCH)
65 .unwrap()
66 .as_secs() as i64;
67
68 Utc.timestamp_opt(seconds_since_epoch, 0)
69 .single()
70 .unwrap_or_else(Utc::now)
71 .format("%Y-%m-%d")
72 .to_string()
73 })
74}
75
76pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
78 let path_str = path.to_string_lossy();
79 let file_name = path
80 .file_name()
81 .map(|name| name.to_string_lossy())
82 .unwrap_or_default();
83
84 for pattern in exclude_patterns {
85 if pattern.matches(&path_str) {
87 return true;
88 }
89
90 if pattern.matches(&file_name) {
92 return true;
93 }
94 }
95
96 false
97}
98
99pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
105 match String::from_utf8(bytes.to_vec()) {
106 Ok(s) => s,
107 Err(e) => {
108 let bytes = e.into_bytes();
109 if has_binary_control_chars(&bytes) {
110 return String::new();
111 }
112 bytes.iter().map(|&b| b as char).collect()
113 }
114 }
115}
116
117pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
118 let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
119 (text, kind)
120}
121
122pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
123 let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
124 return Cow::Borrowed(text);
125 };
126 if !matches!(
127 extension.to_ascii_lowercase().as_str(),
128 "md" | "markdown" | "html" | "htm"
129 ) {
130 return Cow::Borrowed(text);
131 }
132
133 let mut hints = Vec::new();
134 if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
135 hints.push("Creative Commons Attribution 4.0 International License".to_string());
136 }
137 if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
138 {
139 hints.push(
140 "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
141 .to_string(),
142 );
143 }
144
145 hints.extend(extract_shields_license_badge_hints(text));
146
147 if hints.is_empty() {
148 Cow::Borrowed(text)
149 } else {
150 let mut augmented =
151 String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
152 augmented.push_str(text);
153 augmented.push_str("\n\n");
154 for (index, hint) in hints.into_iter().enumerate() {
155 if index > 0 {
156 augmented.push('\n');
157 }
158 augmented.push_str(&hint);
159 }
160 Cow::Owned(augmented)
161 }
162}
163
164fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
165 let mut hints = Vec::new();
166 let mut rest = text;
167 let needle = "img.shields.io/badge/license-";
168
169 while let Some(index) = rest.find(needle) {
170 let start = index + needle.len();
171 let suffix = &rest[start..];
172 let end = suffix
173 .find([')', ']', '"', '\'', ' ', '\n'])
174 .unwrap_or(suffix.len());
175 let badge = &suffix[..end];
176 let Some(badge) = badge.strip_suffix(".svg") else {
177 rest = &suffix[end..];
178 continue;
179 };
180
181 let mut segments: Vec<_> = badge
182 .split('-')
183 .filter(|segment| !segment.is_empty())
184 .collect();
185 if segments.len() < 2 {
186 rest = &suffix[end..];
187 continue;
188 }
189 segments.pop();
190 let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
191 if !candidate.is_empty() {
192 hints.push(canonical_shields_license_hint(&candidate));
193 }
194
195 rest = &suffix[end..];
196 }
197
198 hints.sort();
199 hints.dedup();
200 hints
201}
202
203fn canonical_shields_license_hint(candidate: &str) -> String {
204 match candidate.trim() {
205 "MIT" => "The MIT License".to_string(),
206 "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
207 other => format!("{other} License"),
208 }
209}
210
211pub(crate) fn extract_text_for_detection_with_diagnostics(
212 path: &Path,
213 bytes: &[u8],
214) -> (String, ExtractedTextKind, Option<String>) {
215 let ext = path
216 .extension()
217 .and_then(|e| e.to_str())
218 .map(|s| s.to_ascii_lowercase());
219 let detected_format = detect_file_format(bytes);
220
221 if looks_like_rtf(bytes, ext.as_deref()) {
222 let text = extract_rtf_text(bytes);
223 return if text.trim().is_empty() {
224 (String::new(), ExtractedTextKind::None, None)
225 } else {
226 (text, ExtractedTextKind::Decoded, None)
227 };
228 }
229
230 if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
231 let (text, scan_error) = extract_pdf_text(path, bytes);
232 return if text.is_empty() {
233 (String::new(), ExtractedTextKind::None, scan_error)
234 } else {
235 (text, ExtractedTextKind::Pdf, None)
236 };
237 }
238
239 if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
240 let text = extract_image_metadata_text(bytes, format);
241 return if text.is_empty() {
242 if is_supported_image_container(bytes, format) {
243 (String::new(), ExtractedTextKind::None, None)
244 } else {
245 let decoded = decode_bytes_to_string(bytes);
246 if decoded.is_empty() {
247 (String::new(), ExtractedTextKind::None, None)
248 } else {
249 (decoded, ExtractedTextKind::Decoded, None)
250 }
251 }
252 } else {
253 (text, ExtractedTextKind::ImageMetadata, None)
254 };
255 }
256
257 if let Some(text) = extract_font_metadata_text(path, bytes) {
258 return (text, ExtractedTextKind::FontMetadata, None);
259 }
260
261 let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
262 let large_opaque_binary = windows_executable_metadata_text.is_none()
263 && is_large_opaque_binary_candidate(bytes, detected_format);
264
265 if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
266 return windows_metadata_or_empty_result(windows_executable_metadata_text);
267 }
268
269 if should_skip_binary_string_extraction(path, bytes, detected_format) {
270 return (String::new(), ExtractedTextKind::None, None);
271 }
272
273 if !large_opaque_binary {
274 let decoded = decode_bytes_to_string(bytes);
275 if !decoded.is_empty() {
276 let combined =
277 combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
278 return (combined, ExtractedTextKind::Decoded, None);
279 }
280 }
281
282 let text = if large_opaque_binary {
283 extract_sampled_printable_strings(bytes)
284 } else {
285 extract_printable_strings(bytes)
286 };
287 if text.is_empty() {
288 windows_metadata_or_empty_result(windows_executable_metadata_text)
289 } else {
290 (
291 combine_extracted_text_fragments(windows_executable_metadata_text, text),
292 ExtractedTextKind::BinaryStrings,
293 None,
294 )
295 }
296}
297
298fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
299 match prefix {
300 Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
301 Some(prefix) if !prefix.is_empty() => prefix,
302 _ => suffix,
303 }
304}
305
306fn windows_metadata_or_empty_result(
307 windows_executable_metadata_text: Option<String>,
308) -> (String, ExtractedTextKind, Option<String>) {
309 if let Some(metadata_text) = windows_executable_metadata_text {
310 (
311 metadata_text,
312 ExtractedTextKind::WindowsExecutableMetadata,
313 None,
314 )
315 } else {
316 (String::new(), ExtractedTextKind::None, None)
317 }
318}
319
320pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
321 let detected_format = detect_file_format(bytes);
322 let detected_language = detect_language(path, bytes);
323 let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
324 let is_text = !is_binary;
325 let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
326 let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
327 let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
328 let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
329 let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
330 let programming_language = is_source.then(|| detected_language.clone()).flatten();
331 let file_type = detect_file_type(
332 path,
333 bytes,
334 detected_format,
335 &mime_type,
336 programming_language.as_deref(),
337 is_binary,
338 is_text,
339 is_archive,
340 is_media,
341 is_script,
342 );
343
344 FileInfoClassification {
345 mime_type,
346 file_type,
347 programming_language,
348 is_binary,
349 is_text,
350 is_archive,
351 is_media,
352 is_source,
353 is_script,
354 }
355}
356
357fn detect_file_format(bytes: &[u8]) -> FileFormat {
358 FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
359}
360
361fn is_utf8_text(bytes: &[u8]) -> bool {
362 std::str::from_utf8(bytes).is_ok()
363}
364
365fn has_binary_control_chars(bytes: &[u8]) -> bool {
366 let control_count = bytes
367 .iter()
368 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
369 .count();
370 control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
371}
372
373fn has_decodable_text(bytes: &[u8]) -> bool {
374 bytes.is_empty() || is_utf8_text(bytes) || !has_binary_control_chars(bytes)
375}
376
377fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
378 if bytes.is_empty() || is_utf8_text(bytes) {
379 return true;
380 }
381
382 let printable_count = bytes
383 .iter()
384 .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
385 .count();
386 printable_count * 2 >= bytes.len()
387}
388
389fn is_textual_media_type(media_type: &str) -> bool {
390 media_type.starts_with("text/")
391 || matches!(
392 media_type,
393 "application/json" | "application/xml" | "text/xml"
394 )
395 || media_type.ends_with("+json")
396 || media_type.ends_with("+xml")
397}
398
399fn is_textual_format(detected_format: FileFormat) -> bool {
400 matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
401 || is_textual_media_type(detected_format.media_type())
402}
403
404fn is_known_binary_format(detected_format: FileFormat) -> bool {
405 !matches!(detected_format, FileFormat::ArbitraryBinaryData)
406 && !is_textual_format(detected_format)
407}
408
409pub fn detect_mime_type(
410 path: &Path,
411 bytes: &[u8],
412 detected_format: FileFormat,
413 programming_language: Option<&str>,
414) -> String {
415 if bytes.is_empty() {
416 return "inode/x-empty".to_string();
417 }
418
419 if is_zip_archive(bytes) {
420 return detect_zip_like_mime(path);
421 }
422
423 if looks_like_deb(bytes, path) {
424 return "application/vnd.debian.binary-package".to_string();
425 }
426
427 if looks_like_rpm(bytes, path) {
428 return "application/x-rpm".to_string();
429 }
430
431 let guessed_mime = from_path(path)
432 .first_or_octet_stream()
433 .essence_str()
434 .to_string();
435
436 let mime_type = match detected_format {
437 FileFormat::Empty => "inode/x-empty".to_string(),
438 FileFormat::PlainText => {
439 if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
440 "text/plain".to_string()
441 } else {
442 guessed_mime.clone()
443 }
444 }
445 _ => {
446 let detected_mime = detected_format.media_type();
447 if detected_mime == "application/octet-stream"
448 && guessed_mime != "application/octet-stream"
449 {
450 guessed_mime.clone()
451 } else {
452 detected_mime.to_string()
453 }
454 }
455 };
456
457 normalize_mime_type(path, bytes, programming_language, &mime_type)
458}
459
460fn normalize_mime_type(
461 path: &Path,
462 bytes: &[u8],
463 programming_language: Option<&str>,
464 mime_type: &str,
465) -> String {
466 if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
467 return "text/plain".to_string();
468 }
469
470 mime_type.to_string()
471}
472
473fn should_prefer_text_mime(
474 path: &Path,
475 bytes: &[u8],
476 programming_language: Option<&str>,
477 mime_type: &str,
478) -> bool {
479 has_decodable_text(bytes)
480 && looks_like_textual_bytes(bytes)
481 && is_textual_source_candidate(path, programming_language)
482 && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
483}
484
485fn detect_is_binary(
486 path: &Path,
487 bytes: &[u8],
488 detected_format: FileFormat,
489 programming_language: Option<&str>,
490) -> bool {
491 if is_textual_format(detected_format) {
492 return false;
493 }
494
495 if lower_extension(path)
496 .as_deref()
497 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
498 {
499 return true;
500 }
501
502 if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
503 return false;
504 }
505
506 has_binary_control_chars(bytes)
507 || is_known_binary_format(detected_format)
508 || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
509 && !looks_like_textual_bytes(bytes))
510}
511
512fn should_treat_binary_bytes_as_text(
513 path: &Path,
514 bytes: &[u8],
515 programming_language: Option<&str>,
516) -> bool {
517 has_decodable_text(bytes)
518 && looks_like_textual_bytes(bytes)
519 && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
520}
521
522fn detect_is_archive(
523 path: &Path,
524 bytes: &[u8],
525 mime_type: &str,
526 is_text: bool,
527 detected_format: FileFormat,
528) -> bool {
529 if is_text {
530 return false;
531 }
532
533 lower_extension(path)
534 .as_deref()
535 .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
536 || matches!(
537 detected_format.kind(),
538 FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
539 )
540 || is_zip_archive(bytes)
541 || looks_like_gzip(bytes)
542 || looks_like_bzip2(bytes)
543 || looks_like_xz(bytes)
544 || looks_like_deb(bytes, path)
545 || looks_like_rpm(bytes, path)
546 || looks_like_squashfs(bytes, path)
547 || mime_type.contains("zip")
548 || mime_type.contains("compressed")
549 || mime_type.contains("tar")
550 || mime_type.contains("x-rpm")
551 || mime_type.contains("debian")
552}
553
554fn detect_is_media(
555 path: &Path,
556 bytes: &[u8],
557 mime_type: &str,
558 detected_format: FileFormat,
559) -> bool {
560 media_mime_from_content(bytes).is_some()
561 || matches!(
562 detected_format.kind(),
563 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
564 )
565 || mime_type.starts_with("image/")
566 || mime_type.starts_with("audio/")
567 || mime_type.starts_with("video/")
568 || (mime_type == "application/octet-stream"
569 && lower_extension(path).as_deref() == Some("tga")
570 && !has_binary_control_chars(bytes))
571}
572
573fn detect_is_script(
574 path: &Path,
575 bytes: &[u8],
576 programming_language: Option<&str>,
577 is_text: bool,
578) -> bool {
579 if !is_text || is_makefile(path) {
580 return false;
581 }
582
583 bytes.starts_with(b"#!")
584 || lower_extension(path).as_deref().is_some_and(|ext| {
585 matches!(
586 ext,
587 "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
588 )
589 })
590 || matches!(
591 programming_language,
592 Some("Shell" | "Python" | "Ruby" | "Perl" | "PHP" | "PowerShell" | "Awk")
593 )
594}
595
596fn detect_is_source(
597 path: &Path,
598 programming_language: Option<&str>,
599 is_text: bool,
600 is_script: bool,
601) -> bool {
602 if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
603 return false;
604 }
605
606 if is_c_like_source(path) || is_java_like_source(path) {
607 return true;
608 }
609
610 programming_language.is_some() || is_script
611}
612
613#[allow(clippy::too_many_arguments)]
614fn detect_file_type(
615 path: &Path,
616 bytes: &[u8],
617 detected_format: FileFormat,
618 mime_type: &str,
619 programming_language: Option<&str>,
620 is_binary: bool,
621 is_text: bool,
622 is_archive: bool,
623 is_media: bool,
624 is_script: bool,
625) -> String {
626 if bytes.is_empty() {
627 return "empty".to_string();
628 }
629
630 if looks_like_pdf(bytes) {
631 return "PDF document".to_string();
632 }
633
634 if let Some(file_type) = media_file_type_from_content(bytes) {
635 return file_type.to_string();
636 }
637
638 if is_archive {
639 return archive_file_type(path, bytes, detected_format);
640 }
641
642 if is_script {
643 return script_file_type(programming_language, bytes);
644 }
645
646 if is_text {
647 if lower_extension(path).as_deref() == Some("json") {
648 return "JSON text data".to_string();
649 }
650 if lower_extension(path).as_deref() == Some("xml") {
651 return "XML text data".to_string();
652 }
653 if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
654 return "YAML text data".to_string();
655 }
656 if lower_extension(path).as_deref() == Some("toml") {
657 return "TOML text data".to_string();
658 }
659 if matches!(
660 lower_extension(path).as_deref(),
661 Some("ini" | "cfg" | "conf")
662 ) {
663 return "INI text data".to_string();
664 }
665 if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
666 return "Git configuration text".to_string();
667 }
668 if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
669 return text_file_type(bytes);
670 }
671 if programming_language.is_some() && !is_media {
672 return text_file_type(bytes);
673 }
674 return text_file_type(bytes);
675 }
676
677 if let Some(file_type) = format_based_file_type(detected_format) {
678 return file_type;
679 }
680
681 if is_binary && mime_type == "application/octet-stream" {
682 return "data".to_string();
683 }
684
685 mime_type.to_string()
686}
687
688fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
689 if matches!(programming_language, Some(language) if is_source_like_language(language)) {
690 return true;
691 }
692
693 if matches!(
694 lower_file_name(path).as_str(),
695 "dockerfile"
696 | "containerfile"
697 | "containerfile.core"
698 | "apkbuild"
699 | "podfile"
700 | "meson.build"
701 | "build"
702 | "workspace"
703 | "buck"
704 | "default.nix"
705 | "flake.nix"
706 | "shell.nix"
707 ) {
708 return true;
709 }
710
711 path.extension()
712 .and_then(|ext| ext.to_str())
713 .is_some_and(|ext| {
714 matches!(
715 ext.to_ascii_lowercase().as_str(),
716 "rs" | "py"
717 | "js"
718 | "mjs"
719 | "cjs"
720 | "jsx"
721 | "ts"
722 | "mts"
723 | "cts"
724 | "tsx"
725 | "c"
726 | "cpp"
727 | "cc"
728 | "cxx"
729 | "h"
730 | "hpp"
731 | "m"
732 | "mm"
733 | "s"
734 | "asm"
735 | "java"
736 | "go"
737 | "rb"
738 | "php"
739 | "pl"
740 | "swift"
741 | "sh"
742 | "bash"
743 | "zsh"
744 | "fish"
745 | "ksh"
746 | "ps1"
747 | "psm1"
748 | "psd1"
749 | "awk"
750 | "kt"
751 | "kts"
752 | "dart"
753 | "scala"
754 | "groovy"
755 | "gradle"
756 | "gvy"
757 | "gy"
758 | "gsh"
759 | "cs"
760 | "fs"
761 | "fsx"
762 | "r"
763 | "lua"
764 | "jl"
765 | "ex"
766 | "exs"
767 | "clj"
768 | "cljs"
769 | "cljc"
770 | "hs"
771 | "erl"
772 | "nix"
773 | "zig"
774 | "bzl"
775 | "bazel"
776 | "star"
777 | "sky"
778 | "ml"
779 | "mli"
780 | "tex"
781 )
782 })
783}
784
785fn is_source_like_language(language: &str) -> bool {
786 matches!(
787 language,
788 "Rust"
789 | "Python"
790 | "JavaScript"
791 | "TypeScript"
792 | "JavaScript/TypeScript"
793 | "C"
794 | "C++"
795 | "Objective-C"
796 | "Objective-C++"
797 | "GAS"
798 | "Java"
799 | "Go"
800 | "Ruby"
801 | "PHP"
802 | "Perl"
803 | "Swift"
804 | "Shell"
805 | "PowerShell"
806 | "Awk"
807 | "Kotlin"
808 | "Dart"
809 | "Scala"
810 | "C#"
811 | "F#"
812 | "R"
813 | "Lua"
814 | "Julia"
815 | "Elixir"
816 | "Clojure"
817 | "Haskell"
818 | "Erlang"
819 | "Groovy"
820 | "Nix"
821 | "Zig"
822 | "Starlark"
823 | "OCaml"
824 | "Meson"
825 | "TeX"
826 | "Dockerfile"
827 | "Makefile"
828 )
829}
830
831fn extension(path: &Path) -> Option<&str> {
832 path.extension().and_then(|ext| ext.to_str())
833}
834
835fn lower_extension(path: &Path) -> Option<String> {
836 extension(path).map(|ext| ext.to_ascii_lowercase())
837}
838
839fn lower_file_name(path: &Path) -> String {
840 path.file_name()
841 .and_then(|name| name.to_str())
842 .map(|name| name.to_ascii_lowercase())
843 .unwrap_or_default()
844}
845
846fn is_plain_text(path: &Path) -> bool {
847 lower_extension(path)
848 .as_deref()
849 .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
850}
851
852fn is_makefile(path: &Path) -> bool {
853 matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
854}
855
856fn is_source_map(path: &Path) -> bool {
857 let path_lower = path.to_string_lossy().to_ascii_lowercase();
858 path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
859}
860
861fn is_c_like_source(path: &Path) -> bool {
862 lower_extension(path).as_deref().is_some_and(|ext| {
863 matches!(
864 ext,
865 "c" | "cc"
866 | "cp"
867 | "cpp"
868 | "cxx"
869 | "c++"
870 | "h"
871 | "hh"
872 | "hpp"
873 | "hxx"
874 | "h++"
875 | "i"
876 | "ii"
877 | "m"
878 | "s"
879 | "asm"
880 )
881 })
882}
883
884fn is_java_like_source(path: &Path) -> bool {
885 lower_extension(path)
886 .as_deref()
887 .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
888}
889
890fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
891 match detected_format {
892 FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
893 format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
894 format => Some(match format.kind() {
895 FileFormatKind::Image => short_name_or_name(&format, "image data"),
896 FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
897 FileFormatKind::Video => short_name_or_name(&format, "video data"),
898 _ => format.name().to_string(),
899 }),
900 }
901}
902
903fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
904 format
905 .short_name()
906 .map(|short_name| format!("{short_name} {suffix}"))
907 .unwrap_or_else(|| format!("{} {suffix}", format.name()))
908}
909
910fn detect_zip_like_mime(path: &Path) -> String {
911 match extension(path).map(|ext| ext.to_ascii_lowercase()) {
912 Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
913 Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
914 "application/java-archive".to_string()
915 }
916 _ => "application/zip".to_string(),
917 }
918}
919
920fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
921 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
922 Some("image/png")
923 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
924 Some("image/jpeg")
925 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
926 Some("image/tiff")
927 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
928 Some("image/webp")
929 } else {
930 None
931 }
932}
933
934fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
935 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
936 Some("PNG image data")
937 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
938 Some("JPEG image data")
939 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
940 Some("TIFF image data")
941 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
942 Some("WebP image data")
943 } else {
944 None
945 }
946}
947
948fn looks_like_pdf(bytes: &[u8]) -> bool {
949 bytes.starts_with(b"%PDF-")
950}
951
952fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
953 ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
954}
955
956fn extract_rtf_text(bytes: &[u8]) -> String {
957 let text = String::from_utf8_lossy(bytes);
958 let chars: Vec<char> = text.chars().collect();
959 let mut output = String::new();
960 let mut index = 0usize;
961
962 while index < chars.len() {
963 match chars[index] {
964 '{' | '}' => {
965 index += 1;
966 }
967 '\\' => {
968 index += 1;
969 if index >= chars.len() {
970 break;
971 }
972
973 match chars[index] {
974 '\\' | '{' | '}' => {
975 output.push(chars[index]);
976 index += 1;
977 }
978 '\'' => {
979 if index + 2 < chars.len() {
980 let hex = [chars[index + 1], chars[index + 2]];
981 let hex: String = hex.iter().collect();
982 if let Ok(value) = u8::from_str_radix(&hex, 16) {
983 output.push(value as char);
984 index += 3;
985 continue;
986 }
987 }
988 index += 1;
989 }
990 control if control.is_ascii_alphabetic() => {
991 let start = index;
992 while index < chars.len() && chars[index].is_ascii_alphabetic() {
993 index += 1;
994 }
995 let control_word: String = chars[start..index].iter().collect();
996
997 let number_start = index;
998 if index < chars.len()
999 && (chars[index] == '-' || chars[index].is_ascii_digit())
1000 {
1001 index += 1;
1002 while index < chars.len() && chars[index].is_ascii_digit() {
1003 index += 1;
1004 }
1005 }
1006 let parameter: String = chars[number_start..index].iter().collect();
1007
1008 if index < chars.len() && chars[index] == ' ' {
1009 index += 1;
1010 }
1011
1012 match control_word.as_str() {
1013 "par" | "line" => output.push('\n'),
1014 "tab" => output.push('\t'),
1015 "emdash" => output.push('—'),
1016 "endash" => output.push('–'),
1017 "bullet" => output.push('•'),
1018 "lquote" | "rquote" => output.push('\''),
1019 "ldblquote" | "rdblquote" => output.push('"'),
1020 "u" => {
1021 if let Ok(codepoint) = parameter.parse::<i32>() {
1022 let normalized = if codepoint < 0 {
1023 codepoint + 65_536
1024 } else {
1025 codepoint
1026 };
1027 if let Ok(normalized) = u32::try_from(normalized)
1028 && let Some(ch) = char::from_u32(normalized)
1029 {
1030 output.push(ch);
1031 }
1032 }
1033
1034 if index < chars.len()
1035 && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1036 {
1037 index += 1;
1038 }
1039 }
1040 _ => {}
1041 }
1042 }
1043 _ => {
1044 index += 1;
1045 }
1046 }
1047 }
1048 ch => {
1049 output.push(ch);
1050 index += 1;
1051 }
1052 }
1053 }
1054
1055 output
1056 .replace(['\r', '\u{0c}'], "\n")
1057 .lines()
1058 .map(str::trim_end)
1059 .collect::<Vec<_>>()
1060 .join("\n")
1061}
1062
1063fn looks_like_gzip(bytes: &[u8]) -> bool {
1064 bytes.starts_with(&[0x1f, 0x8b])
1065}
1066
1067fn looks_like_bzip2(bytes: &[u8]) -> bool {
1068 bytes.starts_with(b"BZh")
1069}
1070
1071fn looks_like_xz(bytes: &[u8]) -> bool {
1072 bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1073}
1074
1075fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1076 lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1077}
1078
1079fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1080 lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1081}
1082
1083fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1084 lower_extension(path)
1085 .as_deref()
1086 .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1087 && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1088 || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1089}
1090
1091fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1092 if looks_like_deb(bytes, path) {
1093 "debian binary package (format 2.0)".to_string()
1094 } else if looks_like_rpm(bytes, path) {
1095 "RPM package".to_string()
1096 } else if looks_like_squashfs(bytes, path) {
1097 "Squashfs filesystem".to_string()
1098 } else if looks_like_gzip(bytes) {
1099 "gzip compressed data".to_string()
1100 } else if looks_like_bzip2(bytes) {
1101 "bzip2 compressed data".to_string()
1102 } else if looks_like_xz(bytes) {
1103 "XZ compressed data".to_string()
1104 } else if is_zip_archive(bytes) {
1105 "Zip archive data".to_string()
1106 } else if lower_extension(path).as_deref() == Some("gem") {
1107 "POSIX tar archive".to_string()
1108 } else if let Some(file_type) = format_based_file_type(detected_format) {
1109 file_type
1110 } else {
1111 "archive data".to_string()
1112 }
1113}
1114
1115fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1116 let suffix = text_executable_label(bytes);
1117
1118 match programming_language {
1119 Some("Python") => format!("python script, {suffix}"),
1120 Some("Ruby") => format!("ruby script, {suffix}"),
1121 Some("Perl") => format!("perl script, {suffix}"),
1122 Some("PHP") => format!("php script, {suffix}"),
1123 Some("Shell") => format!("shell script, {suffix}"),
1124 Some("JavaScript") => format!("javascript script, {suffix}"),
1125 Some("TypeScript") => format!("typescript script, {suffix}"),
1126 Some("PowerShell") => format!("powershell script, {suffix}"),
1127 Some("Awk") => format!("awk script, {suffix}"),
1128 _ => format!("script, {suffix}"),
1129 }
1130}
1131
1132fn text_file_type(bytes: &[u8]) -> String {
1133 text_label(bytes).to_string()
1134}
1135
1136fn text_label(bytes: &[u8]) -> &'static str {
1137 if std::str::from_utf8(bytes).is_ok() {
1138 if bytes.contains(&b'\n') {
1139 "UTF-8 Unicode text"
1140 } else {
1141 "UTF-8 Unicode text, with no line terminators"
1142 }
1143 } else if bytes.contains(&b'\n') {
1144 "text"
1145 } else {
1146 "text, with no line terminators"
1147 }
1148}
1149
1150fn text_executable_label(bytes: &[u8]) -> &'static str {
1151 if std::str::from_utf8(bytes).is_ok() {
1152 if bytes.contains(&b'\n') {
1153 "UTF-8 Unicode text executable"
1154 } else {
1155 "UTF-8 Unicode text executable, with no line terminators"
1156 }
1157 } else if bytes.contains(&b'\n') {
1158 "text executable"
1159 } else {
1160 "text executable, with no line terminators"
1161 }
1162}
1163
1164fn supported_image_metadata_format(
1165 ext: Option<&str>,
1166 detected_format: FileFormat,
1167) -> Option<ImageFormat> {
1168 match ext {
1169 Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1170 Some("png") => Some(ImageFormat::Png),
1171 Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1172 Some("webp") => Some(ImageFormat::WebP),
1173 _ => match detected_format.media_type() {
1174 "image/jpeg" => Some(ImageFormat::Jpeg),
1175 "image/png" => Some(ImageFormat::Png),
1176 "image/tiff" => Some(ImageFormat::Tiff),
1177 "image/webp" => Some(ImageFormat::WebP),
1178 _ => None,
1179 },
1180 }
1181}
1182
1183fn should_skip_binary_string_extraction(
1184 path: &Path,
1185 bytes: &[u8],
1186 detected_format: FileFormat,
1187) -> bool {
1188 matches!(lower_extension(path).as_deref(), Some("pdf"))
1189 || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1190 .is_some()
1191 || (matches!(
1192 detected_format.kind(),
1193 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1194 ) && !is_textual_format(detected_format))
1195 || media_mime_from_content(bytes).is_some()
1196 || is_zip_archive(bytes)
1197 || looks_like_gzip(bytes)
1198 || looks_like_bzip2(bytes)
1199 || looks_like_xz(bytes)
1200 || looks_like_deb(bytes, path)
1201 || looks_like_rpm(bytes, path)
1202 || looks_like_squashfs(bytes, path)
1203}
1204
1205fn should_skip_large_opaque_binary_text_extraction(
1206 _path: &Path,
1207 bytes: &[u8],
1208 detected_format: FileFormat,
1209) -> bool {
1210 is_large_opaque_binary_candidate(bytes, detected_format)
1211 && !sample_has_promising_printable_strings(bytes)
1212}
1213
1214fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1215 bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1216 && !is_textual_format(detected_format)
1217 && !matches!(
1218 detected_format.kind(),
1219 FileFormatKind::Archive
1220 | FileFormatKind::Compressed
1221 | FileFormatKind::Package
1222 | FileFormatKind::Audio
1223 | FileFormatKind::Image
1224 | FileFormatKind::Video
1225 )
1226}
1227
1228fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1229 const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1230
1231 let mut ranges = Vec::new();
1232 let mut push_range = |start: usize, end: usize| {
1233 if start < end && !ranges.contains(&(start, end)) {
1234 ranges.push((start, end));
1235 }
1236 };
1237
1238 push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1239 if len > SAMPLE_WINDOW_BYTES * 2 {
1240 let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1241 let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1242 push_range(mid_start, mid_end);
1243 }
1244 if len > SAMPLE_WINDOW_BYTES {
1245 push_range(len - SAMPLE_WINDOW_BYTES, len);
1246 }
1247
1248 ranges
1249}
1250
1251fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1252 let mut structured_signal_seen = false;
1253 let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1254 .into_iter()
1255 .filter(|&(start, end)| {
1256 let window = &bytes[start..end];
1257 if has_strong_structured_text_signal(window) {
1258 structured_signal_seen = true;
1259 }
1260 has_license_or_notice_signal(window)
1261 })
1262 .count();
1263
1264 structured_signal_seen || promising_license_windows >= 2
1265}
1266
1267fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1268 let mut combined_lines = BTreeSet::new();
1269
1270 for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1271 let window_text = extract_printable_strings(&bytes[start..end]);
1272 for line in window_text
1273 .lines()
1274 .map(str::trim)
1275 .filter(|line| !line.is_empty())
1276 {
1277 combined_lines.insert(line.to_string());
1278 }
1279 }
1280
1281 combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1282}
1283
1284fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1285 let strings = extract_printable_strings(bytes);
1286 if strings.is_empty() {
1287 return false;
1288 }
1289
1290 let lower = strings.to_ascii_lowercase();
1291 [
1292 "copyright",
1293 "license",
1294 "licensed under",
1295 "all rights reserved",
1296 "permission is hereby granted",
1297 "redistribution and use",
1298 "spdx-license-identifier",
1299 ]
1300 .iter()
1301 .any(|marker| lower.contains(marker))
1302}
1303
1304fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1305 let strings = extract_printable_strings(bytes);
1306 if strings.is_empty() {
1307 return false;
1308 }
1309
1310 let email_markers = strings.matches('@').count();
1311 let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1312
1313 email_markers + url_markers >= 3
1314}
1315
1316fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1317 match format {
1318 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1319 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1320 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1321 ImageFormat::WebP => {
1322 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1323 }
1324 _ => false,
1325 }
1326}
1327
1328fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1329 let mut values = Vec::new();
1330 values.extend(extract_exif_metadata_values(bytes));
1331 values.extend(extract_xmp_metadata_values(bytes, format));
1332 values_to_text(values)
1333}
1334
1335fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1336 let mut cursor = BufReader::new(Cursor::new(bytes));
1337 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1338 Ok(exif) => exif,
1339 Err(_) => return Vec::new(),
1340 };
1341
1342 let mut values = Vec::new();
1343 for field in exif.fields() {
1344 let rendered = match field.tag {
1345 exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1346 Some(field.display_value().with_unit(&exif).to_string())
1347 }
1348 exif::Tag::Artist => Some(format!(
1349 "Author: {}",
1350 field.display_value().with_unit(&exif)
1351 )),
1352 _ => None,
1353 };
1354
1355 if let Some(rendered) = rendered {
1356 values.push(rendered);
1357 }
1358 }
1359
1360 values
1361}
1362
1363fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1364 let xmp = match extract_raw_xmp_packet(bytes, format) {
1365 Some(xmp) => xmp,
1366 None => return Vec::new(),
1367 };
1368
1369 parse_xmp_values(&xmp)
1370}
1371
1372fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1373 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1374 if let Ok(mut decoder) = reader.into_decoder()
1375 && let Ok(Some(xmp)) = decoder.xmp_metadata()
1376 {
1377 return Some(xmp);
1378 }
1379
1380 match format {
1381 ImageFormat::Png => extract_png_xmp_packet(bytes),
1382 _ => None,
1383 }
1384}
1385
1386fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1387 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1388
1389 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1390 return None;
1391 }
1392
1393 let mut offset = PNG_SIGNATURE.len();
1394 while offset + 12 <= bytes.len() {
1395 let length = u32::from_be_bytes([
1396 bytes[offset],
1397 bytes[offset + 1],
1398 bytes[offset + 2],
1399 bytes[offset + 3],
1400 ]) as usize;
1401 let chunk_start = offset + 8;
1402 let chunk_end = chunk_start + length;
1403 if chunk_end + 4 > bytes.len() {
1404 return None;
1405 }
1406
1407 let chunk_type = &bytes[offset + 4..offset + 8];
1408 if chunk_type == b"iTXt" {
1409 let data = &bytes[chunk_start..chunk_end];
1410 if let Some(xmp) = parse_png_itxt_xmp(data) {
1411 return Some(xmp);
1412 }
1413 }
1414
1415 offset = chunk_end + 4;
1416 }
1417
1418 None
1419}
1420
1421fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1422 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1423
1424 let keyword_end = data.iter().position(|&b| b == 0)?;
1425 if &data[..keyword_end] != XMP_KEYWORD {
1426 return None;
1427 }
1428
1429 let mut cursor = keyword_end + 1;
1430 let compression_flag = *data.get(cursor)?;
1431 cursor += 1;
1432 let compression_method = *data.get(cursor)?;
1433 cursor += 1;
1434 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1435 return None;
1436 }
1437
1438 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1439 cursor = language_end + 1;
1440
1441 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1442 cursor = translated_end + 1;
1443
1444 let text_bytes = &data[cursor..];
1445 if compression_flag == 1 {
1446 let mut decoder = ZlibDecoder::new(text_bytes);
1447 let mut decoded = Vec::new();
1448 decoder.read_to_end(&mut decoded).ok()?;
1449 Some(decoded)
1450 } else {
1451 Some(text_bytes.to_vec())
1452 }
1453}
1454
1455fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1456 let mut reader = XmlReader::from_reader(xmp);
1457 reader.config_mut().trim_text(true);
1458
1459 let mut buf = Vec::new();
1460 let mut stack: Vec<String> = Vec::new();
1461 let mut values = Vec::new();
1462
1463 loop {
1464 match reader.read_event_into(&mut buf) {
1465 Ok(Event::Start(e)) => {
1466 stack.push(local_xml_name(e.name().as_ref()));
1467 }
1468 Ok(Event::End(_)) => {
1469 stack.pop();
1470 }
1471 Ok(Event::Empty(_)) => {}
1472 Ok(Event::Text(text)) => {
1473 if let Some(field) = stack
1474 .iter()
1475 .rev()
1476 .find_map(|name| allowed_xmp_field(name.as_str()))
1477 && let Ok(decoded) = text.decode()
1478 {
1479 let decoded = decoded.into_owned();
1480 if !decoded.trim().is_empty() {
1481 values.push(format_xmp_value(field, &decoded));
1482 }
1483 }
1484 }
1485 Ok(Event::CData(text)) => {
1486 if let Some(field) = stack
1487 .iter()
1488 .rev()
1489 .find_map(|name| allowed_xmp_field(name.as_str()))
1490 && let Ok(decoded) = text.decode()
1491 {
1492 let decoded = decoded.into_owned();
1493 if !decoded.trim().is_empty() {
1494 values.push(format_xmp_value(field, &decoded));
1495 }
1496 }
1497 }
1498 Ok(Event::Eof) | Err(_) => break,
1499 _ => {}
1500 }
1501 buf.clear();
1502 }
1503
1504 values
1505}
1506
1507fn local_xml_name(name: &[u8]) -> String {
1508 let name = std::str::from_utf8(name).unwrap_or_default();
1509 name.rsplit(':').next().unwrap_or(name).to_string()
1510}
1511
1512fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1513 match name {
1514 "creator" => Some("creator"),
1515 "rights" => Some("rights"),
1516 "description" => Some("description"),
1517 "title" => Some("title"),
1518 "subject" => Some("subject"),
1519 "UsageTerms" => Some("usage_terms"),
1520 "WebStatement" => Some("web_statement"),
1521 _ => None,
1522 }
1523}
1524
1525fn format_xmp_value(field: &str, value: &str) -> String {
1526 match field {
1527 "creator" => format!("Author: {value}"),
1528 _ => value.to_string(),
1529 }
1530}
1531
1532fn values_to_text(values: Vec<String>) -> String {
1533 let mut seen = BTreeSet::new();
1534 let mut lines = Vec::new();
1535 let mut total_bytes = 0usize;
1536
1537 for value in values {
1538 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1539 break;
1540 }
1541
1542 let normalized = normalize_metadata_value(&value);
1543 if normalized.is_empty() || !seen.insert(normalized.clone()) {
1544 continue;
1545 }
1546
1547 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1548 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1549 break;
1550 }
1551
1552 total_bytes += added_bytes;
1553 lines.push(normalized);
1554 }
1555
1556 lines.join("\n")
1557}
1558
1559fn normalize_metadata_value(value: &str) -> String {
1560 value
1561 .chars()
1562 .filter(|&ch| ch != '\0')
1563 .collect::<String>()
1564 .split_whitespace()
1565 .collect::<Vec<_>>()
1566 .join(" ")
1567 .trim()
1568 .to_string()
1569}
1570
1571fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1572 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1573 return (String::new(), None);
1574 }
1575
1576 let mut failures = Vec::new();
1577 let mut saw_success = false;
1578
1579 let extracted = catch_unwind(AssertUnwindSafe(
1580 || -> Result<String, Box<dyn std::error::Error>> {
1581 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1582 extract_first_pdf_page_text(&mut document)
1583 },
1584 ));
1585 match extracted {
1586 Ok(Ok(text)) => {
1587 saw_success = true;
1588 if let Some(normalized) = normalize_pdf_text(text) {
1589 return (normalized, None);
1590 }
1591 }
1592 Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1593 Err(payload) => failures.push(format!(
1594 "from-bytes first-page panic: {}",
1595 panic_payload_to_string(payload.as_ref())
1596 )),
1597 }
1598
1599 let extracted = catch_unwind(AssertUnwindSafe(
1600 || -> Result<String, Box<dyn std::error::Error>> {
1601 let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1602 extract_pdf_text_from_document(&mut document)
1603 },
1604 ));
1605 match extracted {
1606 Ok(Ok(text)) => {
1607 saw_success = true;
1608 if let Some(normalized) = normalize_pdf_text(text) {
1609 return (normalized, None);
1610 }
1611 }
1612 Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1613 Err(payload) => failures.push(format!(
1614 "open full-document panic: {}",
1615 panic_payload_to_string(payload.as_ref())
1616 )),
1617 }
1618
1619 let extracted = catch_unwind(AssertUnwindSafe(
1620 || -> Result<String, Box<dyn std::error::Error>> {
1621 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1622 extract_pdf_text_from_document(&mut document)
1623 },
1624 ));
1625 match extracted {
1626 Ok(Ok(text)) => {
1627 saw_success = true;
1628 if let Some(normalized) = normalize_pdf_text(text) {
1629 return (normalized, None);
1630 }
1631 }
1632 Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1633 Err(payload) => failures.push(format!(
1634 "from-bytes full-document panic: {}",
1635 panic_payload_to_string(payload.as_ref())
1636 )),
1637 }
1638
1639 if saw_success || is_non_actionable_pdf_failure(&failures) {
1640 (String::new(), None)
1641 } else {
1642 (
1643 String::new(),
1644 Some(format!(
1645 "PDF text extraction failed after {} attempts: {}",
1646 failures.len(),
1647 failures.join("; ")
1648 )),
1649 )
1650 }
1651}
1652
1653fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
1654 !failures.is_empty()
1655 && failures.iter().all(|failure| {
1656 failure.contains("requires a password")
1657 || failure.contains("Encrypt dictionary missing /O")
1658 || failure.contains("Encrypt dictionary missing /U")
1659 || failure.contains("security handler cannot be found")
1660 || failure.contains("Invalid cross-reference table")
1661 })
1662}
1663
1664fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
1665 if let Some(message) = payload.downcast_ref::<&str>() {
1666 (*message).to_string()
1667 } else if let Some(message) = payload.downcast_ref::<String>() {
1668 message.clone()
1669 } else {
1670 "unknown panic payload".to_string()
1671 }
1672}
1673
1674fn extract_first_pdf_page_text(
1675 document: &mut pdf_oxide::document::PdfDocument,
1676) -> Result<String, Box<dyn std::error::Error>> {
1677 if document.page_count()? == 0 {
1678 return Ok(String::new());
1679 }
1680
1681 let extracted_text = document.extract_text(0)?;
1682 let markdown_text =
1683 document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1684 if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1685 return Ok(extracted_text);
1686 }
1687
1688 let pipeline_text =
1689 document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1690
1691 Ok(merge_pdf_first_page_text(
1692 &extracted_text,
1693 &markdown_text,
1694 &pipeline_text,
1695 ))
1696}
1697
1698fn extract_pdf_text_from_document(
1699 document: &mut pdf_oxide::document::PdfDocument,
1700) -> Result<String, Box<dyn std::error::Error>> {
1701 Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1702}
1703
1704fn normalize_pdf_text(text: String) -> Option<String> {
1705 let normalized = text.replace(['\r', '\u{0c}'], "\n");
1706 (!normalized.trim().is_empty()).then_some(normalized)
1707}
1708
1709fn merge_pdf_first_page_text(
1710 _extracted_text: &str,
1711 markdown_text: &str,
1712 pipeline_text: &str,
1713) -> String {
1714 let pipeline = pipeline_text.trim();
1715 if pipeline.is_empty() {
1716 return String::new();
1717 }
1718
1719 let prefix = pdf_first_page_heading_prefix(markdown_text);
1720 let Some(prefix) = prefix else {
1721 return pipeline_text.to_string();
1722 };
1723
1724 if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1725 pipeline_text.to_string()
1726 } else {
1727 format!("{prefix}\n\n{pipeline}")
1728 }
1729}
1730
1731fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1732 normalize_pdf_heading_comparison_text(text)
1733 .contains(&normalize_pdf_heading_comparison_text(prefix))
1734}
1735
1736fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1737 text.split_whitespace()
1738 .map(|part| part.to_ascii_lowercase())
1739 .collect::<Vec<_>>()
1740 .join(" ")
1741}
1742
1743fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1744 let mut lines = Vec::new();
1745
1746 for line in pdf_markdown_heading_lines(markdown_text) {
1747 push_unique_line(&mut lines, line);
1748 }
1749
1750 (!lines.is_empty()).then(|| lines.join("\n"))
1751}
1752
1753fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1754 text.lines()
1755 .map(str::trim)
1756 .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1757 .map(|line| line.trim_matches('#').trim())
1758 .filter(|line| !line.is_empty())
1759 .filter(|line| !looks_like_numbered_section_heading(line))
1760 .take(4)
1761 .map(ToOwned::to_owned)
1762 .collect()
1763}
1764
1765fn push_unique_line(lines: &mut Vec<String>, line: String) {
1766 if !lines.iter().any(|existing| existing == &line) {
1767 lines.push(line);
1768 }
1769}
1770
1771fn looks_like_numbered_section_heading(line: &str) -> bool {
1772 let mut chars = line.chars();
1773 let Some(first) = chars.next() else {
1774 return false;
1775 };
1776
1777 if !first.is_ascii_digit() {
1778 return false;
1779 }
1780
1781 matches!(chars.next(), Some('.'))
1782}
1783
1784fn is_zip_archive(bytes: &[u8]) -> bool {
1785 bytes.starts_with(b"PK\x03\x04")
1786 || bytes.starts_with(b"PK\x05\x06")
1787 || bytes.starts_with(b"PK\x07\x08")
1788}
1789
1790pub fn extract_printable_strings(bytes: &[u8]) -> String {
1791 const MIN_LEN: usize = 4;
1792 const MIN_OUTPUT_BYTES: usize = 2_000_000;
1793 const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1794
1795 let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1796
1797 fn is_printable_ascii(b: u8) -> bool {
1798 matches!(b, 0x20..=0x7E)
1799 }
1800
1801 let mut out = String::new();
1802 let mut run: Vec<u8> = Vec::new();
1803
1804 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1805 if run.len() >= MIN_LEN {
1806 if !out.is_empty() {
1807 out.push('\n');
1808 }
1809 out.push_str(&String::from_utf8_lossy(run));
1810 }
1811 run.clear();
1812 };
1813
1814 for &b in bytes {
1815 if is_printable_ascii(b) {
1816 run.push(b);
1817 } else {
1818 flush_run(&mut out, &mut run);
1819 if out.len() >= max_output_bytes {
1820 return out;
1821 }
1822 }
1823 }
1824 flush_run(&mut out, &mut run);
1825 if out.len() >= max_output_bytes {
1826 return out;
1827 }
1828
1829 for start in 0..=1 {
1830 run.clear();
1831 let mut i = start;
1832 while i + 1 < bytes.len() {
1833 let b0 = bytes[i];
1834 let b1 = bytes[i + 1];
1835 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1836 if is_printable_ascii(ch) && zero == 0 {
1837 run.push(ch);
1838 } else {
1839 flush_run(&mut out, &mut run);
1840 if out.len() >= max_output_bytes {
1841 return out;
1842 }
1843 }
1844 i += 2;
1845 }
1846 flush_run(&mut out, &mut run);
1847 if out.len() >= max_output_bytes {
1848 return out;
1849 }
1850 }
1851
1852 out
1853}
1854
1855#[cfg(test)]
1856mod tests {
1857 use std::path::Path;
1858
1859 use super::{
1860 ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
1861 extract_printable_strings, extract_text_for_detection,
1862 extract_text_for_detection_with_diagnostics, is_non_actionable_pdf_failure,
1863 normalize_mime_type, normalize_pdf_heading_comparison_text,
1864 windows_metadata_or_empty_result,
1865 };
1866
1867 #[test]
1868 fn test_extract_text_for_detection_skips_jar_archives() {
1869 let path = Path::new(
1870 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
1871 );
1872 let bytes = std::fs::read(path).expect("failed to read jar fixture");
1873
1874 let (text, kind) = extract_text_for_detection(path, &bytes);
1875
1876 assert!(text.is_empty());
1877 assert_eq!(kind, ExtractedTextKind::None);
1878 }
1879
1880 #[test]
1881 fn test_extract_text_for_detection_reads_pdf_fixture_text() {
1882 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1883 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1884
1885 let (text, kind) = extract_text_for_detection(path, &bytes);
1886
1887 assert_eq!(kind, ExtractedTextKind::Pdf);
1888 assert!(text.contains("Redistribution and use in source and binary forms"));
1889 }
1890
1891 #[test]
1892 fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
1893 let path =
1894 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1895 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1896
1897 let (text, kind) = extract_text_for_detection(path, &bytes);
1898
1899 assert_eq!(kind, ExtractedTextKind::Pdf);
1900 assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
1901 assert!(!text.contains("DISCLAIMER OF WARRANTY"));
1902 }
1903
1904 #[test]
1905 fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
1906 let path =
1907 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1908 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1909
1910 let (text, kind) = extract_text_for_detection(path, &bytes);
1911
1912 assert_eq!(kind, ExtractedTextKind::Pdf);
1913
1914 let normalized = normalize_pdf_heading_comparison_text(&text);
1915 let heading =
1916 normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
1917 assert_eq!(normalized.matches(&heading).count(), 1);
1918 }
1919
1920 #[test]
1921 fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
1922 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1923 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1924
1925 let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
1926
1927 assert_eq!(kind, ExtractedTextKind::Pdf);
1928 assert!(text.contains("Redistribution and use in source and binary forms"));
1929 }
1930
1931 #[test]
1932 fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
1933 let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
1934
1935 let (text, kind, scan_error) =
1936 extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
1937
1938 assert!(text.is_empty());
1939 assert_eq!(kind, ExtractedTextKind::None);
1940 let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
1941 assert!(scan_error.contains("PDF text extraction failed after"));
1942 }
1943
1944 #[test]
1945 fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
1946 let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1947
1948 let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
1949
1950 assert!(text.is_empty());
1951 assert_eq!(kind, ExtractedTextKind::None);
1952 }
1953
1954 #[test]
1955 fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
1956 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1957 let text = b"Copyright 2026 Example Project!!!";
1958 bytes[..text.len()].copy_from_slice(text);
1959 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
1960 bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
1961
1962 let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
1963
1964 assert_ne!(kind, ExtractedTextKind::None);
1965 assert!(text.contains("Copyright 2026 Example Project"));
1966 }
1967
1968 #[test]
1969 fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
1970 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1971 let noise = b"(c) $1234567890ABCDEF[]{}--==++";
1972 bytes[..noise.len()].copy_from_slice(noise);
1973 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
1974 bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
1975
1976 let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
1977
1978 assert!(text.is_empty());
1979 assert_eq!(kind, ExtractedTextKind::None);
1980 }
1981
1982 #[test]
1983 fn test_extract_text_for_detection_uses_windows_executable_metadata() {
1984 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
1985 let bytes = std::fs::read(path).expect("read PE fixture");
1986
1987 let (text, kind) = extract_text_for_detection(path, &bytes);
1988
1989 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
1990 assert!(text.contains("License: This program is free software"));
1991 assert!(text.contains("LegalCopyright:"));
1992 }
1993
1994 #[test]
1995 fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
1996 {
1997 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
1998 let mut bytes = std::fs::read(path).expect("read PE fixture");
1999 bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2000
2001 let (text, kind) = extract_text_for_detection(path, &bytes);
2002
2003 assert_ne!(kind, ExtractedTextKind::None);
2004 assert!(!text.trim().is_empty());
2005 }
2006
2007 #[test]
2008 fn test_windows_metadata_or_empty_result_preserves_metadata() {
2009 let (text, kind, scan_error) =
2010 windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2011
2012 assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2013 assert_eq!(text, "LegalCopyright: Example Corp");
2014 assert!(scan_error.is_none());
2015 }
2016
2017 #[test]
2018 fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2019 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2020 let text = b"Copyright 2026 Example Project!!!";
2021 bytes[..text.len()].copy_from_slice(text);
2022
2023 let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2024
2025 assert!(text.is_empty());
2026 assert_eq!(kind, ExtractedTextKind::None);
2027 }
2028
2029 #[test]
2030 fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2031 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2032 let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2033 bytes[..text.len()].copy_from_slice(text);
2034
2035 let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2036
2037 assert_ne!(kind, ExtractedTextKind::None);
2038 assert!(text.contains("asn@redhat.com"));
2039 assert!(text.contains("https://publicsuffix.org/"));
2040 }
2041
2042 #[test]
2043 fn test_non_actionable_pdf_failures_are_suppressed() {
2044 assert!(is_non_actionable_pdf_failure(&[
2045 "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2046 "open full-document: PDF is encrypted and requires a password".to_string(),
2047 ]));
2048 assert!(is_non_actionable_pdf_failure(&[
2049 "from-bytes first-page: Invalid cross-reference table".to_string(),
2050 "open full-document: Invalid cross-reference table".to_string(),
2051 ]));
2052 assert!(is_non_actionable_pdf_failure(&[
2053 "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2054 "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2055 ]));
2056 assert!(!is_non_actionable_pdf_failure(&[
2057 "from-bytes first-page: some other parser failure".to_string(),
2058 ]));
2059 }
2060
2061 #[test]
2062 fn test_extract_text_for_detection_skips_zip_like_archives() {
2063 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2064
2065 let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2066 let (crate_text, crate_kind) =
2067 extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2068
2069 assert!(whl_text.is_empty());
2070 assert_eq!(whl_kind, ExtractedTextKind::None);
2071 assert!(crate_text.is_empty());
2072 assert_eq!(crate_kind, ExtractedTextKind::None);
2073 }
2074
2075 #[test]
2076 fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2077 let path =
2078 Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2079 let bytes = std::fs::read(path).expect("failed to read lib fixture");
2080
2081 let (text, kind) = extract_text_for_detection(path, &bytes);
2082
2083 assert_ne!(kind, ExtractedTextKind::None);
2084 assert!(text.contains("Copyright nexB and others (c) 2012"));
2085 }
2086
2087 #[test]
2088 fn test_extract_text_for_detection_reads_font_metadata() {
2089 let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2090 let bytes = std::fs::read(path).expect("failed to read font fixture");
2091
2092 let (text, kind) = extract_text_for_detection(path, &bytes);
2093
2094 assert_eq!(kind, ExtractedTextKind::FontMetadata);
2095 assert!(text.contains("License Description:"), "{text}");
2096 assert!(
2097 text.contains("Open Font License") || text.contains("OFL"),
2098 "{text}"
2099 );
2100 }
2101
2102 #[test]
2103 fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2104 let bytes = b"abcd\0".repeat(525_000);
2105
2106 let text = extract_printable_strings(&bytes);
2107
2108 assert!(
2109 text.len() > 2_000_000,
2110 "unexpected truncation at {}",
2111 text.len()
2112 );
2113 assert!(text.ends_with("abcd"));
2114 }
2115
2116 #[test]
2117 fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2118 let path = Path::new(
2119 "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2120 );
2121 let bytes = std::fs::read(path).expect("failed to read svg fixture");
2122
2123 let (text, kind) = extract_text_for_detection(path, &bytes);
2124
2125 assert_eq!(kind, ExtractedTextKind::Decoded);
2126 assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2127 }
2128
2129 #[test]
2130 fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2131 let path = Path::new(
2132 "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2133 );
2134 let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2135
2136 let (text, kind) = extract_text_for_detection(path, &bytes);
2137
2138 assert_eq!(kind, ExtractedTextKind::Decoded);
2139 assert!(text.contains("GNU Lesser General Public"));
2140 assert!(text.contains("version"));
2141 assert!(text.contains("2.1 of the License"));
2142 }
2143
2144 #[test]
2145 fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2146 assert_eq!(
2147 normalize_mime_type(
2148 Path::new("main.ts"),
2149 b"export const answer = 42;\n",
2150 Some("TypeScript"),
2151 "video/mp2t",
2152 ),
2153 "text/plain"
2154 );
2155 }
2156
2157 #[test]
2158 fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2159 assert_eq!(
2160 normalize_mime_type(
2161 Path::new("main.js"),
2162 b"console.log('hello');\n",
2163 Some("JavaScript"),
2164 "application/octet-stream",
2165 ),
2166 "text/plain"
2167 );
2168 }
2169
2170 #[test]
2171 fn test_normalize_mime_type_preserves_binary_video_guess() {
2172 assert_eq!(
2173 normalize_mime_type(
2174 Path::new("main.ts"),
2175 &[0, 159, 146, 150, 0, 1, 2, 3],
2176 Some("TypeScript"),
2177 "video/mp2t",
2178 ),
2179 "video/mp2t"
2180 );
2181 }
2182
2183 #[test]
2184 fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2185 assert_eq!(
2186 normalize_mime_type(
2187 Path::new("main.ts"),
2188 &[0, 159, 146, 150],
2189 Some("TypeScript"),
2190 "application/octet-stream",
2191 ),
2192 "application/octet-stream"
2193 );
2194 }
2195
2196 #[test]
2197 fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2198 let classification = classify_file_info(Path::new("test.txt"), b"");
2199
2200 assert_eq!(classification.mime_type, "inode/x-empty");
2201 assert_eq!(classification.file_type, "empty");
2202 assert!(!classification.is_binary);
2203 assert!(classification.is_text);
2204 assert!(!classification.is_source);
2205 assert_eq!(classification.programming_language, None);
2206 }
2207
2208 #[test]
2209 fn test_classify_file_info_keeps_json_out_of_programming_language() {
2210 let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2211
2212 assert_eq!(classification.mime_type, "application/json");
2213 assert_eq!(classification.file_type, "JSON text data");
2214 assert!(classification.is_text);
2215 assert!(!classification.is_source);
2216 assert_eq!(classification.programming_language, None);
2217 }
2218
2219 #[test]
2220 fn test_classify_file_info_treats_dockerfile_as_source() {
2221 let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2222
2223 assert_eq!(
2224 classification.programming_language.as_deref(),
2225 Some("Dockerfile")
2226 );
2227 assert!(classification.is_source);
2228 assert!(!classification.is_script);
2229 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2230 }
2231
2232 #[test]
2233 fn test_classify_file_info_treats_makefile_as_text_not_source() {
2234 let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2235
2236 assert_eq!(classification.programming_language, None);
2237 assert!(classification.is_text);
2238 assert!(!classification.is_source);
2239 assert!(!classification.is_script);
2240 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2241 }
2242
2243 #[test]
2244 fn test_classify_file_info_marks_supported_package_archives() {
2245 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2246
2247 let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2248 let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2249
2250 assert!(egg.is_archive);
2251 assert_eq!(egg.mime_type, "application/zip");
2252 assert_eq!(egg.file_type, "Zip archive data");
2253 assert!(nupkg.is_archive);
2254 assert_eq!(nupkg.mime_type, "application/zip");
2255 assert_eq!(nupkg.file_type, "Zip archive data");
2256 }
2257
2258 #[test]
2259 fn test_classify_file_info_marks_png_as_binary_media() {
2260 let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2261
2262 let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2263
2264 assert_eq!(classification.mime_type, "image/png");
2265 assert_eq!(classification.file_type, "PNG image data");
2266 assert!(classification.is_binary);
2267 assert!(!classification.is_text);
2268 assert!(classification.is_media);
2269 assert!(!classification.is_archive);
2270 assert!(!classification.is_source);
2271 }
2272
2273 #[test]
2274 fn test_classify_file_info_marks_pdf_as_binary_document() {
2275 let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2276
2277 let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2278
2279 assert_eq!(classification.mime_type, "application/pdf");
2280 assert_eq!(classification.file_type, "PDF document");
2281 assert!(classification.is_binary);
2282 assert!(!classification.is_text);
2283 assert!(!classification.is_archive);
2284 assert!(!classification.is_media);
2285 }
2286
2287 #[test]
2288 fn test_classify_file_info_marks_binary_blobs_as_binary() {
2289 let classification =
2290 classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2291
2292 assert!(classification.is_binary);
2293 assert!(!classification.is_text);
2294 assert!(!classification.is_source);
2295 assert_eq!(classification.programming_language, None);
2296 }
2297
2298 #[test]
2299 fn test_classify_file_info_treats_yaml_as_text_not_source() {
2300 let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2301
2302 assert_eq!(classification.programming_language, None);
2303 assert!(classification.is_text);
2304 assert!(!classification.is_source);
2305 assert_eq!(classification.file_type, "YAML text data");
2306 }
2307
2308 #[test]
2309 fn test_classify_file_info_classifies_common_build_manifests() {
2310 let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2311 let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2312 let gitmodules = classify_file_info(
2313 Path::new(".gitmodules"),
2314 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2315 );
2316
2317 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2318 assert!(gradle.is_source);
2319 assert_eq!(gradle.mime_type, "text/plain");
2320
2321 assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2322 assert!(flake.is_source);
2323 assert_eq!(flake.mime_type, "text/plain");
2324
2325 assert_eq!(gitmodules.programming_language, None);
2326 assert!(gitmodules.is_text);
2327 assert!(!gitmodules.is_source);
2328 assert_eq!(gitmodules.file_type, "Git configuration text");
2329 }
2330
2331 #[test]
2332 fn test_classify_file_info_labels_javascript_shebang_scripts() {
2333 let classification = classify_file_info(
2334 Path::new("bin/run"),
2335 b"#!/usr/bin/env node\nconsole.log('hello');\n",
2336 );
2337
2338 assert_eq!(
2339 classification.programming_language.as_deref(),
2340 Some("JavaScript")
2341 );
2342 assert!(classification.is_script);
2343 assert_eq!(
2344 classification.file_type,
2345 "javascript script, UTF-8 Unicode text executable"
2346 );
2347 }
2348
2349 #[test]
2350 fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2351 let classification = classify_file_info(
2352 Path::new("script.py"),
2353 b"# coding: latin-1\nprint(\"caf\xe9\")\n",
2354 );
2355
2356 assert_eq!(
2357 classification.programming_language.as_deref(),
2358 Some("Python")
2359 );
2360 assert!(classification.is_script);
2361 assert_eq!(classification.file_type, "python script, text executable");
2362 }
2363
2364 #[test]
2365 fn test_classify_file_info_treats_textual_tga_as_media() {
2366 let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
2367
2368 assert!(classification.is_media);
2369 assert!(classification.is_text);
2370 assert!(!classification.is_binary);
2371 }
2372
2373 #[test]
2374 fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
2375 let classification =
2376 classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
2377
2378 assert!(classification.is_binary);
2379 assert!(!classification.is_text);
2380 assert!(!classification.is_source);
2381 assert_eq!(classification.programming_language, None);
2382 }
2383
2384 #[test]
2385 fn test_extract_text_for_detection_skips_unsupported_image_formats() {
2386 let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
2387
2388 let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
2389
2390 assert!(text.is_empty());
2391 assert_eq!(kind, ExtractedTextKind::None);
2392 }
2393
2394 #[test]
2395 fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
2396 let cases = [
2397 (
2398 Path::new("bin/run"),
2399 b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
2400 Some("JavaScript"),
2401 true,
2402 true,
2403 ),
2404 (
2405 Path::new("Dockerfile"),
2406 b"FROM scratch\n".as_slice(),
2407 Some("Dockerfile"),
2408 true,
2409 false,
2410 ),
2411 (
2412 Path::new("package.json"),
2413 br#"{"name":"demo"}"#.as_slice(),
2414 None,
2415 false,
2416 false,
2417 ),
2418 (
2419 Path::new("config.yaml"),
2420 b"key: value\n".as_slice(),
2421 None,
2422 false,
2423 false,
2424 ),
2425 (
2426 Path::new("Makefile"),
2427 b"all:\n\techo hi\n".as_slice(),
2428 None,
2429 false,
2430 false,
2431 ),
2432 ];
2433
2434 for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
2435 let classification = classify_file_info(path, bytes);
2436
2437 assert_eq!(
2438 classification.programming_language.as_deref(),
2439 expected_language,
2440 "unexpected language for {}",
2441 path.display()
2442 );
2443 assert_eq!(
2444 classification.is_source,
2445 expected_is_source,
2446 "unexpected is_source for {}",
2447 path.display()
2448 );
2449 assert_eq!(
2450 classification.is_script,
2451 expected_is_script,
2452 "unexpected is_script for {}",
2453 path.display()
2454 );
2455 }
2456 }
2457}