1use std::borrow::Cow;
2use std::collections::BTreeSet;
3use std::fs;
4use std::io::{BufReader, Cursor, Read};
5use std::panic::{AssertUnwindSafe, catch_unwind};
6use std::path::Path;
7
8use chrono::{TimeZone, Utc};
9use file_format::{FileFormat, Kind as FileFormatKind};
10use flate2::read::ZlibDecoder;
11use glob::Pattern;
12use image::{ImageDecoder, ImageFormat, ImageReader};
13use mime_guess::from_path;
14use quick_xml::events::Event;
15use quick_xml::reader::Reader as XmlReader;
16
17use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
18use crate::utils::font::extract_font_metadata_text;
19use crate::utils::language::detect_language;
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ExtractedTextKind {
23 None,
24 Decoded,
25 FontMetadata,
26 Pdf,
27 BinaryStrings,
28 ImageMetadata,
29 WindowsExecutableMetadata,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct FileInfoClassification {
34 pub mime_type: String,
35 pub file_type: String,
36 pub programming_language: Option<String>,
37 pub is_binary: bool,
38 pub is_text: bool,
39 pub is_archive: bool,
40 pub is_media: bool,
41 pub is_source: bool,
42 pub is_script: bool,
43}
44
45const MAX_IMAGE_METADATA_VALUES: usize = 64;
46const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
47const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
48const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
49const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
50 "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
51];
52const BINARY_EXTENSIONS: &[&str] = &[
53 "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
54];
55const ARCHIVE_EXTENSIONS: &[&str] = &[
56 "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
57 "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
58];
59
60pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
62 metadata.modified().ok().map(|time: std::time::SystemTime| {
63 let seconds_since_epoch = time
64 .duration_since(std::time::UNIX_EPOCH)
65 .unwrap()
66 .as_secs() as i64;
67
68 Utc.timestamp_opt(seconds_since_epoch, 0)
69 .single()
70 .unwrap_or_else(Utc::now)
71 .format("%Y-%m-%d")
72 .to_string()
73 })
74}
75
76pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
78 let path_str = path.to_string_lossy();
79 let file_name = path
80 .file_name()
81 .map(|name| name.to_string_lossy())
82 .unwrap_or_default();
83
84 for pattern in exclude_patterns {
85 if pattern.matches(&path_str) {
87 return true;
88 }
89
90 if pattern.matches(&file_name) {
92 return true;
93 }
94 }
95
96 false
97}
98
99pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
105 match String::from_utf8(bytes.to_vec()) {
106 Ok(s) => s,
107 Err(e) => {
108 let bytes = e.into_bytes();
109 if has_binary_control_chars(&bytes) {
110 return String::new();
111 }
112 bytes.iter().map(|&b| b as char).collect()
113 }
114 }
115}
116
117pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
118 let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
119 (text, kind)
120}
121
122pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
123 let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
124 return Cow::Borrowed(text);
125 };
126 if !matches!(
127 extension.to_ascii_lowercase().as_str(),
128 "md" | "markdown" | "html" | "htm"
129 ) {
130 return Cow::Borrowed(text);
131 }
132
133 let mut hints = Vec::new();
134 if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
135 hints.push("Creative Commons Attribution 4.0 International License".to_string());
136 }
137 if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
138 {
139 hints.push(
140 "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
141 .to_string(),
142 );
143 }
144
145 hints.extend(extract_shields_license_badge_hints(text));
146
147 if hints.is_empty() {
148 Cow::Borrowed(text)
149 } else {
150 let mut augmented =
151 String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
152 augmented.push_str(text);
153 augmented.push_str("\n\n");
154 for (index, hint) in hints.into_iter().enumerate() {
155 if index > 0 {
156 augmented.push('\n');
157 }
158 augmented.push_str(&hint);
159 }
160 Cow::Owned(augmented)
161 }
162}
163
164fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
165 let mut hints = Vec::new();
166 let mut rest = text;
167 let needle = "img.shields.io/badge/license-";
168
169 while let Some(index) = rest.find(needle) {
170 let start = index + needle.len();
171 let suffix = &rest[start..];
172 let end = suffix
173 .find([')', ']', '"', '\'', ' ', '\n'])
174 .unwrap_or(suffix.len());
175 let badge = &suffix[..end];
176 let Some(badge) = badge.strip_suffix(".svg") else {
177 rest = &suffix[end..];
178 continue;
179 };
180
181 let mut segments: Vec<_> = badge
182 .split('-')
183 .filter(|segment| !segment.is_empty())
184 .collect();
185 if segments.len() < 2 {
186 rest = &suffix[end..];
187 continue;
188 }
189 segments.pop();
190 let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
191 if !candidate.is_empty() {
192 hints.push(canonical_shields_license_hint(&candidate));
193 }
194
195 rest = &suffix[end..];
196 }
197
198 hints.sort();
199 hints.dedup();
200 hints
201}
202
203fn canonical_shields_license_hint(candidate: &str) -> String {
204 match candidate.trim() {
205 "MIT" => "The MIT License".to_string(),
206 "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
207 other => format!("{other} License"),
208 }
209}
210
211pub(crate) fn extract_text_for_detection_with_diagnostics(
212 path: &Path,
213 bytes: &[u8],
214) -> (String, ExtractedTextKind, Option<String>) {
215 let ext = path
216 .extension()
217 .and_then(|e| e.to_str())
218 .map(|s| s.to_ascii_lowercase());
219 let detected_format = detect_file_format(bytes);
220
221 if looks_like_rtf(bytes, ext.as_deref()) {
222 let text = extract_rtf_text(bytes);
223 return if text.trim().is_empty() {
224 (String::new(), ExtractedTextKind::None, None)
225 } else {
226 (text, ExtractedTextKind::Decoded, None)
227 };
228 }
229
230 if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
231 let (text, scan_error) = extract_pdf_text(path, bytes);
232 return if text.is_empty() {
233 (String::new(), ExtractedTextKind::None, scan_error)
234 } else {
235 (text, ExtractedTextKind::Pdf, None)
236 };
237 }
238
239 if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
240 let text = extract_image_metadata_text(bytes, format);
241 return if text.is_empty() {
242 if is_supported_image_container(bytes, format) {
243 (String::new(), ExtractedTextKind::None, None)
244 } else {
245 let decoded = decode_bytes_to_string(bytes);
246 if decoded.is_empty() {
247 (String::new(), ExtractedTextKind::None, None)
248 } else {
249 (decoded, ExtractedTextKind::Decoded, None)
250 }
251 }
252 } else {
253 (text, ExtractedTextKind::ImageMetadata, None)
254 };
255 }
256
257 if let Some(text) = extract_font_metadata_text(path, bytes) {
258 return (text, ExtractedTextKind::FontMetadata, None);
259 }
260
261 let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
262
263 if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
264 return (String::new(), ExtractedTextKind::None, None);
265 }
266
267 if should_skip_binary_string_extraction(path, bytes, detected_format) {
268 return (String::new(), ExtractedTextKind::None, None);
269 }
270
271 let decoded = decode_bytes_to_string(bytes);
272 if !decoded.is_empty() {
273 let combined = combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
274 return (combined, ExtractedTextKind::Decoded, None);
275 }
276
277 let text = extract_printable_strings(bytes);
278 if text.is_empty() {
279 if let Some(metadata_text) = windows_executable_metadata_text {
280 (
281 metadata_text,
282 ExtractedTextKind::WindowsExecutableMetadata,
283 None,
284 )
285 } else {
286 (String::new(), ExtractedTextKind::None, None)
287 }
288 } else {
289 (
290 combine_extracted_text_fragments(windows_executable_metadata_text, text),
291 ExtractedTextKind::BinaryStrings,
292 None,
293 )
294 }
295}
296
297fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
298 match prefix {
299 Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
300 Some(prefix) if !prefix.is_empty() => prefix,
301 _ => suffix,
302 }
303}
304
305pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
306 let detected_format = detect_file_format(bytes);
307 let detected_language = detect_language(path, bytes);
308 let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
309 let is_text = !is_binary;
310 let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
311 let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
312 let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
313 let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
314 let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
315 let programming_language = is_source.then(|| detected_language.clone()).flatten();
316 let file_type = detect_file_type(
317 path,
318 bytes,
319 detected_format,
320 &mime_type,
321 programming_language.as_deref(),
322 is_binary,
323 is_text,
324 is_archive,
325 is_media,
326 is_script,
327 );
328
329 FileInfoClassification {
330 mime_type,
331 file_type,
332 programming_language,
333 is_binary,
334 is_text,
335 is_archive,
336 is_media,
337 is_source,
338 is_script,
339 }
340}
341
342fn detect_file_format(bytes: &[u8]) -> FileFormat {
343 FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
344}
345
346fn is_utf8_text(bytes: &[u8]) -> bool {
347 std::str::from_utf8(bytes).is_ok()
348}
349
350fn has_binary_control_chars(bytes: &[u8]) -> bool {
351 let control_count = bytes
352 .iter()
353 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
354 .count();
355 control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
356}
357
358fn has_decodable_text(bytes: &[u8]) -> bool {
359 bytes.is_empty() || is_utf8_text(bytes) || !has_binary_control_chars(bytes)
360}
361
362fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
363 if bytes.is_empty() || is_utf8_text(bytes) {
364 return true;
365 }
366
367 let printable_count = bytes
368 .iter()
369 .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
370 .count();
371 printable_count * 2 >= bytes.len()
372}
373
374fn is_textual_media_type(media_type: &str) -> bool {
375 media_type.starts_with("text/")
376 || matches!(
377 media_type,
378 "application/json" | "application/xml" | "text/xml"
379 )
380 || media_type.ends_with("+json")
381 || media_type.ends_with("+xml")
382}
383
384fn is_textual_format(detected_format: FileFormat) -> bool {
385 matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
386 || is_textual_media_type(detected_format.media_type())
387}
388
389fn is_known_binary_format(detected_format: FileFormat) -> bool {
390 !matches!(detected_format, FileFormat::ArbitraryBinaryData)
391 && !is_textual_format(detected_format)
392}
393
394pub fn detect_mime_type(
395 path: &Path,
396 bytes: &[u8],
397 detected_format: FileFormat,
398 programming_language: Option<&str>,
399) -> String {
400 if bytes.is_empty() {
401 return "inode/x-empty".to_string();
402 }
403
404 if is_zip_archive(bytes) {
405 return detect_zip_like_mime(path);
406 }
407
408 if looks_like_deb(bytes, path) {
409 return "application/vnd.debian.binary-package".to_string();
410 }
411
412 if looks_like_rpm(bytes, path) {
413 return "application/x-rpm".to_string();
414 }
415
416 let guessed_mime = from_path(path)
417 .first_or_octet_stream()
418 .essence_str()
419 .to_string();
420
421 let mime_type = match detected_format {
422 FileFormat::Empty => "inode/x-empty".to_string(),
423 FileFormat::PlainText => {
424 if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
425 "text/plain".to_string()
426 } else {
427 guessed_mime.clone()
428 }
429 }
430 _ => {
431 let detected_mime = detected_format.media_type();
432 if detected_mime == "application/octet-stream"
433 && guessed_mime != "application/octet-stream"
434 {
435 guessed_mime.clone()
436 } else {
437 detected_mime.to_string()
438 }
439 }
440 };
441
442 normalize_mime_type(path, bytes, programming_language, &mime_type)
443}
444
445fn normalize_mime_type(
446 path: &Path,
447 bytes: &[u8],
448 programming_language: Option<&str>,
449 mime_type: &str,
450) -> String {
451 if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
452 return "text/plain".to_string();
453 }
454
455 mime_type.to_string()
456}
457
458fn should_prefer_text_mime(
459 path: &Path,
460 bytes: &[u8],
461 programming_language: Option<&str>,
462 mime_type: &str,
463) -> bool {
464 has_decodable_text(bytes)
465 && looks_like_textual_bytes(bytes)
466 && is_textual_source_candidate(path, programming_language)
467 && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
468}
469
470fn detect_is_binary(
471 path: &Path,
472 bytes: &[u8],
473 detected_format: FileFormat,
474 programming_language: Option<&str>,
475) -> bool {
476 if is_textual_format(detected_format) {
477 return false;
478 }
479
480 if lower_extension(path)
481 .as_deref()
482 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
483 {
484 return true;
485 }
486
487 if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
488 return false;
489 }
490
491 has_binary_control_chars(bytes)
492 || is_known_binary_format(detected_format)
493 || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
494 && !looks_like_textual_bytes(bytes))
495}
496
497fn should_treat_binary_bytes_as_text(
498 path: &Path,
499 bytes: &[u8],
500 programming_language: Option<&str>,
501) -> bool {
502 has_decodable_text(bytes)
503 && looks_like_textual_bytes(bytes)
504 && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
505}
506
507fn detect_is_archive(
508 path: &Path,
509 bytes: &[u8],
510 mime_type: &str,
511 is_text: bool,
512 detected_format: FileFormat,
513) -> bool {
514 if is_text {
515 return false;
516 }
517
518 lower_extension(path)
519 .as_deref()
520 .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
521 || matches!(
522 detected_format.kind(),
523 FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
524 )
525 || is_zip_archive(bytes)
526 || looks_like_gzip(bytes)
527 || looks_like_bzip2(bytes)
528 || looks_like_xz(bytes)
529 || looks_like_deb(bytes, path)
530 || looks_like_rpm(bytes, path)
531 || looks_like_squashfs(bytes, path)
532 || mime_type.contains("zip")
533 || mime_type.contains("compressed")
534 || mime_type.contains("tar")
535 || mime_type.contains("x-rpm")
536 || mime_type.contains("debian")
537}
538
539fn detect_is_media(
540 path: &Path,
541 bytes: &[u8],
542 mime_type: &str,
543 detected_format: FileFormat,
544) -> bool {
545 media_mime_from_content(bytes).is_some()
546 || matches!(
547 detected_format.kind(),
548 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
549 )
550 || mime_type.starts_with("image/")
551 || mime_type.starts_with("audio/")
552 || mime_type.starts_with("video/")
553 || (mime_type == "application/octet-stream"
554 && lower_extension(path).as_deref() == Some("tga")
555 && !has_binary_control_chars(bytes))
556}
557
558fn detect_is_script(
559 path: &Path,
560 bytes: &[u8],
561 programming_language: Option<&str>,
562 is_text: bool,
563) -> bool {
564 if !is_text || is_makefile(path) {
565 return false;
566 }
567
568 bytes.starts_with(b"#!")
569 || lower_extension(path).as_deref().is_some_and(|ext| {
570 matches!(
571 ext,
572 "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
573 )
574 })
575 || matches!(
576 programming_language,
577 Some("Shell" | "Python" | "Ruby" | "Perl" | "PHP" | "PowerShell" | "Awk")
578 )
579}
580
581fn detect_is_source(
582 path: &Path,
583 programming_language: Option<&str>,
584 is_text: bool,
585 is_script: bool,
586) -> bool {
587 if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
588 return false;
589 }
590
591 if is_c_like_source(path) || is_java_like_source(path) {
592 return true;
593 }
594
595 programming_language.is_some() || is_script
596}
597
598#[allow(clippy::too_many_arguments)]
599fn detect_file_type(
600 path: &Path,
601 bytes: &[u8],
602 detected_format: FileFormat,
603 mime_type: &str,
604 programming_language: Option<&str>,
605 is_binary: bool,
606 is_text: bool,
607 is_archive: bool,
608 is_media: bool,
609 is_script: bool,
610) -> String {
611 if bytes.is_empty() {
612 return "empty".to_string();
613 }
614
615 if looks_like_pdf(bytes) {
616 return "PDF document".to_string();
617 }
618
619 if let Some(file_type) = media_file_type_from_content(bytes) {
620 return file_type.to_string();
621 }
622
623 if is_archive {
624 return archive_file_type(path, bytes, detected_format);
625 }
626
627 if is_script {
628 return script_file_type(programming_language, bytes);
629 }
630
631 if is_text {
632 if lower_extension(path).as_deref() == Some("json") {
633 return "JSON text data".to_string();
634 }
635 if lower_extension(path).as_deref() == Some("xml") {
636 return "XML text data".to_string();
637 }
638 if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
639 return "YAML text data".to_string();
640 }
641 if lower_extension(path).as_deref() == Some("toml") {
642 return "TOML text data".to_string();
643 }
644 if matches!(
645 lower_extension(path).as_deref(),
646 Some("ini" | "cfg" | "conf")
647 ) {
648 return "INI text data".to_string();
649 }
650 if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
651 return "Git configuration text".to_string();
652 }
653 if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
654 return text_file_type(bytes);
655 }
656 if programming_language.is_some() && !is_media {
657 return text_file_type(bytes);
658 }
659 return text_file_type(bytes);
660 }
661
662 if let Some(file_type) = format_based_file_type(detected_format) {
663 return file_type;
664 }
665
666 if is_binary && mime_type == "application/octet-stream" {
667 return "data".to_string();
668 }
669
670 mime_type.to_string()
671}
672
673fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
674 if matches!(programming_language, Some(language) if is_source_like_language(language)) {
675 return true;
676 }
677
678 if matches!(
679 lower_file_name(path).as_str(),
680 "dockerfile"
681 | "containerfile"
682 | "containerfile.core"
683 | "apkbuild"
684 | "podfile"
685 | "meson.build"
686 | "build"
687 | "workspace"
688 | "buck"
689 | "default.nix"
690 | "flake.nix"
691 | "shell.nix"
692 ) {
693 return true;
694 }
695
696 path.extension()
697 .and_then(|ext| ext.to_str())
698 .is_some_and(|ext| {
699 matches!(
700 ext.to_ascii_lowercase().as_str(),
701 "rs" | "py"
702 | "js"
703 | "mjs"
704 | "cjs"
705 | "jsx"
706 | "ts"
707 | "mts"
708 | "cts"
709 | "tsx"
710 | "c"
711 | "cpp"
712 | "cc"
713 | "cxx"
714 | "h"
715 | "hpp"
716 | "m"
717 | "mm"
718 | "s"
719 | "asm"
720 | "java"
721 | "go"
722 | "rb"
723 | "php"
724 | "pl"
725 | "swift"
726 | "sh"
727 | "bash"
728 | "zsh"
729 | "fish"
730 | "ksh"
731 | "ps1"
732 | "psm1"
733 | "psd1"
734 | "awk"
735 | "kt"
736 | "kts"
737 | "dart"
738 | "scala"
739 | "groovy"
740 | "gradle"
741 | "gvy"
742 | "gy"
743 | "gsh"
744 | "cs"
745 | "fs"
746 | "fsx"
747 | "r"
748 | "lua"
749 | "jl"
750 | "ex"
751 | "exs"
752 | "clj"
753 | "cljs"
754 | "cljc"
755 | "hs"
756 | "erl"
757 | "nix"
758 | "zig"
759 | "bzl"
760 | "bazel"
761 | "star"
762 | "sky"
763 | "ml"
764 | "mli"
765 | "tex"
766 )
767 })
768}
769
770fn is_source_like_language(language: &str) -> bool {
771 matches!(
772 language,
773 "Rust"
774 | "Python"
775 | "JavaScript"
776 | "TypeScript"
777 | "JavaScript/TypeScript"
778 | "C"
779 | "C++"
780 | "Objective-C"
781 | "Objective-C++"
782 | "GAS"
783 | "Java"
784 | "Go"
785 | "Ruby"
786 | "PHP"
787 | "Perl"
788 | "Swift"
789 | "Shell"
790 | "PowerShell"
791 | "Awk"
792 | "Kotlin"
793 | "Dart"
794 | "Scala"
795 | "C#"
796 | "F#"
797 | "R"
798 | "Lua"
799 | "Julia"
800 | "Elixir"
801 | "Clojure"
802 | "Haskell"
803 | "Erlang"
804 | "Groovy"
805 | "Nix"
806 | "Zig"
807 | "Starlark"
808 | "OCaml"
809 | "Meson"
810 | "TeX"
811 | "Dockerfile"
812 | "Makefile"
813 )
814}
815
816fn extension(path: &Path) -> Option<&str> {
817 path.extension().and_then(|ext| ext.to_str())
818}
819
820fn lower_extension(path: &Path) -> Option<String> {
821 extension(path).map(|ext| ext.to_ascii_lowercase())
822}
823
824fn lower_file_name(path: &Path) -> String {
825 path.file_name()
826 .and_then(|name| name.to_str())
827 .map(|name| name.to_ascii_lowercase())
828 .unwrap_or_default()
829}
830
831fn is_plain_text(path: &Path) -> bool {
832 lower_extension(path)
833 .as_deref()
834 .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
835}
836
837fn is_makefile(path: &Path) -> bool {
838 matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
839}
840
841fn is_source_map(path: &Path) -> bool {
842 let path_lower = path.to_string_lossy().to_ascii_lowercase();
843 path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
844}
845
846fn is_c_like_source(path: &Path) -> bool {
847 lower_extension(path).as_deref().is_some_and(|ext| {
848 matches!(
849 ext,
850 "c" | "cc"
851 | "cp"
852 | "cpp"
853 | "cxx"
854 | "c++"
855 | "h"
856 | "hh"
857 | "hpp"
858 | "hxx"
859 | "h++"
860 | "i"
861 | "ii"
862 | "m"
863 | "s"
864 | "asm"
865 )
866 })
867}
868
869fn is_java_like_source(path: &Path) -> bool {
870 lower_extension(path)
871 .as_deref()
872 .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
873}
874
875fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
876 match detected_format {
877 FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
878 format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
879 format => Some(match format.kind() {
880 FileFormatKind::Image => short_name_or_name(&format, "image data"),
881 FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
882 FileFormatKind::Video => short_name_or_name(&format, "video data"),
883 _ => format.name().to_string(),
884 }),
885 }
886}
887
888fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
889 format
890 .short_name()
891 .map(|short_name| format!("{short_name} {suffix}"))
892 .unwrap_or_else(|| format!("{} {suffix}", format.name()))
893}
894
895fn detect_zip_like_mime(path: &Path) -> String {
896 match extension(path).map(|ext| ext.to_ascii_lowercase()) {
897 Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
898 Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
899 "application/java-archive".to_string()
900 }
901 _ => "application/zip".to_string(),
902 }
903}
904
905fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
906 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
907 Some("image/png")
908 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
909 Some("image/jpeg")
910 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
911 Some("image/tiff")
912 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
913 Some("image/webp")
914 } else {
915 None
916 }
917}
918
919fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
920 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
921 Some("PNG image data")
922 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
923 Some("JPEG image data")
924 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
925 Some("TIFF image data")
926 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
927 Some("WebP image data")
928 } else {
929 None
930 }
931}
932
933fn looks_like_pdf(bytes: &[u8]) -> bool {
934 bytes.starts_with(b"%PDF-")
935}
936
937fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
938 ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
939}
940
941fn extract_rtf_text(bytes: &[u8]) -> String {
942 let text = String::from_utf8_lossy(bytes);
943 let chars: Vec<char> = text.chars().collect();
944 let mut output = String::new();
945 let mut index = 0usize;
946
947 while index < chars.len() {
948 match chars[index] {
949 '{' | '}' => {
950 index += 1;
951 }
952 '\\' => {
953 index += 1;
954 if index >= chars.len() {
955 break;
956 }
957
958 match chars[index] {
959 '\\' | '{' | '}' => {
960 output.push(chars[index]);
961 index += 1;
962 }
963 '\'' => {
964 if index + 2 < chars.len() {
965 let hex = [chars[index + 1], chars[index + 2]];
966 let hex: String = hex.iter().collect();
967 if let Ok(value) = u8::from_str_radix(&hex, 16) {
968 output.push(value as char);
969 index += 3;
970 continue;
971 }
972 }
973 index += 1;
974 }
975 control if control.is_ascii_alphabetic() => {
976 let start = index;
977 while index < chars.len() && chars[index].is_ascii_alphabetic() {
978 index += 1;
979 }
980 let control_word: String = chars[start..index].iter().collect();
981
982 let number_start = index;
983 if index < chars.len()
984 && (chars[index] == '-' || chars[index].is_ascii_digit())
985 {
986 index += 1;
987 while index < chars.len() && chars[index].is_ascii_digit() {
988 index += 1;
989 }
990 }
991 let parameter: String = chars[number_start..index].iter().collect();
992
993 if index < chars.len() && chars[index] == ' ' {
994 index += 1;
995 }
996
997 match control_word.as_str() {
998 "par" | "line" => output.push('\n'),
999 "tab" => output.push('\t'),
1000 "emdash" => output.push('—'),
1001 "endash" => output.push('–'),
1002 "bullet" => output.push('•'),
1003 "lquote" | "rquote" => output.push('\''),
1004 "ldblquote" | "rdblquote" => output.push('"'),
1005 "u" => {
1006 if let Ok(codepoint) = parameter.parse::<i32>() {
1007 let normalized = if codepoint < 0 {
1008 codepoint + 65_536
1009 } else {
1010 codepoint
1011 };
1012 if let Ok(normalized) = u32::try_from(normalized)
1013 && let Some(ch) = char::from_u32(normalized)
1014 {
1015 output.push(ch);
1016 }
1017 }
1018
1019 if index < chars.len()
1020 && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1021 {
1022 index += 1;
1023 }
1024 }
1025 _ => {}
1026 }
1027 }
1028 _ => {
1029 index += 1;
1030 }
1031 }
1032 }
1033 ch => {
1034 output.push(ch);
1035 index += 1;
1036 }
1037 }
1038 }
1039
1040 output
1041 .replace(['\r', '\u{0c}'], "\n")
1042 .lines()
1043 .map(str::trim_end)
1044 .collect::<Vec<_>>()
1045 .join("\n")
1046}
1047
1048fn looks_like_gzip(bytes: &[u8]) -> bool {
1049 bytes.starts_with(&[0x1f, 0x8b])
1050}
1051
1052fn looks_like_bzip2(bytes: &[u8]) -> bool {
1053 bytes.starts_with(b"BZh")
1054}
1055
1056fn looks_like_xz(bytes: &[u8]) -> bool {
1057 bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1058}
1059
1060fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1061 lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1062}
1063
1064fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1065 lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1066}
1067
1068fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1069 lower_extension(path)
1070 .as_deref()
1071 .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1072 && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1073 || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1074}
1075
1076fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1077 if looks_like_deb(bytes, path) {
1078 "debian binary package (format 2.0)".to_string()
1079 } else if looks_like_rpm(bytes, path) {
1080 "RPM package".to_string()
1081 } else if looks_like_squashfs(bytes, path) {
1082 "Squashfs filesystem".to_string()
1083 } else if looks_like_gzip(bytes) {
1084 "gzip compressed data".to_string()
1085 } else if looks_like_bzip2(bytes) {
1086 "bzip2 compressed data".to_string()
1087 } else if looks_like_xz(bytes) {
1088 "XZ compressed data".to_string()
1089 } else if is_zip_archive(bytes) {
1090 "Zip archive data".to_string()
1091 } else if lower_extension(path).as_deref() == Some("gem") {
1092 "POSIX tar archive".to_string()
1093 } else if let Some(file_type) = format_based_file_type(detected_format) {
1094 file_type
1095 } else {
1096 "archive data".to_string()
1097 }
1098}
1099
1100fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1101 let suffix = text_executable_label(bytes);
1102
1103 match programming_language {
1104 Some("Python") => format!("python script, {suffix}"),
1105 Some("Ruby") => format!("ruby script, {suffix}"),
1106 Some("Perl") => format!("perl script, {suffix}"),
1107 Some("PHP") => format!("php script, {suffix}"),
1108 Some("Shell") => format!("shell script, {suffix}"),
1109 Some("JavaScript") => format!("javascript script, {suffix}"),
1110 Some("TypeScript") => format!("typescript script, {suffix}"),
1111 Some("PowerShell") => format!("powershell script, {suffix}"),
1112 Some("Awk") => format!("awk script, {suffix}"),
1113 _ => format!("script, {suffix}"),
1114 }
1115}
1116
1117fn text_file_type(bytes: &[u8]) -> String {
1118 text_label(bytes).to_string()
1119}
1120
1121fn text_label(bytes: &[u8]) -> &'static str {
1122 if std::str::from_utf8(bytes).is_ok() {
1123 if bytes.contains(&b'\n') {
1124 "UTF-8 Unicode text"
1125 } else {
1126 "UTF-8 Unicode text, with no line terminators"
1127 }
1128 } else if bytes.contains(&b'\n') {
1129 "text"
1130 } else {
1131 "text, with no line terminators"
1132 }
1133}
1134
1135fn text_executable_label(bytes: &[u8]) -> &'static str {
1136 if std::str::from_utf8(bytes).is_ok() {
1137 if bytes.contains(&b'\n') {
1138 "UTF-8 Unicode text executable"
1139 } else {
1140 "UTF-8 Unicode text executable, with no line terminators"
1141 }
1142 } else if bytes.contains(&b'\n') {
1143 "text executable"
1144 } else {
1145 "text executable, with no line terminators"
1146 }
1147}
1148
1149fn supported_image_metadata_format(
1150 ext: Option<&str>,
1151 detected_format: FileFormat,
1152) -> Option<ImageFormat> {
1153 match ext {
1154 Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1155 Some("png") => Some(ImageFormat::Png),
1156 Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1157 Some("webp") => Some(ImageFormat::WebP),
1158 _ => match detected_format.media_type() {
1159 "image/jpeg" => Some(ImageFormat::Jpeg),
1160 "image/png" => Some(ImageFormat::Png),
1161 "image/tiff" => Some(ImageFormat::Tiff),
1162 "image/webp" => Some(ImageFormat::WebP),
1163 _ => None,
1164 },
1165 }
1166}
1167
1168fn should_skip_binary_string_extraction(
1169 path: &Path,
1170 bytes: &[u8],
1171 detected_format: FileFormat,
1172) -> bool {
1173 matches!(lower_extension(path).as_deref(), Some("pdf"))
1174 || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1175 .is_some()
1176 || (matches!(
1177 detected_format.kind(),
1178 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1179 ) && !is_textual_format(detected_format))
1180 || media_mime_from_content(bytes).is_some()
1181 || is_zip_archive(bytes)
1182 || looks_like_gzip(bytes)
1183 || looks_like_bzip2(bytes)
1184 || looks_like_xz(bytes)
1185 || looks_like_deb(bytes, path)
1186 || looks_like_rpm(bytes, path)
1187 || looks_like_squashfs(bytes, path)
1188}
1189
1190fn should_skip_large_opaque_binary_text_extraction(
1191 _path: &Path,
1192 bytes: &[u8],
1193 detected_format: FileFormat,
1194) -> bool {
1195 if bytes.len() < LARGE_OPAQUE_BINARY_SKIP_BYTES {
1196 return false;
1197 }
1198
1199 if !matches!(detected_format, FileFormat::ArbitraryBinaryData) {
1200 return false;
1201 }
1202
1203 if !has_binary_control_chars(bytes) {
1204 return false;
1205 }
1206
1207 !sample_has_promising_printable_strings(bytes)
1208}
1209
1210fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1211 const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1212 const MIN_PROMISING_RUN: usize = 16;
1213 const MIN_PROMISING_WINDOWS: usize = 2;
1214
1215 let len = bytes.len();
1216 let mut windows = Vec::new();
1217 windows.push(&bytes[..bytes.len().min(SAMPLE_WINDOW_BYTES)]);
1218 if len > SAMPLE_WINDOW_BYTES * 2 {
1219 let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1220 let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1221 windows.push(&bytes[mid_start..mid_end]);
1222 }
1223 if len > SAMPLE_WINDOW_BYTES {
1224 windows.push(&bytes[len - SAMPLE_WINDOW_BYTES..]);
1225 }
1226
1227 let promising_windows = windows
1228 .iter()
1229 .filter(|window| has_promising_printable_run(window, MIN_PROMISING_RUN))
1230 .count();
1231
1232 promising_windows >= MIN_PROMISING_WINDOWS
1233 || windows
1234 .iter()
1235 .any(|window| has_strong_structured_text_signal(window))
1236}
1237
1238fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1239 let strings = extract_printable_strings(bytes);
1240 if strings.is_empty() {
1241 return false;
1242 }
1243
1244 let email_markers = strings.matches('@').count();
1245 let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1246
1247 email_markers + url_markers >= 3
1248}
1249
1250fn has_promising_printable_run(bytes: &[u8], min_run: usize) -> bool {
1251 longest_printable_ascii_run(bytes) >= min_run
1252 || longest_utf16le_printable_ascii_run(bytes) >= min_run
1253 || longest_utf16be_printable_ascii_run(bytes) >= min_run
1254}
1255
1256fn longest_printable_ascii_run(bytes: &[u8]) -> usize {
1257 bytes
1258 .iter()
1259 .fold((0, 0), |(best, current), &byte| {
1260 if matches!(byte, 0x20..=0x7E) {
1261 let next = current + 1;
1262 (best.max(next), next)
1263 } else {
1264 (best, 0)
1265 }
1266 })
1267 .0
1268}
1269
1270fn longest_utf16le_printable_ascii_run(bytes: &[u8]) -> usize {
1271 longest_utf16_printable_ascii_run(bytes, true)
1272}
1273
1274fn longest_utf16be_printable_ascii_run(bytes: &[u8]) -> usize {
1275 longest_utf16_printable_ascii_run(bytes, false)
1276}
1277
1278fn longest_utf16_printable_ascii_run(bytes: &[u8], little_endian: bool) -> usize {
1279 let mut best = 0;
1280 let mut current = 0;
1281 let start = usize::from(!little_endian);
1282 let mut index = start;
1283 while index + 1 < bytes.len() {
1284 let (ch, zero) = if little_endian {
1285 (bytes[index], bytes[index + 1])
1286 } else {
1287 (bytes[index + 1], bytes[index])
1288 };
1289 if matches!(ch, 0x20..=0x7E) && zero == 0 {
1290 current += 1;
1291 best = best.max(current);
1292 } else {
1293 current = 0;
1294 }
1295 index += 2;
1296 }
1297 best
1298}
1299
1300fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1301 match format {
1302 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1303 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1304 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1305 ImageFormat::WebP => {
1306 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1307 }
1308 _ => false,
1309 }
1310}
1311
1312fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1313 let mut values = Vec::new();
1314 values.extend(extract_exif_metadata_values(bytes));
1315 values.extend(extract_xmp_metadata_values(bytes, format));
1316 values_to_text(values)
1317}
1318
1319fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1320 let mut cursor = BufReader::new(Cursor::new(bytes));
1321 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1322 Ok(exif) => exif,
1323 Err(_) => return Vec::new(),
1324 };
1325
1326 let mut values = Vec::new();
1327 for field in exif.fields() {
1328 let rendered = match field.tag {
1329 exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1330 Some(field.display_value().with_unit(&exif).to_string())
1331 }
1332 exif::Tag::Artist => Some(format!(
1333 "Author: {}",
1334 field.display_value().with_unit(&exif)
1335 )),
1336 _ => None,
1337 };
1338
1339 if let Some(rendered) = rendered {
1340 values.push(rendered);
1341 }
1342 }
1343
1344 values
1345}
1346
1347fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1348 let xmp = match extract_raw_xmp_packet(bytes, format) {
1349 Some(xmp) => xmp,
1350 None => return Vec::new(),
1351 };
1352
1353 parse_xmp_values(&xmp)
1354}
1355
1356fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1357 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1358 if let Ok(mut decoder) = reader.into_decoder()
1359 && let Ok(Some(xmp)) = decoder.xmp_metadata()
1360 {
1361 return Some(xmp);
1362 }
1363
1364 match format {
1365 ImageFormat::Png => extract_png_xmp_packet(bytes),
1366 _ => None,
1367 }
1368}
1369
1370fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1371 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1372
1373 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1374 return None;
1375 }
1376
1377 let mut offset = PNG_SIGNATURE.len();
1378 while offset + 12 <= bytes.len() {
1379 let length = u32::from_be_bytes([
1380 bytes[offset],
1381 bytes[offset + 1],
1382 bytes[offset + 2],
1383 bytes[offset + 3],
1384 ]) as usize;
1385 let chunk_start = offset + 8;
1386 let chunk_end = chunk_start + length;
1387 if chunk_end + 4 > bytes.len() {
1388 return None;
1389 }
1390
1391 let chunk_type = &bytes[offset + 4..offset + 8];
1392 if chunk_type == b"iTXt" {
1393 let data = &bytes[chunk_start..chunk_end];
1394 if let Some(xmp) = parse_png_itxt_xmp(data) {
1395 return Some(xmp);
1396 }
1397 }
1398
1399 offset = chunk_end + 4;
1400 }
1401
1402 None
1403}
1404
1405fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1406 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1407
1408 let keyword_end = data.iter().position(|&b| b == 0)?;
1409 if &data[..keyword_end] != XMP_KEYWORD {
1410 return None;
1411 }
1412
1413 let mut cursor = keyword_end + 1;
1414 let compression_flag = *data.get(cursor)?;
1415 cursor += 1;
1416 let compression_method = *data.get(cursor)?;
1417 cursor += 1;
1418 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1419 return None;
1420 }
1421
1422 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1423 cursor = language_end + 1;
1424
1425 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1426 cursor = translated_end + 1;
1427
1428 let text_bytes = &data[cursor..];
1429 if compression_flag == 1 {
1430 let mut decoder = ZlibDecoder::new(text_bytes);
1431 let mut decoded = Vec::new();
1432 decoder.read_to_end(&mut decoded).ok()?;
1433 Some(decoded)
1434 } else {
1435 Some(text_bytes.to_vec())
1436 }
1437}
1438
1439fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1440 let mut reader = XmlReader::from_reader(xmp);
1441 reader.config_mut().trim_text(true);
1442
1443 let mut buf = Vec::new();
1444 let mut stack: Vec<String> = Vec::new();
1445 let mut values = Vec::new();
1446
1447 loop {
1448 match reader.read_event_into(&mut buf) {
1449 Ok(Event::Start(e)) => {
1450 stack.push(local_xml_name(e.name().as_ref()));
1451 }
1452 Ok(Event::End(_)) => {
1453 stack.pop();
1454 }
1455 Ok(Event::Empty(_)) => {}
1456 Ok(Event::Text(text)) => {
1457 if let Some(field) = stack
1458 .iter()
1459 .rev()
1460 .find_map(|name| allowed_xmp_field(name.as_str()))
1461 && let Ok(decoded) = text.decode()
1462 {
1463 let decoded = decoded.into_owned();
1464 if !decoded.trim().is_empty() {
1465 values.push(format_xmp_value(field, &decoded));
1466 }
1467 }
1468 }
1469 Ok(Event::CData(text)) => {
1470 if let Some(field) = stack
1471 .iter()
1472 .rev()
1473 .find_map(|name| allowed_xmp_field(name.as_str()))
1474 && let Ok(decoded) = text.decode()
1475 {
1476 let decoded = decoded.into_owned();
1477 if !decoded.trim().is_empty() {
1478 values.push(format_xmp_value(field, &decoded));
1479 }
1480 }
1481 }
1482 Ok(Event::Eof) | Err(_) => break,
1483 _ => {}
1484 }
1485 buf.clear();
1486 }
1487
1488 values
1489}
1490
1491fn local_xml_name(name: &[u8]) -> String {
1492 let name = std::str::from_utf8(name).unwrap_or_default();
1493 name.rsplit(':').next().unwrap_or(name).to_string()
1494}
1495
1496fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1497 match name {
1498 "creator" => Some("creator"),
1499 "rights" => Some("rights"),
1500 "description" => Some("description"),
1501 "title" => Some("title"),
1502 "subject" => Some("subject"),
1503 "UsageTerms" => Some("usage_terms"),
1504 "WebStatement" => Some("web_statement"),
1505 _ => None,
1506 }
1507}
1508
1509fn format_xmp_value(field: &str, value: &str) -> String {
1510 match field {
1511 "creator" => format!("Author: {value}"),
1512 _ => value.to_string(),
1513 }
1514}
1515
1516fn values_to_text(values: Vec<String>) -> String {
1517 let mut seen = BTreeSet::new();
1518 let mut lines = Vec::new();
1519 let mut total_bytes = 0usize;
1520
1521 for value in values {
1522 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1523 break;
1524 }
1525
1526 let normalized = normalize_metadata_value(&value);
1527 if normalized.is_empty() || !seen.insert(normalized.clone()) {
1528 continue;
1529 }
1530
1531 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1532 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1533 break;
1534 }
1535
1536 total_bytes += added_bytes;
1537 lines.push(normalized);
1538 }
1539
1540 lines.join("\n")
1541}
1542
1543fn normalize_metadata_value(value: &str) -> String {
1544 value
1545 .chars()
1546 .filter(|&ch| ch != '\0')
1547 .collect::<String>()
1548 .split_whitespace()
1549 .collect::<Vec<_>>()
1550 .join(" ")
1551 .trim()
1552 .to_string()
1553}
1554
1555fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1556 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1557 return (String::new(), None);
1558 }
1559
1560 let mut failures = Vec::new();
1561 let mut saw_success = false;
1562
1563 let extracted = catch_unwind(AssertUnwindSafe(
1564 || -> Result<String, Box<dyn std::error::Error>> {
1565 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1566 extract_first_pdf_page_text(&mut document)
1567 },
1568 ));
1569 match extracted {
1570 Ok(Ok(text)) => {
1571 saw_success = true;
1572 if let Some(normalized) = normalize_pdf_text(text) {
1573 return (normalized, None);
1574 }
1575 }
1576 Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1577 Err(payload) => failures.push(format!(
1578 "from-bytes first-page panic: {}",
1579 panic_payload_to_string(payload.as_ref())
1580 )),
1581 }
1582
1583 let extracted = catch_unwind(AssertUnwindSafe(
1584 || -> Result<String, Box<dyn std::error::Error>> {
1585 let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1586 extract_pdf_text_from_document(&mut document)
1587 },
1588 ));
1589 match extracted {
1590 Ok(Ok(text)) => {
1591 saw_success = true;
1592 if let Some(normalized) = normalize_pdf_text(text) {
1593 return (normalized, None);
1594 }
1595 }
1596 Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1597 Err(payload) => failures.push(format!(
1598 "open full-document panic: {}",
1599 panic_payload_to_string(payload.as_ref())
1600 )),
1601 }
1602
1603 let extracted = catch_unwind(AssertUnwindSafe(
1604 || -> Result<String, Box<dyn std::error::Error>> {
1605 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1606 extract_pdf_text_from_document(&mut document)
1607 },
1608 ));
1609 match extracted {
1610 Ok(Ok(text)) => {
1611 saw_success = true;
1612 if let Some(normalized) = normalize_pdf_text(text) {
1613 return (normalized, None);
1614 }
1615 }
1616 Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1617 Err(payload) => failures.push(format!(
1618 "from-bytes full-document panic: {}",
1619 panic_payload_to_string(payload.as_ref())
1620 )),
1621 }
1622
1623 if saw_success || is_non_actionable_pdf_failure(&failures) {
1624 (String::new(), None)
1625 } else {
1626 (
1627 String::new(),
1628 Some(format!(
1629 "PDF text extraction failed after {} attempts: {}",
1630 failures.len(),
1631 failures.join("; ")
1632 )),
1633 )
1634 }
1635}
1636
1637fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
1638 !failures.is_empty()
1639 && failures.iter().all(|failure| {
1640 failure.contains("requires a password")
1641 || failure.contains("Invalid cross-reference table")
1642 })
1643}
1644
1645fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
1646 if let Some(message) = payload.downcast_ref::<&str>() {
1647 (*message).to_string()
1648 } else if let Some(message) = payload.downcast_ref::<String>() {
1649 message.clone()
1650 } else {
1651 "unknown panic payload".to_string()
1652 }
1653}
1654
1655fn extract_first_pdf_page_text(
1656 document: &mut pdf_oxide::document::PdfDocument,
1657) -> Result<String, Box<dyn std::error::Error>> {
1658 if document.page_count()? == 0 {
1659 return Ok(String::new());
1660 }
1661
1662 let extracted_text = document.extract_text(0)?;
1663 let markdown_text =
1664 document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1665 if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1666 return Ok(extracted_text);
1667 }
1668
1669 let pipeline_text =
1670 document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1671
1672 Ok(merge_pdf_first_page_text(
1673 &extracted_text,
1674 &markdown_text,
1675 &pipeline_text,
1676 ))
1677}
1678
1679fn extract_pdf_text_from_document(
1680 document: &mut pdf_oxide::document::PdfDocument,
1681) -> Result<String, Box<dyn std::error::Error>> {
1682 Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1683}
1684
1685fn normalize_pdf_text(text: String) -> Option<String> {
1686 let normalized = text.replace(['\r', '\u{0c}'], "\n");
1687 (!normalized.trim().is_empty()).then_some(normalized)
1688}
1689
1690fn merge_pdf_first_page_text(
1691 _extracted_text: &str,
1692 markdown_text: &str,
1693 pipeline_text: &str,
1694) -> String {
1695 let pipeline = pipeline_text.trim();
1696 if pipeline.is_empty() {
1697 return String::new();
1698 }
1699
1700 let prefix = pdf_first_page_heading_prefix(markdown_text);
1701 let Some(prefix) = prefix else {
1702 return pipeline_text.to_string();
1703 };
1704
1705 if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1706 pipeline_text.to_string()
1707 } else {
1708 format!("{prefix}\n\n{pipeline}")
1709 }
1710}
1711
1712fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1713 normalize_pdf_heading_comparison_text(text)
1714 .contains(&normalize_pdf_heading_comparison_text(prefix))
1715}
1716
1717fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1718 text.split_whitespace()
1719 .map(|part| part.to_ascii_lowercase())
1720 .collect::<Vec<_>>()
1721 .join(" ")
1722}
1723
1724fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1725 let mut lines = Vec::new();
1726
1727 for line in pdf_markdown_heading_lines(markdown_text) {
1728 push_unique_line(&mut lines, line);
1729 }
1730
1731 (!lines.is_empty()).then(|| lines.join("\n"))
1732}
1733
1734fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1735 text.lines()
1736 .map(str::trim)
1737 .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1738 .map(|line| line.trim_matches('#').trim())
1739 .filter(|line| !line.is_empty())
1740 .filter(|line| !looks_like_numbered_section_heading(line))
1741 .take(4)
1742 .map(ToOwned::to_owned)
1743 .collect()
1744}
1745
1746fn push_unique_line(lines: &mut Vec<String>, line: String) {
1747 if !lines.iter().any(|existing| existing == &line) {
1748 lines.push(line);
1749 }
1750}
1751
1752fn looks_like_numbered_section_heading(line: &str) -> bool {
1753 let mut chars = line.chars();
1754 let Some(first) = chars.next() else {
1755 return false;
1756 };
1757
1758 if !first.is_ascii_digit() {
1759 return false;
1760 }
1761
1762 matches!(chars.next(), Some('.'))
1763}
1764
1765fn is_zip_archive(bytes: &[u8]) -> bool {
1766 bytes.starts_with(b"PK\x03\x04")
1767 || bytes.starts_with(b"PK\x05\x06")
1768 || bytes.starts_with(b"PK\x07\x08")
1769}
1770
1771pub fn extract_printable_strings(bytes: &[u8]) -> String {
1772 const MIN_LEN: usize = 4;
1773 const MIN_OUTPUT_BYTES: usize = 2_000_000;
1774 const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1775
1776 let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1777
1778 fn is_printable_ascii(b: u8) -> bool {
1779 matches!(b, 0x20..=0x7E)
1780 }
1781
1782 let mut out = String::new();
1783 let mut run: Vec<u8> = Vec::new();
1784
1785 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1786 if run.len() >= MIN_LEN {
1787 if !out.is_empty() {
1788 out.push('\n');
1789 }
1790 out.push_str(&String::from_utf8_lossy(run));
1791 }
1792 run.clear();
1793 };
1794
1795 for &b in bytes {
1796 if is_printable_ascii(b) {
1797 run.push(b);
1798 } else {
1799 flush_run(&mut out, &mut run);
1800 if out.len() >= max_output_bytes {
1801 return out;
1802 }
1803 }
1804 }
1805 flush_run(&mut out, &mut run);
1806 if out.len() >= max_output_bytes {
1807 return out;
1808 }
1809
1810 for start in 0..=1 {
1811 run.clear();
1812 let mut i = start;
1813 while i + 1 < bytes.len() {
1814 let b0 = bytes[i];
1815 let b1 = bytes[i + 1];
1816 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1817 if is_printable_ascii(ch) && zero == 0 {
1818 run.push(ch);
1819 } else {
1820 flush_run(&mut out, &mut run);
1821 if out.len() >= max_output_bytes {
1822 return out;
1823 }
1824 }
1825 i += 2;
1826 }
1827 flush_run(&mut out, &mut run);
1828 if out.len() >= max_output_bytes {
1829 return out;
1830 }
1831 }
1832
1833 out
1834}
1835
1836#[cfg(test)]
1837mod tests {
1838 use std::path::Path;
1839
1840 use super::{
1841 ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
1842 extract_printable_strings, extract_text_for_detection,
1843 extract_text_for_detection_with_diagnostics, is_non_actionable_pdf_failure,
1844 normalize_mime_type, normalize_pdf_heading_comparison_text,
1845 };
1846
1847 #[test]
1848 fn test_extract_text_for_detection_skips_jar_archives() {
1849 let path = Path::new(
1850 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
1851 );
1852 let bytes = std::fs::read(path).expect("failed to read jar fixture");
1853
1854 let (text, kind) = extract_text_for_detection(path, &bytes);
1855
1856 assert!(text.is_empty());
1857 assert_eq!(kind, ExtractedTextKind::None);
1858 }
1859
1860 #[test]
1861 fn test_extract_text_for_detection_reads_pdf_fixture_text() {
1862 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1863 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1864
1865 let (text, kind) = extract_text_for_detection(path, &bytes);
1866
1867 assert_eq!(kind, ExtractedTextKind::Pdf);
1868 assert!(text.contains("Redistribution and use in source and binary forms"));
1869 }
1870
1871 #[test]
1872 fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
1873 let path =
1874 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1875 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1876
1877 let (text, kind) = extract_text_for_detection(path, &bytes);
1878
1879 assert_eq!(kind, ExtractedTextKind::Pdf);
1880 assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
1881 assert!(!text.contains("DISCLAIMER OF WARRANTY"));
1882 }
1883
1884 #[test]
1885 fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
1886 let path =
1887 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1888 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1889
1890 let (text, kind) = extract_text_for_detection(path, &bytes);
1891
1892 assert_eq!(kind, ExtractedTextKind::Pdf);
1893
1894 let normalized = normalize_pdf_heading_comparison_text(&text);
1895 let heading =
1896 normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
1897 assert_eq!(normalized.matches(&heading).count(), 1);
1898 }
1899
1900 #[test]
1901 fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
1902 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1903 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1904
1905 let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
1906
1907 assert_eq!(kind, ExtractedTextKind::Pdf);
1908 assert!(text.contains("Redistribution and use in source and binary forms"));
1909 }
1910
1911 #[test]
1912 fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
1913 let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
1914
1915 let (text, kind, scan_error) =
1916 extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
1917
1918 assert!(text.is_empty());
1919 assert_eq!(kind, ExtractedTextKind::None);
1920 let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
1921 assert!(scan_error.contains("PDF text extraction failed after"));
1922 }
1923
1924 #[test]
1925 fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
1926 let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1927
1928 let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
1929
1930 assert!(text.is_empty());
1931 assert_eq!(kind, ExtractedTextKind::None);
1932 }
1933
1934 #[test]
1935 fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
1936 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1937 let text = b"Copyright 2026 Example Project!!!";
1938 bytes[..text.len()].copy_from_slice(text);
1939 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
1940 bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
1941
1942 let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
1943
1944 assert_ne!(kind, ExtractedTextKind::None);
1945 assert!(text.contains("Copyright 2026 Example Project"));
1946 }
1947
1948 #[test]
1949 fn test_extract_text_for_detection_uses_windows_executable_metadata() {
1950 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
1951 let bytes = std::fs::read(path).expect("read PE fixture");
1952
1953 let (text, kind) = extract_text_for_detection(path, &bytes);
1954
1955 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
1956 assert!(text.contains("License: This program is free software"));
1957 assert!(text.contains("LegalCopyright:"));
1958 }
1959
1960 #[test]
1961 fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
1962 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1963 let text = b"Copyright 2026 Example Project!!!";
1964 bytes[..text.len()].copy_from_slice(text);
1965
1966 let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
1967
1968 assert!(text.is_empty());
1969 assert_eq!(kind, ExtractedTextKind::None);
1970 }
1971
1972 #[test]
1973 fn test_non_actionable_pdf_failures_are_suppressed() {
1974 assert!(is_non_actionable_pdf_failure(&[
1975 "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
1976 "open full-document: PDF is encrypted and requires a password".to_string(),
1977 ]));
1978 assert!(is_non_actionable_pdf_failure(&[
1979 "from-bytes first-page: Invalid cross-reference table".to_string(),
1980 "open full-document: Invalid cross-reference table".to_string(),
1981 ]));
1982 assert!(!is_non_actionable_pdf_failure(&[
1983 "from-bytes first-page: some other parser failure".to_string(),
1984 ]));
1985 }
1986
1987 #[test]
1988 fn test_extract_text_for_detection_skips_zip_like_archives() {
1989 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
1990
1991 let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
1992 let (crate_text, crate_kind) =
1993 extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
1994
1995 assert!(whl_text.is_empty());
1996 assert_eq!(whl_kind, ExtractedTextKind::None);
1997 assert!(crate_text.is_empty());
1998 assert_eq!(crate_kind, ExtractedTextKind::None);
1999 }
2000
2001 #[test]
2002 fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2003 let path =
2004 Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2005 let bytes = std::fs::read(path).expect("failed to read lib fixture");
2006
2007 let (text, kind) = extract_text_for_detection(path, &bytes);
2008
2009 assert_ne!(kind, ExtractedTextKind::None);
2010 assert!(text.contains("Copyright nexB and others (c) 2012"));
2011 }
2012
2013 #[test]
2014 fn test_extract_text_for_detection_reads_font_metadata() {
2015 let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2016 let bytes = std::fs::read(path).expect("failed to read font fixture");
2017
2018 let (text, kind) = extract_text_for_detection(path, &bytes);
2019
2020 assert_eq!(kind, ExtractedTextKind::FontMetadata);
2021 assert!(text.contains("License Description:"), "{text}");
2022 assert!(
2023 text.contains("Open Font License") || text.contains("OFL"),
2024 "{text}"
2025 );
2026 }
2027
2028 #[test]
2029 fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2030 let bytes = b"abcd\0".repeat(525_000);
2031
2032 let text = extract_printable_strings(&bytes);
2033
2034 assert!(
2035 text.len() > 2_000_000,
2036 "unexpected truncation at {}",
2037 text.len()
2038 );
2039 assert!(text.ends_with("abcd"));
2040 }
2041
2042 #[test]
2043 fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2044 let path = Path::new(
2045 "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2046 );
2047 let bytes = std::fs::read(path).expect("failed to read svg fixture");
2048
2049 let (text, kind) = extract_text_for_detection(path, &bytes);
2050
2051 assert_eq!(kind, ExtractedTextKind::Decoded);
2052 assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2053 }
2054
2055 #[test]
2056 fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2057 let path = Path::new(
2058 "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2059 );
2060 let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2061
2062 let (text, kind) = extract_text_for_detection(path, &bytes);
2063
2064 assert_eq!(kind, ExtractedTextKind::Decoded);
2065 assert!(text.contains("GNU Lesser General Public"));
2066 assert!(text.contains("version"));
2067 assert!(text.contains("2.1 of the License"));
2068 }
2069
2070 #[test]
2071 fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2072 assert_eq!(
2073 normalize_mime_type(
2074 Path::new("main.ts"),
2075 b"export const answer = 42;\n",
2076 Some("TypeScript"),
2077 "video/mp2t",
2078 ),
2079 "text/plain"
2080 );
2081 }
2082
2083 #[test]
2084 fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2085 assert_eq!(
2086 normalize_mime_type(
2087 Path::new("main.js"),
2088 b"console.log('hello');\n",
2089 Some("JavaScript"),
2090 "application/octet-stream",
2091 ),
2092 "text/plain"
2093 );
2094 }
2095
2096 #[test]
2097 fn test_normalize_mime_type_preserves_binary_video_guess() {
2098 assert_eq!(
2099 normalize_mime_type(
2100 Path::new("main.ts"),
2101 &[0, 159, 146, 150, 0, 1, 2, 3],
2102 Some("TypeScript"),
2103 "video/mp2t",
2104 ),
2105 "video/mp2t"
2106 );
2107 }
2108
2109 #[test]
2110 fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2111 assert_eq!(
2112 normalize_mime_type(
2113 Path::new("main.ts"),
2114 &[0, 159, 146, 150],
2115 Some("TypeScript"),
2116 "application/octet-stream",
2117 ),
2118 "application/octet-stream"
2119 );
2120 }
2121
2122 #[test]
2123 fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2124 let classification = classify_file_info(Path::new("test.txt"), b"");
2125
2126 assert_eq!(classification.mime_type, "inode/x-empty");
2127 assert_eq!(classification.file_type, "empty");
2128 assert!(!classification.is_binary);
2129 assert!(classification.is_text);
2130 assert!(!classification.is_source);
2131 assert_eq!(classification.programming_language, None);
2132 }
2133
2134 #[test]
2135 fn test_classify_file_info_keeps_json_out_of_programming_language() {
2136 let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2137
2138 assert_eq!(classification.mime_type, "application/json");
2139 assert_eq!(classification.file_type, "JSON text data");
2140 assert!(classification.is_text);
2141 assert!(!classification.is_source);
2142 assert_eq!(classification.programming_language, None);
2143 }
2144
2145 #[test]
2146 fn test_classify_file_info_treats_dockerfile_as_source() {
2147 let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2148
2149 assert_eq!(
2150 classification.programming_language.as_deref(),
2151 Some("Dockerfile")
2152 );
2153 assert!(classification.is_source);
2154 assert!(!classification.is_script);
2155 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2156 }
2157
2158 #[test]
2159 fn test_classify_file_info_treats_makefile_as_text_not_source() {
2160 let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2161
2162 assert_eq!(classification.programming_language, None);
2163 assert!(classification.is_text);
2164 assert!(!classification.is_source);
2165 assert!(!classification.is_script);
2166 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2167 }
2168
2169 #[test]
2170 fn test_classify_file_info_marks_supported_package_archives() {
2171 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2172
2173 let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2174 let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2175
2176 assert!(egg.is_archive);
2177 assert_eq!(egg.mime_type, "application/zip");
2178 assert_eq!(egg.file_type, "Zip archive data");
2179 assert!(nupkg.is_archive);
2180 assert_eq!(nupkg.mime_type, "application/zip");
2181 assert_eq!(nupkg.file_type, "Zip archive data");
2182 }
2183
2184 #[test]
2185 fn test_classify_file_info_marks_png_as_binary_media() {
2186 let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2187
2188 let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2189
2190 assert_eq!(classification.mime_type, "image/png");
2191 assert_eq!(classification.file_type, "PNG image data");
2192 assert!(classification.is_binary);
2193 assert!(!classification.is_text);
2194 assert!(classification.is_media);
2195 assert!(!classification.is_archive);
2196 assert!(!classification.is_source);
2197 }
2198
2199 #[test]
2200 fn test_classify_file_info_marks_pdf_as_binary_document() {
2201 let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2202
2203 let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2204
2205 assert_eq!(classification.mime_type, "application/pdf");
2206 assert_eq!(classification.file_type, "PDF document");
2207 assert!(classification.is_binary);
2208 assert!(!classification.is_text);
2209 assert!(!classification.is_archive);
2210 assert!(!classification.is_media);
2211 }
2212
2213 #[test]
2214 fn test_classify_file_info_marks_binary_blobs_as_binary() {
2215 let classification =
2216 classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2217
2218 assert!(classification.is_binary);
2219 assert!(!classification.is_text);
2220 assert!(!classification.is_source);
2221 assert_eq!(classification.programming_language, None);
2222 }
2223
2224 #[test]
2225 fn test_classify_file_info_treats_yaml_as_text_not_source() {
2226 let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2227
2228 assert_eq!(classification.programming_language, None);
2229 assert!(classification.is_text);
2230 assert!(!classification.is_source);
2231 assert_eq!(classification.file_type, "YAML text data");
2232 }
2233
2234 #[test]
2235 fn test_classify_file_info_classifies_common_build_manifests() {
2236 let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2237 let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2238 let gitmodules = classify_file_info(
2239 Path::new(".gitmodules"),
2240 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2241 );
2242
2243 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2244 assert!(gradle.is_source);
2245 assert_eq!(gradle.mime_type, "text/plain");
2246
2247 assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2248 assert!(flake.is_source);
2249 assert_eq!(flake.mime_type, "text/plain");
2250
2251 assert_eq!(gitmodules.programming_language, None);
2252 assert!(gitmodules.is_text);
2253 assert!(!gitmodules.is_source);
2254 assert_eq!(gitmodules.file_type, "Git configuration text");
2255 }
2256
2257 #[test]
2258 fn test_classify_file_info_labels_javascript_shebang_scripts() {
2259 let classification = classify_file_info(
2260 Path::new("bin/run"),
2261 b"#!/usr/bin/env node\nconsole.log('hello');\n",
2262 );
2263
2264 assert_eq!(
2265 classification.programming_language.as_deref(),
2266 Some("JavaScript")
2267 );
2268 assert!(classification.is_script);
2269 assert_eq!(
2270 classification.file_type,
2271 "javascript script, UTF-8 Unicode text executable"
2272 );
2273 }
2274
2275 #[test]
2276 fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2277 let classification = classify_file_info(
2278 Path::new("script.py"),
2279 b"# coding: latin-1\nprint(\"caf\xe9\")\n",
2280 );
2281
2282 assert_eq!(
2283 classification.programming_language.as_deref(),
2284 Some("Python")
2285 );
2286 assert!(classification.is_script);
2287 assert_eq!(classification.file_type, "python script, text executable");
2288 }
2289
2290 #[test]
2291 fn test_classify_file_info_treats_textual_tga_as_media() {
2292 let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
2293
2294 assert!(classification.is_media);
2295 assert!(classification.is_text);
2296 assert!(!classification.is_binary);
2297 }
2298
2299 #[test]
2300 fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
2301 let classification =
2302 classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
2303
2304 assert!(classification.is_binary);
2305 assert!(!classification.is_text);
2306 assert!(!classification.is_source);
2307 assert_eq!(classification.programming_language, None);
2308 }
2309
2310 #[test]
2311 fn test_extract_text_for_detection_skips_unsupported_image_formats() {
2312 let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
2313
2314 let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
2315
2316 assert!(text.is_empty());
2317 assert_eq!(kind, ExtractedTextKind::None);
2318 }
2319
2320 #[test]
2321 fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
2322 let cases = [
2323 (
2324 Path::new("bin/run"),
2325 b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
2326 Some("JavaScript"),
2327 true,
2328 true,
2329 ),
2330 (
2331 Path::new("Dockerfile"),
2332 b"FROM scratch\n".as_slice(),
2333 Some("Dockerfile"),
2334 true,
2335 false,
2336 ),
2337 (
2338 Path::new("package.json"),
2339 br#"{"name":"demo"}"#.as_slice(),
2340 None,
2341 false,
2342 false,
2343 ),
2344 (
2345 Path::new("config.yaml"),
2346 b"key: value\n".as_slice(),
2347 None,
2348 false,
2349 false,
2350 ),
2351 (
2352 Path::new("Makefile"),
2353 b"all:\n\techo hi\n".as_slice(),
2354 None,
2355 false,
2356 false,
2357 ),
2358 ];
2359
2360 for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
2361 let classification = classify_file_info(path, bytes);
2362
2363 assert_eq!(
2364 classification.programming_language.as_deref(),
2365 expected_language,
2366 "unexpected language for {}",
2367 path.display()
2368 );
2369 assert_eq!(
2370 classification.is_source,
2371 expected_is_source,
2372 "unexpected is_source for {}",
2373 path.display()
2374 );
2375 assert_eq!(
2376 classification.is_script,
2377 expected_is_script,
2378 "unexpected is_script for {}",
2379 path.display()
2380 );
2381 }
2382 }
2383}