1use std::collections::BTreeSet;
2use std::fs;
3use std::io::{BufReader, Cursor, Read};
4use std::panic::{AssertUnwindSafe, catch_unwind};
5use std::path::Path;
6
7use chrono::{TimeZone, Utc};
8use content_inspector::{ContentType, inspect};
9use file_format::{FileFormat, Kind as FileFormatKind};
10use flate2::read::ZlibDecoder;
11use glob::Pattern;
12use image::{ImageDecoder, ImageFormat, ImageReader};
13use mime_guess::from_path;
14use quick_xml::events::Event;
15use quick_xml::reader::Reader as XmlReader;
16
17use crate::utils::language::detect_language;
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum ExtractedTextKind {
21 None,
22 Decoded,
23 Pdf,
24 BinaryStrings,
25 ImageMetadata,
26}
27
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub struct FileInfoClassification {
30 pub mime_type: String,
31 pub file_type: String,
32 pub programming_language: Option<String>,
33 pub is_binary: bool,
34 pub is_text: bool,
35 pub is_archive: bool,
36 pub is_media: bool,
37 pub is_source: bool,
38 pub is_script: bool,
39}
40
41const MAX_IMAGE_METADATA_VALUES: usize = 64;
42const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
43const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
44 "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
45];
46const BINARY_EXTENSIONS: &[&str] = &[
47 "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
48];
49const ARCHIVE_EXTENSIONS: &[&str] = &[
50 "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
51 "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
52];
53
54pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
56 metadata.modified().ok().map(|time: std::time::SystemTime| {
57 let seconds_since_epoch = time
58 .duration_since(std::time::UNIX_EPOCH)
59 .unwrap()
60 .as_secs() as i64;
61
62 Utc.timestamp_opt(seconds_since_epoch, 0)
63 .single()
64 .unwrap_or_else(Utc::now)
65 .format("%Y-%m-%d")
66 .to_string()
67 })
68}
69
70pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
72 let path_str = path.to_string_lossy();
73 let file_name = path
74 .file_name()
75 .map(|name| name.to_string_lossy())
76 .unwrap_or_default();
77
78 for pattern in exclude_patterns {
79 if pattern.matches(&path_str) {
81 return true;
82 }
83
84 if pattern.matches(&file_name) {
86 return true;
87 }
88 }
89
90 false
91}
92
93pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
99 match String::from_utf8(bytes.to_vec()) {
100 Ok(s) => s,
101 Err(e) => {
102 let bytes = e.into_bytes();
103 let control_count = bytes
105 .iter()
106 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
107 .count();
108 if control_count > bytes.len() / 10 {
109 return String::new();
110 }
111 bytes.iter().map(|&b| b as char).collect()
112 }
113 }
114}
115
116pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
117 let ext = path
118 .extension()
119 .and_then(|e| e.to_str())
120 .map(|s| s.to_ascii_lowercase());
121 let detected_format = detect_file_format(bytes);
122
123 if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
124 let text = extract_pdf_text(path, bytes);
125 return if text.is_empty() {
126 (String::new(), ExtractedTextKind::None)
127 } else {
128 (text, ExtractedTextKind::Pdf)
129 };
130 }
131
132 if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
133 let text = extract_image_metadata_text(bytes, format);
134 return if text.is_empty() {
135 if is_supported_image_container(bytes, format) {
136 (String::new(), ExtractedTextKind::None)
137 } else {
138 let decoded = decode_bytes_to_string(bytes);
139 if decoded.is_empty() {
140 (String::new(), ExtractedTextKind::None)
141 } else {
142 (decoded, ExtractedTextKind::Decoded)
143 }
144 }
145 } else {
146 (text, ExtractedTextKind::ImageMetadata)
147 };
148 }
149
150 if should_skip_binary_string_extraction(path, bytes, detected_format) {
151 return (String::new(), ExtractedTextKind::None);
152 }
153
154 let decoded = decode_bytes_to_string(bytes);
155 if !decoded.is_empty() {
156 return (decoded, ExtractedTextKind::Decoded);
157 }
158
159 let text = extract_printable_strings(bytes);
160 if text.is_empty() {
161 (String::new(), ExtractedTextKind::None)
162 } else {
163 (text, ExtractedTextKind::BinaryStrings)
164 }
165}
166
167pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
168 let detected_format = detect_file_format(bytes);
169 let detected_language = detect_language(path, bytes);
170 let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
171 let is_text = !is_binary;
172 let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
173 let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
174 let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
175 let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
176 let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
177 let programming_language = is_source.then(|| detected_language.clone()).flatten();
178 let file_type = detect_file_type(
179 path,
180 bytes,
181 detected_format,
182 &mime_type,
183 programming_language.as_deref(),
184 is_binary,
185 is_text,
186 is_archive,
187 is_media,
188 is_script,
189 );
190
191 FileInfoClassification {
192 mime_type,
193 file_type,
194 programming_language,
195 is_binary,
196 is_text,
197 is_archive,
198 is_media,
199 is_source,
200 is_script,
201 }
202}
203
204fn detect_file_format(bytes: &[u8]) -> FileFormat {
205 FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
206}
207
208pub fn detect_mime_type(
209 path: &Path,
210 bytes: &[u8],
211 detected_format: FileFormat,
212 programming_language: Option<&str>,
213) -> String {
214 if bytes.is_empty() {
215 return "inode/x-empty".to_string();
216 }
217
218 if is_zip_archive(bytes) {
219 return detect_zip_like_mime(path);
220 }
221
222 if looks_like_deb(bytes, path) {
223 return "application/vnd.debian.binary-package".to_string();
224 }
225
226 if looks_like_rpm(bytes, path) {
227 return "application/x-rpm".to_string();
228 }
229
230 let guessed_mime = from_path(path)
231 .first_or_octet_stream()
232 .essence_str()
233 .to_string();
234
235 let mime_type = match detected_format {
236 FileFormat::Empty => "inode/x-empty".to_string(),
237 FileFormat::PlainText => {
238 if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
239 "text/plain".to_string()
240 } else {
241 guessed_mime.clone()
242 }
243 }
244 _ => {
245 let detected_mime = detected_format.media_type();
246 if detected_mime == "application/octet-stream"
247 && guessed_mime != "application/octet-stream"
248 {
249 guessed_mime.clone()
250 } else {
251 detected_mime.to_string()
252 }
253 }
254 };
255
256 normalize_mime_type(path, bytes, programming_language, &mime_type)
257}
258
259fn is_utf8_text(content_type: ContentType) -> bool {
260 matches!(content_type, ContentType::UTF_8 | ContentType::UTF_8_BOM)
261}
262
263fn normalize_mime_type(
264 path: &Path,
265 bytes: &[u8],
266 programming_language: Option<&str>,
267 mime_type: &str,
268) -> String {
269 if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
270 return "text/plain".to_string();
271 }
272
273 mime_type.to_string()
274}
275
276fn should_prefer_text_mime(
277 path: &Path,
278 bytes: &[u8],
279 programming_language: Option<&str>,
280 mime_type: &str,
281) -> bool {
282 (is_utf8_text(inspect(bytes)) || !decode_bytes_to_string(bytes).is_empty())
283 && is_textual_source_candidate(path, programming_language)
284 && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
285}
286
287fn detect_is_binary(
288 path: &Path,
289 bytes: &[u8],
290 detected_format: FileFormat,
291 programming_language: Option<&str>,
292) -> bool {
293 if matches!(detected_format, FileFormat::Empty | FileFormat::PlainText) {
294 return false;
295 }
296
297 lower_extension(path)
298 .as_deref()
299 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
300 || (!bytes.is_empty()
301 && matches!(inspect(bytes), ContentType::BINARY)
302 && !should_treat_binary_bytes_as_text(path, bytes, programming_language))
303}
304
305fn should_treat_binary_bytes_as_text(
306 path: &Path,
307 bytes: &[u8],
308 programming_language: Option<&str>,
309) -> bool {
310 !decode_bytes_to_string(bytes).is_empty()
311 && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
312}
313
314fn detect_is_archive(
315 path: &Path,
316 bytes: &[u8],
317 mime_type: &str,
318 is_text: bool,
319 detected_format: FileFormat,
320) -> bool {
321 if is_text {
322 return false;
323 }
324
325 lower_extension(path)
326 .as_deref()
327 .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
328 || matches!(
329 detected_format.kind(),
330 FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
331 )
332 || is_zip_archive(bytes)
333 || looks_like_gzip(bytes)
334 || looks_like_bzip2(bytes)
335 || looks_like_xz(bytes)
336 || looks_like_deb(bytes, path)
337 || looks_like_rpm(bytes, path)
338 || looks_like_squashfs(bytes, path)
339 || mime_type.contains("zip")
340 || mime_type.contains("compressed")
341 || mime_type.contains("tar")
342 || mime_type.contains("x-rpm")
343 || mime_type.contains("debian")
344}
345
346fn detect_is_media(
347 path: &Path,
348 bytes: &[u8],
349 mime_type: &str,
350 detected_format: FileFormat,
351) -> bool {
352 media_mime_from_content(bytes).is_some()
353 || matches!(
354 detected_format.kind(),
355 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
356 )
357 || mime_type.starts_with("image/")
358 || mime_type.starts_with("audio/")
359 || mime_type.starts_with("video/")
360 || (mime_type == "application/octet-stream"
361 && lower_extension(path).as_deref() == Some("tga")
362 && !matches!(inspect(bytes), ContentType::BINARY))
363}
364
365fn detect_is_script(
366 path: &Path,
367 bytes: &[u8],
368 programming_language: Option<&str>,
369 is_text: bool,
370) -> bool {
371 if !is_text || is_makefile(path) {
372 return false;
373 }
374
375 bytes.starts_with(b"#!")
376 || lower_extension(path).as_deref().is_some_and(|ext| {
377 matches!(
378 ext,
379 "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
380 )
381 })
382 || matches!(
383 programming_language,
384 Some("Shell" | "Python" | "Ruby" | "Perl" | "PHP" | "PowerShell" | "Awk")
385 )
386}
387
388fn detect_is_source(
389 path: &Path,
390 programming_language: Option<&str>,
391 is_text: bool,
392 is_script: bool,
393) -> bool {
394 if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
395 return false;
396 }
397
398 if is_c_like_source(path) || is_java_like_source(path) {
399 return true;
400 }
401
402 programming_language.is_some() || is_script
403}
404
405#[allow(clippy::too_many_arguments)]
406fn detect_file_type(
407 path: &Path,
408 bytes: &[u8],
409 detected_format: FileFormat,
410 mime_type: &str,
411 programming_language: Option<&str>,
412 is_binary: bool,
413 is_text: bool,
414 is_archive: bool,
415 is_media: bool,
416 is_script: bool,
417) -> String {
418 if bytes.is_empty() {
419 return "empty".to_string();
420 }
421
422 if looks_like_pdf(bytes) {
423 return "PDF document".to_string();
424 }
425
426 if let Some(file_type) = media_file_type_from_content(bytes) {
427 return file_type.to_string();
428 }
429
430 if is_archive {
431 return archive_file_type(path, bytes, detected_format);
432 }
433
434 if is_script {
435 return script_file_type(programming_language, bytes);
436 }
437
438 if is_text {
439 if lower_extension(path).as_deref() == Some("json") {
440 return "JSON text data".to_string();
441 }
442 if lower_extension(path).as_deref() == Some("xml") {
443 return "XML text data".to_string();
444 }
445 if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
446 return "YAML text data".to_string();
447 }
448 if lower_extension(path).as_deref() == Some("toml") {
449 return "TOML text data".to_string();
450 }
451 if matches!(
452 lower_extension(path).as_deref(),
453 Some("ini" | "cfg" | "conf")
454 ) {
455 return "INI text data".to_string();
456 }
457 if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
458 return "Git configuration text".to_string();
459 }
460 if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
461 return text_file_type(bytes);
462 }
463 if programming_language.is_some() && !is_media {
464 return text_file_type(bytes);
465 }
466 return text_file_type(bytes);
467 }
468
469 if let Some(file_type) = format_based_file_type(detected_format) {
470 return file_type;
471 }
472
473 if is_binary && mime_type == "application/octet-stream" {
474 return "data".to_string();
475 }
476
477 mime_type.to_string()
478}
479
480fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
481 if matches!(programming_language, Some(language) if is_source_like_language(language)) {
482 return true;
483 }
484
485 if matches!(
486 lower_file_name(path).as_str(),
487 "dockerfile"
488 | "containerfile"
489 | "containerfile.core"
490 | "apkbuild"
491 | "podfile"
492 | "meson.build"
493 | "build"
494 | "workspace"
495 | "buck"
496 | "default.nix"
497 | "flake.nix"
498 | "shell.nix"
499 ) {
500 return true;
501 }
502
503 path.extension()
504 .and_then(|ext| ext.to_str())
505 .is_some_and(|ext| {
506 matches!(
507 ext.to_ascii_lowercase().as_str(),
508 "rs" | "py"
509 | "js"
510 | "mjs"
511 | "cjs"
512 | "jsx"
513 | "ts"
514 | "mts"
515 | "cts"
516 | "tsx"
517 | "c"
518 | "cpp"
519 | "cc"
520 | "cxx"
521 | "h"
522 | "hpp"
523 | "m"
524 | "mm"
525 | "s"
526 | "asm"
527 | "java"
528 | "go"
529 | "rb"
530 | "php"
531 | "pl"
532 | "swift"
533 | "sh"
534 | "bash"
535 | "zsh"
536 | "fish"
537 | "ksh"
538 | "ps1"
539 | "psm1"
540 | "psd1"
541 | "awk"
542 | "kt"
543 | "kts"
544 | "dart"
545 | "scala"
546 | "groovy"
547 | "gradle"
548 | "gvy"
549 | "gy"
550 | "gsh"
551 | "cs"
552 | "fs"
553 | "fsx"
554 | "r"
555 | "lua"
556 | "jl"
557 | "ex"
558 | "exs"
559 | "clj"
560 | "cljs"
561 | "cljc"
562 | "hs"
563 | "erl"
564 | "nix"
565 | "zig"
566 | "bzl"
567 | "bazel"
568 | "star"
569 | "sky"
570 | "ml"
571 | "mli"
572 | "tex"
573 )
574 })
575}
576
577fn is_source_like_language(language: &str) -> bool {
578 matches!(
579 language,
580 "Rust"
581 | "Python"
582 | "JavaScript"
583 | "TypeScript"
584 | "JavaScript/TypeScript"
585 | "C"
586 | "C++"
587 | "Objective-C"
588 | "Objective-C++"
589 | "GAS"
590 | "Java"
591 | "Go"
592 | "Ruby"
593 | "PHP"
594 | "Perl"
595 | "Swift"
596 | "Shell"
597 | "PowerShell"
598 | "Awk"
599 | "Kotlin"
600 | "Dart"
601 | "Scala"
602 | "C#"
603 | "F#"
604 | "R"
605 | "Lua"
606 | "Julia"
607 | "Elixir"
608 | "Clojure"
609 | "Haskell"
610 | "Erlang"
611 | "Groovy"
612 | "Nix"
613 | "Zig"
614 | "Starlark"
615 | "OCaml"
616 | "Meson"
617 | "TeX"
618 | "Dockerfile"
619 | "Makefile"
620 )
621}
622
623fn extension(path: &Path) -> Option<&str> {
624 path.extension().and_then(|ext| ext.to_str())
625}
626
627fn lower_extension(path: &Path) -> Option<String> {
628 extension(path).map(|ext| ext.to_ascii_lowercase())
629}
630
631fn lower_file_name(path: &Path) -> String {
632 path.file_name()
633 .and_then(|name| name.to_str())
634 .map(|name| name.to_ascii_lowercase())
635 .unwrap_or_default()
636}
637
638fn is_plain_text(path: &Path) -> bool {
639 lower_extension(path)
640 .as_deref()
641 .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
642}
643
644fn is_makefile(path: &Path) -> bool {
645 matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
646}
647
648fn is_source_map(path: &Path) -> bool {
649 let path_lower = path.to_string_lossy().to_ascii_lowercase();
650 path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
651}
652
653fn is_c_like_source(path: &Path) -> bool {
654 lower_extension(path).as_deref().is_some_and(|ext| {
655 matches!(
656 ext,
657 "c" | "cc"
658 | "cp"
659 | "cpp"
660 | "cxx"
661 | "c++"
662 | "h"
663 | "hh"
664 | "hpp"
665 | "hxx"
666 | "h++"
667 | "i"
668 | "ii"
669 | "m"
670 | "s"
671 | "asm"
672 )
673 })
674}
675
676fn is_java_like_source(path: &Path) -> bool {
677 lower_extension(path)
678 .as_deref()
679 .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
680}
681
682fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
683 match detected_format {
684 FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
685 format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
686 format => Some(match format.kind() {
687 FileFormatKind::Image => short_name_or_name(&format, "image data"),
688 FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
689 FileFormatKind::Video => short_name_or_name(&format, "video data"),
690 _ => format.name().to_string(),
691 }),
692 }
693}
694
695fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
696 format
697 .short_name()
698 .map(|short_name| format!("{short_name} {suffix}"))
699 .unwrap_or_else(|| format!("{} {suffix}", format.name()))
700}
701
702fn detect_zip_like_mime(path: &Path) -> String {
703 match extension(path).map(|ext| ext.to_ascii_lowercase()) {
704 Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
705 Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
706 "application/java-archive".to_string()
707 }
708 _ => "application/zip".to_string(),
709 }
710}
711
712fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
713 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
714 Some("image/png")
715 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
716 Some("image/jpeg")
717 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
718 Some("image/tiff")
719 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
720 Some("image/webp")
721 } else {
722 None
723 }
724}
725
726fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
727 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
728 Some("PNG image data")
729 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
730 Some("JPEG image data")
731 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
732 Some("TIFF image data")
733 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
734 Some("WebP image data")
735 } else {
736 None
737 }
738}
739
740fn looks_like_pdf(bytes: &[u8]) -> bool {
741 bytes.starts_with(b"%PDF-")
742}
743
744fn looks_like_gzip(bytes: &[u8]) -> bool {
745 bytes.starts_with(&[0x1f, 0x8b])
746}
747
748fn looks_like_bzip2(bytes: &[u8]) -> bool {
749 bytes.starts_with(b"BZh")
750}
751
752fn looks_like_xz(bytes: &[u8]) -> bool {
753 bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
754}
755
756fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
757 lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
758}
759
760fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
761 lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
762}
763
764fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
765 lower_extension(path)
766 .as_deref()
767 .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
768 && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
769 || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
770}
771
772fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
773 if looks_like_deb(bytes, path) {
774 "debian binary package (format 2.0)".to_string()
775 } else if looks_like_rpm(bytes, path) {
776 "RPM package".to_string()
777 } else if looks_like_squashfs(bytes, path) {
778 "Squashfs filesystem".to_string()
779 } else if looks_like_gzip(bytes) {
780 "gzip compressed data".to_string()
781 } else if looks_like_bzip2(bytes) {
782 "bzip2 compressed data".to_string()
783 } else if looks_like_xz(bytes) {
784 "XZ compressed data".to_string()
785 } else if is_zip_archive(bytes) {
786 "Zip archive data".to_string()
787 } else if lower_extension(path).as_deref() == Some("gem") {
788 "POSIX tar archive".to_string()
789 } else if let Some(file_type) = format_based_file_type(detected_format) {
790 file_type
791 } else {
792 "archive data".to_string()
793 }
794}
795
796fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
797 let suffix = text_executable_label(bytes);
798
799 match programming_language {
800 Some("Python") => format!("python script, {suffix}"),
801 Some("Ruby") => format!("ruby script, {suffix}"),
802 Some("Perl") => format!("perl script, {suffix}"),
803 Some("PHP") => format!("php script, {suffix}"),
804 Some("Shell") => format!("shell script, {suffix}"),
805 Some("JavaScript") => format!("javascript script, {suffix}"),
806 Some("TypeScript") => format!("typescript script, {suffix}"),
807 Some("PowerShell") => format!("powershell script, {suffix}"),
808 Some("Awk") => format!("awk script, {suffix}"),
809 _ => format!("script, {suffix}"),
810 }
811}
812
813fn text_file_type(bytes: &[u8]) -> String {
814 text_label(bytes).to_string()
815}
816
817fn text_label(bytes: &[u8]) -> &'static str {
818 if std::str::from_utf8(bytes).is_ok() {
819 if bytes.contains(&b'\n') {
820 "UTF-8 Unicode text"
821 } else {
822 "UTF-8 Unicode text, with no line terminators"
823 }
824 } else if bytes.contains(&b'\n') {
825 "text"
826 } else {
827 "text, with no line terminators"
828 }
829}
830
831fn text_executable_label(bytes: &[u8]) -> &'static str {
832 if std::str::from_utf8(bytes).is_ok() {
833 if bytes.contains(&b'\n') {
834 "UTF-8 Unicode text executable"
835 } else {
836 "UTF-8 Unicode text executable, with no line terminators"
837 }
838 } else if bytes.contains(&b'\n') {
839 "text executable"
840 } else {
841 "text executable, with no line terminators"
842 }
843}
844
845fn supported_image_metadata_format(
846 ext: Option<&str>,
847 detected_format: FileFormat,
848) -> Option<ImageFormat> {
849 match ext {
850 Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
851 Some("png") => Some(ImageFormat::Png),
852 Some("tif" | "tiff") => Some(ImageFormat::Tiff),
853 Some("webp") => Some(ImageFormat::WebP),
854 _ => match detected_format.media_type() {
855 "image/jpeg" => Some(ImageFormat::Jpeg),
856 "image/png" => Some(ImageFormat::Png),
857 "image/tiff" => Some(ImageFormat::Tiff),
858 "image/webp" => Some(ImageFormat::WebP),
859 _ => None,
860 },
861 }
862}
863
864fn should_skip_binary_string_extraction(
865 path: &Path,
866 bytes: &[u8],
867 detected_format: FileFormat,
868) -> bool {
869 matches!(lower_extension(path).as_deref(), Some("pdf"))
870 || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
871 .is_some()
872 || media_mime_from_content(bytes).is_some()
873 || is_zip_archive(bytes)
874 || looks_like_gzip(bytes)
875 || looks_like_bzip2(bytes)
876 || looks_like_xz(bytes)
877 || looks_like_deb(bytes, path)
878 || looks_like_rpm(bytes, path)
879 || looks_like_squashfs(bytes, path)
880}
881
882fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
883 match format {
884 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
885 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
886 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
887 ImageFormat::WebP => {
888 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
889 }
890 _ => false,
891 }
892}
893
894fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
895 let mut values = Vec::new();
896 values.extend(extract_exif_metadata_values(bytes));
897 values.extend(extract_xmp_metadata_values(bytes, format));
898 values_to_text(values)
899}
900
901fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
902 let mut cursor = BufReader::new(Cursor::new(bytes));
903 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
904 Ok(exif) => exif,
905 Err(_) => return Vec::new(),
906 };
907
908 let mut values = Vec::new();
909 for field in exif.fields() {
910 let rendered = match field.tag {
911 exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
912 Some(field.display_value().with_unit(&exif).to_string())
913 }
914 exif::Tag::Artist => Some(format!(
915 "Author: {}",
916 field.display_value().with_unit(&exif)
917 )),
918 _ => None,
919 };
920
921 if let Some(rendered) = rendered {
922 values.push(rendered);
923 }
924 }
925
926 values
927}
928
929fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
930 let xmp = match extract_raw_xmp_packet(bytes, format) {
931 Some(xmp) => xmp,
932 None => return Vec::new(),
933 };
934
935 parse_xmp_values(&xmp)
936}
937
938fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
939 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
940 if let Ok(mut decoder) = reader.into_decoder()
941 && let Ok(Some(xmp)) = decoder.xmp_metadata()
942 {
943 return Some(xmp);
944 }
945
946 match format {
947 ImageFormat::Png => extract_png_xmp_packet(bytes),
948 _ => None,
949 }
950}
951
952fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
953 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
954
955 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
956 return None;
957 }
958
959 let mut offset = PNG_SIGNATURE.len();
960 while offset + 12 <= bytes.len() {
961 let length = u32::from_be_bytes([
962 bytes[offset],
963 bytes[offset + 1],
964 bytes[offset + 2],
965 bytes[offset + 3],
966 ]) as usize;
967 let chunk_start = offset + 8;
968 let chunk_end = chunk_start + length;
969 if chunk_end + 4 > bytes.len() {
970 return None;
971 }
972
973 let chunk_type = &bytes[offset + 4..offset + 8];
974 if chunk_type == b"iTXt" {
975 let data = &bytes[chunk_start..chunk_end];
976 if let Some(xmp) = parse_png_itxt_xmp(data) {
977 return Some(xmp);
978 }
979 }
980
981 offset = chunk_end + 4;
982 }
983
984 None
985}
986
987fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
988 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
989
990 let keyword_end = data.iter().position(|&b| b == 0)?;
991 if &data[..keyword_end] != XMP_KEYWORD {
992 return None;
993 }
994
995 let mut cursor = keyword_end + 1;
996 let compression_flag = *data.get(cursor)?;
997 cursor += 1;
998 let compression_method = *data.get(cursor)?;
999 cursor += 1;
1000 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1001 return None;
1002 }
1003
1004 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1005 cursor = language_end + 1;
1006
1007 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1008 cursor = translated_end + 1;
1009
1010 let text_bytes = &data[cursor..];
1011 if compression_flag == 1 {
1012 let mut decoder = ZlibDecoder::new(text_bytes);
1013 let mut decoded = Vec::new();
1014 decoder.read_to_end(&mut decoded).ok()?;
1015 Some(decoded)
1016 } else {
1017 Some(text_bytes.to_vec())
1018 }
1019}
1020
1021fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1022 let mut reader = XmlReader::from_reader(xmp);
1023 reader.config_mut().trim_text(true);
1024
1025 let mut buf = Vec::new();
1026 let mut stack: Vec<String> = Vec::new();
1027 let mut values = Vec::new();
1028
1029 loop {
1030 match reader.read_event_into(&mut buf) {
1031 Ok(Event::Start(e)) => {
1032 stack.push(local_xml_name(e.name().as_ref()));
1033 }
1034 Ok(Event::End(_)) => {
1035 stack.pop();
1036 }
1037 Ok(Event::Empty(_)) => {}
1038 Ok(Event::Text(text)) => {
1039 if let Some(field) = stack
1040 .iter()
1041 .rev()
1042 .find_map(|name| allowed_xmp_field(name.as_str()))
1043 && let Ok(decoded) = text.decode()
1044 {
1045 let decoded = decoded.into_owned();
1046 if !decoded.trim().is_empty() {
1047 values.push(format_xmp_value(field, &decoded));
1048 }
1049 }
1050 }
1051 Ok(Event::CData(text)) => {
1052 if let Some(field) = stack
1053 .iter()
1054 .rev()
1055 .find_map(|name| allowed_xmp_field(name.as_str()))
1056 && let Ok(decoded) = text.decode()
1057 {
1058 let decoded = decoded.into_owned();
1059 if !decoded.trim().is_empty() {
1060 values.push(format_xmp_value(field, &decoded));
1061 }
1062 }
1063 }
1064 Ok(Event::Eof) | Err(_) => break,
1065 _ => {}
1066 }
1067 buf.clear();
1068 }
1069
1070 values
1071}
1072
1073fn local_xml_name(name: &[u8]) -> String {
1074 let name = std::str::from_utf8(name).unwrap_or_default();
1075 name.rsplit(':').next().unwrap_or(name).to_string()
1076}
1077
1078fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1079 match name {
1080 "creator" => Some("creator"),
1081 "rights" => Some("rights"),
1082 "description" => Some("description"),
1083 "title" => Some("title"),
1084 "subject" => Some("subject"),
1085 "UsageTerms" => Some("usage_terms"),
1086 "WebStatement" => Some("web_statement"),
1087 _ => None,
1088 }
1089}
1090
1091fn format_xmp_value(field: &str, value: &str) -> String {
1092 match field {
1093 "creator" => format!("Author: {value}"),
1094 _ => value.to_string(),
1095 }
1096}
1097
1098fn values_to_text(values: Vec<String>) -> String {
1099 let mut seen = BTreeSet::new();
1100 let mut lines = Vec::new();
1101 let mut total_bytes = 0usize;
1102
1103 for value in values {
1104 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1105 break;
1106 }
1107
1108 let normalized = normalize_metadata_value(&value);
1109 if normalized.is_empty() || !seen.insert(normalized.clone()) {
1110 continue;
1111 }
1112
1113 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1114 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1115 break;
1116 }
1117
1118 total_bytes += added_bytes;
1119 lines.push(normalized);
1120 }
1121
1122 lines.join("\n")
1123}
1124
1125fn normalize_metadata_value(value: &str) -> String {
1126 value
1127 .chars()
1128 .filter(|&ch| ch != '\0')
1129 .collect::<String>()
1130 .split_whitespace()
1131 .collect::<Vec<_>>()
1132 .join(" ")
1133 .trim()
1134 .to_string()
1135}
1136
1137fn extract_pdf_text(path: &Path, bytes: &[u8]) -> String {
1138 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1139 return String::new();
1140 }
1141
1142 let extracted = catch_unwind(AssertUnwindSafe(
1143 || -> Result<String, Box<dyn std::error::Error>> {
1144 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1145 extract_first_pdf_page_text(&mut document)
1146 },
1147 ));
1148 if let Ok(Ok(text)) = extracted
1149 && let Some(normalized) = normalize_pdf_text(text)
1150 {
1151 return normalized;
1152 }
1153
1154 let extracted = catch_unwind(AssertUnwindSafe(
1155 || -> Result<String, Box<dyn std::error::Error>> {
1156 let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1157 extract_pdf_text_from_document(&mut document)
1158 },
1159 ));
1160 if let Ok(Ok(text)) = extracted
1161 && let Some(normalized) = normalize_pdf_text(text)
1162 {
1163 return normalized;
1164 }
1165
1166 let extracted = catch_unwind(AssertUnwindSafe(
1167 || -> Result<String, Box<dyn std::error::Error>> {
1168 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1169 extract_pdf_text_from_document(&mut document)
1170 },
1171 ));
1172 if let Ok(Ok(text)) = extracted
1173 && let Some(normalized) = normalize_pdf_text(text)
1174 {
1175 return normalized;
1176 }
1177
1178 String::new()
1179}
1180
1181fn extract_first_pdf_page_text(
1182 document: &mut pdf_oxide::document::PdfDocument,
1183) -> Result<String, Box<dyn std::error::Error>> {
1184 if document.page_count()? == 0 {
1185 return Ok(String::new());
1186 }
1187
1188 let extracted_text = document.extract_text(0)?;
1189 let markdown_text =
1190 document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1191 if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1192 return Ok(extracted_text);
1193 }
1194
1195 let pipeline_text =
1196 document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1197
1198 Ok(merge_pdf_first_page_text(
1199 &extracted_text,
1200 &markdown_text,
1201 &pipeline_text,
1202 ))
1203}
1204
1205fn extract_pdf_text_from_document(
1206 document: &mut pdf_oxide::document::PdfDocument,
1207) -> Result<String, Box<dyn std::error::Error>> {
1208 Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1209}
1210
1211fn normalize_pdf_text(text: String) -> Option<String> {
1212 let normalized = text.replace(['\r', '\u{0c}'], "\n");
1213 (!normalized.trim().is_empty()).then_some(normalized)
1214}
1215
1216fn merge_pdf_first_page_text(
1217 _extracted_text: &str,
1218 markdown_text: &str,
1219 pipeline_text: &str,
1220) -> String {
1221 let pipeline = pipeline_text.trim();
1222 if pipeline.is_empty() {
1223 return String::new();
1224 }
1225
1226 let prefix = pdf_first_page_heading_prefix(markdown_text);
1227 let Some(prefix) = prefix else {
1228 return pipeline_text.to_string();
1229 };
1230
1231 if pipeline.contains(&prefix) {
1232 pipeline_text.to_string()
1233 } else {
1234 format!("{prefix}\n\n{pipeline}")
1235 }
1236}
1237
1238fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1239 let mut lines = Vec::new();
1240
1241 for line in pdf_markdown_heading_lines(markdown_text) {
1242 push_unique_line(&mut lines, line);
1243 }
1244
1245 (!lines.is_empty()).then(|| lines.join("\n"))
1246}
1247
1248fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1249 text.lines()
1250 .map(str::trim)
1251 .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1252 .map(|line| line.trim_matches('#').trim())
1253 .filter(|line| !line.is_empty())
1254 .filter(|line| !looks_like_numbered_section_heading(line))
1255 .take(4)
1256 .map(ToOwned::to_owned)
1257 .collect()
1258}
1259
1260fn push_unique_line(lines: &mut Vec<String>, line: String) {
1261 if !lines.iter().any(|existing| existing == &line) {
1262 lines.push(line);
1263 }
1264}
1265
1266fn looks_like_numbered_section_heading(line: &str) -> bool {
1267 let mut chars = line.chars();
1268 let Some(first) = chars.next() else {
1269 return false;
1270 };
1271
1272 if !first.is_ascii_digit() {
1273 return false;
1274 }
1275
1276 matches!(chars.next(), Some('.'))
1277}
1278
1279fn is_zip_archive(bytes: &[u8]) -> bool {
1280 bytes.starts_with(b"PK\x03\x04")
1281 || bytes.starts_with(b"PK\x05\x06")
1282 || bytes.starts_with(b"PK\x07\x08")
1283}
1284
1285pub fn extract_printable_strings(bytes: &[u8]) -> String {
1286 const MIN_LEN: usize = 4;
1287 const MAX_OUTPUT_BYTES: usize = 2_000_000;
1288
1289 fn is_printable_ascii(b: u8) -> bool {
1290 matches!(b, 0x20..=0x7E)
1291 }
1292
1293 let mut out = String::new();
1294 let mut run: Vec<u8> = Vec::new();
1295
1296 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1297 if run.len() >= MIN_LEN {
1298 if !out.is_empty() {
1299 out.push('\n');
1300 }
1301 out.push_str(&String::from_utf8_lossy(run));
1302 }
1303 run.clear();
1304 };
1305
1306 for &b in bytes {
1307 if is_printable_ascii(b) {
1308 run.push(b);
1309 } else {
1310 flush_run(&mut out, &mut run);
1311 if out.len() >= MAX_OUTPUT_BYTES {
1312 return out;
1313 }
1314 }
1315 }
1316 flush_run(&mut out, &mut run);
1317 if out.len() >= MAX_OUTPUT_BYTES {
1318 return out;
1319 }
1320
1321 for start in 0..=1 {
1322 run.clear();
1323 let mut i = start;
1324 while i + 1 < bytes.len() {
1325 let b0 = bytes[i];
1326 let b1 = bytes[i + 1];
1327 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1328 if is_printable_ascii(ch) && zero == 0 {
1329 run.push(ch);
1330 } else {
1331 flush_run(&mut out, &mut run);
1332 if out.len() >= MAX_OUTPUT_BYTES {
1333 return out;
1334 }
1335 }
1336 i += 2;
1337 }
1338 flush_run(&mut out, &mut run);
1339 if out.len() >= MAX_OUTPUT_BYTES {
1340 return out;
1341 }
1342 }
1343
1344 out
1345}
1346
1347#[cfg(test)]
1348mod tests {
1349 use std::path::Path;
1350
1351 use super::{
1352 ExtractedTextKind, classify_file_info, extract_text_for_detection, normalize_mime_type,
1353 };
1354
1355 #[test]
1356 fn test_extract_text_for_detection_skips_jar_archives() {
1357 let path = Path::new(
1358 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
1359 );
1360 let bytes = std::fs::read(path).expect("failed to read jar fixture");
1361
1362 let (text, kind) = extract_text_for_detection(path, &bytes);
1363
1364 assert!(text.is_empty());
1365 assert_eq!(kind, ExtractedTextKind::None);
1366 }
1367
1368 #[test]
1369 fn test_extract_text_for_detection_reads_pdf_fixture_text() {
1370 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1371 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1372
1373 let (text, kind) = extract_text_for_detection(path, &bytes);
1374
1375 assert_eq!(kind, ExtractedTextKind::Pdf);
1376 assert!(text.contains("Redistribution and use in source and binary forms"));
1377 }
1378
1379 #[test]
1380 fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
1381 let path =
1382 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1383 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1384
1385 let (text, kind) = extract_text_for_detection(path, &bytes);
1386
1387 assert_eq!(kind, ExtractedTextKind::Pdf);
1388 assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
1389 assert!(!text.contains("DISCLAIMER OF WARRANTY"));
1390 }
1391
1392 #[test]
1393 fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
1394 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1395 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1396
1397 let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
1398
1399 assert_eq!(kind, ExtractedTextKind::Pdf);
1400 assert!(text.contains("Redistribution and use in source and binary forms"));
1401 }
1402
1403 #[test]
1404 fn test_extract_text_for_detection_skips_zip_like_archives() {
1405 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
1406
1407 let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
1408 let (crate_text, crate_kind) =
1409 extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
1410
1411 assert!(whl_text.is_empty());
1412 assert_eq!(whl_kind, ExtractedTextKind::None);
1413 assert!(crate_text.is_empty());
1414 assert_eq!(crate_kind, ExtractedTextKind::None);
1415 }
1416
1417 #[test]
1418 fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
1419 let path =
1420 Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
1421 let bytes = std::fs::read(path).expect("failed to read lib fixture");
1422
1423 let (text, kind) = extract_text_for_detection(path, &bytes);
1424
1425 assert_ne!(kind, ExtractedTextKind::None);
1426 assert!(text.contains("Copyright nexB and others (c) 2012"));
1427 }
1428
1429 #[test]
1430 fn test_extract_text_for_detection_decodes_svg_fixture_text() {
1431 let path = Path::new(
1432 "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
1433 );
1434 let bytes = std::fs::read(path).expect("failed to read svg fixture");
1435
1436 let (text, kind) = extract_text_for_detection(path, &bytes);
1437
1438 assert_eq!(kind, ExtractedTextKind::Decoded);
1439 assert!(text.contains("creativecommons.org/licenses/publicdomain"));
1440 }
1441
1442 #[test]
1443 fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
1444 assert_eq!(
1445 normalize_mime_type(
1446 Path::new("main.ts"),
1447 b"export const answer = 42;\n",
1448 Some("TypeScript"),
1449 "video/mp2t",
1450 ),
1451 "text/plain"
1452 );
1453 }
1454
1455 #[test]
1456 fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
1457 assert_eq!(
1458 normalize_mime_type(
1459 Path::new("main.js"),
1460 b"console.log('hello');\n",
1461 Some("JavaScript"),
1462 "application/octet-stream",
1463 ),
1464 "text/plain"
1465 );
1466 }
1467
1468 #[test]
1469 fn test_normalize_mime_type_preserves_binary_video_guess() {
1470 assert_eq!(
1471 normalize_mime_type(
1472 Path::new("main.ts"),
1473 &[0, 159, 146, 150, 0, 1, 2, 3],
1474 Some("TypeScript"),
1475 "video/mp2t",
1476 ),
1477 "video/mp2t"
1478 );
1479 }
1480
1481 #[test]
1482 fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
1483 assert_eq!(
1484 normalize_mime_type(
1485 Path::new("main.ts"),
1486 &[0, 159, 146, 150],
1487 Some("TypeScript"),
1488 "application/octet-stream",
1489 ),
1490 "application/octet-stream"
1491 );
1492 }
1493
1494 #[test]
1495 fn test_classify_file_info_marks_empty_files_as_text_not_source() {
1496 let classification = classify_file_info(Path::new("test.txt"), b"");
1497
1498 assert_eq!(classification.mime_type, "inode/x-empty");
1499 assert_eq!(classification.file_type, "empty");
1500 assert!(!classification.is_binary);
1501 assert!(classification.is_text);
1502 assert!(!classification.is_source);
1503 assert_eq!(classification.programming_language, None);
1504 }
1505
1506 #[test]
1507 fn test_classify_file_info_keeps_json_out_of_programming_language() {
1508 let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
1509
1510 assert_eq!(classification.mime_type, "application/json");
1511 assert_eq!(classification.file_type, "JSON text data");
1512 assert!(classification.is_text);
1513 assert!(!classification.is_source);
1514 assert_eq!(classification.programming_language, None);
1515 }
1516
1517 #[test]
1518 fn test_classify_file_info_treats_dockerfile_as_source() {
1519 let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
1520
1521 assert_eq!(
1522 classification.programming_language.as_deref(),
1523 Some("Dockerfile")
1524 );
1525 assert!(classification.is_source);
1526 assert!(!classification.is_script);
1527 assert_eq!(classification.file_type, "UTF-8 Unicode text");
1528 }
1529
1530 #[test]
1531 fn test_classify_file_info_treats_makefile_as_text_not_source() {
1532 let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
1533
1534 assert_eq!(classification.programming_language, None);
1535 assert!(classification.is_text);
1536 assert!(!classification.is_source);
1537 assert!(!classification.is_script);
1538 assert_eq!(classification.file_type, "UTF-8 Unicode text");
1539 }
1540
1541 #[test]
1542 fn test_classify_file_info_marks_supported_package_archives() {
1543 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
1544
1545 let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
1546 let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
1547
1548 assert!(egg.is_archive);
1549 assert_eq!(egg.mime_type, "application/zip");
1550 assert_eq!(egg.file_type, "Zip archive data");
1551 assert!(nupkg.is_archive);
1552 assert_eq!(nupkg.mime_type, "application/zip");
1553 assert_eq!(nupkg.file_type, "Zip archive data");
1554 }
1555
1556 #[test]
1557 fn test_classify_file_info_marks_png_as_binary_media() {
1558 let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
1559
1560 let classification = classify_file_info(Path::new("logo.png"), png_bytes);
1561
1562 assert_eq!(classification.mime_type, "image/png");
1563 assert_eq!(classification.file_type, "PNG image data");
1564 assert!(classification.is_binary);
1565 assert!(!classification.is_text);
1566 assert!(classification.is_media);
1567 assert!(!classification.is_archive);
1568 assert!(!classification.is_source);
1569 }
1570
1571 #[test]
1572 fn test_classify_file_info_marks_binary_blobs_as_binary() {
1573 let classification =
1574 classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
1575
1576 assert!(classification.is_binary);
1577 assert!(!classification.is_text);
1578 assert!(!classification.is_source);
1579 assert_eq!(classification.programming_language, None);
1580 }
1581
1582 #[test]
1583 fn test_classify_file_info_treats_yaml_as_text_not_source() {
1584 let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
1585
1586 assert_eq!(classification.programming_language, None);
1587 assert!(classification.is_text);
1588 assert!(!classification.is_source);
1589 assert_eq!(classification.file_type, "YAML text data");
1590 }
1591
1592 #[test]
1593 fn test_classify_file_info_classifies_common_build_manifests() {
1594 let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
1595 let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
1596 let gitmodules = classify_file_info(
1597 Path::new(".gitmodules"),
1598 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
1599 );
1600
1601 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
1602 assert!(gradle.is_source);
1603 assert_eq!(gradle.mime_type, "text/plain");
1604
1605 assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
1606 assert!(flake.is_source);
1607 assert_eq!(flake.mime_type, "text/plain");
1608
1609 assert_eq!(gitmodules.programming_language, None);
1610 assert!(gitmodules.is_text);
1611 assert!(!gitmodules.is_source);
1612 assert_eq!(gitmodules.file_type, "Git configuration text");
1613 }
1614
1615 #[test]
1616 fn test_classify_file_info_labels_javascript_shebang_scripts() {
1617 let classification = classify_file_info(
1618 Path::new("bin/run"),
1619 b"#!/usr/bin/env node\nconsole.log('hello');\n",
1620 );
1621
1622 assert_eq!(
1623 classification.programming_language.as_deref(),
1624 Some("JavaScript")
1625 );
1626 assert!(classification.is_script);
1627 assert_eq!(
1628 classification.file_type,
1629 "javascript script, UTF-8 Unicode text executable"
1630 );
1631 }
1632
1633 #[test]
1634 fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
1635 let classification = classify_file_info(
1636 Path::new("script.py"),
1637 b"# coding: latin-1\nprint(\"caf\xe9\")\n",
1638 );
1639
1640 assert_eq!(
1641 classification.programming_language.as_deref(),
1642 Some("Python")
1643 );
1644 assert!(classification.is_script);
1645 assert_eq!(classification.file_type, "python script, text executable");
1646 }
1647
1648 #[test]
1649 fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
1650 let cases = [
1651 (
1652 Path::new("bin/run"),
1653 b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
1654 Some("JavaScript"),
1655 true,
1656 true,
1657 ),
1658 (
1659 Path::new("Dockerfile"),
1660 b"FROM scratch\n".as_slice(),
1661 Some("Dockerfile"),
1662 true,
1663 false,
1664 ),
1665 (
1666 Path::new("package.json"),
1667 br#"{"name":"demo"}"#.as_slice(),
1668 None,
1669 false,
1670 false,
1671 ),
1672 (
1673 Path::new("config.yaml"),
1674 b"key: value\n".as_slice(),
1675 None,
1676 false,
1677 false,
1678 ),
1679 (
1680 Path::new("Makefile"),
1681 b"all:\n\techo hi\n".as_slice(),
1682 None,
1683 false,
1684 false,
1685 ),
1686 ];
1687
1688 for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
1689 let classification = classify_file_info(path, bytes);
1690
1691 assert_eq!(
1692 classification.programming_language.as_deref(),
1693 expected_language,
1694 "unexpected language for {}",
1695 path.display()
1696 );
1697 assert_eq!(
1698 classification.is_source,
1699 expected_is_source,
1700 "unexpected is_source for {}",
1701 path.display()
1702 );
1703 assert_eq!(
1704 classification.is_script,
1705 expected_is_script,
1706 "unexpected is_script for {}",
1707 path.display()
1708 );
1709 }
1710 }
1711}