1use std::borrow::Cow;
5use std::collections::BTreeSet;
6use std::fs;
7use std::io::{BufReader, Cursor, Read};
8use std::panic::{AssertUnwindSafe, catch_unwind};
9use std::path::Path;
10
11use chrono::{TimeZone, Utc};
12use file_format::{FileFormat, Kind as FileFormatKind};
13use flate2::read::ZlibDecoder;
14use glob::Pattern;
15use image::{ImageDecoder, ImageFormat, ImageReader};
16use mime_guess::from_path;
17use quick_xml::events::Event;
18use quick_xml::reader::Reader as XmlReader;
19
20use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
21use crate::utils::font::extract_font_metadata_text;
22use crate::utils::language::detect_language;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum ExtractedTextKind {
26 None,
27 Decoded,
28 FontMetadata,
29 Pdf,
30 BinaryStrings,
31 ImageMetadata,
32 WindowsExecutableMetadata,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct FileInfoClassification {
37 pub mime_type: String,
38 pub file_type: String,
39 pub programming_language: Option<String>,
40 pub is_binary: bool,
41 pub is_text: bool,
42 pub is_archive: bool,
43 pub is_media: bool,
44 pub is_source: bool,
45 pub is_script: bool,
46}
47
48const MAX_IMAGE_METADATA_VALUES: usize = 64;
49const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
50const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
51const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
52const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
53const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
54 "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
55];
56const BINARY_EXTENSIONS: &[&str] = &[
57 "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
58];
59const ARCHIVE_EXTENSIONS: &[&str] = &[
60 "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
61 "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
62];
63
64pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
66 metadata.modified().ok().map(|time: std::time::SystemTime| {
67 let seconds_since_epoch = time
68 .duration_since(std::time::UNIX_EPOCH)
69 .unwrap()
70 .as_secs() as i64;
71
72 Utc.timestamp_opt(seconds_since_epoch, 0)
73 .single()
74 .unwrap_or_else(Utc::now)
75 .format("%Y-%m-%d")
76 .to_string()
77 })
78}
79
80pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
82 let path_str = path.to_string_lossy();
83 let file_name = path
84 .file_name()
85 .map(|name| name.to_string_lossy())
86 .unwrap_or_default();
87
88 for pattern in exclude_patterns {
89 if pattern.matches(&path_str) {
91 return true;
92 }
93
94 if pattern.matches(&file_name) {
96 return true;
97 }
98 }
99
100 false
101}
102
103pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
109 match String::from_utf8(bytes.to_vec()) {
110 Ok(s) => s,
111 Err(e) => {
112 let bytes = e.into_bytes();
113 if has_binary_control_chars(&bytes) {
114 return String::new();
115 }
116 bytes.iter().map(|&b| b as char).collect()
117 }
118 }
119}
120
121pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
122 let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
123 (text, kind)
124}
125
126pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
127 let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
128 return Cow::Borrowed(text);
129 };
130 if !matches!(
131 extension.to_ascii_lowercase().as_str(),
132 "md" | "markdown" | "html" | "htm"
133 ) {
134 return Cow::Borrowed(text);
135 }
136
137 let mut hints = Vec::new();
138 if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
139 hints.push("Creative Commons Attribution 4.0 International License".to_string());
140 }
141 if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
142 {
143 hints.push(
144 "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
145 .to_string(),
146 );
147 }
148
149 hints.extend(extract_shields_license_badge_hints(text));
150
151 if hints.is_empty() {
152 Cow::Borrowed(text)
153 } else {
154 let mut augmented =
155 String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
156 augmented.push_str(text);
157 augmented.push_str("\n\n");
158 for (index, hint) in hints.into_iter().enumerate() {
159 if index > 0 {
160 augmented.push('\n');
161 }
162 augmented.push_str(&hint);
163 }
164 Cow::Owned(augmented)
165 }
166}
167
168fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
169 let mut hints = Vec::new();
170 let mut rest = text;
171 let needle = "img.shields.io/badge/license-";
172
173 while let Some(index) = rest.find(needle) {
174 let start = index + needle.len();
175 let suffix = &rest[start..];
176 let end = suffix
177 .find([')', ']', '"', '\'', ' ', '\n'])
178 .unwrap_or(suffix.len());
179 let badge = &suffix[..end];
180 let Some(badge) = badge.strip_suffix(".svg") else {
181 rest = &suffix[end..];
182 continue;
183 };
184
185 let mut segments: Vec<_> = badge
186 .split('-')
187 .filter(|segment| !segment.is_empty())
188 .collect();
189 if segments.len() < 2 {
190 rest = &suffix[end..];
191 continue;
192 }
193 segments.pop();
194 let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
195 if !candidate.is_empty() {
196 hints.push(canonical_shields_license_hint(&candidate));
197 }
198
199 rest = &suffix[end..];
200 }
201
202 hints.sort();
203 hints.dedup();
204 hints
205}
206
207fn canonical_shields_license_hint(candidate: &str) -> String {
208 match candidate.trim() {
209 "MIT" => "The MIT License".to_string(),
210 "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
211 other => format!("{other} License"),
212 }
213}
214
215pub(crate) fn extract_text_for_detection_with_diagnostics(
216 path: &Path,
217 bytes: &[u8],
218) -> (String, ExtractedTextKind, Option<String>) {
219 let ext = path
220 .extension()
221 .and_then(|e| e.to_str())
222 .map(|s| s.to_ascii_lowercase());
223 let detected_format = detect_file_format(bytes);
224
225 if looks_like_rtf(bytes, ext.as_deref()) {
226 let text = extract_rtf_text(bytes);
227 return if text.trim().is_empty() {
228 (String::new(), ExtractedTextKind::None, None)
229 } else {
230 (text, ExtractedTextKind::Decoded, None)
231 };
232 }
233
234 if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
235 let (text, scan_error) = extract_pdf_text(path, bytes);
236 return if text.is_empty() {
237 (String::new(), ExtractedTextKind::None, scan_error)
238 } else {
239 (text, ExtractedTextKind::Pdf, None)
240 };
241 }
242
243 if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
244 let text = extract_image_metadata_text(bytes, format);
245 return if text.is_empty() {
246 if is_supported_image_container(bytes, format) {
247 (String::new(), ExtractedTextKind::None, None)
248 } else {
249 let decoded = decode_bytes_to_string(bytes);
250 if decoded.is_empty() {
251 (String::new(), ExtractedTextKind::None, None)
252 } else {
253 (decoded, ExtractedTextKind::Decoded, None)
254 }
255 }
256 } else {
257 (text, ExtractedTextKind::ImageMetadata, None)
258 };
259 }
260
261 if let Some(text) = extract_font_metadata_text(path, bytes) {
262 let strings = extract_printable_strings(bytes);
263 let combined = if strings.is_empty() {
264 text
265 } else {
266 combine_extracted_text_fragments(Some(text), strings)
267 };
268 return (combined, ExtractedTextKind::FontMetadata, None);
269 }
270
271 let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
272 let large_opaque_binary = windows_executable_metadata_text.is_none()
273 && is_large_opaque_binary_candidate(bytes, detected_format);
274
275 if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
276 return windows_metadata_or_empty_result(windows_executable_metadata_text);
277 }
278
279 if should_skip_binary_string_extraction(path, bytes, detected_format) {
280 return (String::new(), ExtractedTextKind::None, None);
281 }
282
283 if !large_opaque_binary {
284 let decoded = decode_bytes_to_string(bytes);
285 if !decoded.is_empty() {
286 let combined =
287 combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
288 return (combined, ExtractedTextKind::Decoded, None);
289 }
290 }
291
292 let text = if large_opaque_binary {
293 extract_sampled_printable_strings(bytes)
294 } else {
295 extract_printable_strings(bytes)
296 };
297 if text.is_empty() {
298 windows_metadata_or_empty_result(windows_executable_metadata_text)
299 } else {
300 (
301 combine_extracted_text_fragments(windows_executable_metadata_text, text),
302 ExtractedTextKind::BinaryStrings,
303 None,
304 )
305 }
306}
307
308fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
309 match prefix {
310 Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
311 Some(prefix) if !prefix.is_empty() => prefix,
312 _ => suffix,
313 }
314}
315
316fn windows_metadata_or_empty_result(
317 windows_executable_metadata_text: Option<String>,
318) -> (String, ExtractedTextKind, Option<String>) {
319 if let Some(metadata_text) = windows_executable_metadata_text {
320 (
321 metadata_text,
322 ExtractedTextKind::WindowsExecutableMetadata,
323 None,
324 )
325 } else {
326 (String::new(), ExtractedTextKind::None, None)
327 }
328}
329
330pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
331 let detected_format = detect_file_format(bytes);
332 let detected_language = detect_language(path, bytes);
333 let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
334 let is_text = !is_binary;
335 let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
336 let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
337 let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
338 let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
339 let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
340 let programming_language = is_source.then(|| detected_language.clone()).flatten();
341 let file_type = detect_file_type(
342 path,
343 bytes,
344 detected_format,
345 &mime_type,
346 programming_language.as_deref(),
347 is_binary,
348 is_text,
349 is_archive,
350 is_media,
351 is_script,
352 );
353
354 FileInfoClassification {
355 mime_type,
356 file_type,
357 programming_language,
358 is_binary,
359 is_text,
360 is_archive,
361 is_media,
362 is_source,
363 is_script,
364 }
365}
366
367fn detect_file_format(bytes: &[u8]) -> FileFormat {
368 FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
369}
370
371fn is_utf8_text(bytes: &[u8]) -> bool {
372 std::str::from_utf8(bytes).is_ok()
373}
374
375fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
376 if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
377 return None;
378 }
379
380 let (is_le, body) = match bytes {
381 [0xFF, 0xFE, rest @ ..] => (true, rest),
382 [0xFE, 0xFF, rest @ ..] => (false, rest),
383 _ => return None,
384 };
385
386 if body.is_empty() || body.len() % 2 != 0 {
387 return None;
388 }
389
390 let code_units: Vec<u16> = body
391 .chunks_exact(2)
392 .map(|chunk| {
393 if is_le {
394 u16::from_le_bytes([chunk[0], chunk[1]])
395 } else {
396 u16::from_be_bytes([chunk[0], chunk[1]])
397 }
398 })
399 .collect();
400
401 std::char::decode_utf16(code_units)
402 .collect::<Result<String, _>>()
403 .ok()
404}
405
406fn has_binary_control_chars(bytes: &[u8]) -> bool {
407 let control_count = bytes
408 .iter()
409 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
410 .count();
411 control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
412}
413
414fn has_decodable_text(bytes: &[u8]) -> bool {
415 bytes.is_empty()
416 || is_utf8_text(bytes)
417 || decode_utf16_bom_text(bytes).is_some()
418 || !has_binary_control_chars(bytes)
419}
420
421fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
422 if bytes.is_empty() || is_utf8_text(bytes) {
423 return true;
424 }
425 if let Some(decoded) = decode_utf16_bom_text(bytes) {
426 return decoded
427 .chars()
428 .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
429 }
430
431 let printable_count = bytes
432 .iter()
433 .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
434 .count();
435 printable_count * 2 >= bytes.len()
436}
437
438fn is_textual_media_type(media_type: &str) -> bool {
439 media_type.starts_with("text/")
440 || matches!(
441 media_type,
442 "application/json" | "application/xml" | "text/xml"
443 )
444 || media_type.ends_with("+json")
445 || media_type.ends_with("+xml")
446}
447
448fn is_textual_format(detected_format: FileFormat) -> bool {
449 matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
450 || is_textual_media_type(detected_format.media_type())
451}
452
453fn is_known_binary_format(detected_format: FileFormat) -> bool {
454 !matches!(detected_format, FileFormat::ArbitraryBinaryData)
455 && !is_textual_format(detected_format)
456}
457
458pub fn detect_mime_type(
459 path: &Path,
460 bytes: &[u8],
461 detected_format: FileFormat,
462 programming_language: Option<&str>,
463) -> String {
464 if bytes.is_empty() {
465 return "inode/x-empty".to_string();
466 }
467
468 if lower_extension(path).as_deref() == Some("json") {
469 if let Some(is_binary) = json_binary_override(bytes) {
470 if is_binary {
471 return "application/octet-stream".to_string();
472 }
473 if has_valid_json_text(bytes) {
474 return "application/json".to_string();
475 }
476 return "text/plain".to_string();
477 }
478 if has_valid_json_text(bytes) {
479 return "application/json".to_string();
480 }
481 if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
482 return "text/plain".to_string();
483 }
484 return "application/octet-stream".to_string();
485 }
486
487 if is_zip_archive(bytes) {
488 return detect_zip_like_mime(path);
489 }
490
491 if looks_like_deb(bytes, path) {
492 return "application/vnd.debian.binary-package".to_string();
493 }
494
495 if looks_like_rpm(bytes, path) {
496 return "application/x-rpm".to_string();
497 }
498
499 let guessed_mime = from_path(path)
500 .first_or_octet_stream()
501 .essence_str()
502 .to_string();
503
504 let mime_type = match detected_format {
505 FileFormat::Empty => "inode/x-empty".to_string(),
506 FileFormat::PlainText => {
507 if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
508 "text/plain".to_string()
509 } else {
510 guessed_mime.clone()
511 }
512 }
513 _ => {
514 let detected_mime = detected_format.media_type();
515 if detected_mime == "application/octet-stream"
516 && guessed_mime != "application/octet-stream"
517 {
518 guessed_mime.clone()
519 } else {
520 detected_mime.to_string()
521 }
522 }
523 };
524
525 normalize_mime_type(path, bytes, programming_language, &mime_type)
526}
527
528fn normalize_mime_type(
529 path: &Path,
530 bytes: &[u8],
531 programming_language: Option<&str>,
532 mime_type: &str,
533) -> String {
534 if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
535 return "text/plain".to_string();
536 }
537
538 mime_type.to_string()
539}
540
541fn should_prefer_text_mime(
542 path: &Path,
543 bytes: &[u8],
544 programming_language: Option<&str>,
545 mime_type: &str,
546) -> bool {
547 has_decodable_text(bytes)
548 && looks_like_textual_bytes(bytes)
549 && is_textual_source_candidate(path, programming_language)
550 && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
551}
552
553fn has_valid_json_text(bytes: &[u8]) -> bool {
554 if bytes.len() > JSON_VALIDATION_MAX_BYTES {
555 return false;
556 }
557
558 serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
559 || decode_utf16_bom_text(bytes)
560 .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
561 .is_some()
562}
563
564fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
565 !bytes.contains(&0)
566 && !bytes.contains(&0xFF)
567 && bytes.starts_with(b"[\"")
568 && bytes.ends_with(b"\"]")
569 && bytes.len() >= 8
570}
571
572fn json_binary_override(bytes: &[u8]) -> Option<bool> {
573 if has_valid_json_text(bytes) || decode_utf16_bom_text(bytes).is_some() {
574 return Some(false);
575 }
576
577 if bytes.contains(&0) {
578 return Some(true);
579 }
580
581 if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
582 return Some(true);
583 }
584
585 if is_wrapped_invalid_json_string_text(bytes) {
586 return Some(false);
587 }
588
589 None
590}
591
592fn detect_is_binary(
593 path: &Path,
594 bytes: &[u8],
595 detected_format: FileFormat,
596 programming_language: Option<&str>,
597) -> bool {
598 if lower_extension(path).as_deref() == Some("json")
599 && let Some(is_binary) = json_binary_override(bytes)
600 {
601 return is_binary;
602 }
603
604 if is_textual_format(detected_format) {
605 return false;
606 }
607
608 if lower_extension(path)
609 .as_deref()
610 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
611 {
612 return true;
613 }
614
615 if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
616 return false;
617 }
618
619 has_binary_control_chars(bytes)
620 || is_known_binary_format(detected_format)
621 || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
622 && !looks_like_textual_bytes(bytes))
623}
624
625fn should_treat_binary_bytes_as_text(
626 path: &Path,
627 bytes: &[u8],
628 programming_language: Option<&str>,
629) -> bool {
630 has_decodable_text(bytes)
631 && looks_like_textual_bytes(bytes)
632 && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
633}
634
635fn detect_is_archive(
636 path: &Path,
637 bytes: &[u8],
638 mime_type: &str,
639 is_text: bool,
640 detected_format: FileFormat,
641) -> bool {
642 if is_text {
643 return false;
644 }
645
646 lower_extension(path)
647 .as_deref()
648 .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
649 || matches!(
650 detected_format.kind(),
651 FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
652 )
653 || is_zip_archive(bytes)
654 || looks_like_gzip(bytes)
655 || looks_like_bzip2(bytes)
656 || looks_like_xz(bytes)
657 || looks_like_deb(bytes, path)
658 || looks_like_rpm(bytes, path)
659 || looks_like_squashfs(bytes, path)
660 || mime_type.contains("zip")
661 || mime_type.contains("compressed")
662 || mime_type.contains("tar")
663 || mime_type.contains("x-rpm")
664 || mime_type.contains("debian")
665}
666
667fn detect_is_media(
668 path: &Path,
669 bytes: &[u8],
670 mime_type: &str,
671 detected_format: FileFormat,
672) -> bool {
673 media_mime_from_content(bytes).is_some()
674 || matches!(
675 detected_format.kind(),
676 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
677 )
678 || mime_type.starts_with("image/")
679 || mime_type.starts_with("audio/")
680 || mime_type.starts_with("video/")
681 || (mime_type == "application/octet-stream"
682 && lower_extension(path).as_deref() == Some("tga")
683 && !has_binary_control_chars(bytes))
684}
685
686fn detect_is_script(
687 path: &Path,
688 bytes: &[u8],
689 programming_language: Option<&str>,
690 is_text: bool,
691) -> bool {
692 if !is_text || is_makefile(path) {
693 return false;
694 }
695
696 bytes.starts_with(b"#!")
697 || lower_extension(path).as_deref().is_some_and(|ext| {
698 matches!(
699 ext,
700 "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
701 )
702 })
703 || matches!(
704 programming_language,
705 Some(
706 "Shell"
707 | "Bash"
708 | "Zsh"
709 | "Fish"
710 | "Ksh"
711 | "Python"
712 | "Ruby"
713 | "Perl"
714 | "PHP"
715 | "PowerShell"
716 | "Awk"
717 )
718 )
719}
720
721fn detect_is_source(
722 path: &Path,
723 programming_language: Option<&str>,
724 is_text: bool,
725 is_script: bool,
726) -> bool {
727 if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
728 return false;
729 }
730
731 if is_c_like_source(path) || is_java_like_source(path) {
732 return true;
733 }
734
735 programming_language.is_some() || is_script
736}
737
738#[allow(clippy::too_many_arguments)]
739fn detect_file_type(
740 path: &Path,
741 bytes: &[u8],
742 detected_format: FileFormat,
743 mime_type: &str,
744 programming_language: Option<&str>,
745 is_binary: bool,
746 is_text: bool,
747 is_archive: bool,
748 is_media: bool,
749 is_script: bool,
750) -> String {
751 if bytes.is_empty() {
752 return "empty".to_string();
753 }
754
755 if looks_like_pdf(bytes) {
756 return "PDF document".to_string();
757 }
758
759 if let Some(file_type) = media_file_type_from_content(bytes) {
760 return file_type.to_string();
761 }
762
763 if is_archive {
764 return archive_file_type(path, bytes, detected_format);
765 }
766
767 if is_script {
768 return script_file_type(programming_language, bytes);
769 }
770
771 if is_text {
772 if lower_extension(path).as_deref() == Some("json") {
773 if has_valid_json_text(bytes) {
774 return "JSON text data".to_string();
775 }
776 return text_file_type(bytes);
777 }
778 if lower_extension(path).as_deref() == Some("xml") {
779 return "XML text data".to_string();
780 }
781 if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
782 return "YAML text data".to_string();
783 }
784 if lower_extension(path).as_deref() == Some("toml") {
785 return "TOML text data".to_string();
786 }
787 if matches!(
788 lower_extension(path).as_deref(),
789 Some("ini" | "cfg" | "conf")
790 ) {
791 return "INI text data".to_string();
792 }
793 if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
794 return "Git configuration text".to_string();
795 }
796 if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
797 return text_file_type(bytes);
798 }
799 if programming_language.is_some() && !is_media {
800 return source_file_type(programming_language, bytes);
801 }
802 return text_file_type(bytes);
803 }
804
805 if let Some(file_type) = format_based_file_type(detected_format) {
806 return file_type;
807 }
808
809 if is_binary && mime_type == "application/octet-stream" {
810 return "data".to_string();
811 }
812
813 mime_type.to_string()
814}
815
816fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
817 if matches!(programming_language, Some(language) if is_source_like_language(language)) {
818 return true;
819 }
820
821 if matches!(
822 lower_file_name(path).as_str(),
823 "dockerfile"
824 | "containerfile"
825 | "containerfile.core"
826 | "apkbuild"
827 | "podfile"
828 | "jamfile"
829 | "jamroot"
830 | "meson.build"
831 | "build"
832 | "workspace"
833 | "buck"
834 | "default.nix"
835 | "flake.nix"
836 | "shell.nix"
837 ) {
838 return true;
839 }
840
841 path.extension()
842 .and_then(|ext| ext.to_str())
843 .is_some_and(|ext| {
844 matches!(
845 ext.to_ascii_lowercase().as_str(),
846 "rs" | "py"
847 | "js"
848 | "mjs"
849 | "cjs"
850 | "jsx"
851 | "ts"
852 | "mts"
853 | "cts"
854 | "tsx"
855 | "c"
856 | "cpp"
857 | "cc"
858 | "cxx"
859 | "h"
860 | "hpp"
861 | "m"
862 | "mm"
863 | "s"
864 | "asm"
865 | "java"
866 | "go"
867 | "rb"
868 | "php"
869 | "pl"
870 | "swift"
871 | "sh"
872 | "bash"
873 | "zsh"
874 | "fish"
875 | "ksh"
876 | "ps1"
877 | "psm1"
878 | "psd1"
879 | "awk"
880 | "kt"
881 | "kts"
882 | "dart"
883 | "scala"
884 | "groovy"
885 | "gradle"
886 | "gvy"
887 | "gy"
888 | "gsh"
889 | "cs"
890 | "fs"
891 | "fsx"
892 | "r"
893 | "lua"
894 | "jl"
895 | "ex"
896 | "exs"
897 | "clj"
898 | "cljs"
899 | "cljc"
900 | "hs"
901 | "erl"
902 | "nix"
903 | "zig"
904 | "bzl"
905 | "bazel"
906 | "star"
907 | "sky"
908 | "ml"
909 | "mli"
910 | "tex"
911 )
912 })
913}
914
915fn is_source_like_language(language: &str) -> bool {
916 matches!(
917 language,
918 "Rust"
919 | "Python"
920 | "JavaScript"
921 | "TypeScript"
922 | "JavaScript/TypeScript"
923 | "C"
924 | "C++"
925 | "Objective-C"
926 | "Objective-C++"
927 | "GAS"
928 | "Java"
929 | "Go"
930 | "Ruby"
931 | "PHP"
932 | "Perl"
933 | "Swift"
934 | "Shell"
935 | "PowerShell"
936 | "Awk"
937 | "Kotlin"
938 | "Dart"
939 | "Scala"
940 | "C#"
941 | "F#"
942 | "R"
943 | "Lua"
944 | "Julia"
945 | "Elixir"
946 | "Clojure"
947 | "Haskell"
948 | "Erlang"
949 | "Groovy"
950 | "Nix"
951 | "Zig"
952 | "Starlark"
953 | "OCaml"
954 | "Meson"
955 | "TeX"
956 | "Dockerfile"
957 | "Makefile"
958 | "Jamfile"
959 )
960}
961
962fn extension(path: &Path) -> Option<&str> {
963 path.extension().and_then(|ext| ext.to_str())
964}
965
966fn lower_extension(path: &Path) -> Option<String> {
967 extension(path).map(|ext| ext.to_ascii_lowercase())
968}
969
970fn lower_file_name(path: &Path) -> String {
971 path.file_name()
972 .and_then(|name| name.to_str())
973 .map(|name| name.to_ascii_lowercase())
974 .unwrap_or_default()
975}
976
977fn is_plain_text(path: &Path) -> bool {
978 lower_extension(path)
979 .as_deref()
980 .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
981}
982
983fn is_makefile(path: &Path) -> bool {
984 matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
985}
986
987fn is_source_map(path: &Path) -> bool {
988 let path_lower = path.to_string_lossy().to_ascii_lowercase();
989 path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
990}
991
992fn is_c_like_source(path: &Path) -> bool {
993 lower_extension(path).as_deref().is_some_and(|ext| {
994 matches!(
995 ext,
996 "c" | "cc"
997 | "cp"
998 | "cpp"
999 | "cxx"
1000 | "c++"
1001 | "h"
1002 | "hh"
1003 | "hpp"
1004 | "hxx"
1005 | "h++"
1006 | "i"
1007 | "ii"
1008 | "m"
1009 | "s"
1010 | "asm"
1011 )
1012 })
1013}
1014
1015fn is_java_like_source(path: &Path) -> bool {
1016 lower_extension(path)
1017 .as_deref()
1018 .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1019}
1020
1021fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1022 match detected_format {
1023 FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1024 format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1025 format => Some(match format.kind() {
1026 FileFormatKind::Image => short_name_or_name(&format, "image data"),
1027 FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1028 FileFormatKind::Video => short_name_or_name(&format, "video data"),
1029 _ => format.name().to_string(),
1030 }),
1031 }
1032}
1033
1034fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1035 format
1036 .short_name()
1037 .map(|short_name| format!("{short_name} {suffix}"))
1038 .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1039}
1040
1041fn detect_zip_like_mime(path: &Path) -> String {
1042 match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1043 Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1044 Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1045 "application/java-archive".to_string()
1046 }
1047 _ => "application/zip".to_string(),
1048 }
1049}
1050
1051fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1052 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1053 Some("image/png")
1054 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1055 Some("image/jpeg")
1056 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1057 Some("image/tiff")
1058 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1059 Some("image/webp")
1060 } else {
1061 None
1062 }
1063}
1064
1065fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1066 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1067 Some("PNG image data")
1068 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1069 Some("JPEG image data")
1070 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1071 Some("TIFF image data")
1072 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1073 Some("WebP image data")
1074 } else {
1075 None
1076 }
1077}
1078
1079fn looks_like_pdf(bytes: &[u8]) -> bool {
1080 bytes.starts_with(b"%PDF-")
1081}
1082
1083fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1084 ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1085}
1086
1087fn extract_rtf_text(bytes: &[u8]) -> String {
1088 let text = String::from_utf8_lossy(bytes);
1089 let chars: Vec<char> = text.chars().collect();
1090 let mut output = String::new();
1091 let mut index = 0usize;
1092
1093 while index < chars.len() {
1094 match chars[index] {
1095 '{' | '}' => {
1096 index += 1;
1097 }
1098 '\\' => {
1099 index += 1;
1100 if index >= chars.len() {
1101 break;
1102 }
1103
1104 match chars[index] {
1105 '\\' | '{' | '}' => {
1106 output.push(chars[index]);
1107 index += 1;
1108 }
1109 '\'' => {
1110 if index + 2 < chars.len() {
1111 let hex = [chars[index + 1], chars[index + 2]];
1112 let hex: String = hex.iter().collect();
1113 if let Ok(value) = u8::from_str_radix(&hex, 16) {
1114 output.push(value as char);
1115 index += 3;
1116 continue;
1117 }
1118 }
1119 index += 1;
1120 }
1121 control if control.is_ascii_alphabetic() => {
1122 let start = index;
1123 while index < chars.len() && chars[index].is_ascii_alphabetic() {
1124 index += 1;
1125 }
1126 let control_word: String = chars[start..index].iter().collect();
1127
1128 let number_start = index;
1129 if index < chars.len()
1130 && (chars[index] == '-' || chars[index].is_ascii_digit())
1131 {
1132 index += 1;
1133 while index < chars.len() && chars[index].is_ascii_digit() {
1134 index += 1;
1135 }
1136 }
1137 let parameter: String = chars[number_start..index].iter().collect();
1138
1139 if index < chars.len() && chars[index] == ' ' {
1140 index += 1;
1141 }
1142
1143 match control_word.as_str() {
1144 "par" | "line" => output.push('\n'),
1145 "tab" => output.push('\t'),
1146 "emdash" => output.push('—'),
1147 "endash" => output.push('–'),
1148 "bullet" => output.push('•'),
1149 "lquote" | "rquote" => output.push('\''),
1150 "ldblquote" | "rdblquote" => output.push('"'),
1151 "u" => {
1152 if let Ok(codepoint) = parameter.parse::<i32>() {
1153 let normalized = if codepoint < 0 {
1154 codepoint + 65_536
1155 } else {
1156 codepoint
1157 };
1158 if let Ok(normalized) = u32::try_from(normalized)
1159 && let Some(ch) = char::from_u32(normalized)
1160 {
1161 output.push(ch);
1162 }
1163 }
1164
1165 if index < chars.len()
1166 && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1167 {
1168 index += 1;
1169 }
1170 }
1171 _ => {}
1172 }
1173 }
1174 _ => {
1175 index += 1;
1176 }
1177 }
1178 }
1179 ch => {
1180 output.push(ch);
1181 index += 1;
1182 }
1183 }
1184 }
1185
1186 output
1187 .replace(['\r', '\u{0c}'], "\n")
1188 .lines()
1189 .map(str::trim_end)
1190 .collect::<Vec<_>>()
1191 .join("\n")
1192}
1193
1194fn looks_like_gzip(bytes: &[u8]) -> bool {
1195 bytes.starts_with(&[0x1f, 0x8b])
1196}
1197
1198fn looks_like_bzip2(bytes: &[u8]) -> bool {
1199 bytes.starts_with(b"BZh")
1200}
1201
1202fn looks_like_xz(bytes: &[u8]) -> bool {
1203 bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1204}
1205
1206fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1207 lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1208}
1209
1210fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1211 lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1212}
1213
1214fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1215 lower_extension(path)
1216 .as_deref()
1217 .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1218 && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1219 || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1220}
1221
1222fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1223 if looks_like_deb(bytes, path) {
1224 "debian binary package (format 2.0)".to_string()
1225 } else if looks_like_rpm(bytes, path) {
1226 "RPM package".to_string()
1227 } else if looks_like_squashfs(bytes, path) {
1228 "Squashfs filesystem".to_string()
1229 } else if looks_like_gzip(bytes) {
1230 "gzip compressed data".to_string()
1231 } else if looks_like_bzip2(bytes) {
1232 "bzip2 compressed data".to_string()
1233 } else if looks_like_xz(bytes) {
1234 "XZ compressed data".to_string()
1235 } else if is_zip_archive(bytes) {
1236 "Zip archive data".to_string()
1237 } else if lower_extension(path).as_deref() == Some("gem") {
1238 "POSIX tar archive".to_string()
1239 } else if let Some(file_type) = format_based_file_type(detected_format) {
1240 file_type
1241 } else {
1242 "archive data".to_string()
1243 }
1244}
1245
1246fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1247 let suffix = text_executable_label(bytes);
1248
1249 match programming_language {
1250 Some("Python") => format!("python script, {suffix}"),
1251 Some("Ruby") => format!("ruby script, {suffix}"),
1252 Some("Perl") => format!("perl script, {suffix}"),
1253 Some("PHP") => format!("php script, {suffix}"),
1254 Some("Shell") => format!("shell script, {suffix}"),
1255 Some("Bash") => format!("bash script, {suffix}"),
1256 Some("Zsh") => format!("zsh script, {suffix}"),
1257 Some("Fish") => format!("fish script, {suffix}"),
1258 Some("Ksh") => format!("ksh script, {suffix}"),
1259 Some("JavaScript") => format!("javascript script, {suffix}"),
1260 Some("TypeScript") => format!("typescript script, {suffix}"),
1261 Some("PowerShell") => format!("powershell script, {suffix}"),
1262 Some("Awk") => format!("awk script, {suffix}"),
1263 _ => format!("script, {suffix}"),
1264 }
1265}
1266
1267fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1268 let suffix = text_label(bytes);
1269 match programming_language {
1270 Some("C") => format!("C source, {suffix}"),
1271 Some("C++") => format!("C++ source, {suffix}"),
1272 Some("Java") => format!("Java source, {suffix}"),
1273 Some("C#") => format!("C# source, {suffix}"),
1274 Some("F#") => format!("F# source, {suffix}"),
1275 Some("Go") => format!("Go source, {suffix}"),
1276 Some("Rust") => format!("Rust source, {suffix}"),
1277 Some("Starlark") => format!("Starlark source, {suffix}"),
1278 Some("CMake") => format!("CMake source, {suffix}"),
1279 Some("Meson") => format!("Meson source, {suffix}"),
1280 Some("Nix") => format!("Nix source, {suffix}"),
1281 Some("Groovy") => format!("Groovy source, {suffix}"),
1282 Some("Makefile") => format!("Makefile source, {suffix}"),
1283 Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1284 Some("Jamfile") => format!("Jamfile source, {suffix}"),
1285 Some("Batchfile") => format!("Batchfile source, {suffix}"),
1286 Some(language) => format!("{language} source, {suffix}"),
1287 None => text_file_type(bytes),
1288 }
1289}
1290
1291fn text_file_type(bytes: &[u8]) -> String {
1292 text_label(bytes).to_string()
1293}
1294
1295fn text_label(bytes: &[u8]) -> &'static str {
1296 if std::str::from_utf8(bytes).is_ok() {
1297 if bytes.contains(&b'\n') {
1298 "UTF-8 Unicode text"
1299 } else {
1300 "UTF-8 Unicode text, with no line terminators"
1301 }
1302 } else if bytes.contains(&b'\n') {
1303 "text"
1304 } else {
1305 "text, with no line terminators"
1306 }
1307}
1308
1309fn text_executable_label(bytes: &[u8]) -> &'static str {
1310 if std::str::from_utf8(bytes).is_ok() {
1311 if bytes.contains(&b'\n') {
1312 "UTF-8 Unicode text executable"
1313 } else {
1314 "UTF-8 Unicode text executable, with no line terminators"
1315 }
1316 } else if bytes.contains(&b'\n') {
1317 "text executable"
1318 } else {
1319 "text executable, with no line terminators"
1320 }
1321}
1322
1323fn supported_image_metadata_format(
1324 ext: Option<&str>,
1325 detected_format: FileFormat,
1326) -> Option<ImageFormat> {
1327 match ext {
1328 Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1329 Some("png") => Some(ImageFormat::Png),
1330 Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1331 Some("webp") => Some(ImageFormat::WebP),
1332 _ => match detected_format.media_type() {
1333 "image/jpeg" => Some(ImageFormat::Jpeg),
1334 "image/png" => Some(ImageFormat::Png),
1335 "image/tiff" => Some(ImageFormat::Tiff),
1336 "image/webp" => Some(ImageFormat::WebP),
1337 _ => None,
1338 },
1339 }
1340}
1341
1342fn should_skip_binary_string_extraction(
1343 path: &Path,
1344 bytes: &[u8],
1345 detected_format: FileFormat,
1346) -> bool {
1347 matches!(lower_extension(path).as_deref(), Some("pdf"))
1348 || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1349 .is_some()
1350 || (matches!(
1351 detected_format.kind(),
1352 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1353 ) && !is_textual_format(detected_format))
1354 || media_mime_from_content(bytes).is_some()
1355 || is_zip_archive(bytes)
1356 || looks_like_gzip(bytes)
1357 || looks_like_bzip2(bytes)
1358 || looks_like_xz(bytes)
1359 || looks_like_deb(bytes, path)
1360 || looks_like_rpm(bytes, path)
1361 || looks_like_squashfs(bytes, path)
1362}
1363
1364fn should_skip_large_opaque_binary_text_extraction(
1365 _path: &Path,
1366 bytes: &[u8],
1367 detected_format: FileFormat,
1368) -> bool {
1369 is_large_opaque_binary_candidate(bytes, detected_format)
1370 && !sample_has_promising_printable_strings(bytes)
1371}
1372
1373fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1374 bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1375 && !is_textual_format(detected_format)
1376 && !matches!(
1377 detected_format.kind(),
1378 FileFormatKind::Archive
1379 | FileFormatKind::Compressed
1380 | FileFormatKind::Package
1381 | FileFormatKind::Audio
1382 | FileFormatKind::Image
1383 | FileFormatKind::Video
1384 )
1385}
1386
1387fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1388 const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1389
1390 let mut ranges = Vec::new();
1391 let mut push_range = |start: usize, end: usize| {
1392 if start < end && !ranges.contains(&(start, end)) {
1393 ranges.push((start, end));
1394 }
1395 };
1396
1397 push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1398 if len > SAMPLE_WINDOW_BYTES * 2 {
1399 let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1400 let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1401 push_range(mid_start, mid_end);
1402 }
1403 if len > SAMPLE_WINDOW_BYTES {
1404 push_range(len - SAMPLE_WINDOW_BYTES, len);
1405 }
1406
1407 ranges
1408}
1409
1410fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1411 let mut structured_signal_seen = false;
1412 let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1413 .into_iter()
1414 .filter(|&(start, end)| {
1415 let window = &bytes[start..end];
1416 if has_strong_structured_text_signal(window) {
1417 structured_signal_seen = true;
1418 }
1419 has_license_or_notice_signal(window)
1420 })
1421 .count();
1422
1423 structured_signal_seen || promising_license_windows >= 2
1424}
1425
1426fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1427 let mut combined_lines = BTreeSet::new();
1428
1429 for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1430 let window_text = extract_printable_strings(&bytes[start..end]);
1431 for line in window_text
1432 .lines()
1433 .map(str::trim)
1434 .filter(|line| !line.is_empty())
1435 {
1436 combined_lines.insert(line.to_string());
1437 }
1438 }
1439
1440 combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1441}
1442
1443fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1444 let strings = extract_printable_strings(bytes);
1445 if strings.is_empty() {
1446 return false;
1447 }
1448
1449 let lower = strings.to_ascii_lowercase();
1450 [
1451 "copyright",
1452 "license",
1453 "licensed under",
1454 "all rights reserved",
1455 "permission is hereby granted",
1456 "redistribution and use",
1457 "spdx-license-identifier",
1458 ]
1459 .iter()
1460 .any(|marker| lower.contains(marker))
1461}
1462
1463fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1464 let strings = extract_printable_strings(bytes);
1465 if strings.is_empty() {
1466 return false;
1467 }
1468
1469 let email_markers = strings.matches('@').count();
1470 let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1471
1472 email_markers + url_markers >= 3
1473}
1474
1475fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1476 match format {
1477 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1478 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1479 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1480 ImageFormat::WebP => {
1481 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1482 }
1483 _ => false,
1484 }
1485}
1486
1487fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1488 let mut values = Vec::new();
1489 values.extend(extract_exif_metadata_values(bytes));
1490 values.extend(extract_xmp_metadata_values(bytes, format));
1491 values_to_text(values)
1492}
1493
1494fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1495 let mut cursor = BufReader::new(Cursor::new(bytes));
1496 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1497 Ok(exif) => exif,
1498 Err(_) => return Vec::new(),
1499 };
1500
1501 let mut values = Vec::new();
1502 for field in exif.fields() {
1503 let rendered = match field.tag {
1504 exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1505 Some(field.display_value().with_unit(&exif).to_string())
1506 }
1507 exif::Tag::Artist => Some(format!(
1508 "Author: {}",
1509 field.display_value().with_unit(&exif)
1510 )),
1511 _ => None,
1512 };
1513
1514 if let Some(rendered) = rendered {
1515 values.push(rendered);
1516 }
1517 }
1518
1519 values
1520}
1521
1522fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1523 let xmp = match extract_raw_xmp_packet(bytes, format) {
1524 Some(xmp) => xmp,
1525 None => return Vec::new(),
1526 };
1527
1528 parse_xmp_values(&xmp)
1529}
1530
1531fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1532 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1533 if let Ok(mut decoder) = reader.into_decoder()
1534 && let Ok(Some(xmp)) = decoder.xmp_metadata()
1535 {
1536 return Some(xmp);
1537 }
1538
1539 match format {
1540 ImageFormat::Png => extract_png_xmp_packet(bytes),
1541 _ => None,
1542 }
1543}
1544
1545fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1546 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1547
1548 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1549 return None;
1550 }
1551
1552 let mut offset = PNG_SIGNATURE.len();
1553 while offset + 12 <= bytes.len() {
1554 let length = u32::from_be_bytes([
1555 bytes[offset],
1556 bytes[offset + 1],
1557 bytes[offset + 2],
1558 bytes[offset + 3],
1559 ]) as usize;
1560 let chunk_start = offset + 8;
1561 let chunk_end = chunk_start + length;
1562 if chunk_end + 4 > bytes.len() {
1563 return None;
1564 }
1565
1566 let chunk_type = &bytes[offset + 4..offset + 8];
1567 if chunk_type == b"iTXt" {
1568 let data = &bytes[chunk_start..chunk_end];
1569 if let Some(xmp) = parse_png_itxt_xmp(data) {
1570 return Some(xmp);
1571 }
1572 }
1573
1574 offset = chunk_end + 4;
1575 }
1576
1577 None
1578}
1579
1580fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1581 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1582
1583 let keyword_end = data.iter().position(|&b| b == 0)?;
1584 if &data[..keyword_end] != XMP_KEYWORD {
1585 return None;
1586 }
1587
1588 let mut cursor = keyword_end + 1;
1589 let compression_flag = *data.get(cursor)?;
1590 cursor += 1;
1591 let compression_method = *data.get(cursor)?;
1592 cursor += 1;
1593 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1594 return None;
1595 }
1596
1597 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1598 cursor = language_end + 1;
1599
1600 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1601 cursor = translated_end + 1;
1602
1603 let text_bytes = &data[cursor..];
1604 if compression_flag == 1 {
1605 let mut decoder = ZlibDecoder::new(text_bytes);
1606 let mut decoded = Vec::new();
1607 decoder.read_to_end(&mut decoded).ok()?;
1608 Some(decoded)
1609 } else {
1610 Some(text_bytes.to_vec())
1611 }
1612}
1613
1614fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1615 let mut reader = XmlReader::from_reader(xmp);
1616 reader.config_mut().trim_text(true);
1617
1618 let mut buf = Vec::new();
1619 let mut stack: Vec<String> = Vec::new();
1620 let mut values = Vec::new();
1621
1622 loop {
1623 match reader.read_event_into(&mut buf) {
1624 Ok(Event::Start(e)) => {
1625 stack.push(local_xml_name(e.name().as_ref()));
1626 }
1627 Ok(Event::End(_)) => {
1628 stack.pop();
1629 }
1630 Ok(Event::Empty(_)) => {}
1631 Ok(Event::Text(text)) => {
1632 if let Some(field) = stack
1633 .iter()
1634 .rev()
1635 .find_map(|name| allowed_xmp_field(name.as_str()))
1636 && let Ok(decoded) = text.decode()
1637 {
1638 let decoded = decoded.into_owned();
1639 if !decoded.trim().is_empty() {
1640 values.push(format_xmp_value(field, &decoded));
1641 }
1642 }
1643 }
1644 Ok(Event::CData(text)) => {
1645 if let Some(field) = stack
1646 .iter()
1647 .rev()
1648 .find_map(|name| allowed_xmp_field(name.as_str()))
1649 && let Ok(decoded) = text.decode()
1650 {
1651 let decoded = decoded.into_owned();
1652 if !decoded.trim().is_empty() {
1653 values.push(format_xmp_value(field, &decoded));
1654 }
1655 }
1656 }
1657 Ok(Event::Eof) | Err(_) => break,
1658 _ => {}
1659 }
1660 buf.clear();
1661 }
1662
1663 values
1664}
1665
1666fn local_xml_name(name: &[u8]) -> String {
1667 let name = std::str::from_utf8(name).unwrap_or_default();
1668 name.rsplit(':').next().unwrap_or(name).to_string()
1669}
1670
1671fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1672 match name {
1673 "creator" => Some("creator"),
1674 "rights" => Some("rights"),
1675 "description" => Some("description"),
1676 "title" => Some("title"),
1677 "subject" => Some("subject"),
1678 "UsageTerms" => Some("usage_terms"),
1679 "WebStatement" => Some("web_statement"),
1680 _ => None,
1681 }
1682}
1683
1684fn format_xmp_value(field: &str, value: &str) -> String {
1685 match field {
1686 "creator" => format!("Author: {value}"),
1687 _ => value.to_string(),
1688 }
1689}
1690
1691fn values_to_text(values: Vec<String>) -> String {
1692 let mut seen = BTreeSet::new();
1693 let mut lines = Vec::new();
1694 let mut total_bytes = 0usize;
1695
1696 for value in values {
1697 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1698 break;
1699 }
1700
1701 let normalized = normalize_metadata_value(&value);
1702 if normalized.is_empty() || !seen.insert(normalized.clone()) {
1703 continue;
1704 }
1705
1706 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1707 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1708 break;
1709 }
1710
1711 total_bytes += added_bytes;
1712 lines.push(normalized);
1713 }
1714
1715 lines.join("\n")
1716}
1717
1718fn normalize_metadata_value(value: &str) -> String {
1719 value
1720 .chars()
1721 .filter(|&ch| ch != '\0')
1722 .collect::<String>()
1723 .split_whitespace()
1724 .collect::<Vec<_>>()
1725 .join(" ")
1726 .trim()
1727 .to_string()
1728}
1729
1730fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1731 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1732 return (String::new(), None);
1733 }
1734
1735 let mut failures = Vec::new();
1736 let mut saw_success = false;
1737
1738 let extracted = catch_unwind(AssertUnwindSafe(
1739 || -> Result<String, Box<dyn std::error::Error>> {
1740 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1741 extract_first_pdf_page_text(&mut document)
1742 },
1743 ));
1744 match extracted {
1745 Ok(Ok(text)) => {
1746 saw_success = true;
1747 if let Some(normalized) = normalize_pdf_text(text) {
1748 return (normalized, None);
1749 }
1750 }
1751 Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1752 Err(payload) => failures.push(format!(
1753 "from-bytes first-page panic: {}",
1754 panic_payload_to_string(payload.as_ref())
1755 )),
1756 }
1757
1758 let extracted = catch_unwind(AssertUnwindSafe(
1759 || -> Result<String, Box<dyn std::error::Error>> {
1760 let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1761 extract_pdf_text_from_document(&mut document)
1762 },
1763 ));
1764 match extracted {
1765 Ok(Ok(text)) => {
1766 saw_success = true;
1767 if let Some(normalized) = normalize_pdf_text(text) {
1768 return (normalized, None);
1769 }
1770 }
1771 Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1772 Err(payload) => failures.push(format!(
1773 "open full-document panic: {}",
1774 panic_payload_to_string(payload.as_ref())
1775 )),
1776 }
1777
1778 let extracted = catch_unwind(AssertUnwindSafe(
1779 || -> Result<String, Box<dyn std::error::Error>> {
1780 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1781 extract_pdf_text_from_document(&mut document)
1782 },
1783 ));
1784 match extracted {
1785 Ok(Ok(text)) => {
1786 saw_success = true;
1787 if let Some(normalized) = normalize_pdf_text(text) {
1788 return (normalized, None);
1789 }
1790 }
1791 Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1792 Err(payload) => failures.push(format!(
1793 "from-bytes full-document panic: {}",
1794 panic_payload_to_string(payload.as_ref())
1795 )),
1796 }
1797
1798 if saw_success || is_non_actionable_pdf_failure(&failures) {
1799 (String::new(), None)
1800 } else {
1801 (
1802 String::new(),
1803 Some(format!(
1804 "PDF text extraction failed after {} attempts: {}",
1805 failures.len(),
1806 failures.join("; ")
1807 )),
1808 )
1809 }
1810}
1811
1812fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
1813 !failures.is_empty()
1814 && failures.iter().all(|failure| {
1815 failure.contains("requires a password")
1816 || failure.contains("Encrypt dictionary missing /O")
1817 || failure.contains("Encrypt dictionary missing /U")
1818 || failure.contains("security handler cannot be found")
1819 || failure.contains("Invalid cross-reference table")
1820 })
1821}
1822
1823fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
1824 if let Some(message) = payload.downcast_ref::<&str>() {
1825 (*message).to_string()
1826 } else if let Some(message) = payload.downcast_ref::<String>() {
1827 message.clone()
1828 } else {
1829 "unknown panic payload".to_string()
1830 }
1831}
1832
1833fn extract_first_pdf_page_text(
1834 document: &mut pdf_oxide::document::PdfDocument,
1835) -> Result<String, Box<dyn std::error::Error>> {
1836 if document.page_count()? == 0 {
1837 return Ok(String::new());
1838 }
1839
1840 let extracted_text = document.extract_text(0)?;
1841 let markdown_text =
1842 document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1843 if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1844 return Ok(extracted_text);
1845 }
1846
1847 let pipeline_text =
1848 document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1849
1850 Ok(merge_pdf_first_page_text(
1851 &extracted_text,
1852 &markdown_text,
1853 &pipeline_text,
1854 ))
1855}
1856
1857fn extract_pdf_text_from_document(
1858 document: &mut pdf_oxide::document::PdfDocument,
1859) -> Result<String, Box<dyn std::error::Error>> {
1860 Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1861}
1862
1863fn normalize_pdf_text(text: String) -> Option<String> {
1864 let normalized = text.replace(['\r', '\u{0c}'], "\n");
1865 (!normalized.trim().is_empty()).then_some(normalized)
1866}
1867
1868fn merge_pdf_first_page_text(
1869 _extracted_text: &str,
1870 markdown_text: &str,
1871 pipeline_text: &str,
1872) -> String {
1873 let pipeline = pipeline_text.trim();
1874 if pipeline.is_empty() {
1875 return String::new();
1876 }
1877
1878 let prefix = pdf_first_page_heading_prefix(markdown_text);
1879 let Some(prefix) = prefix else {
1880 return pipeline_text.to_string();
1881 };
1882
1883 if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1884 pipeline_text.to_string()
1885 } else {
1886 format!("{prefix}\n\n{pipeline}")
1887 }
1888}
1889
1890fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1891 normalize_pdf_heading_comparison_text(text)
1892 .contains(&normalize_pdf_heading_comparison_text(prefix))
1893}
1894
1895fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1896 text.split_whitespace()
1897 .map(|part| part.to_ascii_lowercase())
1898 .collect::<Vec<_>>()
1899 .join(" ")
1900}
1901
1902fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1903 let mut lines = Vec::new();
1904
1905 for line in pdf_markdown_heading_lines(markdown_text) {
1906 push_unique_line(&mut lines, line);
1907 }
1908
1909 (!lines.is_empty()).then(|| lines.join("\n"))
1910}
1911
1912fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1913 text.lines()
1914 .map(str::trim)
1915 .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1916 .map(|line| line.trim_matches('#').trim())
1917 .filter(|line| !line.is_empty())
1918 .filter(|line| !looks_like_numbered_section_heading(line))
1919 .take(4)
1920 .map(ToOwned::to_owned)
1921 .collect()
1922}
1923
1924fn push_unique_line(lines: &mut Vec<String>, line: String) {
1925 if !lines.iter().any(|existing| existing == &line) {
1926 lines.push(line);
1927 }
1928}
1929
1930fn looks_like_numbered_section_heading(line: &str) -> bool {
1931 let mut chars = line.chars();
1932 let Some(first) = chars.next() else {
1933 return false;
1934 };
1935
1936 if !first.is_ascii_digit() {
1937 return false;
1938 }
1939
1940 matches!(chars.next(), Some('.'))
1941}
1942
1943fn is_zip_archive(bytes: &[u8]) -> bool {
1944 bytes.starts_with(b"PK\x03\x04")
1945 || bytes.starts_with(b"PK\x05\x06")
1946 || bytes.starts_with(b"PK\x07\x08")
1947}
1948
1949pub fn extract_printable_strings(bytes: &[u8]) -> String {
1950 const MIN_LEN: usize = 4;
1951 const MIN_OUTPUT_BYTES: usize = 2_000_000;
1952 const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1953
1954 let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1955
1956 fn is_printable_ascii(b: u8) -> bool {
1957 matches!(b, 0x20..=0x7E)
1958 }
1959
1960 let mut out = String::new();
1961 let mut run: Vec<u8> = Vec::new();
1962
1963 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1964 if run.len() >= MIN_LEN {
1965 if !out.is_empty() {
1966 out.push('\n');
1967 }
1968 out.push_str(&String::from_utf8_lossy(run));
1969 }
1970 run.clear();
1971 };
1972
1973 for &b in bytes {
1974 if is_printable_ascii(b) {
1975 run.push(b);
1976 } else {
1977 flush_run(&mut out, &mut run);
1978 if out.len() >= max_output_bytes {
1979 return out;
1980 }
1981 }
1982 }
1983 flush_run(&mut out, &mut run);
1984 if out.len() >= max_output_bytes {
1985 return out;
1986 }
1987
1988 for start in 0..=1 {
1989 run.clear();
1990 let mut i = start;
1991 while i + 1 < bytes.len() {
1992 let b0 = bytes[i];
1993 let b1 = bytes[i + 1];
1994 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1995 if is_printable_ascii(ch) && zero == 0 {
1996 run.push(ch);
1997 } else {
1998 flush_run(&mut out, &mut run);
1999 if out.len() >= max_output_bytes {
2000 return out;
2001 }
2002 }
2003 i += 2;
2004 }
2005 flush_run(&mut out, &mut run);
2006 if out.len() >= max_output_bytes {
2007 return out;
2008 }
2009 }
2010
2011 out
2012}
2013
2014#[cfg(test)]
2015mod tests {
2016 use std::path::Path;
2017
2018 use super::{
2019 ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
2020 extract_printable_strings, extract_text_for_detection,
2021 extract_text_for_detection_with_diagnostics, is_non_actionable_pdf_failure,
2022 normalize_mime_type, normalize_pdf_heading_comparison_text,
2023 windows_metadata_or_empty_result,
2024 };
2025
2026 #[test]
2027 fn test_extract_text_for_detection_skips_jar_archives() {
2028 let path = Path::new(
2029 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2030 );
2031 let bytes = std::fs::read(path).expect("failed to read jar fixture");
2032
2033 let (text, kind) = extract_text_for_detection(path, &bytes);
2034
2035 assert!(text.is_empty());
2036 assert_eq!(kind, ExtractedTextKind::None);
2037 }
2038
2039 #[test]
2040 fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2041 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2042 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2043
2044 let (text, kind) = extract_text_for_detection(path, &bytes);
2045
2046 assert_eq!(kind, ExtractedTextKind::Pdf);
2047 assert!(text.contains("Redistribution and use in source and binary forms"));
2048 }
2049
2050 #[test]
2051 fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2052 let path =
2053 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2054 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2055
2056 let (text, kind) = extract_text_for_detection(path, &bytes);
2057
2058 assert_eq!(kind, ExtractedTextKind::Pdf);
2059 assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2060 assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2061 }
2062
2063 #[test]
2064 fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2065 let path =
2066 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2067 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2068
2069 let (text, kind) = extract_text_for_detection(path, &bytes);
2070
2071 assert_eq!(kind, ExtractedTextKind::Pdf);
2072
2073 let normalized = normalize_pdf_heading_comparison_text(&text);
2074 let heading =
2075 normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2076 assert_eq!(normalized.matches(&heading).count(), 1);
2077 }
2078
2079 #[test]
2080 fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2081 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2082 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2083
2084 let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2085
2086 assert_eq!(kind, ExtractedTextKind::Pdf);
2087 assert!(text.contains("Redistribution and use in source and binary forms"));
2088 }
2089
2090 #[test]
2091 fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2092 let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2093
2094 let (text, kind, scan_error) =
2095 extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2096
2097 assert!(text.is_empty());
2098 assert_eq!(kind, ExtractedTextKind::None);
2099 let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2100 assert!(scan_error.contains("PDF text extraction failed after"));
2101 }
2102
2103 #[test]
2104 fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2105 let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2106
2107 let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2108
2109 assert!(text.is_empty());
2110 assert_eq!(kind, ExtractedTextKind::None);
2111 }
2112
2113 #[test]
2114 fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2115 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2116 let text = b"Copyright 2026 Example Project!!!";
2117 bytes[..text.len()].copy_from_slice(text);
2118 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2119 bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2120
2121 let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2122
2123 assert_ne!(kind, ExtractedTextKind::None);
2124 assert!(text.contains("Copyright 2026 Example Project"));
2125 }
2126
2127 #[test]
2128 fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2129 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2130 let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2131 bytes[..noise.len()].copy_from_slice(noise);
2132 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2133 bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2134
2135 let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2136
2137 assert!(text.is_empty());
2138 assert_eq!(kind, ExtractedTextKind::None);
2139 }
2140
2141 #[test]
2142 fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2143 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2144 let bytes = std::fs::read(path).expect("read PE fixture");
2145
2146 let (text, kind) = extract_text_for_detection(path, &bytes);
2147
2148 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2149 assert!(text.contains("License: This program is free software"));
2150 assert!(text.contains("LegalCopyright:"));
2151 }
2152
2153 #[test]
2154 fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2155 {
2156 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2157 let mut bytes = std::fs::read(path).expect("read PE fixture");
2158 bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2159
2160 let (text, kind) = extract_text_for_detection(path, &bytes);
2161
2162 assert_ne!(kind, ExtractedTextKind::None);
2163 assert!(!text.trim().is_empty());
2164 }
2165
2166 #[test]
2167 fn test_windows_metadata_or_empty_result_preserves_metadata() {
2168 let (text, kind, scan_error) =
2169 windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2170
2171 assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2172 assert_eq!(text, "LegalCopyright: Example Corp");
2173 assert!(scan_error.is_none());
2174 }
2175
2176 #[test]
2177 fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2178 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2179 let text = b"Copyright 2026 Example Project!!!";
2180 bytes[..text.len()].copy_from_slice(text);
2181
2182 let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2183
2184 assert!(text.is_empty());
2185 assert_eq!(kind, ExtractedTextKind::None);
2186 }
2187
2188 #[test]
2189 fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2190 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2191 let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2192 bytes[..text.len()].copy_from_slice(text);
2193
2194 let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2195
2196 assert_ne!(kind, ExtractedTextKind::None);
2197 assert!(text.contains("asn@redhat.com"));
2198 assert!(text.contains("https://publicsuffix.org/"));
2199 }
2200
2201 #[test]
2202 fn test_non_actionable_pdf_failures_are_suppressed() {
2203 assert!(is_non_actionable_pdf_failure(&[
2204 "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2205 "open full-document: PDF is encrypted and requires a password".to_string(),
2206 ]));
2207 assert!(is_non_actionable_pdf_failure(&[
2208 "from-bytes first-page: Invalid cross-reference table".to_string(),
2209 "open full-document: Invalid cross-reference table".to_string(),
2210 ]));
2211 assert!(is_non_actionable_pdf_failure(&[
2212 "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2213 "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2214 ]));
2215 assert!(!is_non_actionable_pdf_failure(&[
2216 "from-bytes first-page: some other parser failure".to_string(),
2217 ]));
2218 }
2219
2220 #[test]
2221 fn test_extract_text_for_detection_skips_zip_like_archives() {
2222 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2223
2224 let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2225 let (crate_text, crate_kind) =
2226 extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2227
2228 assert!(whl_text.is_empty());
2229 assert_eq!(whl_kind, ExtractedTextKind::None);
2230 assert!(crate_text.is_empty());
2231 assert_eq!(crate_kind, ExtractedTextKind::None);
2232 }
2233
2234 #[test]
2235 fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2236 let path =
2237 Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2238 let bytes = std::fs::read(path).expect("failed to read lib fixture");
2239
2240 let (text, kind) = extract_text_for_detection(path, &bytes);
2241
2242 assert_ne!(kind, ExtractedTextKind::None);
2243 assert!(text.contains("Copyright nexB and others (c) 2012"));
2244 }
2245
2246 #[test]
2247 fn test_extract_text_for_detection_reads_font_metadata() {
2248 let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2249 let bytes = std::fs::read(path).expect("failed to read font fixture");
2250
2251 let (text, kind) = extract_text_for_detection(path, &bytes);
2252
2253 assert_eq!(kind, ExtractedTextKind::FontMetadata);
2254 assert!(text.contains("License Description:"), "{text}");
2255 assert!(
2256 text.contains("Open Font License") || text.contains("OFL"),
2257 "{text}"
2258 );
2259 assert!(text.contains("Lato"), "{text}");
2260 }
2261
2262 #[test]
2263 fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2264 let bytes = b"abcd\0".repeat(525_000);
2265
2266 let text = extract_printable_strings(&bytes);
2267
2268 assert!(
2269 text.len() > 2_000_000,
2270 "unexpected truncation at {}",
2271 text.len()
2272 );
2273 assert!(text.ends_with("abcd"));
2274 }
2275
2276 #[test]
2277 fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2278 let path = Path::new(
2279 "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2280 );
2281 let bytes = std::fs::read(path).expect("failed to read svg fixture");
2282
2283 let (text, kind) = extract_text_for_detection(path, &bytes);
2284
2285 assert_eq!(kind, ExtractedTextKind::Decoded);
2286 assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2287 }
2288
2289 #[test]
2290 fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2291 let path = Path::new(
2292 "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2293 );
2294 let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2295
2296 let (text, kind) = extract_text_for_detection(path, &bytes);
2297
2298 assert_eq!(kind, ExtractedTextKind::Decoded);
2299 assert!(text.contains("GNU Lesser General Public"));
2300 assert!(text.contains("version"));
2301 assert!(text.contains("2.1 of the License"));
2302 }
2303
2304 #[test]
2305 fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2306 assert_eq!(
2307 normalize_mime_type(
2308 Path::new("main.ts"),
2309 b"export const answer = 42;\n",
2310 Some("TypeScript"),
2311 "video/mp2t",
2312 ),
2313 "text/plain"
2314 );
2315 }
2316
2317 #[test]
2318 fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2319 assert_eq!(
2320 normalize_mime_type(
2321 Path::new("main.js"),
2322 b"console.log('hello');\n",
2323 Some("JavaScript"),
2324 "application/octet-stream",
2325 ),
2326 "text/plain"
2327 );
2328 }
2329
2330 #[test]
2331 fn test_normalize_mime_type_preserves_binary_video_guess() {
2332 assert_eq!(
2333 normalize_mime_type(
2334 Path::new("main.ts"),
2335 &[0, 159, 146, 150, 0, 1, 2, 3],
2336 Some("TypeScript"),
2337 "video/mp2t",
2338 ),
2339 "video/mp2t"
2340 );
2341 }
2342
2343 #[test]
2344 fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2345 assert_eq!(
2346 normalize_mime_type(
2347 Path::new("main.ts"),
2348 &[0, 159, 146, 150],
2349 Some("TypeScript"),
2350 "application/octet-stream",
2351 ),
2352 "application/octet-stream"
2353 );
2354 }
2355
2356 #[test]
2357 fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2358 let classification = classify_file_info(Path::new("test.txt"), b"");
2359
2360 assert_eq!(classification.mime_type, "inode/x-empty");
2361 assert_eq!(classification.file_type, "empty");
2362 assert!(!classification.is_binary);
2363 assert!(classification.is_text);
2364 assert!(!classification.is_source);
2365 assert_eq!(classification.programming_language, None);
2366 }
2367
2368 #[test]
2369 fn test_classify_file_info_keeps_json_out_of_programming_language() {
2370 let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2371
2372 assert_eq!(classification.mime_type, "application/json");
2373 assert_eq!(classification.file_type, "JSON text data");
2374 assert!(classification.is_text);
2375 assert!(!classification.is_source);
2376 assert_eq!(classification.programming_language, None);
2377 }
2378
2379 #[test]
2380 fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2381 let classification =
2382 classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2383
2384 assert_eq!(classification.mime_type, "text/plain");
2385 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2386 assert!(classification.is_text);
2387 assert!(!classification.is_binary);
2388 }
2389
2390 #[test]
2391 fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2392 let classification =
2393 classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
2394
2395 assert_eq!(classification.mime_type, "application/octet-stream");
2396 assert_eq!(classification.file_type, "data");
2397 assert!(classification.is_binary);
2398 assert!(!classification.is_text);
2399 }
2400
2401 #[test]
2402 fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
2403 let classification = classify_file_info(
2404 Path::new("utf16.json"),
2405 &[
2406 0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
2407 ],
2408 );
2409
2410 assert!(!classification.is_binary);
2411 assert!(classification.is_text);
2412 assert_eq!(classification.mime_type, "application/json");
2413 assert_eq!(classification.file_type, "JSON text data");
2414 }
2415
2416 #[test]
2417 fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
2418 let classification = classify_file_info(Path::new("true.json"), b"true");
2419
2420 assert!(!classification.is_binary);
2421 assert!(classification.is_text);
2422 assert_eq!(classification.mime_type, "application/json");
2423 assert_eq!(classification.file_type, "JSON text data");
2424 }
2425
2426 #[test]
2427 fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
2428 let classification = classify_file_info(
2429 Path::new("wrapped.json"),
2430 &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
2431 );
2432
2433 assert!(!classification.is_binary);
2434 assert!(classification.is_text);
2435 assert_eq!(classification.mime_type, "text/plain");
2436 assert_eq!(classification.file_type, "text, with no line terminators");
2437 }
2438
2439 #[test]
2440 fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
2441 let classification =
2442 classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
2443
2444 assert!(classification.is_binary);
2445 assert!(!classification.is_text);
2446 assert_eq!(classification.mime_type, "application/octet-stream");
2447 assert_eq!(classification.file_type, "data");
2448 }
2449
2450 #[test]
2451 fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
2452 let classification = classify_file_info(
2453 Path::new("crash.json"),
2454 &[
2455 0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
2456 ],
2457 );
2458
2459 assert!(classification.is_binary);
2460 assert!(!classification.is_text);
2461 assert_eq!(classification.mime_type, "application/octet-stream");
2462 }
2463
2464 #[test]
2465 fn test_classify_file_info_treats_dockerfile_as_source() {
2466 let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2467
2468 assert_eq!(
2469 classification.programming_language.as_deref(),
2470 Some("Dockerfile")
2471 );
2472 assert!(classification.is_source);
2473 assert!(!classification.is_script);
2474 assert_eq!(
2475 classification.file_type,
2476 "Dockerfile source, UTF-8 Unicode text"
2477 );
2478 }
2479
2480 #[test]
2481 fn test_classify_file_info_treats_makefile_as_text_not_source() {
2482 let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2483
2484 assert_eq!(classification.programming_language, None);
2485 assert!(classification.is_text);
2486 assert!(!classification.is_source);
2487 assert!(!classification.is_script);
2488 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2489 }
2490
2491 #[test]
2492 fn test_classify_file_info_marks_supported_package_archives() {
2493 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2494
2495 let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2496 let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2497
2498 assert!(egg.is_archive);
2499 assert_eq!(egg.mime_type, "application/zip");
2500 assert_eq!(egg.file_type, "Zip archive data");
2501 assert!(nupkg.is_archive);
2502 assert_eq!(nupkg.mime_type, "application/zip");
2503 assert_eq!(nupkg.file_type, "Zip archive data");
2504 }
2505
2506 #[test]
2507 fn test_classify_file_info_marks_png_as_binary_media() {
2508 let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2509
2510 let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2511
2512 assert_eq!(classification.mime_type, "image/png");
2513 assert_eq!(classification.file_type, "PNG image data");
2514 assert!(classification.is_binary);
2515 assert!(!classification.is_text);
2516 assert!(classification.is_media);
2517 assert!(!classification.is_archive);
2518 assert!(!classification.is_source);
2519 }
2520
2521 #[test]
2522 fn test_classify_file_info_marks_pdf_as_binary_document() {
2523 let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2524
2525 let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2526
2527 assert_eq!(classification.mime_type, "application/pdf");
2528 assert_eq!(classification.file_type, "PDF document");
2529 assert!(classification.is_binary);
2530 assert!(!classification.is_text);
2531 assert!(!classification.is_archive);
2532 assert!(!classification.is_media);
2533 }
2534
2535 #[test]
2536 fn test_classify_file_info_marks_binary_blobs_as_binary() {
2537 let classification =
2538 classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2539
2540 assert!(classification.is_binary);
2541 assert!(!classification.is_text);
2542 assert!(!classification.is_source);
2543 assert_eq!(classification.programming_language, None);
2544 }
2545
2546 #[test]
2547 fn test_classify_file_info_treats_yaml_as_text_not_source() {
2548 let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2549
2550 assert_eq!(classification.programming_language, None);
2551 assert!(classification.is_text);
2552 assert!(!classification.is_source);
2553 assert_eq!(classification.file_type, "YAML text data");
2554 }
2555
2556 #[test]
2557 fn test_classify_file_info_classifies_common_build_manifests() {
2558 let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2559 let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2560 let cmake = classify_file_info(
2561 Path::new("toolchain.cmake"),
2562 b"set(CMAKE_CXX_STANDARD 20)\n",
2563 );
2564 let gitmodules = classify_file_info(
2565 Path::new(".gitmodules"),
2566 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2567 );
2568
2569 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2570 assert!(gradle.is_source);
2571 assert_eq!(gradle.mime_type, "text/plain");
2572 assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
2573
2574 assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2575 assert!(flake.is_source);
2576 assert_eq!(flake.mime_type, "text/plain");
2577 assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
2578
2579 assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
2580 assert!(cmake.is_source);
2581 assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
2582
2583 assert_eq!(gitmodules.programming_language, None);
2584 assert!(gitmodules.is_text);
2585 assert!(!gitmodules.is_source);
2586 assert_eq!(gitmodules.file_type, "Git configuration text");
2587 }
2588
2589 #[test]
2590 fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
2591 let header = classify_file_info(
2592 Path::new("include/demo.hpp"),
2593 b"#pragma once\nclass Demo {};\n",
2594 );
2595 let ipp = classify_file_info(
2596 Path::new("include/detail/demo.ipp"),
2597 b"template <class T> void parse() {}\n",
2598 );
2599
2600 assert_eq!(header.programming_language.as_deref(), Some("C++"));
2601 assert!(header.is_source);
2602 assert!(!header.is_script);
2603 assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
2604
2605 assert_eq!(ipp.programming_language, None);
2606 assert!(!ipp.is_source);
2607 assert!(!ipp.is_script);
2608 assert_eq!(ipp.file_type, "UTF-8 Unicode text");
2609 }
2610
2611 #[test]
2612 fn test_classify_file_info_preserves_specific_shell_family_labels() {
2613 let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
2614
2615 assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
2616 assert!(bash.is_script);
2617 assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
2618 }
2619
2620 #[test]
2621 fn test_classify_file_info_marks_jamfile_as_source() {
2622 let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
2623
2624 assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
2625 assert!(jamfile.is_source);
2626 assert!(!jamfile.is_script);
2627 assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
2628 }
2629
2630 #[test]
2631 fn test_classify_file_info_labels_javascript_shebang_scripts() {
2632 let classification = classify_file_info(
2633 Path::new("bin/run"),
2634 b"#!/usr/bin/env node\nconsole.log('hello');\n",
2635 );
2636
2637 assert_eq!(
2638 classification.programming_language.as_deref(),
2639 Some("JavaScript")
2640 );
2641 assert!(classification.is_script);
2642 assert_eq!(
2643 classification.file_type,
2644 "javascript script, UTF-8 Unicode text executable"
2645 );
2646 }
2647
2648 #[test]
2649 fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2650 let classification = classify_file_info(
2651 Path::new("script.py"),
2652 b"# coding: latin-1\nprint(\"caf\xe9\")\n",
2653 );
2654
2655 assert_eq!(
2656 classification.programming_language.as_deref(),
2657 Some("Python")
2658 );
2659 assert!(classification.is_script);
2660 assert_eq!(classification.file_type, "python script, text executable");
2661 }
2662
2663 #[test]
2664 fn test_classify_file_info_treats_textual_tga_as_media() {
2665 let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
2666
2667 assert!(classification.is_media);
2668 assert!(classification.is_text);
2669 assert!(!classification.is_binary);
2670 }
2671
2672 #[test]
2673 fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
2674 let classification =
2675 classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
2676
2677 assert!(classification.is_binary);
2678 assert!(!classification.is_text);
2679 assert!(!classification.is_source);
2680 assert_eq!(classification.programming_language, None);
2681 }
2682
2683 #[test]
2684 fn test_extract_text_for_detection_skips_unsupported_image_formats() {
2685 let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
2686
2687 let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
2688
2689 assert!(text.is_empty());
2690 assert_eq!(kind, ExtractedTextKind::None);
2691 }
2692
2693 #[test]
2694 fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
2695 let cases = [
2696 (
2697 Path::new("bin/run"),
2698 b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
2699 Some("JavaScript"),
2700 true,
2701 true,
2702 ),
2703 (
2704 Path::new("Dockerfile"),
2705 b"FROM scratch\n".as_slice(),
2706 Some("Dockerfile"),
2707 true,
2708 false,
2709 ),
2710 (
2711 Path::new("package.json"),
2712 br#"{"name":"demo"}"#.as_slice(),
2713 None,
2714 false,
2715 false,
2716 ),
2717 (
2718 Path::new("config.yaml"),
2719 b"key: value\n".as_slice(),
2720 None,
2721 false,
2722 false,
2723 ),
2724 (
2725 Path::new("Makefile"),
2726 b"all:\n\techo hi\n".as_slice(),
2727 None,
2728 false,
2729 false,
2730 ),
2731 ];
2732
2733 for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
2734 let classification = classify_file_info(path, bytes);
2735
2736 assert_eq!(
2737 classification.programming_language.as_deref(),
2738 expected_language,
2739 "unexpected language for {}",
2740 path.display()
2741 );
2742 assert_eq!(
2743 classification.is_source,
2744 expected_is_source,
2745 "unexpected is_source for {}",
2746 path.display()
2747 );
2748 assert_eq!(
2749 classification.is_script,
2750 expected_is_script,
2751 "unexpected is_script for {}",
2752 path.display()
2753 );
2754 }
2755 }
2756}