1use std::borrow::Cow;
5use std::collections::BTreeSet;
6use std::fs;
7use std::io::{BufReader, Cursor, Read};
8use std::panic::{AssertUnwindSafe, catch_unwind};
9use std::path::Path;
10
11use chrono::{TimeZone, Utc};
12use file_format::{FileFormat, Kind as FileFormatKind};
13use flate2::read::ZlibDecoder;
14use glob::Pattern;
15use image::{ImageDecoder, ImageFormat, ImageReader};
16use mime_guess::from_path;
17use object::FileKind;
18use quick_xml::events::Event;
19use quick_xml::reader::Reader as XmlReader;
20
21use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
22use crate::utils::font::extract_font_metadata_text;
23use crate::utils::language::detect_language;
24
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum ExtractedTextKind {
27 None,
28 Decoded,
29 FontMetadata,
30 Pdf,
31 BinaryStrings,
32 ImageMetadata,
33 WindowsExecutableMetadata,
34}
35
36#[derive(Debug, Clone, PartialEq, Eq)]
37pub struct FileInfoClassification {
38 pub mime_type: String,
39 pub file_type: String,
40 pub programming_language: Option<String>,
41 pub is_binary: bool,
42 pub is_text: bool,
43 pub is_archive: bool,
44 pub is_media: bool,
45 pub is_source: bool,
46 pub is_script: bool,
47}
48
49const MAX_IMAGE_METADATA_VALUES: usize = 64;
50const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
51const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
52const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
53const LARGE_MACHO_LEGAL_WINDOW_BYTES: usize = 64 * 1024;
54const LARGE_MACHO_LEGAL_MAX_WINDOWS: usize = 24;
55const LARGE_MACHO_LEGAL_MAX_WINDOWS_PER_MARKER: usize = 4;
56const LARGE_MACHO_LEGAL_MAX_EXTRACT_BYTES: usize = 2 * 1024 * 1024;
57const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
58const MAX_XMP_PACKET_BYTES: usize = 256 * 1024;
59const MAX_PDF_TEXT_EXTRACTION_BYTES: usize = 32 * 1024 * 1024;
60const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
61 "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
62];
63const BINARY_EXTENSIONS: &[&str] = &[
64 "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
65];
66const ARCHIVE_EXTENSIONS: &[&str] = &[
67 "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
68 "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
69];
70const LARGE_MACHO_LEGAL_MARKERS: &[&[u8]] = &[
71 b"Unicode, Inc.",
72 b"http://www.unicode.org/copyright.html",
73 b"https://www.unicode.org/copyright.html",
74 b"SPDX-License-Identifier:",
75 b"Licensed under",
76 b"licensed under",
77 b"Apache License",
78 b"http://www.apache.org/licenses/",
79 b"https://www.apache.org/licenses/",
80 b"Permission is hereby granted",
81 b"permission is hereby granted",
82 b"Redistribution and use in source and binary forms",
83 b"redistribution and use in source and binary forms",
84 b"Permission to use, copy, modify, and/or distribute this software",
85 b"The MIT License",
86 b"GNU GENERAL PUBLIC LICENSE",
87 b"GNU LESSER GENERAL PUBLIC LICENSE",
88 b"Mozilla Public License",
89];
90
91pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
93 metadata.modified().ok().map(|time: std::time::SystemTime| {
94 let seconds_since_epoch = time
95 .duration_since(std::time::UNIX_EPOCH)
96 .unwrap()
97 .as_secs() as i64;
98
99 Utc.timestamp_opt(seconds_since_epoch, 0)
100 .single()
101 .unwrap_or_else(Utc::now)
102 .format("%Y-%m-%d")
103 .to_string()
104 })
105}
106
107pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
109 let path_str = path.to_string_lossy();
110 let file_name = path
111 .file_name()
112 .map(|name| name.to_string_lossy())
113 .unwrap_or_default();
114
115 for pattern in exclude_patterns {
116 if pattern.matches(&path_str) {
118 return true;
119 }
120
121 if pattern.matches(&file_name) {
123 return true;
124 }
125 }
126
127 false
128}
129
130pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
137 if let Some(decoded) = decode_utf16_text(bytes) {
138 return decoded;
139 }
140
141 match String::from_utf8(bytes.to_vec()) {
142 Ok(s) => s,
143 Err(e) => {
144 let bytes = e.into_bytes();
145 if has_binary_control_chars(&bytes) {
146 return String::new();
147 }
148 bytes.iter().map(|&b| b as char).collect()
149 }
150 }
151}
152
153pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
154 let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
155 (text, kind)
156}
157
158pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
159 let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
160 return Cow::Borrowed(text);
161 };
162 if !matches!(
163 extension.to_ascii_lowercase().as_str(),
164 "md" | "markdown" | "html" | "htm"
165 ) {
166 return Cow::Borrowed(text);
167 }
168
169 let mut hints = Vec::new();
170 let has_dual_license_notice = has_dual_license_notice_text(text);
171 if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
172 hints.push("Creative Commons Attribution 4.0 International License".to_string());
173 }
174 if !has_dual_license_notice
175 && (text.contains("Apache License (Version 2.0)")
176 || text.contains("Apache License, Version 2.0"))
177 {
178 hints.push(
179 "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
180 .to_string(),
181 );
182 }
183
184 if !has_dual_license_notice {
185 hints.extend(extract_shields_license_badge_hints(text));
186 }
187
188 if hints.is_empty() {
189 Cow::Borrowed(text)
190 } else {
191 let mut augmented =
192 String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
193 augmented.push_str(text);
194 augmented.push_str("\n\n");
195 for (index, hint) in hints.into_iter().enumerate() {
196 if index > 0 {
197 augmented.push('\n');
198 }
199 augmented.push_str(&hint);
200 }
201 Cow::Owned(augmented)
202 }
203}
204
205fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
206 let mut hints = Vec::new();
207 let mut rest = text;
208 let needle = "img.shields.io/badge/license-";
209
210 while let Some(index) = rest.find(needle) {
211 let start = index + needle.len();
212 let suffix = &rest[start..];
213 let end = suffix
214 .find([')', ']', '"', '\'', ' ', '\n'])
215 .unwrap_or(suffix.len());
216 let badge = &suffix[..end];
217 let Some(badge) = badge.strip_suffix(".svg") else {
218 rest = &suffix[end..];
219 continue;
220 };
221
222 let mut segments: Vec<_> = badge
223 .split('-')
224 .filter(|segment| !segment.is_empty())
225 .collect();
226 if segments.len() < 2 {
227 rest = &suffix[end..];
228 continue;
229 }
230 segments.pop();
231 let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
232 if !candidate.is_empty() {
233 hints.push(canonical_shields_license_hint(&candidate));
234 }
235
236 rest = &suffix[end..];
237 }
238
239 hints.sort();
240 hints.dedup();
241 hints
242}
243
244fn has_dual_license_notice_text(text: &str) -> bool {
245 let lower = text.to_ascii_lowercase();
246 (lower.contains("licensed under either of") && lower.contains("at your option"))
247 || lower.contains("dual-licensed under")
248 || lower.contains("dual licensed under")
249}
250
251fn canonical_shields_license_hint(candidate: &str) -> String {
252 match candidate.trim() {
253 "MIT" => "The MIT License".to_string(),
254 "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
255 other => format!("{other} License"),
256 }
257}
258
259pub(crate) fn extract_text_for_detection_with_diagnostics(
260 path: &Path,
261 bytes: &[u8],
262) -> (String, ExtractedTextKind, Option<String>) {
263 let ext = path
264 .extension()
265 .and_then(|e| e.to_str())
266 .map(|s| s.to_ascii_lowercase());
267 let detected_format = detect_file_format(bytes);
268
269 if looks_like_rtf(bytes, ext.as_deref()) {
270 let text = extract_rtf_text(bytes);
271 return if text.trim().is_empty() {
272 (String::new(), ExtractedTextKind::None, None)
273 } else {
274 (text, ExtractedTextKind::Decoded, None)
275 };
276 }
277
278 if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
279 let (text, scan_error) = extract_pdf_text(path, bytes);
280 return if text.is_empty() {
281 (String::new(), ExtractedTextKind::None, scan_error)
282 } else {
283 (text, ExtractedTextKind::Pdf, None)
284 };
285 }
286
287 if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
288 let text = extract_image_metadata_text(bytes, format);
289 return if text.is_empty() {
290 if is_supported_image_container(bytes, format) {
291 (String::new(), ExtractedTextKind::None, None)
292 } else {
293 let decoded = decode_bytes_to_string(bytes);
294 if decoded.is_empty() {
295 (String::new(), ExtractedTextKind::None, None)
296 } else {
297 (decoded, ExtractedTextKind::Decoded, None)
298 }
299 }
300 } else {
301 (text, ExtractedTextKind::ImageMetadata, None)
302 };
303 }
304
305 if let Some(text) = extract_font_metadata_text(path, bytes) {
306 let strings = extract_printable_strings(bytes);
307 let combined = if strings.is_empty() {
308 text
309 } else {
310 combine_extracted_text_fragments(Some(text), strings)
311 };
312 return (combined, ExtractedTextKind::FontMetadata, None);
313 }
314
315 let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
316 let large_opaque_binary = windows_executable_metadata_text.is_none()
317 && is_large_opaque_binary_candidate(bytes, detected_format);
318 let bounded_macho_legal_text = if large_opaque_binary {
319 extract_bounded_macho_legal_strings(bytes)
320 } else {
321 String::new()
322 };
323 let skip_large_opaque_binary_text =
324 should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format);
325
326 if skip_large_opaque_binary_text {
327 if !bounded_macho_legal_text.is_empty() {
328 return (
329 combine_extracted_text_fragments(
330 windows_executable_metadata_text,
331 bounded_macho_legal_text,
332 ),
333 ExtractedTextKind::BinaryStrings,
334 None,
335 );
336 }
337 return windows_metadata_or_empty_result(windows_executable_metadata_text);
338 }
339
340 if should_skip_binary_string_extraction(path, bytes, detected_format) {
341 return (String::new(), ExtractedTextKind::None, None);
342 }
343
344 let is_svg_text = lower_extension(path).as_deref() == Some("svg")
345 || detected_format.media_type() == "image/svg+xml";
346 let should_try_decoded_text = looks_like_textual_bytes(bytes) || is_svg_text;
347 let decoded_is_utf8 = std::str::from_utf8(bytes).is_ok();
348 let path_suggests_text = ext.as_deref().is_some_and(|extension| {
349 PLAIN_TEXT_EXTENSIONS.contains(&extension) || detect_language(path, bytes).is_some()
350 });
351
352 if !large_opaque_binary && should_try_decoded_text {
353 let decoded = decode_bytes_to_string(bytes);
354 if !decoded.is_empty()
355 && (is_svg_text
356 || decoded_is_utf8
357 || path_suggests_text
358 || looks_like_decoded_text(&decoded))
359 {
360 let combined =
361 combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
362 return (combined, ExtractedTextKind::Decoded, None);
363 }
364 }
365
366 let text = if large_opaque_binary {
367 let sampled_text = extract_sampled_printable_strings(bytes);
368 if bounded_macho_legal_text.is_empty() {
369 sampled_text
370 } else {
371 combine_extracted_text_fragments(Some(sampled_text), bounded_macho_legal_text)
372 }
373 } else {
374 extract_printable_strings(bytes)
375 };
376 if text.is_empty() {
377 windows_metadata_or_empty_result(windows_executable_metadata_text)
378 } else {
379 (
380 combine_extracted_text_fragments(windows_executable_metadata_text, text),
381 ExtractedTextKind::BinaryStrings,
382 None,
383 )
384 }
385}
386
387fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
388 match prefix {
389 Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
390 Some(prefix) if !prefix.is_empty() => prefix,
391 _ => suffix,
392 }
393}
394
395fn windows_metadata_or_empty_result(
396 windows_executable_metadata_text: Option<String>,
397) -> (String, ExtractedTextKind, Option<String>) {
398 if let Some(metadata_text) = windows_executable_metadata_text {
399 (
400 metadata_text,
401 ExtractedTextKind::WindowsExecutableMetadata,
402 None,
403 )
404 } else {
405 (String::new(), ExtractedTextKind::None, None)
406 }
407}
408
409pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
410 let detected_format = detect_file_format(bytes);
411 let detected_language = detect_language(path, bytes);
412 let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
413 let is_text = !is_binary;
414 let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
415 let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
416 let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
417 let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
418 let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
419 let programming_language = is_source.then(|| detected_language.clone()).flatten();
420 let file_type = detect_file_type(
421 path,
422 bytes,
423 detected_format,
424 &mime_type,
425 programming_language.as_deref(),
426 is_binary,
427 is_text,
428 is_archive,
429 is_media,
430 is_script,
431 );
432
433 FileInfoClassification {
434 mime_type,
435 file_type,
436 programming_language,
437 is_binary,
438 is_text,
439 is_archive,
440 is_media,
441 is_source,
442 is_script,
443 }
444}
445
446fn detect_file_format(bytes: &[u8]) -> FileFormat {
447 FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
448}
449
450const CORRUPTED_UTF16_BOM_PREFIX: &[u8] = &[0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD];
451
452fn is_utf8_text(bytes: &[u8]) -> bool {
453 std::str::from_utf8(bytes).is_ok()
454}
455
456fn strip_corrupted_utf16_bom_prefix(bytes: &[u8]) -> &[u8] {
457 bytes
458 .strip_prefix(CORRUPTED_UTF16_BOM_PREFIX)
459 .unwrap_or(bytes)
460}
461
462fn decode_utf16_units(bytes: &[u8], is_le: bool, require_text_shape: bool) -> Option<String> {
463 if bytes.is_empty() || !bytes.len().is_multiple_of(2) {
464 return None;
465 }
466
467 let code_units: Vec<u16> = bytes
468 .chunks_exact(2)
469 .map(|chunk| {
470 if is_le {
471 u16::from_le_bytes([chunk[0], chunk[1]])
472 } else {
473 u16::from_be_bytes([chunk[0], chunk[1]])
474 }
475 })
476 .collect();
477
478 let decoded = std::char::decode_utf16(code_units)
479 .collect::<Result<String, _>>()
480 .ok()?;
481
482 if !require_text_shape {
483 return (!decoded.contains('\0')).then_some(decoded);
484 }
485
486 if !looks_like_decoded_text(&decoded) {
487 return None;
488 }
489
490 Some(decoded)
491}
492
493fn looks_like_decoded_text(decoded: &str) -> bool {
494 if decoded
495 .chars()
496 .any(|ch| ch.is_control() && !matches!(ch, '\n' | '\r' | '\t'))
497 {
498 return false;
499 }
500
501 let visible = decoded
502 .chars()
503 .filter(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'))
504 .count();
505 if visible < 3 || decoded.contains('\0') {
506 return false;
507 }
508
509 let alpha = decoded.chars().filter(|ch| ch.is_alphabetic()).count();
510 let punctuation = decoded
511 .chars()
512 .filter(|ch| {
513 matches!(
514 ch,
515 '{' | '}'
516 | '['
517 | ']'
518 | '<'
519 | '>'
520 | '('
521 | ')'
522 | ':'
523 | ';'
524 | ','
525 | '"'
526 | '\''
527 | '/'
528 | '='
529 | '-'
530 | '_'
531 | '#'
532 | '!'
533 )
534 })
535 .count();
536 let whitespace = decoded.chars().filter(|ch| ch.is_whitespace()).count();
537
538 let textish = alpha + punctuation + whitespace;
539 textish + (visible / 5) >= visible && (alpha > 0 || punctuation >= 2)
540}
541
542fn detect_utf16_endianness(bytes: &[u8]) -> Option<bool> {
543 let stripped = strip_corrupted_utf16_bom_prefix(bytes);
544 if stripped.len() < 4 || !stripped.len().is_multiple_of(2) {
545 return None;
546 }
547
548 let pair_count = stripped.len() / 2;
549 let even_zero = stripped.iter().step_by(2).filter(|&&b| b == 0).count();
550 let odd_zero = stripped
551 .iter()
552 .skip(1)
553 .step_by(2)
554 .filter(|&&b| b == 0)
555 .count();
556
557 let looks_like_be = even_zero * 3 >= pair_count && odd_zero * 6 <= pair_count;
558 let looks_like_le = odd_zero * 3 >= pair_count && even_zero * 6 <= pair_count;
559
560 match (looks_like_le, looks_like_be) {
561 (true, false) => Some(true),
562 (false, true) => Some(false),
563 (true, true) => Some(true),
564 (false, false) => None,
565 }
566}
567
568fn decode_utf16_text(bytes: &[u8]) -> Option<String> {
569 if let Some(decoded) = decode_utf16_bom_text(bytes) {
570 return Some(decoded);
571 }
572
573 let stripped = strip_corrupted_utf16_bom_prefix(bytes);
574 match detect_utf16_endianness(bytes) {
575 Some(true) => decode_utf16_units(stripped, true, true),
576 Some(false) => decode_utf16_units(stripped, false, true),
577 None => None,
578 }
579}
580
581fn decode_utf16_json_text(bytes: &[u8]) -> Option<String> {
582 if bytes.len() >= 2 {
583 let (is_le, body) = match bytes {
584 [0xFF, 0xFE, rest @ ..] => (true, rest),
585 [0xFE, 0xFF, rest @ ..] => (false, rest),
586 _ => {
587 let stripped = strip_corrupted_utf16_bom_prefix(bytes);
588 return match detect_utf16_endianness(bytes) {
589 Some(true) => decode_utf16_units(stripped, true, false),
590 Some(false) => decode_utf16_units(stripped, false, false),
591 None => None,
592 };
593 }
594 };
595
596 if body.is_empty() || !body.len().is_multiple_of(2) {
597 return None;
598 }
599
600 return decode_utf16_units(body, is_le, false);
601 }
602
603 None
604}
605
606fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
607 if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
608 return None;
609 }
610
611 let (is_le, body) = match bytes {
612 [0xFF, 0xFE, rest @ ..] => (true, rest),
613 [0xFE, 0xFF, rest @ ..] => (false, rest),
614 _ => return None,
615 };
616
617 if body.is_empty() || body.len() % 2 != 0 {
618 return None;
619 }
620
621 decode_utf16_units(body, is_le, true)
622}
623
624fn has_binary_control_chars(bytes: &[u8]) -> bool {
625 let control_count = bytes
626 .iter()
627 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
628 .count();
629 control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
630}
631
632fn has_decodable_text(bytes: &[u8]) -> bool {
633 bytes.is_empty()
634 || is_utf8_text(bytes)
635 || decode_utf16_text(bytes).is_some()
636 || !has_binary_control_chars(bytes)
637}
638
639fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
640 if bytes.is_empty() || is_utf8_text(bytes) {
641 return true;
642 }
643 if let Some(decoded) = decode_utf16_text(bytes) {
644 return decoded
645 .chars()
646 .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
647 }
648
649 let printable_count = bytes
650 .iter()
651 .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
652 .count();
653 printable_count * 2 >= bytes.len()
654}
655
656fn is_textual_media_type(media_type: &str) -> bool {
657 media_type.starts_with("text/")
658 || matches!(
659 media_type,
660 "application/json" | "application/xml" | "text/xml"
661 )
662 || media_type.ends_with("+json")
663 || media_type.ends_with("+xml")
664}
665
666fn is_textual_format(detected_format: FileFormat) -> bool {
667 matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
668 || is_textual_media_type(detected_format.media_type())
669}
670
671fn is_known_binary_format(detected_format: FileFormat) -> bool {
672 !matches!(detected_format, FileFormat::ArbitraryBinaryData)
673 && !is_textual_format(detected_format)
674}
675
676pub fn detect_mime_type(
677 path: &Path,
678 bytes: &[u8],
679 detected_format: FileFormat,
680 programming_language: Option<&str>,
681) -> String {
682 if bytes.is_empty() {
683 return "inode/x-empty".to_string();
684 }
685
686 if lower_extension(path).as_deref() == Some("json") {
687 if let Some(is_binary) = json_binary_override(bytes) {
688 if is_binary {
689 return "application/octet-stream".to_string();
690 }
691 if has_valid_json_text(bytes) {
692 return "application/json".to_string();
693 }
694 return "text/plain".to_string();
695 }
696 if has_valid_json_text(bytes) {
697 return "application/json".to_string();
698 }
699 if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
700 return "text/plain".to_string();
701 }
702 return "application/octet-stream".to_string();
703 }
704
705 if is_zip_archive(bytes) {
706 return detect_zip_like_mime(path);
707 }
708
709 if looks_like_deb(bytes, path) {
710 return "application/vnd.debian.binary-package".to_string();
711 }
712
713 if looks_like_rpm(bytes, path) {
714 return "application/x-rpm".to_string();
715 }
716
717 let guessed_mime = from_path(path)
718 .first_or_octet_stream()
719 .essence_str()
720 .to_string();
721
722 let mime_type = match detected_format {
723 FileFormat::Empty => "inode/x-empty".to_string(),
724 FileFormat::PlainText => {
725 if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
726 "text/plain".to_string()
727 } else {
728 guessed_mime.clone()
729 }
730 }
731 _ => {
732 let detected_mime = detected_format.media_type();
733 if detected_mime == "application/octet-stream"
734 && guessed_mime != "application/octet-stream"
735 {
736 guessed_mime.clone()
737 } else {
738 detected_mime.to_string()
739 }
740 }
741 };
742
743 normalize_mime_type(path, bytes, programming_language, &mime_type)
744}
745
746fn normalize_mime_type(
747 path: &Path,
748 bytes: &[u8],
749 programming_language: Option<&str>,
750 mime_type: &str,
751) -> String {
752 if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
753 return "text/plain".to_string();
754 }
755
756 mime_type.to_string()
757}
758
759fn should_prefer_text_mime(
760 path: &Path,
761 bytes: &[u8],
762 programming_language: Option<&str>,
763 mime_type: &str,
764) -> bool {
765 has_decodable_text(bytes)
766 && looks_like_textual_bytes(bytes)
767 && is_textual_source_candidate(path, programming_language)
768 && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
769}
770
771fn has_valid_json_text(bytes: &[u8]) -> bool {
772 if bytes.len() > JSON_VALIDATION_MAX_BYTES {
773 return false;
774 }
775
776 serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
777 || decode_utf16_json_text(bytes)
778 .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
779 .is_some()
780}
781
782fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
783 !bytes.contains(&0)
784 && !bytes.contains(&0xFF)
785 && bytes.starts_with(b"[\"")
786 && bytes.ends_with(b"\"]")
787 && bytes.len() >= 8
788}
789
790fn json_binary_override(bytes: &[u8]) -> Option<bool> {
791 if has_valid_json_text(bytes) {
792 return Some(false);
793 }
794
795 if bytes.contains(&0) {
796 return Some(true);
797 }
798
799 if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
800 return Some(true);
801 }
802
803 if is_wrapped_invalid_json_string_text(bytes) {
804 return Some(false);
805 }
806
807 None
808}
809
810fn detect_is_binary(
811 path: &Path,
812 bytes: &[u8],
813 detected_format: FileFormat,
814 programming_language: Option<&str>,
815) -> bool {
816 if lower_extension(path).as_deref() == Some("json")
817 && let Some(is_binary) = json_binary_override(bytes)
818 {
819 return is_binary;
820 }
821
822 if is_textual_format(detected_format) {
823 return false;
824 }
825
826 if lower_extension(path)
827 .as_deref()
828 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
829 {
830 return true;
831 }
832
833 if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
834 return false;
835 }
836
837 has_binary_control_chars(bytes)
838 || is_known_binary_format(detected_format)
839 || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
840 && !looks_like_textual_bytes(bytes))
841}
842
843fn should_treat_binary_bytes_as_text(
844 path: &Path,
845 bytes: &[u8],
846 programming_language: Option<&str>,
847) -> bool {
848 has_decodable_text(bytes)
849 && looks_like_textual_bytes(bytes)
850 && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
851}
852
853fn detect_is_archive(
854 path: &Path,
855 bytes: &[u8],
856 mime_type: &str,
857 is_text: bool,
858 detected_format: FileFormat,
859) -> bool {
860 if is_text {
861 return false;
862 }
863
864 lower_extension(path)
865 .as_deref()
866 .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
867 || matches!(
868 detected_format.kind(),
869 FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
870 )
871 || is_zip_archive(bytes)
872 || looks_like_gzip(bytes)
873 || looks_like_bzip2(bytes)
874 || looks_like_xz(bytes)
875 || looks_like_deb(bytes, path)
876 || looks_like_rpm(bytes, path)
877 || looks_like_squashfs(bytes, path)
878 || mime_type.contains("zip")
879 || mime_type.contains("compressed")
880 || mime_type.contains("tar")
881 || mime_type.contains("x-rpm")
882 || mime_type.contains("debian")
883}
884
885fn detect_is_media(
886 path: &Path,
887 bytes: &[u8],
888 mime_type: &str,
889 detected_format: FileFormat,
890) -> bool {
891 media_mime_from_content(bytes).is_some()
892 || matches!(
893 detected_format.kind(),
894 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
895 )
896 || mime_type.starts_with("image/")
897 || mime_type.starts_with("audio/")
898 || mime_type.starts_with("video/")
899 || (mime_type == "application/octet-stream"
900 && lower_extension(path).as_deref() == Some("tga")
901 && !has_binary_control_chars(bytes))
902}
903
904fn detect_is_script(
905 path: &Path,
906 bytes: &[u8],
907 programming_language: Option<&str>,
908 is_text: bool,
909) -> bool {
910 if !is_text || is_makefile(path) {
911 return false;
912 }
913
914 bytes.starts_with(b"#!")
915 || lower_extension(path).as_deref().is_some_and(|ext| {
916 matches!(
917 ext,
918 "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
919 )
920 })
921 || matches!(
922 programming_language,
923 Some(
924 "Shell"
925 | "Bash"
926 | "Zsh"
927 | "Fish"
928 | "Ksh"
929 | "Python"
930 | "Ruby"
931 | "Perl"
932 | "PHP"
933 | "PowerShell"
934 | "Awk"
935 )
936 )
937}
938
939fn detect_is_source(
940 path: &Path,
941 programming_language: Option<&str>,
942 is_text: bool,
943 is_script: bool,
944) -> bool {
945 if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
946 return false;
947 }
948
949 if is_c_like_source(path) || is_java_like_source(path) {
950 return true;
951 }
952
953 programming_language.is_some() || is_script
954}
955
956#[allow(clippy::too_many_arguments)]
957fn detect_file_type(
958 path: &Path,
959 bytes: &[u8],
960 detected_format: FileFormat,
961 mime_type: &str,
962 programming_language: Option<&str>,
963 is_binary: bool,
964 is_text: bool,
965 is_archive: bool,
966 is_media: bool,
967 is_script: bool,
968) -> String {
969 if bytes.is_empty() {
970 return "empty".to_string();
971 }
972
973 if looks_like_pdf(bytes) {
974 return "PDF document".to_string();
975 }
976
977 if let Some(file_type) = media_file_type_from_content(bytes) {
978 return file_type.to_string();
979 }
980
981 if is_archive {
982 return archive_file_type(path, bytes, detected_format);
983 }
984
985 if is_script {
986 return script_file_type(programming_language, bytes);
987 }
988
989 if is_text {
990 if lower_extension(path).as_deref() == Some("json") {
991 if has_valid_json_text(bytes) {
992 return "JSON text data".to_string();
993 }
994 return text_file_type(bytes);
995 }
996 if lower_extension(path).as_deref() == Some("xml") {
997 return "XML text data".to_string();
998 }
999 if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
1000 return "YAML text data".to_string();
1001 }
1002 if lower_extension(path).as_deref() == Some("toml") {
1003 return "TOML text data".to_string();
1004 }
1005 if matches!(
1006 lower_extension(path).as_deref(),
1007 Some("ini" | "cfg" | "conf")
1008 ) {
1009 return "INI text data".to_string();
1010 }
1011 if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
1012 return "Git configuration text".to_string();
1013 }
1014 if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
1015 return text_file_type(bytes);
1016 }
1017 if programming_language.is_some() && !is_media {
1018 return source_file_type(programming_language, bytes);
1019 }
1020 return text_file_type(bytes);
1021 }
1022
1023 if let Some(file_type) = format_based_file_type(detected_format) {
1024 return file_type;
1025 }
1026
1027 if is_binary && mime_type == "application/octet-stream" {
1028 return "data".to_string();
1029 }
1030
1031 mime_type.to_string()
1032}
1033
1034fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
1035 if matches!(programming_language, Some(language) if is_source_like_language(language)) {
1036 return true;
1037 }
1038
1039 if matches!(
1040 lower_file_name(path).as_str(),
1041 "dockerfile"
1042 | "containerfile"
1043 | "containerfile.core"
1044 | "apkbuild"
1045 | "podfile"
1046 | "jamfile"
1047 | "jamroot"
1048 | "meson.build"
1049 | "build"
1050 | "workspace"
1051 | "buck"
1052 | "default.nix"
1053 | "flake.nix"
1054 | "shell.nix"
1055 ) {
1056 return true;
1057 }
1058
1059 path.extension()
1060 .and_then(|ext| ext.to_str())
1061 .is_some_and(|ext| {
1062 matches!(
1063 ext.to_ascii_lowercase().as_str(),
1064 "rs" | "py"
1065 | "js"
1066 | "mjs"
1067 | "cjs"
1068 | "jsx"
1069 | "ts"
1070 | "mts"
1071 | "cts"
1072 | "tsx"
1073 | "c"
1074 | "cpp"
1075 | "cc"
1076 | "cxx"
1077 | "h"
1078 | "hpp"
1079 | "m"
1080 | "mm"
1081 | "s"
1082 | "asm"
1083 | "java"
1084 | "go"
1085 | "rb"
1086 | "php"
1087 | "pl"
1088 | "swift"
1089 | "sh"
1090 | "bash"
1091 | "zsh"
1092 | "fish"
1093 | "ksh"
1094 | "ps1"
1095 | "psm1"
1096 | "psd1"
1097 | "awk"
1098 | "kt"
1099 | "kts"
1100 | "dart"
1101 | "scala"
1102 | "groovy"
1103 | "gradle"
1104 | "gvy"
1105 | "gy"
1106 | "gsh"
1107 | "cs"
1108 | "fs"
1109 | "fsx"
1110 | "r"
1111 | "lua"
1112 | "jl"
1113 | "ex"
1114 | "exs"
1115 | "clj"
1116 | "cljs"
1117 | "cljc"
1118 | "hs"
1119 | "erl"
1120 | "nix"
1121 | "zig"
1122 | "bzl"
1123 | "bazel"
1124 | "star"
1125 | "sky"
1126 | "ml"
1127 | "mli"
1128 | "tex"
1129 )
1130 })
1131}
1132
1133fn is_source_like_language(language: &str) -> bool {
1134 matches!(
1135 language,
1136 "Rust"
1137 | "Python"
1138 | "JavaScript"
1139 | "TypeScript"
1140 | "JavaScript/TypeScript"
1141 | "C"
1142 | "C++"
1143 | "Objective-C"
1144 | "Objective-C++"
1145 | "GAS"
1146 | "Java"
1147 | "Go"
1148 | "Ruby"
1149 | "PHP"
1150 | "Perl"
1151 | "Swift"
1152 | "Shell"
1153 | "PowerShell"
1154 | "Awk"
1155 | "Kotlin"
1156 | "Dart"
1157 | "Scala"
1158 | "C#"
1159 | "F#"
1160 | "R"
1161 | "Lua"
1162 | "Julia"
1163 | "Elixir"
1164 | "Clojure"
1165 | "Haskell"
1166 | "Erlang"
1167 | "Groovy"
1168 | "Nix"
1169 | "Zig"
1170 | "Starlark"
1171 | "OCaml"
1172 | "Meson"
1173 | "TeX"
1174 | "Dockerfile"
1175 | "Makefile"
1176 | "Jamfile"
1177 )
1178}
1179
1180fn extension(path: &Path) -> Option<&str> {
1181 path.extension().and_then(|ext| ext.to_str())
1182}
1183
1184fn lower_extension(path: &Path) -> Option<String> {
1185 extension(path).map(|ext| ext.to_ascii_lowercase())
1186}
1187
1188fn lower_file_name(path: &Path) -> String {
1189 path.file_name()
1190 .and_then(|name| name.to_str())
1191 .map(|name| name.to_ascii_lowercase())
1192 .unwrap_or_default()
1193}
1194
1195fn is_plain_text(path: &Path) -> bool {
1196 lower_extension(path)
1197 .as_deref()
1198 .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
1199}
1200
1201fn is_makefile(path: &Path) -> bool {
1202 matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
1203}
1204
1205fn is_source_map(path: &Path) -> bool {
1206 let path_lower = path.to_string_lossy().to_ascii_lowercase();
1207 path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
1208}
1209
1210fn is_c_like_source(path: &Path) -> bool {
1211 lower_extension(path).as_deref().is_some_and(|ext| {
1212 matches!(
1213 ext,
1214 "c" | "cc"
1215 | "cp"
1216 | "cpp"
1217 | "cxx"
1218 | "c++"
1219 | "h"
1220 | "hh"
1221 | "hpp"
1222 | "hxx"
1223 | "h++"
1224 | "i"
1225 | "ii"
1226 | "m"
1227 | "s"
1228 | "asm"
1229 )
1230 })
1231}
1232
1233fn is_java_like_source(path: &Path) -> bool {
1234 lower_extension(path)
1235 .as_deref()
1236 .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1237}
1238
1239fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1240 match detected_format {
1241 FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1242 format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1243 format => Some(match format.kind() {
1244 FileFormatKind::Image => short_name_or_name(&format, "image data"),
1245 FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1246 FileFormatKind::Video => short_name_or_name(&format, "video data"),
1247 _ => format.name().to_string(),
1248 }),
1249 }
1250}
1251
1252fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1253 format
1254 .short_name()
1255 .map(|short_name| format!("{short_name} {suffix}"))
1256 .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1257}
1258
1259fn detect_zip_like_mime(path: &Path) -> String {
1260 match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1261 Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1262 Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1263 "application/java-archive".to_string()
1264 }
1265 _ => "application/zip".to_string(),
1266 }
1267}
1268
1269fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1270 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1271 Some("image/png")
1272 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1273 Some("image/jpeg")
1274 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1275 Some("image/tiff")
1276 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1277 Some("image/webp")
1278 } else {
1279 None
1280 }
1281}
1282
1283fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1284 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1285 Some("PNG image data")
1286 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1287 Some("JPEG image data")
1288 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1289 Some("TIFF image data")
1290 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1291 Some("WebP image data")
1292 } else {
1293 None
1294 }
1295}
1296
1297fn looks_like_pdf(bytes: &[u8]) -> bool {
1298 bytes.starts_with(b"%PDF-")
1299}
1300
1301fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1302 ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1303}
1304
1305fn extract_rtf_text(bytes: &[u8]) -> String {
1306 let text = String::from_utf8_lossy(bytes);
1307 let chars: Vec<char> = text.chars().collect();
1308 let mut output = String::new();
1309 let mut index = 0usize;
1310
1311 while index < chars.len() {
1312 match chars[index] {
1313 '{' | '}' => {
1314 index += 1;
1315 }
1316 '\\' => {
1317 index += 1;
1318 if index >= chars.len() {
1319 break;
1320 }
1321
1322 match chars[index] {
1323 '\\' | '{' | '}' => {
1324 output.push(chars[index]);
1325 index += 1;
1326 }
1327 '\'' => {
1328 if index + 2 < chars.len() {
1329 let hex = [chars[index + 1], chars[index + 2]];
1330 let hex: String = hex.iter().collect();
1331 if let Ok(value) = u8::from_str_radix(&hex, 16) {
1332 output.push(value as char);
1333 index += 3;
1334 continue;
1335 }
1336 }
1337 index += 1;
1338 }
1339 control if control.is_ascii_alphabetic() => {
1340 let start = index;
1341 while index < chars.len() && chars[index].is_ascii_alphabetic() {
1342 index += 1;
1343 }
1344 let control_word: String = chars[start..index].iter().collect();
1345
1346 let number_start = index;
1347 if index < chars.len()
1348 && (chars[index] == '-' || chars[index].is_ascii_digit())
1349 {
1350 index += 1;
1351 while index < chars.len() && chars[index].is_ascii_digit() {
1352 index += 1;
1353 }
1354 }
1355 let parameter: String = chars[number_start..index].iter().collect();
1356
1357 if index < chars.len() && chars[index] == ' ' {
1358 index += 1;
1359 }
1360
1361 match control_word.as_str() {
1362 "par" | "line" => output.push('\n'),
1363 "tab" => output.push('\t'),
1364 "emdash" => output.push('—'),
1365 "endash" => output.push('–'),
1366 "bullet" => output.push('•'),
1367 "lquote" | "rquote" => output.push('\''),
1368 "ldblquote" | "rdblquote" => output.push('"'),
1369 "u" => {
1370 if let Ok(codepoint) = parameter.parse::<i32>() {
1371 let normalized = if codepoint < 0 {
1372 codepoint + 65_536
1373 } else {
1374 codepoint
1375 };
1376 if let Ok(normalized) = u32::try_from(normalized)
1377 && let Some(ch) = char::from_u32(normalized)
1378 {
1379 output.push(ch);
1380 }
1381 }
1382
1383 if index < chars.len()
1384 && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1385 {
1386 index += 1;
1387 }
1388 }
1389 _ => {}
1390 }
1391 }
1392 _ => {
1393 index += 1;
1394 }
1395 }
1396 }
1397 ch => {
1398 output.push(ch);
1399 index += 1;
1400 }
1401 }
1402 }
1403
1404 output
1405 .replace(['\r', '\u{0c}'], "\n")
1406 .lines()
1407 .map(str::trim_end)
1408 .collect::<Vec<_>>()
1409 .join("\n")
1410}
1411
1412fn looks_like_gzip(bytes: &[u8]) -> bool {
1413 bytes.starts_with(&[0x1f, 0x8b])
1414}
1415
1416fn looks_like_bzip2(bytes: &[u8]) -> bool {
1417 bytes.starts_with(b"BZh")
1418}
1419
1420fn looks_like_xz(bytes: &[u8]) -> bool {
1421 bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1422}
1423
1424fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1425 lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1426}
1427
1428fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1429 lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1430}
1431
1432fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1433 lower_extension(path)
1434 .as_deref()
1435 .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1436 && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1437 || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1438}
1439
1440fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1441 if looks_like_deb(bytes, path) {
1442 "debian binary package (format 2.0)".to_string()
1443 } else if looks_like_rpm(bytes, path) {
1444 "RPM package".to_string()
1445 } else if looks_like_squashfs(bytes, path) {
1446 "Squashfs filesystem".to_string()
1447 } else if looks_like_gzip(bytes) {
1448 "gzip compressed data".to_string()
1449 } else if looks_like_bzip2(bytes) {
1450 "bzip2 compressed data".to_string()
1451 } else if looks_like_xz(bytes) {
1452 "XZ compressed data".to_string()
1453 } else if is_zip_archive(bytes) {
1454 "Zip archive data".to_string()
1455 } else if lower_extension(path).as_deref() == Some("gem") {
1456 "POSIX tar archive".to_string()
1457 } else if let Some(file_type) = format_based_file_type(detected_format) {
1458 file_type
1459 } else {
1460 "archive data".to_string()
1461 }
1462}
1463
1464fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1465 let suffix = text_executable_label(bytes);
1466
1467 match programming_language {
1468 Some("Python") => format!("python script, {suffix}"),
1469 Some("Ruby") => format!("ruby script, {suffix}"),
1470 Some("Perl") => format!("perl script, {suffix}"),
1471 Some("PHP") => format!("php script, {suffix}"),
1472 Some("Shell") => format!("shell script, {suffix}"),
1473 Some("Bash") => format!("bash script, {suffix}"),
1474 Some("Zsh") => format!("zsh script, {suffix}"),
1475 Some("Fish") => format!("fish script, {suffix}"),
1476 Some("Ksh") => format!("ksh script, {suffix}"),
1477 Some("JavaScript") => format!("javascript script, {suffix}"),
1478 Some("TypeScript") => format!("typescript script, {suffix}"),
1479 Some("PowerShell") => format!("powershell script, {suffix}"),
1480 Some("Awk") => format!("awk script, {suffix}"),
1481 _ => format!("script, {suffix}"),
1482 }
1483}
1484
1485fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1486 let suffix = text_label(bytes);
1487 match programming_language {
1488 Some("C") => format!("C source, {suffix}"),
1489 Some("C++") => format!("C++ source, {suffix}"),
1490 Some("Java") => format!("Java source, {suffix}"),
1491 Some("C#") => format!("C# source, {suffix}"),
1492 Some("F#") => format!("F# source, {suffix}"),
1493 Some("Go") => format!("Go source, {suffix}"),
1494 Some("Rust") => format!("Rust source, {suffix}"),
1495 Some("Starlark") => format!("Starlark source, {suffix}"),
1496 Some("CMake") => format!("CMake source, {suffix}"),
1497 Some("Meson") => format!("Meson source, {suffix}"),
1498 Some("Nix") => format!("Nix source, {suffix}"),
1499 Some("Groovy") => format!("Groovy source, {suffix}"),
1500 Some("Makefile") => format!("Makefile source, {suffix}"),
1501 Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1502 Some("Jamfile") => format!("Jamfile source, {suffix}"),
1503 Some("Batchfile") => format!("Batchfile source, {suffix}"),
1504 Some(language) => format!("{language} source, {suffix}"),
1505 None => text_file_type(bytes),
1506 }
1507}
1508
1509fn text_file_type(bytes: &[u8]) -> String {
1510 text_label(bytes).to_string()
1511}
1512
1513fn text_label(bytes: &[u8]) -> &'static str {
1514 if std::str::from_utf8(bytes).is_ok() {
1515 if bytes.contains(&b'\n') {
1516 "UTF-8 Unicode text"
1517 } else {
1518 "UTF-8 Unicode text, with no line terminators"
1519 }
1520 } else if bytes.contains(&b'\n') {
1521 "text"
1522 } else {
1523 "text, with no line terminators"
1524 }
1525}
1526
1527fn text_executable_label(bytes: &[u8]) -> &'static str {
1528 if std::str::from_utf8(bytes).is_ok() {
1529 if bytes.contains(&b'\n') {
1530 "UTF-8 Unicode text executable"
1531 } else {
1532 "UTF-8 Unicode text executable, with no line terminators"
1533 }
1534 } else if bytes.contains(&b'\n') {
1535 "text executable"
1536 } else {
1537 "text executable, with no line terminators"
1538 }
1539}
1540
1541fn supported_image_metadata_format(
1542 ext: Option<&str>,
1543 detected_format: FileFormat,
1544) -> Option<ImageFormat> {
1545 match ext {
1546 Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1547 Some("png") => Some(ImageFormat::Png),
1548 Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1549 Some("webp") => Some(ImageFormat::WebP),
1550 _ => match detected_format.media_type() {
1551 "image/jpeg" => Some(ImageFormat::Jpeg),
1552 "image/png" => Some(ImageFormat::Png),
1553 "image/tiff" => Some(ImageFormat::Tiff),
1554 "image/webp" => Some(ImageFormat::WebP),
1555 _ => None,
1556 },
1557 }
1558}
1559
1560fn should_skip_binary_string_extraction(
1561 path: &Path,
1562 bytes: &[u8],
1563 detected_format: FileFormat,
1564) -> bool {
1565 matches!(lower_extension(path).as_deref(), Some("pdf"))
1566 || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1567 .is_some()
1568 || (matches!(
1569 detected_format.kind(),
1570 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1571 ) && !is_textual_format(detected_format))
1572 || media_mime_from_content(bytes).is_some()
1573 || is_zip_archive(bytes)
1574 || looks_like_gzip(bytes)
1575 || looks_like_bzip2(bytes)
1576 || looks_like_xz(bytes)
1577 || looks_like_deb(bytes, path)
1578 || looks_like_rpm(bytes, path)
1579 || looks_like_squashfs(bytes, path)
1580}
1581
1582fn should_skip_large_opaque_binary_text_extraction(
1583 _path: &Path,
1584 bytes: &[u8],
1585 detected_format: FileFormat,
1586) -> bool {
1587 is_large_opaque_binary_candidate(bytes, detected_format)
1588 && !sample_has_promising_printable_strings(bytes)
1589}
1590
1591fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1592 bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1593 && !is_textual_format(detected_format)
1594 && !matches!(
1595 detected_format.kind(),
1596 FileFormatKind::Archive
1597 | FileFormatKind::Compressed
1598 | FileFormatKind::Package
1599 | FileFormatKind::Audio
1600 | FileFormatKind::Image
1601 | FileFormatKind::Video
1602 )
1603}
1604
1605fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1606 const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1607
1608 let mut ranges = Vec::new();
1609 let mut push_range = |start: usize, end: usize| {
1610 if start < end && !ranges.contains(&(start, end)) {
1611 ranges.push((start, end));
1612 }
1613 };
1614
1615 push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1616 if len > SAMPLE_WINDOW_BYTES * 2 {
1617 let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1618 let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1619 push_range(mid_start, mid_end);
1620 }
1621 if len > SAMPLE_WINDOW_BYTES {
1622 push_range(len - SAMPLE_WINDOW_BYTES, len);
1623 }
1624
1625 ranges
1626}
1627
1628fn extract_bounded_macho_legal_strings(bytes: &[u8]) -> String {
1629 if !matches!(
1630 FileKind::parse(bytes),
1631 Ok(FileKind::MachO32 | FileKind::MachO64 | FileKind::MachOFat32 | FileKind::MachOFat64)
1632 ) {
1633 return String::new();
1634 }
1635
1636 let mut ranges = Vec::new();
1637 for marker in LARGE_MACHO_LEGAL_MARKERS {
1638 collect_marker_window_ranges(bytes, marker, &mut ranges);
1639 if ranges.len() >= LARGE_MACHO_LEGAL_MAX_WINDOWS {
1640 break;
1641 }
1642 }
1643
1644 if ranges.is_empty() {
1645 return String::new();
1646 }
1647
1648 let mut merged_ranges = merge_overlapping_ranges(ranges);
1649 let mut combined_lines = BTreeSet::new();
1650 let mut extracted_bytes = 0usize;
1651
1652 for (start, end) in merged_ranges.drain(..) {
1653 if extracted_bytes >= LARGE_MACHO_LEGAL_MAX_EXTRACT_BYTES {
1654 break;
1655 }
1656 let remaining = LARGE_MACHO_LEGAL_MAX_EXTRACT_BYTES - extracted_bytes;
1657 let end = start.saturating_add((end - start).min(remaining));
1658 let window_text = extract_printable_strings(&bytes[start..end]);
1659 for line in window_text
1660 .lines()
1661 .map(str::trim)
1662 .filter(|line| !line.is_empty())
1663 {
1664 combined_lines.insert(line.to_string());
1665 }
1666 extracted_bytes += end - start;
1667 }
1668
1669 combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1670}
1671
1672fn collect_marker_window_ranges(bytes: &[u8], marker: &[u8], ranges: &mut Vec<(usize, usize)>) {
1673 if marker.is_empty() || ranges.len() >= LARGE_MACHO_LEGAL_MAX_WINDOWS {
1674 return;
1675 }
1676
1677 let mut search_start = 0usize;
1678 let mut hits_for_marker = 0usize;
1679
1680 while search_start + marker.len() <= bytes.len()
1681 && ranges.len() < LARGE_MACHO_LEGAL_MAX_WINDOWS
1682 && hits_for_marker < LARGE_MACHO_LEGAL_MAX_WINDOWS_PER_MARKER
1683 {
1684 let Some(relative_match) = bytes[search_start..].iter().position(|&b| b == marker[0])
1685 else {
1686 break;
1687 };
1688 let match_start = search_start + relative_match;
1689 let match_end = match_start + marker.len();
1690 if match_end <= bytes.len() && &bytes[match_start..match_end] == marker {
1691 let half_window = LARGE_MACHO_LEGAL_WINDOW_BYTES / 2;
1692 let window_start = match_start.saturating_sub(half_window);
1693 let window_end = (match_end + half_window).min(bytes.len());
1694 ranges.push((window_start, window_end));
1695 hits_for_marker += 1;
1696 search_start = match_end;
1697 } else {
1698 search_start = match_start + 1;
1699 }
1700 }
1701}
1702
1703fn merge_overlapping_ranges(mut ranges: Vec<(usize, usize)>) -> Vec<(usize, usize)> {
1704 if ranges.is_empty() {
1705 return ranges;
1706 }
1707
1708 ranges.sort_unstable_by_key(|&(start, end)| (start, end));
1709
1710 let mut merged = Vec::with_capacity(ranges.len());
1711 let mut current = ranges[0];
1712 for (start, end) in ranges.into_iter().skip(1) {
1713 if start <= current.1 {
1714 current.1 = current.1.max(end);
1715 } else {
1716 merged.push(current);
1717 current = (start, end);
1718 }
1719 }
1720 merged.push(current);
1721
1722 merged
1723}
1724
1725fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1726 let mut structured_signal_seen = false;
1727 let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1728 .into_iter()
1729 .filter(|&(start, end)| {
1730 let window = &bytes[start..end];
1731 if has_strong_structured_text_signal(window) {
1732 structured_signal_seen = true;
1733 }
1734 has_license_or_notice_signal(window)
1735 })
1736 .count();
1737
1738 structured_signal_seen || promising_license_windows >= 2
1739}
1740
1741fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1742 let mut combined_lines = BTreeSet::new();
1743
1744 for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1745 let window_text = extract_printable_strings(&bytes[start..end]);
1746 for line in window_text
1747 .lines()
1748 .map(str::trim)
1749 .filter(|line| !line.is_empty())
1750 {
1751 combined_lines.insert(line.to_string());
1752 }
1753 }
1754
1755 combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1756}
1757
1758fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1759 let strings = extract_printable_strings(bytes);
1760 if strings.is_empty() {
1761 return false;
1762 }
1763
1764 let lower = strings.to_ascii_lowercase();
1765 [
1766 "copyright",
1767 "license",
1768 "licensed under",
1769 "all rights reserved",
1770 "permission is hereby granted",
1771 "redistribution and use",
1772 "spdx-license-identifier",
1773 ]
1774 .iter()
1775 .any(|marker| lower.contains(marker))
1776}
1777
1778fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1779 let strings = extract_printable_strings(bytes);
1780 if strings.is_empty() {
1781 return false;
1782 }
1783
1784 let email_markers = strings.matches('@').count();
1785 let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1786
1787 email_markers + url_markers >= 3
1788}
1789
1790fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1791 match format {
1792 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1793 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1794 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1795 ImageFormat::WebP => {
1796 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1797 }
1798 _ => false,
1799 }
1800}
1801
1802fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1803 let mut values = Vec::new();
1804 values.extend(extract_exif_metadata_values(bytes));
1805 values.extend(extract_xmp_metadata_values(bytes, format));
1806 values_to_text(values)
1807}
1808
1809fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1810 let mut cursor = BufReader::new(Cursor::new(bytes));
1811 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1812 Ok(exif) => exif,
1813 Err(_) => return Vec::new(),
1814 };
1815
1816 let mut values = Vec::new();
1817 for field in exif.fields() {
1818 let rendered = match field.tag {
1819 exif::Tag::ImageDescription => Some(format_metadata_field(
1820 "Description",
1821 &field.display_value().with_unit(&exif).to_string(),
1822 )),
1823 exif::Tag::Copyright => Some(format_metadata_field(
1824 "Copyright",
1825 &field.display_value().with_unit(&exif).to_string(),
1826 )),
1827 exif::Tag::UserComment => Some(format_metadata_field(
1828 "Comment",
1829 &field.display_value().with_unit(&exif).to_string(),
1830 )),
1831 exif::Tag::Artist => Some(format_metadata_field(
1832 "Author",
1833 &field.display_value().with_unit(&exif).to_string(),
1834 )),
1835 _ => None,
1836 };
1837
1838 if let Some(rendered) = rendered {
1839 values.push(rendered);
1840 }
1841 }
1842
1843 values
1844}
1845
1846fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1847 let xmp = match extract_raw_xmp_packet(bytes, format) {
1848 Some(xmp) => xmp,
1849 None => return Vec::new(),
1850 };
1851
1852 parse_xmp_values(&xmp)
1853}
1854
1855fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1856 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1857 if let Ok(mut decoder) = reader.into_decoder()
1858 && let Ok(Some(xmp)) = decoder.xmp_metadata()
1859 {
1860 return (xmp.len() <= MAX_XMP_PACKET_BYTES).then_some(xmp);
1861 }
1862
1863 match format {
1864 ImageFormat::Png => extract_png_xmp_packet(bytes),
1865 _ => None,
1866 }
1867}
1868
1869fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1870 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1871
1872 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1873 return None;
1874 }
1875
1876 let mut offset = PNG_SIGNATURE.len();
1877 while offset + 12 <= bytes.len() {
1878 let length = u32::from_be_bytes([
1879 bytes[offset],
1880 bytes[offset + 1],
1881 bytes[offset + 2],
1882 bytes[offset + 3],
1883 ]) as usize;
1884 let chunk_start = offset + 8;
1885 let chunk_end = chunk_start + length;
1886 if chunk_end + 4 > bytes.len() {
1887 return None;
1888 }
1889
1890 let chunk_type = &bytes[offset + 4..offset + 8];
1891 if chunk_type == b"iTXt" {
1892 let data = &bytes[chunk_start..chunk_end];
1893 if let Some(xmp) = parse_png_itxt_xmp(data) {
1894 return Some(xmp);
1895 }
1896 }
1897
1898 offset = chunk_end + 4;
1899 }
1900
1901 None
1902}
1903
1904fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1905 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1906
1907 let keyword_end = data.iter().position(|&b| b == 0)?;
1908 if &data[..keyword_end] != XMP_KEYWORD {
1909 return None;
1910 }
1911
1912 let mut cursor = keyword_end + 1;
1913 let compression_flag = *data.get(cursor)?;
1914 cursor += 1;
1915 let compression_method = *data.get(cursor)?;
1916 cursor += 1;
1917 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1918 return None;
1919 }
1920
1921 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1922 cursor = language_end + 1;
1923
1924 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1925 cursor = translated_end + 1;
1926
1927 let text_bytes = &data[cursor..];
1928 if compression_flag == 1 {
1929 let decoder = ZlibDecoder::new(text_bytes);
1930 let mut decoded = Vec::new();
1931 decoder
1932 .take((MAX_XMP_PACKET_BYTES + 1) as u64)
1933 .read_to_end(&mut decoded)
1934 .ok()?;
1935 (decoded.len() <= MAX_XMP_PACKET_BYTES).then_some(decoded)
1936 } else {
1937 (text_bytes.len() <= MAX_XMP_PACKET_BYTES).then(|| text_bytes.to_vec())
1938 }
1939}
1940
1941fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1942 let mut reader = XmlReader::from_reader(xmp);
1943 reader.config_mut().trim_text(true);
1944
1945 let mut buf = Vec::new();
1946 let mut stack: Vec<String> = Vec::new();
1947 let mut values = Vec::new();
1948
1949 loop {
1950 match reader.read_event_into(&mut buf) {
1951 Ok(Event::Start(e)) => {
1952 stack.push(local_xml_name(e.name().as_ref()));
1953 }
1954 Ok(Event::End(_)) => {
1955 stack.pop();
1956 }
1957 Ok(Event::Empty(_)) => {}
1958 Ok(Event::Text(text)) => {
1959 if let Some(field) = stack
1960 .iter()
1961 .rev()
1962 .find_map(|name| allowed_xmp_field(name.as_str()))
1963 && let Ok(decoded) = text.decode()
1964 {
1965 let decoded = decoded.into_owned();
1966 if !decoded.trim().is_empty() {
1967 values.push(format_xmp_value(field, &decoded));
1968 }
1969 }
1970 }
1971 Ok(Event::CData(text)) => {
1972 if let Some(field) = stack
1973 .iter()
1974 .rev()
1975 .find_map(|name| allowed_xmp_field(name.as_str()))
1976 && let Ok(decoded) = text.decode()
1977 {
1978 let decoded = decoded.into_owned();
1979 if !decoded.trim().is_empty() {
1980 values.push(format_xmp_value(field, &decoded));
1981 }
1982 }
1983 }
1984 Ok(Event::Eof) | Err(_) => break,
1985 _ => {}
1986 }
1987 buf.clear();
1988 }
1989
1990 values
1991}
1992
1993fn local_xml_name(name: &[u8]) -> String {
1994 let name = std::str::from_utf8(name).unwrap_or_default();
1995 name.rsplit(':').next().unwrap_or(name).to_string()
1996}
1997
1998fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1999 match name {
2000 "creator" => Some("creator"),
2001 "rights" => Some("rights"),
2002 "description" => Some("description"),
2003 "title" => Some("title"),
2004 "subject" => Some("subject"),
2005 "UsageTerms" => Some("usage_terms"),
2006 "WebStatement" => Some("web_statement"),
2007 _ => None,
2008 }
2009}
2010
2011fn format_xmp_value(field: &str, value: &str) -> String {
2012 match field {
2013 "creator" => format_metadata_field("Author", value),
2014 "rights" => format_metadata_field("Copyright", value),
2015 "description" => format_metadata_field("Description", value),
2016 "title" => format_metadata_field("Title", value),
2017 "subject" => format_metadata_field("Subject", value),
2018 "usage_terms" => format_metadata_field("UsageTerms", value),
2019 "web_statement" => format_metadata_field("WebStatement", value),
2020 _ => value.to_string(),
2021 }
2022}
2023
2024fn format_metadata_field(label: &str, value: &str) -> String {
2025 format!("{label}: {value}")
2026}
2027
2028fn values_to_text(values: Vec<String>) -> String {
2029 let mut seen = BTreeSet::new();
2030 let mut normalized_lines = Vec::new();
2031
2032 for value in values {
2033 let normalized = normalize_metadata_value(&value);
2034 if normalized.is_empty() || !seen.insert(normalized.clone()) {
2035 continue;
2036 }
2037
2038 normalized_lines.push(normalized);
2039 }
2040
2041 let author_values: BTreeSet<String> = normalized_lines
2042 .iter()
2043 .filter_map(|line| split_metadata_field(line))
2044 .filter(|(label, _)| label.eq_ignore_ascii_case("Author"))
2045 .map(|(_, value)| value.to_string())
2046 .collect();
2047
2048 let mut lines = Vec::new();
2049 let mut total_bytes = 0usize;
2050
2051 for normalized in normalized_lines {
2052 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
2053 break;
2054 }
2055
2056 if should_suppress_bare_copyright_metadata_line(&normalized, &author_values) {
2057 continue;
2058 }
2059
2060 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
2061 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
2062 break;
2063 }
2064
2065 total_bytes += added_bytes;
2066 lines.push(normalized);
2067 }
2068
2069 lines.join("\n")
2070}
2071
2072fn split_metadata_field(line: &str) -> Option<(&str, &str)> {
2073 let (label, value) = line.split_once(':')?;
2074 Some((label.trim(), value.trim()))
2075}
2076
2077fn should_suppress_bare_copyright_metadata_line(
2078 line: &str,
2079 author_values: &BTreeSet<String>,
2080) -> bool {
2081 let Some((label, value)) = split_metadata_field(line) else {
2082 return false;
2083 };
2084 if !label.eq_ignore_ascii_case("Copyright")
2085 || value.is_empty()
2086 || !author_values.contains(value)
2087 {
2088 return false;
2089 }
2090
2091 let lower = value.to_ascii_lowercase();
2092 !lower.contains("copyright")
2093 && !lower.contains("(c)")
2094 && !lower.contains('©')
2095 && !lower.contains("all rights")
2096 && !value.chars().any(|ch| ch.is_ascii_digit())
2097}
2098
2099fn normalize_metadata_value(value: &str) -> String {
2100 value
2101 .chars()
2102 .filter(|&ch| ch != '\0')
2103 .collect::<String>()
2104 .split_whitespace()
2105 .collect::<Vec<_>>()
2106 .join(" ")
2107 .trim()
2108 .to_string()
2109}
2110
2111fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
2112 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
2113 return (String::new(), None);
2114 }
2115
2116 if bytes.len() > MAX_PDF_TEXT_EXTRACTION_BYTES {
2117 return (
2118 String::new(),
2119 Some(format!(
2120 "PDF text extraction skipped because file exceeds {} bytes",
2121 MAX_PDF_TEXT_EXTRACTION_BYTES
2122 )),
2123 );
2124 }
2125
2126 let mut failures = Vec::new();
2127 let mut saw_success = false;
2128
2129 let extracted = catch_unwind(AssertUnwindSafe(
2130 || -> Result<String, Box<dyn std::error::Error>> {
2131 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
2132 extract_first_pdf_page_text(&mut document)
2133 },
2134 ));
2135 match extracted {
2136 Ok(Ok(text)) => {
2137 saw_success = true;
2138 if let Some(normalized) = normalize_pdf_text(text) {
2139 return (normalized, None);
2140 }
2141 }
2142 Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
2143 Err(payload) => failures.push(format!(
2144 "from-bytes first-page panic: {}",
2145 panic_payload_to_string(payload.as_ref())
2146 )),
2147 }
2148
2149 let extracted = catch_unwind(AssertUnwindSafe(
2150 || -> Result<String, Box<dyn std::error::Error>> {
2151 let mut document = pdf_oxide::document::PdfDocument::open(path)?;
2152 extract_pdf_text_from_document(&mut document)
2153 },
2154 ));
2155 match extracted {
2156 Ok(Ok(text)) => {
2157 saw_success = true;
2158 if let Some(normalized) = normalize_pdf_text(text) {
2159 return (normalized, None);
2160 }
2161 }
2162 Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
2163 Err(payload) => failures.push(format!(
2164 "open full-document panic: {}",
2165 panic_payload_to_string(payload.as_ref())
2166 )),
2167 }
2168
2169 let extracted = catch_unwind(AssertUnwindSafe(
2170 || -> Result<String, Box<dyn std::error::Error>> {
2171 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
2172 extract_pdf_text_from_document(&mut document)
2173 },
2174 ));
2175 match extracted {
2176 Ok(Ok(text)) => {
2177 saw_success = true;
2178 if let Some(normalized) = normalize_pdf_text(text) {
2179 return (normalized, None);
2180 }
2181 }
2182 Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
2183 Err(payload) => failures.push(format!(
2184 "from-bytes full-document panic: {}",
2185 panic_payload_to_string(payload.as_ref())
2186 )),
2187 }
2188
2189 if saw_success || is_non_actionable_pdf_failure(&failures) {
2190 (String::new(), None)
2191 } else {
2192 (
2193 String::new(),
2194 Some(format!(
2195 "PDF text extraction failed after {} attempts: {}",
2196 failures.len(),
2197 failures.join("; ")
2198 )),
2199 )
2200 }
2201}
2202
2203fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
2204 !failures.is_empty()
2205 && failures.iter().all(|failure| {
2206 failure.contains("requires a password")
2207 || failure.contains("Encrypt dictionary missing /O")
2208 || failure.contains("Encrypt dictionary missing /U")
2209 || failure.contains("security handler cannot be found")
2210 || failure.contains("Invalid cross-reference table")
2211 })
2212}
2213
2214fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
2215 if let Some(message) = payload.downcast_ref::<&str>() {
2216 (*message).to_string()
2217 } else if let Some(message) = payload.downcast_ref::<String>() {
2218 message.clone()
2219 } else {
2220 "unknown panic payload".to_string()
2221 }
2222}
2223
2224fn extract_first_pdf_page_text(
2225 document: &mut pdf_oxide::document::PdfDocument,
2226) -> Result<String, Box<dyn std::error::Error>> {
2227 if document.page_count()? == 0 {
2228 return Ok(String::new());
2229 }
2230
2231 let extracted_text = document.extract_text(0)?;
2232 let markdown_text =
2233 document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
2234 if pdf_markdown_heading_lines(&markdown_text).is_empty() {
2235 return Ok(extracted_text);
2236 }
2237
2238 let pipeline_text =
2239 document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
2240
2241 Ok(merge_pdf_first_page_text(
2242 &extracted_text,
2243 &markdown_text,
2244 &pipeline_text,
2245 ))
2246}
2247
2248fn extract_pdf_text_from_document(
2249 document: &mut pdf_oxide::document::PdfDocument,
2250) -> Result<String, Box<dyn std::error::Error>> {
2251 Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
2252}
2253
2254fn normalize_pdf_text(text: String) -> Option<String> {
2255 let normalized = text.replace(['\r', '\u{0c}'], "\n");
2256 (!normalized.trim().is_empty()).then_some(normalized)
2257}
2258
2259fn merge_pdf_first_page_text(
2260 _extracted_text: &str,
2261 markdown_text: &str,
2262 pipeline_text: &str,
2263) -> String {
2264 let pipeline = pipeline_text.trim();
2265 if pipeline.is_empty() {
2266 return String::new();
2267 }
2268
2269 let prefix = pdf_first_page_heading_prefix(markdown_text);
2270 let Some(prefix) = prefix else {
2271 return pipeline_text.to_string();
2272 };
2273
2274 if pdf_text_contains_heading_prefix(pipeline, &prefix) {
2275 pipeline_text.to_string()
2276 } else {
2277 format!("{prefix}\n\n{pipeline}")
2278 }
2279}
2280
2281fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
2282 normalize_pdf_heading_comparison_text(text)
2283 .contains(&normalize_pdf_heading_comparison_text(prefix))
2284}
2285
2286fn normalize_pdf_heading_comparison_text(text: &str) -> String {
2287 text.split_whitespace()
2288 .map(|part| part.to_ascii_lowercase())
2289 .collect::<Vec<_>>()
2290 .join(" ")
2291}
2292
2293fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
2294 let mut lines = Vec::new();
2295
2296 for line in pdf_markdown_heading_lines(markdown_text) {
2297 push_unique_line(&mut lines, line);
2298 }
2299
2300 (!lines.is_empty()).then(|| lines.join("\n"))
2301}
2302
2303fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
2304 text.lines()
2305 .map(str::trim)
2306 .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
2307 .map(|line| line.trim_matches('#').trim())
2308 .filter(|line| !line.is_empty())
2309 .filter(|line| !looks_like_numbered_section_heading(line))
2310 .take(4)
2311 .map(ToOwned::to_owned)
2312 .collect()
2313}
2314
2315fn push_unique_line(lines: &mut Vec<String>, line: String) {
2316 if !lines.iter().any(|existing| existing == &line) {
2317 lines.push(line);
2318 }
2319}
2320
2321fn looks_like_numbered_section_heading(line: &str) -> bool {
2322 let mut chars = line.chars();
2323 let Some(first) = chars.next() else {
2324 return false;
2325 };
2326
2327 if !first.is_ascii_digit() {
2328 return false;
2329 }
2330
2331 matches!(chars.next(), Some('.'))
2332}
2333
2334fn is_zip_archive(bytes: &[u8]) -> bool {
2335 bytes.starts_with(b"PK\x03\x04")
2336 || bytes.starts_with(b"PK\x05\x06")
2337 || bytes.starts_with(b"PK\x07\x08")
2338}
2339
2340pub fn extract_printable_strings(bytes: &[u8]) -> String {
2341 const MIN_LEN: usize = 4;
2342 const MIN_OUTPUT_BYTES: usize = 2_000_000;
2343 const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
2344
2345 let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
2346
2347 fn is_printable_ascii(b: u8) -> bool {
2348 matches!(b, 0x20..=0x7E)
2349 }
2350
2351 let mut out = String::new();
2352 let mut run: Vec<u8> = Vec::new();
2353
2354 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
2355 if run.len() >= MIN_LEN {
2356 if !out.is_empty() {
2357 out.push('\n');
2358 }
2359 out.push_str(&String::from_utf8_lossy(run));
2360 }
2361 run.clear();
2362 };
2363
2364 for &b in bytes {
2365 if is_printable_ascii(b) {
2366 run.push(b);
2367 } else {
2368 flush_run(&mut out, &mut run);
2369 if out.len() >= max_output_bytes {
2370 return out;
2371 }
2372 }
2373 }
2374 flush_run(&mut out, &mut run);
2375 if out.len() >= max_output_bytes {
2376 return out;
2377 }
2378
2379 for start in 0..=1 {
2380 run.clear();
2381 let mut i = start;
2382 while i + 1 < bytes.len() {
2383 let b0 = bytes[i];
2384 let b1 = bytes[i + 1];
2385 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
2386 if is_printable_ascii(ch) && zero == 0 {
2387 run.push(ch);
2388 } else {
2389 flush_run(&mut out, &mut run);
2390 if out.len() >= max_output_bytes {
2391 return out;
2392 }
2393 }
2394 i += 2;
2395 }
2396 flush_run(&mut out, &mut run);
2397 if out.len() >= max_output_bytes {
2398 return out;
2399 }
2400 }
2401
2402 out
2403}
2404
2405#[cfg(test)]
2406mod tests {
2407 use image::ImageFormat;
2408 use std::path::Path;
2409
2410 use crate::copyright::detect_copyrights;
2411
2412 use super::{
2413 ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, MAX_PDF_TEXT_EXTRACTION_BYTES,
2414 MAX_XMP_PACKET_BYTES, classify_file_info, extract_printable_strings,
2415 extract_raw_xmp_packet, extract_text_for_detection,
2416 extract_text_for_detection_with_diagnostics, format_metadata_field, format_xmp_value,
2417 is_non_actionable_pdf_failure, normalize_mime_type, normalize_pdf_heading_comparison_text,
2418 values_to_text, windows_metadata_or_empty_result,
2419 };
2420
2421 fn png_chunk(chunk_type: &[u8; 4], data: &[u8]) -> Vec<u8> {
2422 let mut out = Vec::new();
2423 out.extend_from_slice(&(data.len() as u32).to_be_bytes());
2424 out.extend_from_slice(chunk_type);
2425 out.extend_from_slice(data);
2426 out.extend_from_slice(&0u32.to_be_bytes());
2427 out
2428 }
2429
2430 fn build_png_with_xmp(xmp: &str) -> Vec<u8> {
2431 let mut bytes = Vec::new();
2432 bytes.extend_from_slice(b"\x89PNG\r\n\x1a\n");
2433
2434 let ihdr = [
2435 0, 0, 0, 1, 0, 0, 0, 1, 8, 2, 0, 0, 0, ];
2443 bytes.extend_from_slice(&png_chunk(b"IHDR", &ihdr));
2444
2445 let mut itxt = Vec::new();
2446 itxt.extend_from_slice(b"XML:com.adobe.xmp");
2447 itxt.push(0); itxt.push(0); itxt.push(0); itxt.push(0); itxt.push(0); itxt.extend_from_slice(xmp.as_bytes());
2453 bytes.extend_from_slice(&png_chunk(b"iTXt", &itxt));
2454
2455 bytes.extend_from_slice(&png_chunk(b"IEND", &[]));
2456 bytes
2457 }
2458
2459 #[test]
2460 fn test_extract_text_for_detection_skips_jar_archives() {
2461 let path = Path::new(
2462 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2463 );
2464 let bytes = std::fs::read(path).expect("failed to read jar fixture");
2465
2466 let (text, kind) = extract_text_for_detection(path, &bytes);
2467
2468 assert!(text.is_empty());
2469 assert_eq!(kind, ExtractedTextKind::None);
2470 }
2471
2472 #[test]
2473 fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2474 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2475 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2476
2477 let (text, kind) = extract_text_for_detection(path, &bytes);
2478
2479 assert_eq!(kind, ExtractedTextKind::Pdf);
2480 assert!(text.contains("Redistribution and use in source and binary forms"));
2481 }
2482
2483 #[test]
2484 fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2485 let path =
2486 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2487 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2488
2489 let (text, kind) = extract_text_for_detection(path, &bytes);
2490
2491 assert_eq!(kind, ExtractedTextKind::Pdf);
2492 assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2493 assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2494 }
2495
2496 #[test]
2497 fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2498 let path =
2499 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2500 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2501
2502 let (text, kind) = extract_text_for_detection(path, &bytes);
2503
2504 assert_eq!(kind, ExtractedTextKind::Pdf);
2505
2506 let normalized = normalize_pdf_heading_comparison_text(&text);
2507 let heading =
2508 normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2509 assert_eq!(normalized.matches(&heading).count(), 1);
2510 }
2511
2512 #[test]
2513 fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2514 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2515 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2516
2517 let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2518
2519 assert_eq!(kind, ExtractedTextKind::Pdf);
2520 assert!(text.contains("Redistribution and use in source and binary forms"));
2521 }
2522
2523 #[test]
2524 fn test_extract_text_for_detection_skips_oversized_pdf_payload() {
2525 let mut bytes = b"%PDF-1.7\n".to_vec();
2526 bytes.resize(MAX_PDF_TEXT_EXTRACTION_BYTES + 1, b'0');
2527
2528 let (text, kind, scan_error) =
2529 extract_text_for_detection_with_diagnostics(Path::new("oversized.pdf"), &bytes);
2530
2531 assert!(text.is_empty());
2532 assert_eq!(kind, ExtractedTextKind::None);
2533 assert!(
2534 scan_error
2535 .as_deref()
2536 .is_some_and(|message| message.contains("PDF text extraction skipped"))
2537 );
2538 }
2539
2540 #[test]
2541 fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2542 let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2543
2544 let (text, kind, scan_error) =
2545 extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2546
2547 assert!(text.is_empty());
2548 assert_eq!(kind, ExtractedTextKind::None);
2549 let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2550 assert!(scan_error.contains("PDF text extraction failed after"));
2551 }
2552
2553 #[test]
2554 fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2555 let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2556
2557 let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2558
2559 assert!(text.is_empty());
2560 assert_eq!(kind, ExtractedTextKind::None);
2561 }
2562
2563 #[test]
2564 fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2565 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2566 let text = b"Copyright 2026 Example Project!!!";
2567 bytes[..text.len()].copy_from_slice(text);
2568 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2569 bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2570
2571 let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2572
2573 assert_ne!(kind, ExtractedTextKind::None);
2574 assert!(text.contains("Copyright 2026 Example Project"));
2575 }
2576
2577 #[test]
2578 fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2579 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2580 let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2581 bytes[..noise.len()].copy_from_slice(noise);
2582 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2583 bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2584
2585 let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2586
2587 assert!(text.is_empty());
2588 assert_eq!(kind, ExtractedTextKind::None);
2589 }
2590
2591 #[test]
2592 fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2593 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2594 let bytes = std::fs::read(path).expect("read PE fixture");
2595
2596 let (text, kind) = extract_text_for_detection(path, &bytes);
2597
2598 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2599 assert!(text.contains("License: This program is free software"));
2600 assert!(text.contains("LegalCopyright:"));
2601 }
2602
2603 #[test]
2604 fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2605 {
2606 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2607 let mut bytes = std::fs::read(path).expect("read PE fixture");
2608 bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2609
2610 let (text, kind) = extract_text_for_detection(path, &bytes);
2611
2612 assert_ne!(kind, ExtractedTextKind::None);
2613 assert!(!text.trim().is_empty());
2614 }
2615
2616 #[test]
2617 fn test_windows_metadata_or_empty_result_preserves_metadata() {
2618 let (text, kind, scan_error) =
2619 windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2620
2621 assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2622 assert_eq!(text, "LegalCopyright: Example Corp");
2623 assert!(scan_error.is_none());
2624 }
2625
2626 #[test]
2627 fn test_format_xmp_value_labels_creator_and_title_fields() {
2628 assert_eq!(
2629 format_xmp_value("creator", "Chinmay Garde"),
2630 "Author: Chinmay Garde"
2631 );
2632 assert_eq!(
2633 format_xmp_value("title", "Bay Bridge At Night"),
2634 "Title: Bay Bridge At Night"
2635 );
2636 assert_eq!(
2637 format_xmp_value("description", "Embarcadero in the evening on Delta 3200"),
2638 "Description: Embarcadero in the evening on Delta 3200"
2639 );
2640 }
2641
2642 #[test]
2643 fn test_format_metadata_field_prefixes_exif_text() {
2644 assert_eq!(
2645 format_metadata_field("Author", "Chinmay Garde"),
2646 "Author: Chinmay Garde"
2647 );
2648 assert_eq!(
2649 format_metadata_field("Description", "Bay Bridge At Night"),
2650 "Description: Bay Bridge At Night"
2651 );
2652 }
2653
2654 #[test]
2655 fn test_extract_text_for_detection_keeps_image_author_separate_from_title_and_description() {
2656 let xmp = r#"<x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:creator>Chinmay Garde</dc:creator><dc:title>Bay Bridge At Night</dc:title><dc:description>Embarcadero in the evening on Delta 3200</dc:description></rdf:Description></rdf:RDF></x:xmpmeta>"#;
2657 let bytes = build_png_with_xmp(xmp);
2658
2659 let (text, kind) = extract_text_for_detection(Path::new("fixture.png"), &bytes);
2660
2661 assert_eq!(kind, ExtractedTextKind::ImageMetadata);
2662 assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2663 assert!(
2664 text.contains("Title: Bay Bridge At Night"),
2665 "text: {text:?}"
2666 );
2667 assert!(
2668 text.contains("Description: Embarcadero in the evening on Delta 3200"),
2669 "text: {text:?}"
2670 );
2671
2672 let (_copyrights, _holders, authors) = detect_copyrights(&text, None);
2673 assert_eq!(
2674 authors
2675 .iter()
2676 .map(|a| a.author.as_str())
2677 .collect::<Vec<_>>(),
2678 vec!["Chinmay Garde"],
2679 "authors: {authors:?}; text: {text:?}"
2680 );
2681 }
2682
2683 #[test]
2684 fn test_values_to_text_suppresses_bare_copyright_duplicate_of_author() {
2685 let text = values_to_text(vec![
2686 "Author: Chinmay Garde".to_string(),
2687 "Copyright: Chinmay Garde".to_string(),
2688 "Title: Bay Bridge At Night".to_string(),
2689 ]);
2690
2691 assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2692 assert!(
2693 text.contains("Title: Bay Bridge At Night"),
2694 "text: {text:?}"
2695 );
2696 assert!(!text.contains("Copyright: Chinmay Garde"), "text: {text:?}");
2697 }
2698
2699 #[test]
2700 fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2701 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2702 let text = b"Copyright 2026 Example Project!!!";
2703 bytes[..text.len()].copy_from_slice(text);
2704
2705 let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2706
2707 assert!(text.is_empty());
2708 assert_eq!(kind, ExtractedTextKind::None);
2709 }
2710
2711 #[test]
2712 fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2713 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2714 let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2715 bytes[..text.len()].copy_from_slice(text);
2716
2717 let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2718
2719 assert_ne!(kind, ExtractedTextKind::None);
2720 assert!(text.contains("asn@redhat.com"));
2721 assert!(text.contains("https://publicsuffix.org/"));
2722 }
2723
2724 #[test]
2725 fn test_extract_text_for_detection_keeps_large_macho_with_off_window_legal_markers() {
2726 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES * 2];
2727 bytes[..4].copy_from_slice(&[0xCF, 0xFA, 0xED, 0xFE]);
2728 let apache_notice = b"// Licensed under the Apache License, Version 2.0 (the \"License\");\n// http://www.apache.org/licenses/LICENSE-2.0\n// SPDX-License-Identifier: Apache-2.0\n";
2729 let insert_offset = 200 * 1024;
2730 bytes[insert_offset..insert_offset + apache_notice.len()].copy_from_slice(apache_notice);
2731
2732 let (text, kind) = extract_text_for_detection(Path::new("node"), &bytes);
2733
2734 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2735 assert!(text.contains("Apache License, Version 2.0"), "{text}");
2736 assert!(
2737 text.contains("SPDX-License-Identifier: Apache-2.0"),
2738 "{text}"
2739 );
2740 }
2741
2742 #[test]
2743 fn test_extract_text_for_detection_keeps_large_macho_with_unicode_notice_markers() {
2744 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES * 2];
2745 bytes[..4].copy_from_slice(&[0xCF, 0xFA, 0xED, 0xFE]);
2746 let unicode_notice = b"Copyright (c) 1991-2024 Unicode, Inc.\nFor terms of use, see http://www.unicode.org/copyright.html\n";
2747 let insert_offset = 700 * 1024;
2748 bytes[insert_offset..insert_offset + unicode_notice.len()].copy_from_slice(unicode_notice);
2749
2750 let (text, kind) = extract_text_for_detection(Path::new("node"), &bytes);
2751
2752 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2753 assert!(text.contains("Unicode, Inc."), "{text}");
2754 assert!(text.contains("unicode.org/copyright.html"), "{text}");
2755 }
2756
2757 #[test]
2758 fn test_extract_text_for_detection_does_not_reopen_single_window_legal_noise_for_non_macho() {
2759 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES * 2];
2760 let apache_notice = b"// Licensed under the Apache License, Version 2.0 (the \"License\");\n// http://www.apache.org/licenses/LICENSE-2.0\n// SPDX-License-Identifier: Apache-2.0\n";
2761 let insert_offset = 200 * 1024;
2762 bytes[insert_offset..insert_offset + apache_notice.len()].copy_from_slice(apache_notice);
2763
2764 let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2765
2766 assert!(text.is_empty());
2767 assert_eq!(kind, ExtractedTextKind::None);
2768 }
2769
2770 #[test]
2771 fn test_extract_text_for_detection_avoids_latin1_decode_for_binary_blob_noise() {
2772 let bytes = vec![
2773 0x28, 0x63, 0x29, 0x20, 0x4b, 0x30, 0x0e, 0x71, 0x86, 0x20, 0x62, 0x24, 0x4c,
2774 ];
2775
2776 let (text, kind) = extract_text_for_detection(Path::new("fixture.blb"), &bytes);
2777
2778 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2779 assert_eq!(text, "(c) K0\n b$L");
2780 }
2781
2782 #[test]
2783 fn test_extract_raw_xmp_packet_rejects_oversized_png_itxt_payload() {
2784 let xmp = "A".repeat(MAX_XMP_PACKET_BYTES + 1);
2785 let bytes = build_png_with_xmp(&xmp);
2786
2787 assert!(extract_raw_xmp_packet(&bytes, ImageFormat::Png).is_none());
2788 }
2789
2790 #[test]
2791 fn test_non_actionable_pdf_failures_are_suppressed() {
2792 assert!(is_non_actionable_pdf_failure(&[
2793 "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2794 "open full-document: PDF is encrypted and requires a password".to_string(),
2795 ]));
2796 assert!(is_non_actionable_pdf_failure(&[
2797 "from-bytes first-page: Invalid cross-reference table".to_string(),
2798 "open full-document: Invalid cross-reference table".to_string(),
2799 ]));
2800 assert!(is_non_actionable_pdf_failure(&[
2801 "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2802 "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2803 ]));
2804 assert!(!is_non_actionable_pdf_failure(&[
2805 "from-bytes first-page: some other parser failure".to_string(),
2806 ]));
2807 }
2808
2809 #[test]
2810 fn test_extract_text_for_detection_skips_zip_like_archives() {
2811 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2812
2813 let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2814 let (crate_text, crate_kind) =
2815 extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2816
2817 assert!(whl_text.is_empty());
2818 assert_eq!(whl_kind, ExtractedTextKind::None);
2819 assert!(crate_text.is_empty());
2820 assert_eq!(crate_kind, ExtractedTextKind::None);
2821 }
2822
2823 #[test]
2824 fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2825 let path =
2826 Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2827 let bytes = std::fs::read(path).expect("failed to read lib fixture");
2828
2829 let (text, kind) = extract_text_for_detection(path, &bytes);
2830
2831 assert_ne!(kind, ExtractedTextKind::None);
2832 assert!(text.contains("Copyright nexB and others (c) 2012"));
2833 }
2834
2835 #[test]
2836 fn test_extract_text_for_detection_reads_font_metadata() {
2837 let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2838 let bytes = std::fs::read(path).expect("failed to read font fixture");
2839
2840 let (text, kind) = extract_text_for_detection(path, &bytes);
2841
2842 assert_eq!(kind, ExtractedTextKind::FontMetadata);
2843 assert!(text.contains("License Description:"), "{text}");
2844 assert!(
2845 text.contains("Open Font License") || text.contains("OFL"),
2846 "{text}"
2847 );
2848 assert!(text.contains("Lato"), "{text}");
2849 }
2850
2851 #[test]
2852 fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2853 let bytes = b"abcd\0".repeat(525_000);
2854
2855 let text = extract_printable_strings(&bytes);
2856
2857 assert!(
2858 text.len() > 2_000_000,
2859 "unexpected truncation at {}",
2860 text.len()
2861 );
2862 assert!(text.ends_with("abcd"));
2863 }
2864
2865 #[test]
2866 fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2867 let path = Path::new(
2868 "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2869 );
2870 let bytes = std::fs::read(path).expect("failed to read svg fixture");
2871
2872 let (text, kind) = extract_text_for_detection(path, &bytes);
2873
2874 assert_eq!(kind, ExtractedTextKind::Decoded);
2875 assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2876 }
2877
2878 #[test]
2879 fn test_extract_text_for_detection_preserves_blank_comment_lines_in_utf8_source() {
2880 let path = Path::new("testdata/plugin_email_url/files/IMarkerActionFilter.java");
2881 let bytes = std::fs::read(path).expect("failed to read java fixture");
2882
2883 let (text, kind) = extract_text_for_detection(path, &bytes);
2884
2885 assert_eq!(kind, ExtractedTextKind::Decoded);
2886 let lines: Vec<_> = text.lines().collect();
2887 assert_eq!(lines.get(2).copied(), Some(" *"));
2888 assert_eq!(
2889 lines.get(3).copied(),
2890 Some(" *https://github.com/rpm-software-management")
2891 );
2892 assert_eq!(lines.get(5).copied(), Some("https://gitlab.com/Conan_Kudo"));
2893 }
2894
2895 #[test]
2896 fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2897 let path = Path::new(
2898 "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2899 );
2900 let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2901
2902 let (text, kind) = extract_text_for_detection(path, &bytes);
2903
2904 assert_eq!(kind, ExtractedTextKind::Decoded);
2905 assert!(text.contains("GNU Lesser General Public"));
2906 assert!(text.contains("version"));
2907 assert!(text.contains("2.1 of the License"));
2908 }
2909
2910 #[test]
2911 fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2912 assert_eq!(
2913 normalize_mime_type(
2914 Path::new("main.ts"),
2915 b"export const answer = 42;\n",
2916 Some("TypeScript"),
2917 "video/mp2t",
2918 ),
2919 "text/plain"
2920 );
2921 }
2922
2923 #[test]
2924 fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2925 assert_eq!(
2926 normalize_mime_type(
2927 Path::new("main.js"),
2928 b"console.log('hello');\n",
2929 Some("JavaScript"),
2930 "application/octet-stream",
2931 ),
2932 "text/plain"
2933 );
2934 }
2935
2936 #[test]
2937 fn test_normalize_mime_type_preserves_binary_video_guess() {
2938 assert_eq!(
2939 normalize_mime_type(
2940 Path::new("main.ts"),
2941 &[0, 159, 146, 150, 0, 1, 2, 3],
2942 Some("TypeScript"),
2943 "video/mp2t",
2944 ),
2945 "video/mp2t"
2946 );
2947 }
2948
2949 #[test]
2950 fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2951 assert_eq!(
2952 normalize_mime_type(
2953 Path::new("main.ts"),
2954 &[0, 159, 146, 150],
2955 Some("TypeScript"),
2956 "application/octet-stream",
2957 ),
2958 "application/octet-stream"
2959 );
2960 }
2961
2962 #[test]
2963 fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2964 let classification = classify_file_info(Path::new("test.txt"), b"");
2965
2966 assert_eq!(classification.mime_type, "inode/x-empty");
2967 assert_eq!(classification.file_type, "empty");
2968 assert!(!classification.is_binary);
2969 assert!(classification.is_text);
2970 assert!(!classification.is_source);
2971 assert_eq!(classification.programming_language, None);
2972 }
2973
2974 #[test]
2975 fn test_classify_file_info_keeps_json_out_of_programming_language() {
2976 let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2977
2978 assert_eq!(classification.mime_type, "application/json");
2979 assert_eq!(classification.file_type, "JSON text data");
2980 assert!(classification.is_text);
2981 assert!(!classification.is_source);
2982 assert_eq!(classification.programming_language, None);
2983 }
2984
2985 #[test]
2986 fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2987 let classification =
2988 classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2989
2990 assert_eq!(classification.mime_type, "text/plain");
2991 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2992 assert!(classification.is_text);
2993 assert!(!classification.is_binary);
2994 }
2995
2996 #[test]
2997 fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2998 let classification =
2999 classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
3000
3001 assert_eq!(classification.mime_type, "application/octet-stream");
3002 assert_eq!(classification.file_type, "data");
3003 assert!(classification.is_binary);
3004 assert!(!classification.is_text);
3005 }
3006
3007 #[test]
3008 fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
3009 let classification = classify_file_info(
3010 Path::new("utf16.json"),
3011 &[
3012 0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
3013 ],
3014 );
3015
3016 assert!(!classification.is_binary);
3017 assert!(classification.is_text);
3018 assert_eq!(classification.mime_type, "application/json");
3019 assert_eq!(classification.file_type, "JSON text data");
3020 }
3021
3022 #[test]
3023 fn test_classify_file_info_treats_valid_utf16be_json_without_bom_as_text() {
3024 let classification = classify_file_info(
3025 Path::new("utf16be.json"),
3026 &[0x00, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D],
3027 );
3028
3029 assert!(!classification.is_binary);
3030 assert!(classification.is_text);
3031 assert_eq!(classification.mime_type, "application/json");
3032 assert_eq!(classification.file_type, "JSON text data");
3033 }
3034
3035 #[test]
3036 fn test_classify_file_info_treats_small_valid_utf16be_json_literal_as_text() {
3037 let classification =
3038 classify_file_info(Path::new("utf16be-literal.json"), &[0x00, 0x5B, 0x00, 0x5D]);
3039
3040 assert!(!classification.is_binary);
3041 assert!(classification.is_text);
3042 assert_eq!(classification.mime_type, "application/json");
3043 assert_eq!(classification.file_type, "JSON text data");
3044 }
3045
3046 #[test]
3047 fn test_extract_text_for_detection_decodes_utf16be_text_with_corrupted_bom_prefix() {
3048 let mut bytes = super::CORRUPTED_UTF16_BOM_PREFIX.to_vec();
3049 for code_unit in
3050 "Licensed to the Apache Software Foundation\nApache License, Version 2.0".encode_utf16()
3051 {
3052 bytes.extend_from_slice(&code_unit.to_be_bytes());
3053 }
3054
3055 let (text, kind) = extract_text_for_detection(Path::new("notice.ftl"), &bytes);
3056
3057 assert_eq!(kind, ExtractedTextKind::Decoded);
3058 assert!(text.contains("Apache Software Foundation"), "{text}");
3059 assert!(text.contains("Apache License, Version 2.0"), "{text}");
3060 }
3061
3062 #[test]
3063 fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
3064 let classification = classify_file_info(Path::new("true.json"), b"true");
3065
3066 assert!(!classification.is_binary);
3067 assert!(classification.is_text);
3068 assert_eq!(classification.mime_type, "application/json");
3069 assert_eq!(classification.file_type, "JSON text data");
3070 }
3071
3072 #[test]
3073 fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
3074 let classification = classify_file_info(
3075 Path::new("wrapped.json"),
3076 &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
3077 );
3078
3079 assert!(!classification.is_binary);
3080 assert!(classification.is_text);
3081 assert_eq!(classification.mime_type, "text/plain");
3082 assert_eq!(classification.file_type, "text, with no line terminators");
3083 }
3084
3085 #[test]
3086 fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
3087 let classification =
3088 classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
3089
3090 assert!(classification.is_binary);
3091 assert!(!classification.is_text);
3092 assert_eq!(classification.mime_type, "application/octet-stream");
3093 assert_eq!(classification.file_type, "data");
3094 }
3095
3096 #[test]
3097 fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
3098 let classification = classify_file_info(
3099 Path::new("crash.json"),
3100 &[
3101 0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
3102 ],
3103 );
3104
3105 assert!(classification.is_binary);
3106 assert!(!classification.is_text);
3107 assert_eq!(classification.mime_type, "application/octet-stream");
3108 }
3109
3110 #[test]
3111 fn test_classify_file_info_treats_dockerfile_as_source() {
3112 let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
3113
3114 assert_eq!(
3115 classification.programming_language.as_deref(),
3116 Some("Dockerfile")
3117 );
3118 assert!(classification.is_source);
3119 assert!(!classification.is_script);
3120 assert_eq!(
3121 classification.file_type,
3122 "Dockerfile source, UTF-8 Unicode text"
3123 );
3124 }
3125
3126 #[test]
3127 fn test_classify_file_info_treats_makefile_as_text_not_source() {
3128 let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
3129
3130 assert_eq!(classification.programming_language, None);
3131 assert!(classification.is_text);
3132 assert!(!classification.is_source);
3133 assert!(!classification.is_script);
3134 assert_eq!(classification.file_type, "UTF-8 Unicode text");
3135 }
3136
3137 #[test]
3138 fn test_classify_file_info_marks_supported_package_archives() {
3139 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
3140
3141 let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
3142 let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
3143
3144 assert!(egg.is_archive);
3145 assert_eq!(egg.mime_type, "application/zip");
3146 assert_eq!(egg.file_type, "Zip archive data");
3147 assert!(nupkg.is_archive);
3148 assert_eq!(nupkg.mime_type, "application/zip");
3149 assert_eq!(nupkg.file_type, "Zip archive data");
3150 }
3151
3152 #[test]
3153 fn test_classify_file_info_marks_png_as_binary_media() {
3154 let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
3155
3156 let classification = classify_file_info(Path::new("logo.png"), png_bytes);
3157
3158 assert_eq!(classification.mime_type, "image/png");
3159 assert_eq!(classification.file_type, "PNG image data");
3160 assert!(classification.is_binary);
3161 assert!(!classification.is_text);
3162 assert!(classification.is_media);
3163 assert!(!classification.is_archive);
3164 assert!(!classification.is_source);
3165 }
3166
3167 #[test]
3168 fn test_classify_file_info_marks_pdf_as_binary_document() {
3169 let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
3170
3171 let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
3172
3173 assert_eq!(classification.mime_type, "application/pdf");
3174 assert_eq!(classification.file_type, "PDF document");
3175 assert!(classification.is_binary);
3176 assert!(!classification.is_text);
3177 assert!(!classification.is_archive);
3178 assert!(!classification.is_media);
3179 }
3180
3181 #[test]
3182 fn test_classify_file_info_marks_binary_blobs_as_binary() {
3183 let classification =
3184 classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
3185
3186 assert!(classification.is_binary);
3187 assert!(!classification.is_text);
3188 assert!(!classification.is_source);
3189 assert_eq!(classification.programming_language, None);
3190 }
3191
3192 #[test]
3193 fn test_classify_file_info_treats_yaml_as_text_not_source() {
3194 let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
3195
3196 assert_eq!(classification.programming_language, None);
3197 assert!(classification.is_text);
3198 assert!(!classification.is_source);
3199 assert_eq!(classification.file_type, "YAML text data");
3200 }
3201
3202 #[test]
3203 fn test_classify_file_info_classifies_common_build_manifests() {
3204 let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
3205 let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
3206 let cmake = classify_file_info(
3207 Path::new("toolchain.cmake"),
3208 b"set(CMAKE_CXX_STANDARD 20)\n",
3209 );
3210 let gitmodules = classify_file_info(
3211 Path::new(".gitmodules"),
3212 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
3213 );
3214
3215 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
3216 assert!(gradle.is_source);
3217 assert_eq!(gradle.mime_type, "text/plain");
3218 assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
3219
3220 assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
3221 assert!(flake.is_source);
3222 assert_eq!(flake.mime_type, "text/plain");
3223 assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
3224
3225 assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
3226 assert!(cmake.is_source);
3227 assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
3228
3229 assert_eq!(gitmodules.programming_language, None);
3230 assert!(gitmodules.is_text);
3231 assert!(!gitmodules.is_source);
3232 assert_eq!(gitmodules.file_type, "Git configuration text");
3233 }
3234
3235 #[test]
3236 fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
3237 let header = classify_file_info(
3238 Path::new("include/demo.hpp"),
3239 b"#pragma once\nclass Demo {};\n",
3240 );
3241 let ipp = classify_file_info(
3242 Path::new("include/detail/demo.ipp"),
3243 b"template <class T> void parse() {}\n",
3244 );
3245
3246 assert_eq!(header.programming_language.as_deref(), Some("C++"));
3247 assert!(header.is_source);
3248 assert!(!header.is_script);
3249 assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
3250
3251 assert_eq!(ipp.programming_language, None);
3252 assert!(!ipp.is_source);
3253 assert!(!ipp.is_script);
3254 assert_eq!(ipp.file_type, "UTF-8 Unicode text");
3255 }
3256
3257 #[test]
3258 fn test_classify_file_info_preserves_specific_shell_family_labels() {
3259 let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
3260
3261 assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
3262 assert!(bash.is_script);
3263 assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
3264 }
3265
3266 #[test]
3267 fn test_classify_file_info_marks_jamfile_as_source() {
3268 let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
3269
3270 assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
3271 assert!(jamfile.is_source);
3272 assert!(!jamfile.is_script);
3273 assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
3274 }
3275
3276 #[test]
3277 fn test_classify_file_info_labels_javascript_shebang_scripts() {
3278 let classification = classify_file_info(
3279 Path::new("bin/run"),
3280 b"#!/usr/bin/env node\nconsole.log('hello');\n",
3281 );
3282
3283 assert_eq!(
3284 classification.programming_language.as_deref(),
3285 Some("JavaScript")
3286 );
3287 assert!(classification.is_script);
3288 assert_eq!(
3289 classification.file_type,
3290 "javascript script, UTF-8 Unicode text executable"
3291 );
3292 }
3293
3294 #[test]
3295 fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
3296 let classification = classify_file_info(
3297 Path::new("script.py"),
3298 b"# coding: latin-1\nprint(\"caf\xe9\")\n",
3299 );
3300
3301 assert_eq!(
3302 classification.programming_language.as_deref(),
3303 Some("Python")
3304 );
3305 assert!(classification.is_script);
3306 assert_eq!(classification.file_type, "python script, text executable");
3307 }
3308
3309 #[test]
3310 fn test_classify_file_info_treats_textual_tga_as_media() {
3311 let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
3312
3313 assert!(classification.is_media);
3314 assert!(classification.is_text);
3315 assert!(!classification.is_binary);
3316 }
3317
3318 #[test]
3319 fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
3320 let classification =
3321 classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
3322
3323 assert!(classification.is_binary);
3324 assert!(!classification.is_text);
3325 assert!(!classification.is_source);
3326 assert_eq!(classification.programming_language, None);
3327 }
3328
3329 #[test]
3330 fn test_extract_text_for_detection_skips_unsupported_image_formats() {
3331 let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
3332
3333 let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
3334
3335 assert!(text.is_empty());
3336 assert_eq!(kind, ExtractedTextKind::None);
3337 }
3338
3339 #[test]
3340 fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
3341 let cases = [
3342 (
3343 Path::new("bin/run"),
3344 b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
3345 Some("JavaScript"),
3346 true,
3347 true,
3348 ),
3349 (
3350 Path::new("Dockerfile"),
3351 b"FROM scratch\n".as_slice(),
3352 Some("Dockerfile"),
3353 true,
3354 false,
3355 ),
3356 (
3357 Path::new("package.json"),
3358 br#"{"name":"demo"}"#.as_slice(),
3359 None,
3360 false,
3361 false,
3362 ),
3363 (
3364 Path::new("config.yaml"),
3365 b"key: value\n".as_slice(),
3366 None,
3367 false,
3368 false,
3369 ),
3370 (
3371 Path::new("Makefile"),
3372 b"all:\n\techo hi\n".as_slice(),
3373 None,
3374 false,
3375 false,
3376 ),
3377 ];
3378
3379 for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
3380 let classification = classify_file_info(path, bytes);
3381
3382 assert_eq!(
3383 classification.programming_language.as_deref(),
3384 expected_language,
3385 "unexpected language for {}",
3386 path.display()
3387 );
3388 assert_eq!(
3389 classification.is_source,
3390 expected_is_source,
3391 "unexpected is_source for {}",
3392 path.display()
3393 );
3394 assert_eq!(
3395 classification.is_script,
3396 expected_is_script,
3397 "unexpected is_script for {}",
3398 path.display()
3399 );
3400 }
3401 }
3402}