1use std::borrow::Cow;
5use std::collections::BTreeSet;
6use std::fs;
7use std::io::{BufReader, Cursor, Read};
8use std::panic::{AssertUnwindSafe, catch_unwind};
9use std::path::Path;
10
11use chrono::{TimeZone, Utc};
12use file_format::{FileFormat, Kind as FileFormatKind};
13use flate2::read::ZlibDecoder;
14use glob::Pattern;
15use image::{ImageDecoder, ImageFormat, ImageReader};
16use mime_guess::from_path;
17use quick_xml::events::Event;
18use quick_xml::reader::Reader as XmlReader;
19
20use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
21use crate::utils::font::extract_font_metadata_text;
22use crate::utils::language::detect_language;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum ExtractedTextKind {
26 None,
27 Decoded,
28 FontMetadata,
29 Pdf,
30 BinaryStrings,
31 ImageMetadata,
32 WindowsExecutableMetadata,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct FileInfoClassification {
37 pub mime_type: String,
38 pub file_type: String,
39 pub programming_language: Option<String>,
40 pub is_binary: bool,
41 pub is_text: bool,
42 pub is_archive: bool,
43 pub is_media: bool,
44 pub is_source: bool,
45 pub is_script: bool,
46}
47
48const MAX_IMAGE_METADATA_VALUES: usize = 64;
49const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
50const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
51const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
52const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
53const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
54 "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
55];
56const BINARY_EXTENSIONS: &[&str] = &[
57 "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
58];
59const ARCHIVE_EXTENSIONS: &[&str] = &[
60 "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
61 "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
62];
63
64pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
66 metadata.modified().ok().map(|time: std::time::SystemTime| {
67 let seconds_since_epoch = time
68 .duration_since(std::time::UNIX_EPOCH)
69 .unwrap()
70 .as_secs() as i64;
71
72 Utc.timestamp_opt(seconds_since_epoch, 0)
73 .single()
74 .unwrap_or_else(Utc::now)
75 .format("%Y-%m-%d")
76 .to_string()
77 })
78}
79
80pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
82 let path_str = path.to_string_lossy();
83 let file_name = path
84 .file_name()
85 .map(|name| name.to_string_lossy())
86 .unwrap_or_default();
87
88 for pattern in exclude_patterns {
89 if pattern.matches(&path_str) {
91 return true;
92 }
93
94 if pattern.matches(&file_name) {
96 return true;
97 }
98 }
99
100 false
101}
102
103pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
110 if let Some(decoded) = decode_utf16_text(bytes) {
111 return decoded;
112 }
113
114 match String::from_utf8(bytes.to_vec()) {
115 Ok(s) => s,
116 Err(e) => {
117 let bytes = e.into_bytes();
118 if has_binary_control_chars(&bytes) {
119 return String::new();
120 }
121 bytes.iter().map(|&b| b as char).collect()
122 }
123 }
124}
125
126pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
127 let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
128 (text, kind)
129}
130
131pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
132 let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
133 return Cow::Borrowed(text);
134 };
135 if !matches!(
136 extension.to_ascii_lowercase().as_str(),
137 "md" | "markdown" | "html" | "htm"
138 ) {
139 return Cow::Borrowed(text);
140 }
141
142 let mut hints = Vec::new();
143 if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
144 hints.push("Creative Commons Attribution 4.0 International License".to_string());
145 }
146 if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
147 {
148 hints.push(
149 "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
150 .to_string(),
151 );
152 }
153
154 hints.extend(extract_shields_license_badge_hints(text));
155
156 if hints.is_empty() {
157 Cow::Borrowed(text)
158 } else {
159 let mut augmented =
160 String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
161 augmented.push_str(text);
162 augmented.push_str("\n\n");
163 for (index, hint) in hints.into_iter().enumerate() {
164 if index > 0 {
165 augmented.push('\n');
166 }
167 augmented.push_str(&hint);
168 }
169 Cow::Owned(augmented)
170 }
171}
172
173fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
174 let mut hints = Vec::new();
175 let mut rest = text;
176 let needle = "img.shields.io/badge/license-";
177
178 while let Some(index) = rest.find(needle) {
179 let start = index + needle.len();
180 let suffix = &rest[start..];
181 let end = suffix
182 .find([')', ']', '"', '\'', ' ', '\n'])
183 .unwrap_or(suffix.len());
184 let badge = &suffix[..end];
185 let Some(badge) = badge.strip_suffix(".svg") else {
186 rest = &suffix[end..];
187 continue;
188 };
189
190 let mut segments: Vec<_> = badge
191 .split('-')
192 .filter(|segment| !segment.is_empty())
193 .collect();
194 if segments.len() < 2 {
195 rest = &suffix[end..];
196 continue;
197 }
198 segments.pop();
199 let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
200 if !candidate.is_empty() {
201 hints.push(canonical_shields_license_hint(&candidate));
202 }
203
204 rest = &suffix[end..];
205 }
206
207 hints.sort();
208 hints.dedup();
209 hints
210}
211
212fn canonical_shields_license_hint(candidate: &str) -> String {
213 match candidate.trim() {
214 "MIT" => "The MIT License".to_string(),
215 "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
216 other => format!("{other} License"),
217 }
218}
219
220pub(crate) fn extract_text_for_detection_with_diagnostics(
221 path: &Path,
222 bytes: &[u8],
223) -> (String, ExtractedTextKind, Option<String>) {
224 let ext = path
225 .extension()
226 .and_then(|e| e.to_str())
227 .map(|s| s.to_ascii_lowercase());
228 let detected_format = detect_file_format(bytes);
229
230 if looks_like_rtf(bytes, ext.as_deref()) {
231 let text = extract_rtf_text(bytes);
232 return if text.trim().is_empty() {
233 (String::new(), ExtractedTextKind::None, None)
234 } else {
235 (text, ExtractedTextKind::Decoded, None)
236 };
237 }
238
239 if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
240 let (text, scan_error) = extract_pdf_text(path, bytes);
241 return if text.is_empty() {
242 (String::new(), ExtractedTextKind::None, scan_error)
243 } else {
244 (text, ExtractedTextKind::Pdf, None)
245 };
246 }
247
248 if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
249 let text = extract_image_metadata_text(bytes, format);
250 return if text.is_empty() {
251 if is_supported_image_container(bytes, format) {
252 (String::new(), ExtractedTextKind::None, None)
253 } else {
254 let decoded = decode_bytes_to_string(bytes);
255 if decoded.is_empty() {
256 (String::new(), ExtractedTextKind::None, None)
257 } else {
258 (decoded, ExtractedTextKind::Decoded, None)
259 }
260 }
261 } else {
262 (text, ExtractedTextKind::ImageMetadata, None)
263 };
264 }
265
266 if let Some(text) = extract_font_metadata_text(path, bytes) {
267 let strings = extract_printable_strings(bytes);
268 let combined = if strings.is_empty() {
269 text
270 } else {
271 combine_extracted_text_fragments(Some(text), strings)
272 };
273 return (combined, ExtractedTextKind::FontMetadata, None);
274 }
275
276 let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
277 let large_opaque_binary = windows_executable_metadata_text.is_none()
278 && is_large_opaque_binary_candidate(bytes, detected_format);
279
280 if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
281 return windows_metadata_or_empty_result(windows_executable_metadata_text);
282 }
283
284 if should_skip_binary_string_extraction(path, bytes, detected_format) {
285 return (String::new(), ExtractedTextKind::None, None);
286 }
287
288 if !large_opaque_binary {
289 let decoded = decode_bytes_to_string(bytes);
290 if !decoded.is_empty() {
291 let combined =
292 combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
293 return (combined, ExtractedTextKind::Decoded, None);
294 }
295 }
296
297 let text = if large_opaque_binary {
298 extract_sampled_printable_strings(bytes)
299 } else {
300 extract_printable_strings(bytes)
301 };
302 if text.is_empty() {
303 windows_metadata_or_empty_result(windows_executable_metadata_text)
304 } else {
305 (
306 combine_extracted_text_fragments(windows_executable_metadata_text, text),
307 ExtractedTextKind::BinaryStrings,
308 None,
309 )
310 }
311}
312
313fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
314 match prefix {
315 Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
316 Some(prefix) if !prefix.is_empty() => prefix,
317 _ => suffix,
318 }
319}
320
321fn windows_metadata_or_empty_result(
322 windows_executable_metadata_text: Option<String>,
323) -> (String, ExtractedTextKind, Option<String>) {
324 if let Some(metadata_text) = windows_executable_metadata_text {
325 (
326 metadata_text,
327 ExtractedTextKind::WindowsExecutableMetadata,
328 None,
329 )
330 } else {
331 (String::new(), ExtractedTextKind::None, None)
332 }
333}
334
335pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
336 let detected_format = detect_file_format(bytes);
337 let detected_language = detect_language(path, bytes);
338 let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
339 let is_text = !is_binary;
340 let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
341 let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
342 let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
343 let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
344 let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
345 let programming_language = is_source.then(|| detected_language.clone()).flatten();
346 let file_type = detect_file_type(
347 path,
348 bytes,
349 detected_format,
350 &mime_type,
351 programming_language.as_deref(),
352 is_binary,
353 is_text,
354 is_archive,
355 is_media,
356 is_script,
357 );
358
359 FileInfoClassification {
360 mime_type,
361 file_type,
362 programming_language,
363 is_binary,
364 is_text,
365 is_archive,
366 is_media,
367 is_source,
368 is_script,
369 }
370}
371
372fn detect_file_format(bytes: &[u8]) -> FileFormat {
373 FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
374}
375
376const CORRUPTED_UTF16_BOM_PREFIX: &[u8] = &[0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD];
377
378fn is_utf8_text(bytes: &[u8]) -> bool {
379 std::str::from_utf8(bytes).is_ok()
380}
381
382fn strip_corrupted_utf16_bom_prefix(bytes: &[u8]) -> &[u8] {
383 bytes
384 .strip_prefix(CORRUPTED_UTF16_BOM_PREFIX)
385 .unwrap_or(bytes)
386}
387
388fn decode_utf16_units(bytes: &[u8], is_le: bool, require_text_shape: bool) -> Option<String> {
389 if bytes.is_empty() || !bytes.len().is_multiple_of(2) {
390 return None;
391 }
392
393 let code_units: Vec<u16> = bytes
394 .chunks_exact(2)
395 .map(|chunk| {
396 if is_le {
397 u16::from_le_bytes([chunk[0], chunk[1]])
398 } else {
399 u16::from_be_bytes([chunk[0], chunk[1]])
400 }
401 })
402 .collect();
403
404 let decoded = std::char::decode_utf16(code_units)
405 .collect::<Result<String, _>>()
406 .ok()?;
407
408 if !require_text_shape {
409 return (!decoded.contains('\0')).then_some(decoded);
410 }
411
412 let visible = decoded
413 .chars()
414 .filter(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'))
415 .count();
416 if visible < 3 || decoded.contains('\0') {
417 return None;
418 }
419
420 let alpha = decoded.chars().filter(|ch| ch.is_alphabetic()).count();
421 let punctuation = decoded
422 .chars()
423 .filter(|ch| {
424 matches!(
425 ch,
426 '{' | '}'
427 | '['
428 | ']'
429 | '<'
430 | '>'
431 | '('
432 | ')'
433 | ':'
434 | ';'
435 | ','
436 | '"'
437 | '\''
438 | '/'
439 | '='
440 | '-'
441 | '_'
442 | '#'
443 | '!'
444 )
445 })
446 .count();
447 let whitespace = decoded.chars().filter(|ch| ch.is_whitespace()).count();
448
449 let textish = alpha + punctuation + whitespace;
450 if textish + (visible / 5) < visible || (alpha == 0 && punctuation < 2) {
451 return None;
452 }
453
454 Some(decoded)
455}
456
457fn detect_utf16_endianness(bytes: &[u8]) -> Option<bool> {
458 let stripped = strip_corrupted_utf16_bom_prefix(bytes);
459 if stripped.len() < 4 || !stripped.len().is_multiple_of(2) {
460 return None;
461 }
462
463 let pair_count = stripped.len() / 2;
464 let even_zero = stripped.iter().step_by(2).filter(|&&b| b == 0).count();
465 let odd_zero = stripped
466 .iter()
467 .skip(1)
468 .step_by(2)
469 .filter(|&&b| b == 0)
470 .count();
471
472 let looks_like_be = even_zero * 3 >= pair_count && odd_zero * 6 <= pair_count;
473 let looks_like_le = odd_zero * 3 >= pair_count && even_zero * 6 <= pair_count;
474
475 match (looks_like_le, looks_like_be) {
476 (true, false) => Some(true),
477 (false, true) => Some(false),
478 (true, true) => Some(true),
479 (false, false) => None,
480 }
481}
482
483fn decode_utf16_text(bytes: &[u8]) -> Option<String> {
484 if let Some(decoded) = decode_utf16_bom_text(bytes) {
485 return Some(decoded);
486 }
487
488 let stripped = strip_corrupted_utf16_bom_prefix(bytes);
489 match detect_utf16_endianness(bytes) {
490 Some(true) => decode_utf16_units(stripped, true, true),
491 Some(false) => decode_utf16_units(stripped, false, true),
492 None => None,
493 }
494}
495
496fn decode_utf16_json_text(bytes: &[u8]) -> Option<String> {
497 if bytes.len() >= 2 {
498 let (is_le, body) = match bytes {
499 [0xFF, 0xFE, rest @ ..] => (true, rest),
500 [0xFE, 0xFF, rest @ ..] => (false, rest),
501 _ => {
502 let stripped = strip_corrupted_utf16_bom_prefix(bytes);
503 return match detect_utf16_endianness(bytes) {
504 Some(true) => decode_utf16_units(stripped, true, false),
505 Some(false) => decode_utf16_units(stripped, false, false),
506 None => None,
507 };
508 }
509 };
510
511 if body.is_empty() || !body.len().is_multiple_of(2) {
512 return None;
513 }
514
515 return decode_utf16_units(body, is_le, false);
516 }
517
518 None
519}
520
521fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
522 if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
523 return None;
524 }
525
526 let (is_le, body) = match bytes {
527 [0xFF, 0xFE, rest @ ..] => (true, rest),
528 [0xFE, 0xFF, rest @ ..] => (false, rest),
529 _ => return None,
530 };
531
532 if body.is_empty() || body.len() % 2 != 0 {
533 return None;
534 }
535
536 decode_utf16_units(body, is_le, true)
537}
538
539fn has_binary_control_chars(bytes: &[u8]) -> bool {
540 let control_count = bytes
541 .iter()
542 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
543 .count();
544 control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
545}
546
547fn has_decodable_text(bytes: &[u8]) -> bool {
548 bytes.is_empty()
549 || is_utf8_text(bytes)
550 || decode_utf16_text(bytes).is_some()
551 || !has_binary_control_chars(bytes)
552}
553
554fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
555 if bytes.is_empty() || is_utf8_text(bytes) {
556 return true;
557 }
558 if let Some(decoded) = decode_utf16_text(bytes) {
559 return decoded
560 .chars()
561 .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
562 }
563
564 let printable_count = bytes
565 .iter()
566 .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
567 .count();
568 printable_count * 2 >= bytes.len()
569}
570
571fn is_textual_media_type(media_type: &str) -> bool {
572 media_type.starts_with("text/")
573 || matches!(
574 media_type,
575 "application/json" | "application/xml" | "text/xml"
576 )
577 || media_type.ends_with("+json")
578 || media_type.ends_with("+xml")
579}
580
581fn is_textual_format(detected_format: FileFormat) -> bool {
582 matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
583 || is_textual_media_type(detected_format.media_type())
584}
585
586fn is_known_binary_format(detected_format: FileFormat) -> bool {
587 !matches!(detected_format, FileFormat::ArbitraryBinaryData)
588 && !is_textual_format(detected_format)
589}
590
591pub fn detect_mime_type(
592 path: &Path,
593 bytes: &[u8],
594 detected_format: FileFormat,
595 programming_language: Option<&str>,
596) -> String {
597 if bytes.is_empty() {
598 return "inode/x-empty".to_string();
599 }
600
601 if lower_extension(path).as_deref() == Some("json") {
602 if let Some(is_binary) = json_binary_override(bytes) {
603 if is_binary {
604 return "application/octet-stream".to_string();
605 }
606 if has_valid_json_text(bytes) {
607 return "application/json".to_string();
608 }
609 return "text/plain".to_string();
610 }
611 if has_valid_json_text(bytes) {
612 return "application/json".to_string();
613 }
614 if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
615 return "text/plain".to_string();
616 }
617 return "application/octet-stream".to_string();
618 }
619
620 if is_zip_archive(bytes) {
621 return detect_zip_like_mime(path);
622 }
623
624 if looks_like_deb(bytes, path) {
625 return "application/vnd.debian.binary-package".to_string();
626 }
627
628 if looks_like_rpm(bytes, path) {
629 return "application/x-rpm".to_string();
630 }
631
632 let guessed_mime = from_path(path)
633 .first_or_octet_stream()
634 .essence_str()
635 .to_string();
636
637 let mime_type = match detected_format {
638 FileFormat::Empty => "inode/x-empty".to_string(),
639 FileFormat::PlainText => {
640 if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
641 "text/plain".to_string()
642 } else {
643 guessed_mime.clone()
644 }
645 }
646 _ => {
647 let detected_mime = detected_format.media_type();
648 if detected_mime == "application/octet-stream"
649 && guessed_mime != "application/octet-stream"
650 {
651 guessed_mime.clone()
652 } else {
653 detected_mime.to_string()
654 }
655 }
656 };
657
658 normalize_mime_type(path, bytes, programming_language, &mime_type)
659}
660
661fn normalize_mime_type(
662 path: &Path,
663 bytes: &[u8],
664 programming_language: Option<&str>,
665 mime_type: &str,
666) -> String {
667 if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
668 return "text/plain".to_string();
669 }
670
671 mime_type.to_string()
672}
673
674fn should_prefer_text_mime(
675 path: &Path,
676 bytes: &[u8],
677 programming_language: Option<&str>,
678 mime_type: &str,
679) -> bool {
680 has_decodable_text(bytes)
681 && looks_like_textual_bytes(bytes)
682 && is_textual_source_candidate(path, programming_language)
683 && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
684}
685
686fn has_valid_json_text(bytes: &[u8]) -> bool {
687 if bytes.len() > JSON_VALIDATION_MAX_BYTES {
688 return false;
689 }
690
691 serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
692 || decode_utf16_json_text(bytes)
693 .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
694 .is_some()
695}
696
697fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
698 !bytes.contains(&0)
699 && !bytes.contains(&0xFF)
700 && bytes.starts_with(b"[\"")
701 && bytes.ends_with(b"\"]")
702 && bytes.len() >= 8
703}
704
705fn json_binary_override(bytes: &[u8]) -> Option<bool> {
706 if has_valid_json_text(bytes) {
707 return Some(false);
708 }
709
710 if bytes.contains(&0) {
711 return Some(true);
712 }
713
714 if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
715 return Some(true);
716 }
717
718 if is_wrapped_invalid_json_string_text(bytes) {
719 return Some(false);
720 }
721
722 None
723}
724
725fn detect_is_binary(
726 path: &Path,
727 bytes: &[u8],
728 detected_format: FileFormat,
729 programming_language: Option<&str>,
730) -> bool {
731 if lower_extension(path).as_deref() == Some("json")
732 && let Some(is_binary) = json_binary_override(bytes)
733 {
734 return is_binary;
735 }
736
737 if is_textual_format(detected_format) {
738 return false;
739 }
740
741 if lower_extension(path)
742 .as_deref()
743 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
744 {
745 return true;
746 }
747
748 if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
749 return false;
750 }
751
752 has_binary_control_chars(bytes)
753 || is_known_binary_format(detected_format)
754 || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
755 && !looks_like_textual_bytes(bytes))
756}
757
758fn should_treat_binary_bytes_as_text(
759 path: &Path,
760 bytes: &[u8],
761 programming_language: Option<&str>,
762) -> bool {
763 has_decodable_text(bytes)
764 && looks_like_textual_bytes(bytes)
765 && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
766}
767
768fn detect_is_archive(
769 path: &Path,
770 bytes: &[u8],
771 mime_type: &str,
772 is_text: bool,
773 detected_format: FileFormat,
774) -> bool {
775 if is_text {
776 return false;
777 }
778
779 lower_extension(path)
780 .as_deref()
781 .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
782 || matches!(
783 detected_format.kind(),
784 FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
785 )
786 || is_zip_archive(bytes)
787 || looks_like_gzip(bytes)
788 || looks_like_bzip2(bytes)
789 || looks_like_xz(bytes)
790 || looks_like_deb(bytes, path)
791 || looks_like_rpm(bytes, path)
792 || looks_like_squashfs(bytes, path)
793 || mime_type.contains("zip")
794 || mime_type.contains("compressed")
795 || mime_type.contains("tar")
796 || mime_type.contains("x-rpm")
797 || mime_type.contains("debian")
798}
799
800fn detect_is_media(
801 path: &Path,
802 bytes: &[u8],
803 mime_type: &str,
804 detected_format: FileFormat,
805) -> bool {
806 media_mime_from_content(bytes).is_some()
807 || matches!(
808 detected_format.kind(),
809 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
810 )
811 || mime_type.starts_with("image/")
812 || mime_type.starts_with("audio/")
813 || mime_type.starts_with("video/")
814 || (mime_type == "application/octet-stream"
815 && lower_extension(path).as_deref() == Some("tga")
816 && !has_binary_control_chars(bytes))
817}
818
819fn detect_is_script(
820 path: &Path,
821 bytes: &[u8],
822 programming_language: Option<&str>,
823 is_text: bool,
824) -> bool {
825 if !is_text || is_makefile(path) {
826 return false;
827 }
828
829 bytes.starts_with(b"#!")
830 || lower_extension(path).as_deref().is_some_and(|ext| {
831 matches!(
832 ext,
833 "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
834 )
835 })
836 || matches!(
837 programming_language,
838 Some(
839 "Shell"
840 | "Bash"
841 | "Zsh"
842 | "Fish"
843 | "Ksh"
844 | "Python"
845 | "Ruby"
846 | "Perl"
847 | "PHP"
848 | "PowerShell"
849 | "Awk"
850 )
851 )
852}
853
854fn detect_is_source(
855 path: &Path,
856 programming_language: Option<&str>,
857 is_text: bool,
858 is_script: bool,
859) -> bool {
860 if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
861 return false;
862 }
863
864 if is_c_like_source(path) || is_java_like_source(path) {
865 return true;
866 }
867
868 programming_language.is_some() || is_script
869}
870
871#[allow(clippy::too_many_arguments)]
872fn detect_file_type(
873 path: &Path,
874 bytes: &[u8],
875 detected_format: FileFormat,
876 mime_type: &str,
877 programming_language: Option<&str>,
878 is_binary: bool,
879 is_text: bool,
880 is_archive: bool,
881 is_media: bool,
882 is_script: bool,
883) -> String {
884 if bytes.is_empty() {
885 return "empty".to_string();
886 }
887
888 if looks_like_pdf(bytes) {
889 return "PDF document".to_string();
890 }
891
892 if let Some(file_type) = media_file_type_from_content(bytes) {
893 return file_type.to_string();
894 }
895
896 if is_archive {
897 return archive_file_type(path, bytes, detected_format);
898 }
899
900 if is_script {
901 return script_file_type(programming_language, bytes);
902 }
903
904 if is_text {
905 if lower_extension(path).as_deref() == Some("json") {
906 if has_valid_json_text(bytes) {
907 return "JSON text data".to_string();
908 }
909 return text_file_type(bytes);
910 }
911 if lower_extension(path).as_deref() == Some("xml") {
912 return "XML text data".to_string();
913 }
914 if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
915 return "YAML text data".to_string();
916 }
917 if lower_extension(path).as_deref() == Some("toml") {
918 return "TOML text data".to_string();
919 }
920 if matches!(
921 lower_extension(path).as_deref(),
922 Some("ini" | "cfg" | "conf")
923 ) {
924 return "INI text data".to_string();
925 }
926 if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
927 return "Git configuration text".to_string();
928 }
929 if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
930 return text_file_type(bytes);
931 }
932 if programming_language.is_some() && !is_media {
933 return source_file_type(programming_language, bytes);
934 }
935 return text_file_type(bytes);
936 }
937
938 if let Some(file_type) = format_based_file_type(detected_format) {
939 return file_type;
940 }
941
942 if is_binary && mime_type == "application/octet-stream" {
943 return "data".to_string();
944 }
945
946 mime_type.to_string()
947}
948
949fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
950 if matches!(programming_language, Some(language) if is_source_like_language(language)) {
951 return true;
952 }
953
954 if matches!(
955 lower_file_name(path).as_str(),
956 "dockerfile"
957 | "containerfile"
958 | "containerfile.core"
959 | "apkbuild"
960 | "podfile"
961 | "jamfile"
962 | "jamroot"
963 | "meson.build"
964 | "build"
965 | "workspace"
966 | "buck"
967 | "default.nix"
968 | "flake.nix"
969 | "shell.nix"
970 ) {
971 return true;
972 }
973
974 path.extension()
975 .and_then(|ext| ext.to_str())
976 .is_some_and(|ext| {
977 matches!(
978 ext.to_ascii_lowercase().as_str(),
979 "rs" | "py"
980 | "js"
981 | "mjs"
982 | "cjs"
983 | "jsx"
984 | "ts"
985 | "mts"
986 | "cts"
987 | "tsx"
988 | "c"
989 | "cpp"
990 | "cc"
991 | "cxx"
992 | "h"
993 | "hpp"
994 | "m"
995 | "mm"
996 | "s"
997 | "asm"
998 | "java"
999 | "go"
1000 | "rb"
1001 | "php"
1002 | "pl"
1003 | "swift"
1004 | "sh"
1005 | "bash"
1006 | "zsh"
1007 | "fish"
1008 | "ksh"
1009 | "ps1"
1010 | "psm1"
1011 | "psd1"
1012 | "awk"
1013 | "kt"
1014 | "kts"
1015 | "dart"
1016 | "scala"
1017 | "groovy"
1018 | "gradle"
1019 | "gvy"
1020 | "gy"
1021 | "gsh"
1022 | "cs"
1023 | "fs"
1024 | "fsx"
1025 | "r"
1026 | "lua"
1027 | "jl"
1028 | "ex"
1029 | "exs"
1030 | "clj"
1031 | "cljs"
1032 | "cljc"
1033 | "hs"
1034 | "erl"
1035 | "nix"
1036 | "zig"
1037 | "bzl"
1038 | "bazel"
1039 | "star"
1040 | "sky"
1041 | "ml"
1042 | "mli"
1043 | "tex"
1044 )
1045 })
1046}
1047
1048fn is_source_like_language(language: &str) -> bool {
1049 matches!(
1050 language,
1051 "Rust"
1052 | "Python"
1053 | "JavaScript"
1054 | "TypeScript"
1055 | "JavaScript/TypeScript"
1056 | "C"
1057 | "C++"
1058 | "Objective-C"
1059 | "Objective-C++"
1060 | "GAS"
1061 | "Java"
1062 | "Go"
1063 | "Ruby"
1064 | "PHP"
1065 | "Perl"
1066 | "Swift"
1067 | "Shell"
1068 | "PowerShell"
1069 | "Awk"
1070 | "Kotlin"
1071 | "Dart"
1072 | "Scala"
1073 | "C#"
1074 | "F#"
1075 | "R"
1076 | "Lua"
1077 | "Julia"
1078 | "Elixir"
1079 | "Clojure"
1080 | "Haskell"
1081 | "Erlang"
1082 | "Groovy"
1083 | "Nix"
1084 | "Zig"
1085 | "Starlark"
1086 | "OCaml"
1087 | "Meson"
1088 | "TeX"
1089 | "Dockerfile"
1090 | "Makefile"
1091 | "Jamfile"
1092 )
1093}
1094
1095fn extension(path: &Path) -> Option<&str> {
1096 path.extension().and_then(|ext| ext.to_str())
1097}
1098
1099fn lower_extension(path: &Path) -> Option<String> {
1100 extension(path).map(|ext| ext.to_ascii_lowercase())
1101}
1102
1103fn lower_file_name(path: &Path) -> String {
1104 path.file_name()
1105 .and_then(|name| name.to_str())
1106 .map(|name| name.to_ascii_lowercase())
1107 .unwrap_or_default()
1108}
1109
1110fn is_plain_text(path: &Path) -> bool {
1111 lower_extension(path)
1112 .as_deref()
1113 .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
1114}
1115
1116fn is_makefile(path: &Path) -> bool {
1117 matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
1118}
1119
1120fn is_source_map(path: &Path) -> bool {
1121 let path_lower = path.to_string_lossy().to_ascii_lowercase();
1122 path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
1123}
1124
1125fn is_c_like_source(path: &Path) -> bool {
1126 lower_extension(path).as_deref().is_some_and(|ext| {
1127 matches!(
1128 ext,
1129 "c" | "cc"
1130 | "cp"
1131 | "cpp"
1132 | "cxx"
1133 | "c++"
1134 | "h"
1135 | "hh"
1136 | "hpp"
1137 | "hxx"
1138 | "h++"
1139 | "i"
1140 | "ii"
1141 | "m"
1142 | "s"
1143 | "asm"
1144 )
1145 })
1146}
1147
1148fn is_java_like_source(path: &Path) -> bool {
1149 lower_extension(path)
1150 .as_deref()
1151 .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1152}
1153
1154fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1155 match detected_format {
1156 FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1157 format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1158 format => Some(match format.kind() {
1159 FileFormatKind::Image => short_name_or_name(&format, "image data"),
1160 FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1161 FileFormatKind::Video => short_name_or_name(&format, "video data"),
1162 _ => format.name().to_string(),
1163 }),
1164 }
1165}
1166
1167fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1168 format
1169 .short_name()
1170 .map(|short_name| format!("{short_name} {suffix}"))
1171 .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1172}
1173
1174fn detect_zip_like_mime(path: &Path) -> String {
1175 match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1176 Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1177 Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1178 "application/java-archive".to_string()
1179 }
1180 _ => "application/zip".to_string(),
1181 }
1182}
1183
1184fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1185 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1186 Some("image/png")
1187 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1188 Some("image/jpeg")
1189 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1190 Some("image/tiff")
1191 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1192 Some("image/webp")
1193 } else {
1194 None
1195 }
1196}
1197
1198fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1199 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1200 Some("PNG image data")
1201 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1202 Some("JPEG image data")
1203 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1204 Some("TIFF image data")
1205 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1206 Some("WebP image data")
1207 } else {
1208 None
1209 }
1210}
1211
1212fn looks_like_pdf(bytes: &[u8]) -> bool {
1213 bytes.starts_with(b"%PDF-")
1214}
1215
1216fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1217 ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1218}
1219
1220fn extract_rtf_text(bytes: &[u8]) -> String {
1221 let text = String::from_utf8_lossy(bytes);
1222 let chars: Vec<char> = text.chars().collect();
1223 let mut output = String::new();
1224 let mut index = 0usize;
1225
1226 while index < chars.len() {
1227 match chars[index] {
1228 '{' | '}' => {
1229 index += 1;
1230 }
1231 '\\' => {
1232 index += 1;
1233 if index >= chars.len() {
1234 break;
1235 }
1236
1237 match chars[index] {
1238 '\\' | '{' | '}' => {
1239 output.push(chars[index]);
1240 index += 1;
1241 }
1242 '\'' => {
1243 if index + 2 < chars.len() {
1244 let hex = [chars[index + 1], chars[index + 2]];
1245 let hex: String = hex.iter().collect();
1246 if let Ok(value) = u8::from_str_radix(&hex, 16) {
1247 output.push(value as char);
1248 index += 3;
1249 continue;
1250 }
1251 }
1252 index += 1;
1253 }
1254 control if control.is_ascii_alphabetic() => {
1255 let start = index;
1256 while index < chars.len() && chars[index].is_ascii_alphabetic() {
1257 index += 1;
1258 }
1259 let control_word: String = chars[start..index].iter().collect();
1260
1261 let number_start = index;
1262 if index < chars.len()
1263 && (chars[index] == '-' || chars[index].is_ascii_digit())
1264 {
1265 index += 1;
1266 while index < chars.len() && chars[index].is_ascii_digit() {
1267 index += 1;
1268 }
1269 }
1270 let parameter: String = chars[number_start..index].iter().collect();
1271
1272 if index < chars.len() && chars[index] == ' ' {
1273 index += 1;
1274 }
1275
1276 match control_word.as_str() {
1277 "par" | "line" => output.push('\n'),
1278 "tab" => output.push('\t'),
1279 "emdash" => output.push('—'),
1280 "endash" => output.push('–'),
1281 "bullet" => output.push('•'),
1282 "lquote" | "rquote" => output.push('\''),
1283 "ldblquote" | "rdblquote" => output.push('"'),
1284 "u" => {
1285 if let Ok(codepoint) = parameter.parse::<i32>() {
1286 let normalized = if codepoint < 0 {
1287 codepoint + 65_536
1288 } else {
1289 codepoint
1290 };
1291 if let Ok(normalized) = u32::try_from(normalized)
1292 && let Some(ch) = char::from_u32(normalized)
1293 {
1294 output.push(ch);
1295 }
1296 }
1297
1298 if index < chars.len()
1299 && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1300 {
1301 index += 1;
1302 }
1303 }
1304 _ => {}
1305 }
1306 }
1307 _ => {
1308 index += 1;
1309 }
1310 }
1311 }
1312 ch => {
1313 output.push(ch);
1314 index += 1;
1315 }
1316 }
1317 }
1318
1319 output
1320 .replace(['\r', '\u{0c}'], "\n")
1321 .lines()
1322 .map(str::trim_end)
1323 .collect::<Vec<_>>()
1324 .join("\n")
1325}
1326
1327fn looks_like_gzip(bytes: &[u8]) -> bool {
1328 bytes.starts_with(&[0x1f, 0x8b])
1329}
1330
1331fn looks_like_bzip2(bytes: &[u8]) -> bool {
1332 bytes.starts_with(b"BZh")
1333}
1334
1335fn looks_like_xz(bytes: &[u8]) -> bool {
1336 bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1337}
1338
1339fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1340 lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1341}
1342
1343fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1344 lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1345}
1346
1347fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1348 lower_extension(path)
1349 .as_deref()
1350 .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1351 && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1352 || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1353}
1354
1355fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1356 if looks_like_deb(bytes, path) {
1357 "debian binary package (format 2.0)".to_string()
1358 } else if looks_like_rpm(bytes, path) {
1359 "RPM package".to_string()
1360 } else if looks_like_squashfs(bytes, path) {
1361 "Squashfs filesystem".to_string()
1362 } else if looks_like_gzip(bytes) {
1363 "gzip compressed data".to_string()
1364 } else if looks_like_bzip2(bytes) {
1365 "bzip2 compressed data".to_string()
1366 } else if looks_like_xz(bytes) {
1367 "XZ compressed data".to_string()
1368 } else if is_zip_archive(bytes) {
1369 "Zip archive data".to_string()
1370 } else if lower_extension(path).as_deref() == Some("gem") {
1371 "POSIX tar archive".to_string()
1372 } else if let Some(file_type) = format_based_file_type(detected_format) {
1373 file_type
1374 } else {
1375 "archive data".to_string()
1376 }
1377}
1378
1379fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1380 let suffix = text_executable_label(bytes);
1381
1382 match programming_language {
1383 Some("Python") => format!("python script, {suffix}"),
1384 Some("Ruby") => format!("ruby script, {suffix}"),
1385 Some("Perl") => format!("perl script, {suffix}"),
1386 Some("PHP") => format!("php script, {suffix}"),
1387 Some("Shell") => format!("shell script, {suffix}"),
1388 Some("Bash") => format!("bash script, {suffix}"),
1389 Some("Zsh") => format!("zsh script, {suffix}"),
1390 Some("Fish") => format!("fish script, {suffix}"),
1391 Some("Ksh") => format!("ksh script, {suffix}"),
1392 Some("JavaScript") => format!("javascript script, {suffix}"),
1393 Some("TypeScript") => format!("typescript script, {suffix}"),
1394 Some("PowerShell") => format!("powershell script, {suffix}"),
1395 Some("Awk") => format!("awk script, {suffix}"),
1396 _ => format!("script, {suffix}"),
1397 }
1398}
1399
1400fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1401 let suffix = text_label(bytes);
1402 match programming_language {
1403 Some("C") => format!("C source, {suffix}"),
1404 Some("C++") => format!("C++ source, {suffix}"),
1405 Some("Java") => format!("Java source, {suffix}"),
1406 Some("C#") => format!("C# source, {suffix}"),
1407 Some("F#") => format!("F# source, {suffix}"),
1408 Some("Go") => format!("Go source, {suffix}"),
1409 Some("Rust") => format!("Rust source, {suffix}"),
1410 Some("Starlark") => format!("Starlark source, {suffix}"),
1411 Some("CMake") => format!("CMake source, {suffix}"),
1412 Some("Meson") => format!("Meson source, {suffix}"),
1413 Some("Nix") => format!("Nix source, {suffix}"),
1414 Some("Groovy") => format!("Groovy source, {suffix}"),
1415 Some("Makefile") => format!("Makefile source, {suffix}"),
1416 Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1417 Some("Jamfile") => format!("Jamfile source, {suffix}"),
1418 Some("Batchfile") => format!("Batchfile source, {suffix}"),
1419 Some(language) => format!("{language} source, {suffix}"),
1420 None => text_file_type(bytes),
1421 }
1422}
1423
1424fn text_file_type(bytes: &[u8]) -> String {
1425 text_label(bytes).to_string()
1426}
1427
1428fn text_label(bytes: &[u8]) -> &'static str {
1429 if std::str::from_utf8(bytes).is_ok() {
1430 if bytes.contains(&b'\n') {
1431 "UTF-8 Unicode text"
1432 } else {
1433 "UTF-8 Unicode text, with no line terminators"
1434 }
1435 } else if bytes.contains(&b'\n') {
1436 "text"
1437 } else {
1438 "text, with no line terminators"
1439 }
1440}
1441
1442fn text_executable_label(bytes: &[u8]) -> &'static str {
1443 if std::str::from_utf8(bytes).is_ok() {
1444 if bytes.contains(&b'\n') {
1445 "UTF-8 Unicode text executable"
1446 } else {
1447 "UTF-8 Unicode text executable, with no line terminators"
1448 }
1449 } else if bytes.contains(&b'\n') {
1450 "text executable"
1451 } else {
1452 "text executable, with no line terminators"
1453 }
1454}
1455
1456fn supported_image_metadata_format(
1457 ext: Option<&str>,
1458 detected_format: FileFormat,
1459) -> Option<ImageFormat> {
1460 match ext {
1461 Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1462 Some("png") => Some(ImageFormat::Png),
1463 Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1464 Some("webp") => Some(ImageFormat::WebP),
1465 _ => match detected_format.media_type() {
1466 "image/jpeg" => Some(ImageFormat::Jpeg),
1467 "image/png" => Some(ImageFormat::Png),
1468 "image/tiff" => Some(ImageFormat::Tiff),
1469 "image/webp" => Some(ImageFormat::WebP),
1470 _ => None,
1471 },
1472 }
1473}
1474
1475fn should_skip_binary_string_extraction(
1476 path: &Path,
1477 bytes: &[u8],
1478 detected_format: FileFormat,
1479) -> bool {
1480 matches!(lower_extension(path).as_deref(), Some("pdf"))
1481 || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1482 .is_some()
1483 || (matches!(
1484 detected_format.kind(),
1485 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1486 ) && !is_textual_format(detected_format))
1487 || media_mime_from_content(bytes).is_some()
1488 || is_zip_archive(bytes)
1489 || looks_like_gzip(bytes)
1490 || looks_like_bzip2(bytes)
1491 || looks_like_xz(bytes)
1492 || looks_like_deb(bytes, path)
1493 || looks_like_rpm(bytes, path)
1494 || looks_like_squashfs(bytes, path)
1495}
1496
1497fn should_skip_large_opaque_binary_text_extraction(
1498 _path: &Path,
1499 bytes: &[u8],
1500 detected_format: FileFormat,
1501) -> bool {
1502 is_large_opaque_binary_candidate(bytes, detected_format)
1503 && !sample_has_promising_printable_strings(bytes)
1504}
1505
1506fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1507 bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1508 && !is_textual_format(detected_format)
1509 && !matches!(
1510 detected_format.kind(),
1511 FileFormatKind::Archive
1512 | FileFormatKind::Compressed
1513 | FileFormatKind::Package
1514 | FileFormatKind::Audio
1515 | FileFormatKind::Image
1516 | FileFormatKind::Video
1517 )
1518}
1519
1520fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1521 const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1522
1523 let mut ranges = Vec::new();
1524 let mut push_range = |start: usize, end: usize| {
1525 if start < end && !ranges.contains(&(start, end)) {
1526 ranges.push((start, end));
1527 }
1528 };
1529
1530 push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1531 if len > SAMPLE_WINDOW_BYTES * 2 {
1532 let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1533 let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1534 push_range(mid_start, mid_end);
1535 }
1536 if len > SAMPLE_WINDOW_BYTES {
1537 push_range(len - SAMPLE_WINDOW_BYTES, len);
1538 }
1539
1540 ranges
1541}
1542
1543fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1544 let mut structured_signal_seen = false;
1545 let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1546 .into_iter()
1547 .filter(|&(start, end)| {
1548 let window = &bytes[start..end];
1549 if has_strong_structured_text_signal(window) {
1550 structured_signal_seen = true;
1551 }
1552 has_license_or_notice_signal(window)
1553 })
1554 .count();
1555
1556 structured_signal_seen || promising_license_windows >= 2
1557}
1558
1559fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1560 let mut combined_lines = BTreeSet::new();
1561
1562 for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1563 let window_text = extract_printable_strings(&bytes[start..end]);
1564 for line in window_text
1565 .lines()
1566 .map(str::trim)
1567 .filter(|line| !line.is_empty())
1568 {
1569 combined_lines.insert(line.to_string());
1570 }
1571 }
1572
1573 combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1574}
1575
1576fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1577 let strings = extract_printable_strings(bytes);
1578 if strings.is_empty() {
1579 return false;
1580 }
1581
1582 let lower = strings.to_ascii_lowercase();
1583 [
1584 "copyright",
1585 "license",
1586 "licensed under",
1587 "all rights reserved",
1588 "permission is hereby granted",
1589 "redistribution and use",
1590 "spdx-license-identifier",
1591 ]
1592 .iter()
1593 .any(|marker| lower.contains(marker))
1594}
1595
1596fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1597 let strings = extract_printable_strings(bytes);
1598 if strings.is_empty() {
1599 return false;
1600 }
1601
1602 let email_markers = strings.matches('@').count();
1603 let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1604
1605 email_markers + url_markers >= 3
1606}
1607
1608fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1609 match format {
1610 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1611 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1612 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1613 ImageFormat::WebP => {
1614 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1615 }
1616 _ => false,
1617 }
1618}
1619
1620fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1621 let mut values = Vec::new();
1622 values.extend(extract_exif_metadata_values(bytes));
1623 values.extend(extract_xmp_metadata_values(bytes, format));
1624 values_to_text(values)
1625}
1626
1627fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1628 let mut cursor = BufReader::new(Cursor::new(bytes));
1629 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1630 Ok(exif) => exif,
1631 Err(_) => return Vec::new(),
1632 };
1633
1634 let mut values = Vec::new();
1635 for field in exif.fields() {
1636 let rendered = match field.tag {
1637 exif::Tag::ImageDescription => Some(format_metadata_field(
1638 "Description",
1639 &field.display_value().with_unit(&exif).to_string(),
1640 )),
1641 exif::Tag::Copyright => Some(format_metadata_field(
1642 "Copyright",
1643 &field.display_value().with_unit(&exif).to_string(),
1644 )),
1645 exif::Tag::UserComment => Some(format_metadata_field(
1646 "Comment",
1647 &field.display_value().with_unit(&exif).to_string(),
1648 )),
1649 exif::Tag::Artist => Some(format_metadata_field(
1650 "Author",
1651 &field.display_value().with_unit(&exif).to_string(),
1652 )),
1653 _ => None,
1654 };
1655
1656 if let Some(rendered) = rendered {
1657 values.push(rendered);
1658 }
1659 }
1660
1661 values
1662}
1663
1664fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1665 let xmp = match extract_raw_xmp_packet(bytes, format) {
1666 Some(xmp) => xmp,
1667 None => return Vec::new(),
1668 };
1669
1670 parse_xmp_values(&xmp)
1671}
1672
1673fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1674 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1675 if let Ok(mut decoder) = reader.into_decoder()
1676 && let Ok(Some(xmp)) = decoder.xmp_metadata()
1677 {
1678 return Some(xmp);
1679 }
1680
1681 match format {
1682 ImageFormat::Png => extract_png_xmp_packet(bytes),
1683 _ => None,
1684 }
1685}
1686
1687fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1688 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1689
1690 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1691 return None;
1692 }
1693
1694 let mut offset = PNG_SIGNATURE.len();
1695 while offset + 12 <= bytes.len() {
1696 let length = u32::from_be_bytes([
1697 bytes[offset],
1698 bytes[offset + 1],
1699 bytes[offset + 2],
1700 bytes[offset + 3],
1701 ]) as usize;
1702 let chunk_start = offset + 8;
1703 let chunk_end = chunk_start + length;
1704 if chunk_end + 4 > bytes.len() {
1705 return None;
1706 }
1707
1708 let chunk_type = &bytes[offset + 4..offset + 8];
1709 if chunk_type == b"iTXt" {
1710 let data = &bytes[chunk_start..chunk_end];
1711 if let Some(xmp) = parse_png_itxt_xmp(data) {
1712 return Some(xmp);
1713 }
1714 }
1715
1716 offset = chunk_end + 4;
1717 }
1718
1719 None
1720}
1721
1722fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1723 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1724
1725 let keyword_end = data.iter().position(|&b| b == 0)?;
1726 if &data[..keyword_end] != XMP_KEYWORD {
1727 return None;
1728 }
1729
1730 let mut cursor = keyword_end + 1;
1731 let compression_flag = *data.get(cursor)?;
1732 cursor += 1;
1733 let compression_method = *data.get(cursor)?;
1734 cursor += 1;
1735 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1736 return None;
1737 }
1738
1739 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1740 cursor = language_end + 1;
1741
1742 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1743 cursor = translated_end + 1;
1744
1745 let text_bytes = &data[cursor..];
1746 if compression_flag == 1 {
1747 let mut decoder = ZlibDecoder::new(text_bytes);
1748 let mut decoded = Vec::new();
1749 decoder.read_to_end(&mut decoded).ok()?;
1750 Some(decoded)
1751 } else {
1752 Some(text_bytes.to_vec())
1753 }
1754}
1755
1756fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1757 let mut reader = XmlReader::from_reader(xmp);
1758 reader.config_mut().trim_text(true);
1759
1760 let mut buf = Vec::new();
1761 let mut stack: Vec<String> = Vec::new();
1762 let mut values = Vec::new();
1763
1764 loop {
1765 match reader.read_event_into(&mut buf) {
1766 Ok(Event::Start(e)) => {
1767 stack.push(local_xml_name(e.name().as_ref()));
1768 }
1769 Ok(Event::End(_)) => {
1770 stack.pop();
1771 }
1772 Ok(Event::Empty(_)) => {}
1773 Ok(Event::Text(text)) => {
1774 if let Some(field) = stack
1775 .iter()
1776 .rev()
1777 .find_map(|name| allowed_xmp_field(name.as_str()))
1778 && let Ok(decoded) = text.decode()
1779 {
1780 let decoded = decoded.into_owned();
1781 if !decoded.trim().is_empty() {
1782 values.push(format_xmp_value(field, &decoded));
1783 }
1784 }
1785 }
1786 Ok(Event::CData(text)) => {
1787 if let Some(field) = stack
1788 .iter()
1789 .rev()
1790 .find_map(|name| allowed_xmp_field(name.as_str()))
1791 && let Ok(decoded) = text.decode()
1792 {
1793 let decoded = decoded.into_owned();
1794 if !decoded.trim().is_empty() {
1795 values.push(format_xmp_value(field, &decoded));
1796 }
1797 }
1798 }
1799 Ok(Event::Eof) | Err(_) => break,
1800 _ => {}
1801 }
1802 buf.clear();
1803 }
1804
1805 values
1806}
1807
1808fn local_xml_name(name: &[u8]) -> String {
1809 let name = std::str::from_utf8(name).unwrap_or_default();
1810 name.rsplit(':').next().unwrap_or(name).to_string()
1811}
1812
1813fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1814 match name {
1815 "creator" => Some("creator"),
1816 "rights" => Some("rights"),
1817 "description" => Some("description"),
1818 "title" => Some("title"),
1819 "subject" => Some("subject"),
1820 "UsageTerms" => Some("usage_terms"),
1821 "WebStatement" => Some("web_statement"),
1822 _ => None,
1823 }
1824}
1825
1826fn format_xmp_value(field: &str, value: &str) -> String {
1827 match field {
1828 "creator" => format_metadata_field("Author", value),
1829 "rights" => format_metadata_field("Copyright", value),
1830 "description" => format_metadata_field("Description", value),
1831 "title" => format_metadata_field("Title", value),
1832 "subject" => format_metadata_field("Subject", value),
1833 "usage_terms" => format_metadata_field("UsageTerms", value),
1834 "web_statement" => format_metadata_field("WebStatement", value),
1835 _ => value.to_string(),
1836 }
1837}
1838
1839fn format_metadata_field(label: &str, value: &str) -> String {
1840 format!("{label}: {value}")
1841}
1842
1843fn values_to_text(values: Vec<String>) -> String {
1844 let mut seen = BTreeSet::new();
1845 let mut normalized_lines = Vec::new();
1846
1847 for value in values {
1848 let normalized = normalize_metadata_value(&value);
1849 if normalized.is_empty() || !seen.insert(normalized.clone()) {
1850 continue;
1851 }
1852
1853 normalized_lines.push(normalized);
1854 }
1855
1856 let author_values: BTreeSet<String> = normalized_lines
1857 .iter()
1858 .filter_map(|line| split_metadata_field(line))
1859 .filter(|(label, _)| label.eq_ignore_ascii_case("Author"))
1860 .map(|(_, value)| value.to_string())
1861 .collect();
1862
1863 let mut lines = Vec::new();
1864 let mut total_bytes = 0usize;
1865
1866 for normalized in normalized_lines {
1867 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1868 break;
1869 }
1870
1871 if should_suppress_bare_copyright_metadata_line(&normalized, &author_values) {
1872 continue;
1873 }
1874
1875 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1876 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1877 break;
1878 }
1879
1880 total_bytes += added_bytes;
1881 lines.push(normalized);
1882 }
1883
1884 lines.join("\n")
1885}
1886
1887fn split_metadata_field(line: &str) -> Option<(&str, &str)> {
1888 let (label, value) = line.split_once(':')?;
1889 Some((label.trim(), value.trim()))
1890}
1891
1892fn should_suppress_bare_copyright_metadata_line(
1893 line: &str,
1894 author_values: &BTreeSet<String>,
1895) -> bool {
1896 let Some((label, value)) = split_metadata_field(line) else {
1897 return false;
1898 };
1899 if !label.eq_ignore_ascii_case("Copyright")
1900 || value.is_empty()
1901 || !author_values.contains(value)
1902 {
1903 return false;
1904 }
1905
1906 let lower = value.to_ascii_lowercase();
1907 !lower.contains("copyright")
1908 && !lower.contains("(c)")
1909 && !lower.contains('©')
1910 && !lower.contains("all rights")
1911 && !value.chars().any(|ch| ch.is_ascii_digit())
1912}
1913
1914fn normalize_metadata_value(value: &str) -> String {
1915 value
1916 .chars()
1917 .filter(|&ch| ch != '\0')
1918 .collect::<String>()
1919 .split_whitespace()
1920 .collect::<Vec<_>>()
1921 .join(" ")
1922 .trim()
1923 .to_string()
1924}
1925
1926fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1927 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1928 return (String::new(), None);
1929 }
1930
1931 let mut failures = Vec::new();
1932 let mut saw_success = false;
1933
1934 let extracted = catch_unwind(AssertUnwindSafe(
1935 || -> Result<String, Box<dyn std::error::Error>> {
1936 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1937 extract_first_pdf_page_text(&mut document)
1938 },
1939 ));
1940 match extracted {
1941 Ok(Ok(text)) => {
1942 saw_success = true;
1943 if let Some(normalized) = normalize_pdf_text(text) {
1944 return (normalized, None);
1945 }
1946 }
1947 Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1948 Err(payload) => failures.push(format!(
1949 "from-bytes first-page panic: {}",
1950 panic_payload_to_string(payload.as_ref())
1951 )),
1952 }
1953
1954 let extracted = catch_unwind(AssertUnwindSafe(
1955 || -> Result<String, Box<dyn std::error::Error>> {
1956 let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1957 extract_pdf_text_from_document(&mut document)
1958 },
1959 ));
1960 match extracted {
1961 Ok(Ok(text)) => {
1962 saw_success = true;
1963 if let Some(normalized) = normalize_pdf_text(text) {
1964 return (normalized, None);
1965 }
1966 }
1967 Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1968 Err(payload) => failures.push(format!(
1969 "open full-document panic: {}",
1970 panic_payload_to_string(payload.as_ref())
1971 )),
1972 }
1973
1974 let extracted = catch_unwind(AssertUnwindSafe(
1975 || -> Result<String, Box<dyn std::error::Error>> {
1976 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1977 extract_pdf_text_from_document(&mut document)
1978 },
1979 ));
1980 match extracted {
1981 Ok(Ok(text)) => {
1982 saw_success = true;
1983 if let Some(normalized) = normalize_pdf_text(text) {
1984 return (normalized, None);
1985 }
1986 }
1987 Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1988 Err(payload) => failures.push(format!(
1989 "from-bytes full-document panic: {}",
1990 panic_payload_to_string(payload.as_ref())
1991 )),
1992 }
1993
1994 if saw_success || is_non_actionable_pdf_failure(&failures) {
1995 (String::new(), None)
1996 } else {
1997 (
1998 String::new(),
1999 Some(format!(
2000 "PDF text extraction failed after {} attempts: {}",
2001 failures.len(),
2002 failures.join("; ")
2003 )),
2004 )
2005 }
2006}
2007
2008fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
2009 !failures.is_empty()
2010 && failures.iter().all(|failure| {
2011 failure.contains("requires a password")
2012 || failure.contains("Encrypt dictionary missing /O")
2013 || failure.contains("Encrypt dictionary missing /U")
2014 || failure.contains("security handler cannot be found")
2015 || failure.contains("Invalid cross-reference table")
2016 })
2017}
2018
2019fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
2020 if let Some(message) = payload.downcast_ref::<&str>() {
2021 (*message).to_string()
2022 } else if let Some(message) = payload.downcast_ref::<String>() {
2023 message.clone()
2024 } else {
2025 "unknown panic payload".to_string()
2026 }
2027}
2028
2029fn extract_first_pdf_page_text(
2030 document: &mut pdf_oxide::document::PdfDocument,
2031) -> Result<String, Box<dyn std::error::Error>> {
2032 if document.page_count()? == 0 {
2033 return Ok(String::new());
2034 }
2035
2036 let extracted_text = document.extract_text(0)?;
2037 let markdown_text =
2038 document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
2039 if pdf_markdown_heading_lines(&markdown_text).is_empty() {
2040 return Ok(extracted_text);
2041 }
2042
2043 let pipeline_text =
2044 document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
2045
2046 Ok(merge_pdf_first_page_text(
2047 &extracted_text,
2048 &markdown_text,
2049 &pipeline_text,
2050 ))
2051}
2052
2053fn extract_pdf_text_from_document(
2054 document: &mut pdf_oxide::document::PdfDocument,
2055) -> Result<String, Box<dyn std::error::Error>> {
2056 Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
2057}
2058
2059fn normalize_pdf_text(text: String) -> Option<String> {
2060 let normalized = text.replace(['\r', '\u{0c}'], "\n");
2061 (!normalized.trim().is_empty()).then_some(normalized)
2062}
2063
2064fn merge_pdf_first_page_text(
2065 _extracted_text: &str,
2066 markdown_text: &str,
2067 pipeline_text: &str,
2068) -> String {
2069 let pipeline = pipeline_text.trim();
2070 if pipeline.is_empty() {
2071 return String::new();
2072 }
2073
2074 let prefix = pdf_first_page_heading_prefix(markdown_text);
2075 let Some(prefix) = prefix else {
2076 return pipeline_text.to_string();
2077 };
2078
2079 if pdf_text_contains_heading_prefix(pipeline, &prefix) {
2080 pipeline_text.to_string()
2081 } else {
2082 format!("{prefix}\n\n{pipeline}")
2083 }
2084}
2085
2086fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
2087 normalize_pdf_heading_comparison_text(text)
2088 .contains(&normalize_pdf_heading_comparison_text(prefix))
2089}
2090
2091fn normalize_pdf_heading_comparison_text(text: &str) -> String {
2092 text.split_whitespace()
2093 .map(|part| part.to_ascii_lowercase())
2094 .collect::<Vec<_>>()
2095 .join(" ")
2096}
2097
2098fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
2099 let mut lines = Vec::new();
2100
2101 for line in pdf_markdown_heading_lines(markdown_text) {
2102 push_unique_line(&mut lines, line);
2103 }
2104
2105 (!lines.is_empty()).then(|| lines.join("\n"))
2106}
2107
2108fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
2109 text.lines()
2110 .map(str::trim)
2111 .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
2112 .map(|line| line.trim_matches('#').trim())
2113 .filter(|line| !line.is_empty())
2114 .filter(|line| !looks_like_numbered_section_heading(line))
2115 .take(4)
2116 .map(ToOwned::to_owned)
2117 .collect()
2118}
2119
2120fn push_unique_line(lines: &mut Vec<String>, line: String) {
2121 if !lines.iter().any(|existing| existing == &line) {
2122 lines.push(line);
2123 }
2124}
2125
2126fn looks_like_numbered_section_heading(line: &str) -> bool {
2127 let mut chars = line.chars();
2128 let Some(first) = chars.next() else {
2129 return false;
2130 };
2131
2132 if !first.is_ascii_digit() {
2133 return false;
2134 }
2135
2136 matches!(chars.next(), Some('.'))
2137}
2138
2139fn is_zip_archive(bytes: &[u8]) -> bool {
2140 bytes.starts_with(b"PK\x03\x04")
2141 || bytes.starts_with(b"PK\x05\x06")
2142 || bytes.starts_with(b"PK\x07\x08")
2143}
2144
2145pub fn extract_printable_strings(bytes: &[u8]) -> String {
2146 const MIN_LEN: usize = 4;
2147 const MIN_OUTPUT_BYTES: usize = 2_000_000;
2148 const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
2149
2150 let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
2151
2152 fn is_printable_ascii(b: u8) -> bool {
2153 matches!(b, 0x20..=0x7E)
2154 }
2155
2156 let mut out = String::new();
2157 let mut run: Vec<u8> = Vec::new();
2158
2159 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
2160 if run.len() >= MIN_LEN {
2161 if !out.is_empty() {
2162 out.push('\n');
2163 }
2164 out.push_str(&String::from_utf8_lossy(run));
2165 }
2166 run.clear();
2167 };
2168
2169 for &b in bytes {
2170 if is_printable_ascii(b) {
2171 run.push(b);
2172 } else {
2173 flush_run(&mut out, &mut run);
2174 if out.len() >= max_output_bytes {
2175 return out;
2176 }
2177 }
2178 }
2179 flush_run(&mut out, &mut run);
2180 if out.len() >= max_output_bytes {
2181 return out;
2182 }
2183
2184 for start in 0..=1 {
2185 run.clear();
2186 let mut i = start;
2187 while i + 1 < bytes.len() {
2188 let b0 = bytes[i];
2189 let b1 = bytes[i + 1];
2190 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
2191 if is_printable_ascii(ch) && zero == 0 {
2192 run.push(ch);
2193 } else {
2194 flush_run(&mut out, &mut run);
2195 if out.len() >= max_output_bytes {
2196 return out;
2197 }
2198 }
2199 i += 2;
2200 }
2201 flush_run(&mut out, &mut run);
2202 if out.len() >= max_output_bytes {
2203 return out;
2204 }
2205 }
2206
2207 out
2208}
2209
2210#[cfg(test)]
2211mod tests {
2212 use std::path::Path;
2213
2214 use crate::copyright::detect_copyrights;
2215
2216 use super::{
2217 ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
2218 extract_printable_strings, extract_text_for_detection,
2219 extract_text_for_detection_with_diagnostics, format_metadata_field, format_xmp_value,
2220 is_non_actionable_pdf_failure, normalize_mime_type, normalize_pdf_heading_comparison_text,
2221 values_to_text, windows_metadata_or_empty_result,
2222 };
2223
2224 fn png_chunk(chunk_type: &[u8; 4], data: &[u8]) -> Vec<u8> {
2225 let mut out = Vec::new();
2226 out.extend_from_slice(&(data.len() as u32).to_be_bytes());
2227 out.extend_from_slice(chunk_type);
2228 out.extend_from_slice(data);
2229 out.extend_from_slice(&0u32.to_be_bytes());
2230 out
2231 }
2232
2233 fn build_png_with_xmp(xmp: &str) -> Vec<u8> {
2234 let mut bytes = Vec::new();
2235 bytes.extend_from_slice(b"\x89PNG\r\n\x1a\n");
2236
2237 let ihdr = [
2238 0, 0, 0, 1, 0, 0, 0, 1, 8, 2, 0, 0, 0, ];
2246 bytes.extend_from_slice(&png_chunk(b"IHDR", &ihdr));
2247
2248 let mut itxt = Vec::new();
2249 itxt.extend_from_slice(b"XML:com.adobe.xmp");
2250 itxt.push(0); itxt.push(0); itxt.push(0); itxt.push(0); itxt.push(0); itxt.extend_from_slice(xmp.as_bytes());
2256 bytes.extend_from_slice(&png_chunk(b"iTXt", &itxt));
2257
2258 bytes.extend_from_slice(&png_chunk(b"IEND", &[]));
2259 bytes
2260 }
2261
2262 #[test]
2263 fn test_extract_text_for_detection_skips_jar_archives() {
2264 let path = Path::new(
2265 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2266 );
2267 let bytes = std::fs::read(path).expect("failed to read jar fixture");
2268
2269 let (text, kind) = extract_text_for_detection(path, &bytes);
2270
2271 assert!(text.is_empty());
2272 assert_eq!(kind, ExtractedTextKind::None);
2273 }
2274
2275 #[test]
2276 fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2277 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2278 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2279
2280 let (text, kind) = extract_text_for_detection(path, &bytes);
2281
2282 assert_eq!(kind, ExtractedTextKind::Pdf);
2283 assert!(text.contains("Redistribution and use in source and binary forms"));
2284 }
2285
2286 #[test]
2287 fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2288 let path =
2289 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2290 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2291
2292 let (text, kind) = extract_text_for_detection(path, &bytes);
2293
2294 assert_eq!(kind, ExtractedTextKind::Pdf);
2295 assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2296 assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2297 }
2298
2299 #[test]
2300 fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2301 let path =
2302 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2303 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2304
2305 let (text, kind) = extract_text_for_detection(path, &bytes);
2306
2307 assert_eq!(kind, ExtractedTextKind::Pdf);
2308
2309 let normalized = normalize_pdf_heading_comparison_text(&text);
2310 let heading =
2311 normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2312 assert_eq!(normalized.matches(&heading).count(), 1);
2313 }
2314
2315 #[test]
2316 fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2317 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2318 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2319
2320 let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2321
2322 assert_eq!(kind, ExtractedTextKind::Pdf);
2323 assert!(text.contains("Redistribution and use in source and binary forms"));
2324 }
2325
2326 #[test]
2327 fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2328 let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2329
2330 let (text, kind, scan_error) =
2331 extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2332
2333 assert!(text.is_empty());
2334 assert_eq!(kind, ExtractedTextKind::None);
2335 let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2336 assert!(scan_error.contains("PDF text extraction failed after"));
2337 }
2338
2339 #[test]
2340 fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2341 let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2342
2343 let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2344
2345 assert!(text.is_empty());
2346 assert_eq!(kind, ExtractedTextKind::None);
2347 }
2348
2349 #[test]
2350 fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2351 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2352 let text = b"Copyright 2026 Example Project!!!";
2353 bytes[..text.len()].copy_from_slice(text);
2354 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2355 bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2356
2357 let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2358
2359 assert_ne!(kind, ExtractedTextKind::None);
2360 assert!(text.contains("Copyright 2026 Example Project"));
2361 }
2362
2363 #[test]
2364 fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2365 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2366 let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2367 bytes[..noise.len()].copy_from_slice(noise);
2368 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2369 bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2370
2371 let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2372
2373 assert!(text.is_empty());
2374 assert_eq!(kind, ExtractedTextKind::None);
2375 }
2376
2377 #[test]
2378 fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2379 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2380 let bytes = std::fs::read(path).expect("read PE fixture");
2381
2382 let (text, kind) = extract_text_for_detection(path, &bytes);
2383
2384 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2385 assert!(text.contains("License: This program is free software"));
2386 assert!(text.contains("LegalCopyright:"));
2387 }
2388
2389 #[test]
2390 fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2391 {
2392 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2393 let mut bytes = std::fs::read(path).expect("read PE fixture");
2394 bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2395
2396 let (text, kind) = extract_text_for_detection(path, &bytes);
2397
2398 assert_ne!(kind, ExtractedTextKind::None);
2399 assert!(!text.trim().is_empty());
2400 }
2401
2402 #[test]
2403 fn test_windows_metadata_or_empty_result_preserves_metadata() {
2404 let (text, kind, scan_error) =
2405 windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2406
2407 assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2408 assert_eq!(text, "LegalCopyright: Example Corp");
2409 assert!(scan_error.is_none());
2410 }
2411
2412 #[test]
2413 fn test_format_xmp_value_labels_creator_and_title_fields() {
2414 assert_eq!(
2415 format_xmp_value("creator", "Chinmay Garde"),
2416 "Author: Chinmay Garde"
2417 );
2418 assert_eq!(
2419 format_xmp_value("title", "Bay Bridge At Night"),
2420 "Title: Bay Bridge At Night"
2421 );
2422 assert_eq!(
2423 format_xmp_value("description", "Embarcadero in the evening on Delta 3200"),
2424 "Description: Embarcadero in the evening on Delta 3200"
2425 );
2426 }
2427
2428 #[test]
2429 fn test_format_metadata_field_prefixes_exif_text() {
2430 assert_eq!(
2431 format_metadata_field("Author", "Chinmay Garde"),
2432 "Author: Chinmay Garde"
2433 );
2434 assert_eq!(
2435 format_metadata_field("Description", "Bay Bridge At Night"),
2436 "Description: Bay Bridge At Night"
2437 );
2438 }
2439
2440 #[test]
2441 fn test_extract_text_for_detection_keeps_image_author_separate_from_title_and_description() {
2442 let xmp = r#"<x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:creator>Chinmay Garde</dc:creator><dc:title>Bay Bridge At Night</dc:title><dc:description>Embarcadero in the evening on Delta 3200</dc:description></rdf:Description></rdf:RDF></x:xmpmeta>"#;
2443 let bytes = build_png_with_xmp(xmp);
2444
2445 let (text, kind) = extract_text_for_detection(Path::new("fixture.png"), &bytes);
2446
2447 assert_eq!(kind, ExtractedTextKind::ImageMetadata);
2448 assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2449 assert!(
2450 text.contains("Title: Bay Bridge At Night"),
2451 "text: {text:?}"
2452 );
2453 assert!(
2454 text.contains("Description: Embarcadero in the evening on Delta 3200"),
2455 "text: {text:?}"
2456 );
2457
2458 let (_copyrights, _holders, authors) = detect_copyrights(&text, None);
2459 assert_eq!(
2460 authors
2461 .iter()
2462 .map(|a| a.author.as_str())
2463 .collect::<Vec<_>>(),
2464 vec!["Chinmay Garde"],
2465 "authors: {authors:?}; text: {text:?}"
2466 );
2467 }
2468
2469 #[test]
2470 fn test_values_to_text_suppresses_bare_copyright_duplicate_of_author() {
2471 let text = values_to_text(vec![
2472 "Author: Chinmay Garde".to_string(),
2473 "Copyright: Chinmay Garde".to_string(),
2474 "Title: Bay Bridge At Night".to_string(),
2475 ]);
2476
2477 assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2478 assert!(
2479 text.contains("Title: Bay Bridge At Night"),
2480 "text: {text:?}"
2481 );
2482 assert!(!text.contains("Copyright: Chinmay Garde"), "text: {text:?}");
2483 }
2484
2485 #[test]
2486 fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2487 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2488 let text = b"Copyright 2026 Example Project!!!";
2489 bytes[..text.len()].copy_from_slice(text);
2490
2491 let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2492
2493 assert!(text.is_empty());
2494 assert_eq!(kind, ExtractedTextKind::None);
2495 }
2496
2497 #[test]
2498 fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2499 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2500 let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2501 bytes[..text.len()].copy_from_slice(text);
2502
2503 let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2504
2505 assert_ne!(kind, ExtractedTextKind::None);
2506 assert!(text.contains("asn@redhat.com"));
2507 assert!(text.contains("https://publicsuffix.org/"));
2508 }
2509
2510 #[test]
2511 fn test_non_actionable_pdf_failures_are_suppressed() {
2512 assert!(is_non_actionable_pdf_failure(&[
2513 "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2514 "open full-document: PDF is encrypted and requires a password".to_string(),
2515 ]));
2516 assert!(is_non_actionable_pdf_failure(&[
2517 "from-bytes first-page: Invalid cross-reference table".to_string(),
2518 "open full-document: Invalid cross-reference table".to_string(),
2519 ]));
2520 assert!(is_non_actionable_pdf_failure(&[
2521 "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2522 "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2523 ]));
2524 assert!(!is_non_actionable_pdf_failure(&[
2525 "from-bytes first-page: some other parser failure".to_string(),
2526 ]));
2527 }
2528
2529 #[test]
2530 fn test_extract_text_for_detection_skips_zip_like_archives() {
2531 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2532
2533 let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2534 let (crate_text, crate_kind) =
2535 extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2536
2537 assert!(whl_text.is_empty());
2538 assert_eq!(whl_kind, ExtractedTextKind::None);
2539 assert!(crate_text.is_empty());
2540 assert_eq!(crate_kind, ExtractedTextKind::None);
2541 }
2542
2543 #[test]
2544 fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2545 let path =
2546 Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2547 let bytes = std::fs::read(path).expect("failed to read lib fixture");
2548
2549 let (text, kind) = extract_text_for_detection(path, &bytes);
2550
2551 assert_ne!(kind, ExtractedTextKind::None);
2552 assert!(text.contains("Copyright nexB and others (c) 2012"));
2553 }
2554
2555 #[test]
2556 fn test_extract_text_for_detection_reads_font_metadata() {
2557 let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2558 let bytes = std::fs::read(path).expect("failed to read font fixture");
2559
2560 let (text, kind) = extract_text_for_detection(path, &bytes);
2561
2562 assert_eq!(kind, ExtractedTextKind::FontMetadata);
2563 assert!(text.contains("License Description:"), "{text}");
2564 assert!(
2565 text.contains("Open Font License") || text.contains("OFL"),
2566 "{text}"
2567 );
2568 assert!(text.contains("Lato"), "{text}");
2569 }
2570
2571 #[test]
2572 fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2573 let bytes = b"abcd\0".repeat(525_000);
2574
2575 let text = extract_printable_strings(&bytes);
2576
2577 assert!(
2578 text.len() > 2_000_000,
2579 "unexpected truncation at {}",
2580 text.len()
2581 );
2582 assert!(text.ends_with("abcd"));
2583 }
2584
2585 #[test]
2586 fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2587 let path = Path::new(
2588 "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2589 );
2590 let bytes = std::fs::read(path).expect("failed to read svg fixture");
2591
2592 let (text, kind) = extract_text_for_detection(path, &bytes);
2593
2594 assert_eq!(kind, ExtractedTextKind::Decoded);
2595 assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2596 }
2597
2598 #[test]
2599 fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2600 let path = Path::new(
2601 "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2602 );
2603 let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2604
2605 let (text, kind) = extract_text_for_detection(path, &bytes);
2606
2607 assert_eq!(kind, ExtractedTextKind::Decoded);
2608 assert!(text.contains("GNU Lesser General Public"));
2609 assert!(text.contains("version"));
2610 assert!(text.contains("2.1 of the License"));
2611 }
2612
2613 #[test]
2614 fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2615 assert_eq!(
2616 normalize_mime_type(
2617 Path::new("main.ts"),
2618 b"export const answer = 42;\n",
2619 Some("TypeScript"),
2620 "video/mp2t",
2621 ),
2622 "text/plain"
2623 );
2624 }
2625
2626 #[test]
2627 fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2628 assert_eq!(
2629 normalize_mime_type(
2630 Path::new("main.js"),
2631 b"console.log('hello');\n",
2632 Some("JavaScript"),
2633 "application/octet-stream",
2634 ),
2635 "text/plain"
2636 );
2637 }
2638
2639 #[test]
2640 fn test_normalize_mime_type_preserves_binary_video_guess() {
2641 assert_eq!(
2642 normalize_mime_type(
2643 Path::new("main.ts"),
2644 &[0, 159, 146, 150, 0, 1, 2, 3],
2645 Some("TypeScript"),
2646 "video/mp2t",
2647 ),
2648 "video/mp2t"
2649 );
2650 }
2651
2652 #[test]
2653 fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2654 assert_eq!(
2655 normalize_mime_type(
2656 Path::new("main.ts"),
2657 &[0, 159, 146, 150],
2658 Some("TypeScript"),
2659 "application/octet-stream",
2660 ),
2661 "application/octet-stream"
2662 );
2663 }
2664
2665 #[test]
2666 fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2667 let classification = classify_file_info(Path::new("test.txt"), b"");
2668
2669 assert_eq!(classification.mime_type, "inode/x-empty");
2670 assert_eq!(classification.file_type, "empty");
2671 assert!(!classification.is_binary);
2672 assert!(classification.is_text);
2673 assert!(!classification.is_source);
2674 assert_eq!(classification.programming_language, None);
2675 }
2676
2677 #[test]
2678 fn test_classify_file_info_keeps_json_out_of_programming_language() {
2679 let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2680
2681 assert_eq!(classification.mime_type, "application/json");
2682 assert_eq!(classification.file_type, "JSON text data");
2683 assert!(classification.is_text);
2684 assert!(!classification.is_source);
2685 assert_eq!(classification.programming_language, None);
2686 }
2687
2688 #[test]
2689 fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2690 let classification =
2691 classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2692
2693 assert_eq!(classification.mime_type, "text/plain");
2694 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2695 assert!(classification.is_text);
2696 assert!(!classification.is_binary);
2697 }
2698
2699 #[test]
2700 fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2701 let classification =
2702 classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
2703
2704 assert_eq!(classification.mime_type, "application/octet-stream");
2705 assert_eq!(classification.file_type, "data");
2706 assert!(classification.is_binary);
2707 assert!(!classification.is_text);
2708 }
2709
2710 #[test]
2711 fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
2712 let classification = classify_file_info(
2713 Path::new("utf16.json"),
2714 &[
2715 0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
2716 ],
2717 );
2718
2719 assert!(!classification.is_binary);
2720 assert!(classification.is_text);
2721 assert_eq!(classification.mime_type, "application/json");
2722 assert_eq!(classification.file_type, "JSON text data");
2723 }
2724
2725 #[test]
2726 fn test_classify_file_info_treats_valid_utf16be_json_without_bom_as_text() {
2727 let classification = classify_file_info(
2728 Path::new("utf16be.json"),
2729 &[0x00, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D],
2730 );
2731
2732 assert!(!classification.is_binary);
2733 assert!(classification.is_text);
2734 assert_eq!(classification.mime_type, "application/json");
2735 assert_eq!(classification.file_type, "JSON text data");
2736 }
2737
2738 #[test]
2739 fn test_classify_file_info_treats_small_valid_utf16be_json_literal_as_text() {
2740 let classification =
2741 classify_file_info(Path::new("utf16be-literal.json"), &[0x00, 0x5B, 0x00, 0x5D]);
2742
2743 assert!(!classification.is_binary);
2744 assert!(classification.is_text);
2745 assert_eq!(classification.mime_type, "application/json");
2746 assert_eq!(classification.file_type, "JSON text data");
2747 }
2748
2749 #[test]
2750 fn test_extract_text_for_detection_decodes_utf16be_text_with_corrupted_bom_prefix() {
2751 let mut bytes = super::CORRUPTED_UTF16_BOM_PREFIX.to_vec();
2752 for code_unit in
2753 "Licensed to the Apache Software Foundation\nApache License, Version 2.0".encode_utf16()
2754 {
2755 bytes.extend_from_slice(&code_unit.to_be_bytes());
2756 }
2757
2758 let (text, kind) = extract_text_for_detection(Path::new("notice.ftl"), &bytes);
2759
2760 assert_eq!(kind, ExtractedTextKind::Decoded);
2761 assert!(text.contains("Apache Software Foundation"), "{text}");
2762 assert!(text.contains("Apache License, Version 2.0"), "{text}");
2763 }
2764
2765 #[test]
2766 fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
2767 let classification = classify_file_info(Path::new("true.json"), b"true");
2768
2769 assert!(!classification.is_binary);
2770 assert!(classification.is_text);
2771 assert_eq!(classification.mime_type, "application/json");
2772 assert_eq!(classification.file_type, "JSON text data");
2773 }
2774
2775 #[test]
2776 fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
2777 let classification = classify_file_info(
2778 Path::new("wrapped.json"),
2779 &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
2780 );
2781
2782 assert!(!classification.is_binary);
2783 assert!(classification.is_text);
2784 assert_eq!(classification.mime_type, "text/plain");
2785 assert_eq!(classification.file_type, "text, with no line terminators");
2786 }
2787
2788 #[test]
2789 fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
2790 let classification =
2791 classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
2792
2793 assert!(classification.is_binary);
2794 assert!(!classification.is_text);
2795 assert_eq!(classification.mime_type, "application/octet-stream");
2796 assert_eq!(classification.file_type, "data");
2797 }
2798
2799 #[test]
2800 fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
2801 let classification = classify_file_info(
2802 Path::new("crash.json"),
2803 &[
2804 0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
2805 ],
2806 );
2807
2808 assert!(classification.is_binary);
2809 assert!(!classification.is_text);
2810 assert_eq!(classification.mime_type, "application/octet-stream");
2811 }
2812
2813 #[test]
2814 fn test_classify_file_info_treats_dockerfile_as_source() {
2815 let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2816
2817 assert_eq!(
2818 classification.programming_language.as_deref(),
2819 Some("Dockerfile")
2820 );
2821 assert!(classification.is_source);
2822 assert!(!classification.is_script);
2823 assert_eq!(
2824 classification.file_type,
2825 "Dockerfile source, UTF-8 Unicode text"
2826 );
2827 }
2828
2829 #[test]
2830 fn test_classify_file_info_treats_makefile_as_text_not_source() {
2831 let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2832
2833 assert_eq!(classification.programming_language, None);
2834 assert!(classification.is_text);
2835 assert!(!classification.is_source);
2836 assert!(!classification.is_script);
2837 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2838 }
2839
2840 #[test]
2841 fn test_classify_file_info_marks_supported_package_archives() {
2842 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2843
2844 let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2845 let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2846
2847 assert!(egg.is_archive);
2848 assert_eq!(egg.mime_type, "application/zip");
2849 assert_eq!(egg.file_type, "Zip archive data");
2850 assert!(nupkg.is_archive);
2851 assert_eq!(nupkg.mime_type, "application/zip");
2852 assert_eq!(nupkg.file_type, "Zip archive data");
2853 }
2854
2855 #[test]
2856 fn test_classify_file_info_marks_png_as_binary_media() {
2857 let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2858
2859 let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2860
2861 assert_eq!(classification.mime_type, "image/png");
2862 assert_eq!(classification.file_type, "PNG image data");
2863 assert!(classification.is_binary);
2864 assert!(!classification.is_text);
2865 assert!(classification.is_media);
2866 assert!(!classification.is_archive);
2867 assert!(!classification.is_source);
2868 }
2869
2870 #[test]
2871 fn test_classify_file_info_marks_pdf_as_binary_document() {
2872 let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2873
2874 let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2875
2876 assert_eq!(classification.mime_type, "application/pdf");
2877 assert_eq!(classification.file_type, "PDF document");
2878 assert!(classification.is_binary);
2879 assert!(!classification.is_text);
2880 assert!(!classification.is_archive);
2881 assert!(!classification.is_media);
2882 }
2883
2884 #[test]
2885 fn test_classify_file_info_marks_binary_blobs_as_binary() {
2886 let classification =
2887 classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2888
2889 assert!(classification.is_binary);
2890 assert!(!classification.is_text);
2891 assert!(!classification.is_source);
2892 assert_eq!(classification.programming_language, None);
2893 }
2894
2895 #[test]
2896 fn test_classify_file_info_treats_yaml_as_text_not_source() {
2897 let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2898
2899 assert_eq!(classification.programming_language, None);
2900 assert!(classification.is_text);
2901 assert!(!classification.is_source);
2902 assert_eq!(classification.file_type, "YAML text data");
2903 }
2904
2905 #[test]
2906 fn test_classify_file_info_classifies_common_build_manifests() {
2907 let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2908 let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2909 let cmake = classify_file_info(
2910 Path::new("toolchain.cmake"),
2911 b"set(CMAKE_CXX_STANDARD 20)\n",
2912 );
2913 let gitmodules = classify_file_info(
2914 Path::new(".gitmodules"),
2915 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2916 );
2917
2918 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2919 assert!(gradle.is_source);
2920 assert_eq!(gradle.mime_type, "text/plain");
2921 assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
2922
2923 assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2924 assert!(flake.is_source);
2925 assert_eq!(flake.mime_type, "text/plain");
2926 assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
2927
2928 assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
2929 assert!(cmake.is_source);
2930 assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
2931
2932 assert_eq!(gitmodules.programming_language, None);
2933 assert!(gitmodules.is_text);
2934 assert!(!gitmodules.is_source);
2935 assert_eq!(gitmodules.file_type, "Git configuration text");
2936 }
2937
2938 #[test]
2939 fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
2940 let header = classify_file_info(
2941 Path::new("include/demo.hpp"),
2942 b"#pragma once\nclass Demo {};\n",
2943 );
2944 let ipp = classify_file_info(
2945 Path::new("include/detail/demo.ipp"),
2946 b"template <class T> void parse() {}\n",
2947 );
2948
2949 assert_eq!(header.programming_language.as_deref(), Some("C++"));
2950 assert!(header.is_source);
2951 assert!(!header.is_script);
2952 assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
2953
2954 assert_eq!(ipp.programming_language, None);
2955 assert!(!ipp.is_source);
2956 assert!(!ipp.is_script);
2957 assert_eq!(ipp.file_type, "UTF-8 Unicode text");
2958 }
2959
2960 #[test]
2961 fn test_classify_file_info_preserves_specific_shell_family_labels() {
2962 let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
2963
2964 assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
2965 assert!(bash.is_script);
2966 assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
2967 }
2968
2969 #[test]
2970 fn test_classify_file_info_marks_jamfile_as_source() {
2971 let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
2972
2973 assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
2974 assert!(jamfile.is_source);
2975 assert!(!jamfile.is_script);
2976 assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
2977 }
2978
2979 #[test]
2980 fn test_classify_file_info_labels_javascript_shebang_scripts() {
2981 let classification = classify_file_info(
2982 Path::new("bin/run"),
2983 b"#!/usr/bin/env node\nconsole.log('hello');\n",
2984 );
2985
2986 assert_eq!(
2987 classification.programming_language.as_deref(),
2988 Some("JavaScript")
2989 );
2990 assert!(classification.is_script);
2991 assert_eq!(
2992 classification.file_type,
2993 "javascript script, UTF-8 Unicode text executable"
2994 );
2995 }
2996
2997 #[test]
2998 fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2999 let classification = classify_file_info(
3000 Path::new("script.py"),
3001 b"# coding: latin-1\nprint(\"caf\xe9\")\n",
3002 );
3003
3004 assert_eq!(
3005 classification.programming_language.as_deref(),
3006 Some("Python")
3007 );
3008 assert!(classification.is_script);
3009 assert_eq!(classification.file_type, "python script, text executable");
3010 }
3011
3012 #[test]
3013 fn test_classify_file_info_treats_textual_tga_as_media() {
3014 let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
3015
3016 assert!(classification.is_media);
3017 assert!(classification.is_text);
3018 assert!(!classification.is_binary);
3019 }
3020
3021 #[test]
3022 fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
3023 let classification =
3024 classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
3025
3026 assert!(classification.is_binary);
3027 assert!(!classification.is_text);
3028 assert!(!classification.is_source);
3029 assert_eq!(classification.programming_language, None);
3030 }
3031
3032 #[test]
3033 fn test_extract_text_for_detection_skips_unsupported_image_formats() {
3034 let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
3035
3036 let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
3037
3038 assert!(text.is_empty());
3039 assert_eq!(kind, ExtractedTextKind::None);
3040 }
3041
3042 #[test]
3043 fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
3044 let cases = [
3045 (
3046 Path::new("bin/run"),
3047 b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
3048 Some("JavaScript"),
3049 true,
3050 true,
3051 ),
3052 (
3053 Path::new("Dockerfile"),
3054 b"FROM scratch\n".as_slice(),
3055 Some("Dockerfile"),
3056 true,
3057 false,
3058 ),
3059 (
3060 Path::new("package.json"),
3061 br#"{"name":"demo"}"#.as_slice(),
3062 None,
3063 false,
3064 false,
3065 ),
3066 (
3067 Path::new("config.yaml"),
3068 b"key: value\n".as_slice(),
3069 None,
3070 false,
3071 false,
3072 ),
3073 (
3074 Path::new("Makefile"),
3075 b"all:\n\techo hi\n".as_slice(),
3076 None,
3077 false,
3078 false,
3079 ),
3080 ];
3081
3082 for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
3083 let classification = classify_file_info(path, bytes);
3084
3085 assert_eq!(
3086 classification.programming_language.as_deref(),
3087 expected_language,
3088 "unexpected language for {}",
3089 path.display()
3090 );
3091 assert_eq!(
3092 classification.is_source,
3093 expected_is_source,
3094 "unexpected is_source for {}",
3095 path.display()
3096 );
3097 assert_eq!(
3098 classification.is_script,
3099 expected_is_script,
3100 "unexpected is_script for {}",
3101 path.display()
3102 );
3103 }
3104 }
3105}