1use std::borrow::Cow;
5use std::collections::BTreeSet;
6use std::fs;
7use std::io::{BufReader, Cursor, Read};
8use std::panic::{AssertUnwindSafe, catch_unwind};
9use std::path::Path;
10
11use chrono::{TimeZone, Utc};
12use file_format::{FileFormat, Kind as FileFormatKind};
13use flate2::read::ZlibDecoder;
14use glob::Pattern;
15use image::{ImageDecoder, ImageFormat, ImageReader};
16use mime_guess::from_path;
17use quick_xml::events::Event;
18use quick_xml::reader::Reader as XmlReader;
19
20use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
21use crate::utils::font::extract_font_metadata_text;
22use crate::utils::language::detect_language;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum ExtractedTextKind {
26 None,
27 Decoded,
28 FontMetadata,
29 Pdf,
30 BinaryStrings,
31 ImageMetadata,
32 WindowsExecutableMetadata,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct FileInfoClassification {
37 pub mime_type: String,
38 pub file_type: String,
39 pub programming_language: Option<String>,
40 pub is_binary: bool,
41 pub is_text: bool,
42 pub is_archive: bool,
43 pub is_media: bool,
44 pub is_source: bool,
45 pub is_script: bool,
46}
47
48const MAX_IMAGE_METADATA_VALUES: usize = 64;
49const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
50const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
51const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
52const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
53const MAX_XMP_PACKET_BYTES: usize = 256 * 1024;
54const MAX_PDF_TEXT_EXTRACTION_BYTES: usize = 32 * 1024 * 1024;
55const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
56 "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
57];
58const BINARY_EXTENSIONS: &[&str] = &[
59 "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
60];
61const ARCHIVE_EXTENSIONS: &[&str] = &[
62 "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
63 "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
64];
65
66pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
68 metadata.modified().ok().map(|time: std::time::SystemTime| {
69 let seconds_since_epoch = time
70 .duration_since(std::time::UNIX_EPOCH)
71 .unwrap()
72 .as_secs() as i64;
73
74 Utc.timestamp_opt(seconds_since_epoch, 0)
75 .single()
76 .unwrap_or_else(Utc::now)
77 .format("%Y-%m-%d")
78 .to_string()
79 })
80}
81
82pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
84 let path_str = path.to_string_lossy();
85 let file_name = path
86 .file_name()
87 .map(|name| name.to_string_lossy())
88 .unwrap_or_default();
89
90 for pattern in exclude_patterns {
91 if pattern.matches(&path_str) {
93 return true;
94 }
95
96 if pattern.matches(&file_name) {
98 return true;
99 }
100 }
101
102 false
103}
104
105pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
112 if let Some(decoded) = decode_utf16_text(bytes) {
113 return decoded;
114 }
115
116 match String::from_utf8(bytes.to_vec()) {
117 Ok(s) => s,
118 Err(e) => {
119 let bytes = e.into_bytes();
120 if has_binary_control_chars(&bytes) {
121 return String::new();
122 }
123 bytes.iter().map(|&b| b as char).collect()
124 }
125 }
126}
127
128pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
129 let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
130 (text, kind)
131}
132
133pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
134 let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
135 return Cow::Borrowed(text);
136 };
137 if !matches!(
138 extension.to_ascii_lowercase().as_str(),
139 "md" | "markdown" | "html" | "htm"
140 ) {
141 return Cow::Borrowed(text);
142 }
143
144 let mut hints = Vec::new();
145 let has_dual_license_notice = has_dual_license_notice_text(text);
146 if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
147 hints.push("Creative Commons Attribution 4.0 International License".to_string());
148 }
149 if !has_dual_license_notice
150 && (text.contains("Apache License (Version 2.0)")
151 || text.contains("Apache License, Version 2.0"))
152 {
153 hints.push(
154 "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
155 .to_string(),
156 );
157 }
158
159 if !has_dual_license_notice {
160 hints.extend(extract_shields_license_badge_hints(text));
161 }
162
163 if hints.is_empty() {
164 Cow::Borrowed(text)
165 } else {
166 let mut augmented =
167 String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
168 augmented.push_str(text);
169 augmented.push_str("\n\n");
170 for (index, hint) in hints.into_iter().enumerate() {
171 if index > 0 {
172 augmented.push('\n');
173 }
174 augmented.push_str(&hint);
175 }
176 Cow::Owned(augmented)
177 }
178}
179
180fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
181 let mut hints = Vec::new();
182 let mut rest = text;
183 let needle = "img.shields.io/badge/license-";
184
185 while let Some(index) = rest.find(needle) {
186 let start = index + needle.len();
187 let suffix = &rest[start..];
188 let end = suffix
189 .find([')', ']', '"', '\'', ' ', '\n'])
190 .unwrap_or(suffix.len());
191 let badge = &suffix[..end];
192 let Some(badge) = badge.strip_suffix(".svg") else {
193 rest = &suffix[end..];
194 continue;
195 };
196
197 let mut segments: Vec<_> = badge
198 .split('-')
199 .filter(|segment| !segment.is_empty())
200 .collect();
201 if segments.len() < 2 {
202 rest = &suffix[end..];
203 continue;
204 }
205 segments.pop();
206 let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
207 if !candidate.is_empty() {
208 hints.push(canonical_shields_license_hint(&candidate));
209 }
210
211 rest = &suffix[end..];
212 }
213
214 hints.sort();
215 hints.dedup();
216 hints
217}
218
219fn has_dual_license_notice_text(text: &str) -> bool {
220 let lower = text.to_ascii_lowercase();
221 (lower.contains("licensed under either of") && lower.contains("at your option"))
222 || lower.contains("dual-licensed under")
223 || lower.contains("dual licensed under")
224}
225
226fn canonical_shields_license_hint(candidate: &str) -> String {
227 match candidate.trim() {
228 "MIT" => "The MIT License".to_string(),
229 "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
230 other => format!("{other} License"),
231 }
232}
233
234pub(crate) fn extract_text_for_detection_with_diagnostics(
235 path: &Path,
236 bytes: &[u8],
237) -> (String, ExtractedTextKind, Option<String>) {
238 let ext = path
239 .extension()
240 .and_then(|e| e.to_str())
241 .map(|s| s.to_ascii_lowercase());
242 let detected_format = detect_file_format(bytes);
243
244 if looks_like_rtf(bytes, ext.as_deref()) {
245 let text = extract_rtf_text(bytes);
246 return if text.trim().is_empty() {
247 (String::new(), ExtractedTextKind::None, None)
248 } else {
249 (text, ExtractedTextKind::Decoded, None)
250 };
251 }
252
253 if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
254 let (text, scan_error) = extract_pdf_text(path, bytes);
255 return if text.is_empty() {
256 (String::new(), ExtractedTextKind::None, scan_error)
257 } else {
258 (text, ExtractedTextKind::Pdf, None)
259 };
260 }
261
262 if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
263 let text = extract_image_metadata_text(bytes, format);
264 return if text.is_empty() {
265 if is_supported_image_container(bytes, format) {
266 (String::new(), ExtractedTextKind::None, None)
267 } else {
268 let decoded = decode_bytes_to_string(bytes);
269 if decoded.is_empty() {
270 (String::new(), ExtractedTextKind::None, None)
271 } else {
272 (decoded, ExtractedTextKind::Decoded, None)
273 }
274 }
275 } else {
276 (text, ExtractedTextKind::ImageMetadata, None)
277 };
278 }
279
280 if let Some(text) = extract_font_metadata_text(path, bytes) {
281 let strings = extract_printable_strings(bytes);
282 let combined = if strings.is_empty() {
283 text
284 } else {
285 combine_extracted_text_fragments(Some(text), strings)
286 };
287 return (combined, ExtractedTextKind::FontMetadata, None);
288 }
289
290 let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
291 let large_opaque_binary = windows_executable_metadata_text.is_none()
292 && is_large_opaque_binary_candidate(bytes, detected_format);
293
294 if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
295 return windows_metadata_or_empty_result(windows_executable_metadata_text);
296 }
297
298 if should_skip_binary_string_extraction(path, bytes, detected_format) {
299 return (String::new(), ExtractedTextKind::None, None);
300 }
301
302 let is_svg_text = lower_extension(path).as_deref() == Some("svg")
303 || detected_format.media_type() == "image/svg+xml";
304 let should_try_decoded_text = looks_like_textual_bytes(bytes) || is_svg_text;
305 let decoded_is_utf8 = std::str::from_utf8(bytes).is_ok();
306 let path_suggests_text = ext.as_deref().is_some_and(|extension| {
307 PLAIN_TEXT_EXTENSIONS.contains(&extension) || detect_language(path, bytes).is_some()
308 });
309
310 if !large_opaque_binary && should_try_decoded_text {
311 let decoded = decode_bytes_to_string(bytes);
312 if !decoded.is_empty()
313 && (is_svg_text
314 || decoded_is_utf8
315 || path_suggests_text
316 || looks_like_decoded_text(&decoded))
317 {
318 let combined =
319 combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
320 return (combined, ExtractedTextKind::Decoded, None);
321 }
322 }
323
324 let text = if large_opaque_binary {
325 extract_sampled_printable_strings(bytes)
326 } else {
327 extract_printable_strings(bytes)
328 };
329 if text.is_empty() {
330 windows_metadata_or_empty_result(windows_executable_metadata_text)
331 } else {
332 (
333 combine_extracted_text_fragments(windows_executable_metadata_text, text),
334 ExtractedTextKind::BinaryStrings,
335 None,
336 )
337 }
338}
339
340fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
341 match prefix {
342 Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
343 Some(prefix) if !prefix.is_empty() => prefix,
344 _ => suffix,
345 }
346}
347
348fn windows_metadata_or_empty_result(
349 windows_executable_metadata_text: Option<String>,
350) -> (String, ExtractedTextKind, Option<String>) {
351 if let Some(metadata_text) = windows_executable_metadata_text {
352 (
353 metadata_text,
354 ExtractedTextKind::WindowsExecutableMetadata,
355 None,
356 )
357 } else {
358 (String::new(), ExtractedTextKind::None, None)
359 }
360}
361
362pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
363 let detected_format = detect_file_format(bytes);
364 let detected_language = detect_language(path, bytes);
365 let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
366 let is_text = !is_binary;
367 let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
368 let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
369 let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
370 let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
371 let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
372 let programming_language = is_source.then(|| detected_language.clone()).flatten();
373 let file_type = detect_file_type(
374 path,
375 bytes,
376 detected_format,
377 &mime_type,
378 programming_language.as_deref(),
379 is_binary,
380 is_text,
381 is_archive,
382 is_media,
383 is_script,
384 );
385
386 FileInfoClassification {
387 mime_type,
388 file_type,
389 programming_language,
390 is_binary,
391 is_text,
392 is_archive,
393 is_media,
394 is_source,
395 is_script,
396 }
397}
398
399fn detect_file_format(bytes: &[u8]) -> FileFormat {
400 FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
401}
402
403const CORRUPTED_UTF16_BOM_PREFIX: &[u8] = &[0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD];
404
405fn is_utf8_text(bytes: &[u8]) -> bool {
406 std::str::from_utf8(bytes).is_ok()
407}
408
409fn strip_corrupted_utf16_bom_prefix(bytes: &[u8]) -> &[u8] {
410 bytes
411 .strip_prefix(CORRUPTED_UTF16_BOM_PREFIX)
412 .unwrap_or(bytes)
413}
414
415fn decode_utf16_units(bytes: &[u8], is_le: bool, require_text_shape: bool) -> Option<String> {
416 if bytes.is_empty() || !bytes.len().is_multiple_of(2) {
417 return None;
418 }
419
420 let code_units: Vec<u16> = bytes
421 .chunks_exact(2)
422 .map(|chunk| {
423 if is_le {
424 u16::from_le_bytes([chunk[0], chunk[1]])
425 } else {
426 u16::from_be_bytes([chunk[0], chunk[1]])
427 }
428 })
429 .collect();
430
431 let decoded = std::char::decode_utf16(code_units)
432 .collect::<Result<String, _>>()
433 .ok()?;
434
435 if !require_text_shape {
436 return (!decoded.contains('\0')).then_some(decoded);
437 }
438
439 if !looks_like_decoded_text(&decoded) {
440 return None;
441 }
442
443 Some(decoded)
444}
445
446fn looks_like_decoded_text(decoded: &str) -> bool {
447 if decoded
448 .chars()
449 .any(|ch| ch.is_control() && !matches!(ch, '\n' | '\r' | '\t'))
450 {
451 return false;
452 }
453
454 let visible = decoded
455 .chars()
456 .filter(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'))
457 .count();
458 if visible < 3 || decoded.contains('\0') {
459 return false;
460 }
461
462 let alpha = decoded.chars().filter(|ch| ch.is_alphabetic()).count();
463 let punctuation = decoded
464 .chars()
465 .filter(|ch| {
466 matches!(
467 ch,
468 '{' | '}'
469 | '['
470 | ']'
471 | '<'
472 | '>'
473 | '('
474 | ')'
475 | ':'
476 | ';'
477 | ','
478 | '"'
479 | '\''
480 | '/'
481 | '='
482 | '-'
483 | '_'
484 | '#'
485 | '!'
486 )
487 })
488 .count();
489 let whitespace = decoded.chars().filter(|ch| ch.is_whitespace()).count();
490
491 let textish = alpha + punctuation + whitespace;
492 textish + (visible / 5) >= visible && (alpha > 0 || punctuation >= 2)
493}
494
495fn detect_utf16_endianness(bytes: &[u8]) -> Option<bool> {
496 let stripped = strip_corrupted_utf16_bom_prefix(bytes);
497 if stripped.len() < 4 || !stripped.len().is_multiple_of(2) {
498 return None;
499 }
500
501 let pair_count = stripped.len() / 2;
502 let even_zero = stripped.iter().step_by(2).filter(|&&b| b == 0).count();
503 let odd_zero = stripped
504 .iter()
505 .skip(1)
506 .step_by(2)
507 .filter(|&&b| b == 0)
508 .count();
509
510 let looks_like_be = even_zero * 3 >= pair_count && odd_zero * 6 <= pair_count;
511 let looks_like_le = odd_zero * 3 >= pair_count && even_zero * 6 <= pair_count;
512
513 match (looks_like_le, looks_like_be) {
514 (true, false) => Some(true),
515 (false, true) => Some(false),
516 (true, true) => Some(true),
517 (false, false) => None,
518 }
519}
520
521fn decode_utf16_text(bytes: &[u8]) -> Option<String> {
522 if let Some(decoded) = decode_utf16_bom_text(bytes) {
523 return Some(decoded);
524 }
525
526 let stripped = strip_corrupted_utf16_bom_prefix(bytes);
527 match detect_utf16_endianness(bytes) {
528 Some(true) => decode_utf16_units(stripped, true, true),
529 Some(false) => decode_utf16_units(stripped, false, true),
530 None => None,
531 }
532}
533
534fn decode_utf16_json_text(bytes: &[u8]) -> Option<String> {
535 if bytes.len() >= 2 {
536 let (is_le, body) = match bytes {
537 [0xFF, 0xFE, rest @ ..] => (true, rest),
538 [0xFE, 0xFF, rest @ ..] => (false, rest),
539 _ => {
540 let stripped = strip_corrupted_utf16_bom_prefix(bytes);
541 return match detect_utf16_endianness(bytes) {
542 Some(true) => decode_utf16_units(stripped, true, false),
543 Some(false) => decode_utf16_units(stripped, false, false),
544 None => None,
545 };
546 }
547 };
548
549 if body.is_empty() || !body.len().is_multiple_of(2) {
550 return None;
551 }
552
553 return decode_utf16_units(body, is_le, false);
554 }
555
556 None
557}
558
559fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
560 if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
561 return None;
562 }
563
564 let (is_le, body) = match bytes {
565 [0xFF, 0xFE, rest @ ..] => (true, rest),
566 [0xFE, 0xFF, rest @ ..] => (false, rest),
567 _ => return None,
568 };
569
570 if body.is_empty() || body.len() % 2 != 0 {
571 return None;
572 }
573
574 decode_utf16_units(body, is_le, true)
575}
576
577fn has_binary_control_chars(bytes: &[u8]) -> bool {
578 let control_count = bytes
579 .iter()
580 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
581 .count();
582 control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
583}
584
585fn has_decodable_text(bytes: &[u8]) -> bool {
586 bytes.is_empty()
587 || is_utf8_text(bytes)
588 || decode_utf16_text(bytes).is_some()
589 || !has_binary_control_chars(bytes)
590}
591
592fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
593 if bytes.is_empty() || is_utf8_text(bytes) {
594 return true;
595 }
596 if let Some(decoded) = decode_utf16_text(bytes) {
597 return decoded
598 .chars()
599 .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
600 }
601
602 let printable_count = bytes
603 .iter()
604 .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
605 .count();
606 printable_count * 2 >= bytes.len()
607}
608
609fn is_textual_media_type(media_type: &str) -> bool {
610 media_type.starts_with("text/")
611 || matches!(
612 media_type,
613 "application/json" | "application/xml" | "text/xml"
614 )
615 || media_type.ends_with("+json")
616 || media_type.ends_with("+xml")
617}
618
619fn is_textual_format(detected_format: FileFormat) -> bool {
620 matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
621 || is_textual_media_type(detected_format.media_type())
622}
623
624fn is_known_binary_format(detected_format: FileFormat) -> bool {
625 !matches!(detected_format, FileFormat::ArbitraryBinaryData)
626 && !is_textual_format(detected_format)
627}
628
629pub fn detect_mime_type(
630 path: &Path,
631 bytes: &[u8],
632 detected_format: FileFormat,
633 programming_language: Option<&str>,
634) -> String {
635 if bytes.is_empty() {
636 return "inode/x-empty".to_string();
637 }
638
639 if lower_extension(path).as_deref() == Some("json") {
640 if let Some(is_binary) = json_binary_override(bytes) {
641 if is_binary {
642 return "application/octet-stream".to_string();
643 }
644 if has_valid_json_text(bytes) {
645 return "application/json".to_string();
646 }
647 return "text/plain".to_string();
648 }
649 if has_valid_json_text(bytes) {
650 return "application/json".to_string();
651 }
652 if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
653 return "text/plain".to_string();
654 }
655 return "application/octet-stream".to_string();
656 }
657
658 if is_zip_archive(bytes) {
659 return detect_zip_like_mime(path);
660 }
661
662 if looks_like_deb(bytes, path) {
663 return "application/vnd.debian.binary-package".to_string();
664 }
665
666 if looks_like_rpm(bytes, path) {
667 return "application/x-rpm".to_string();
668 }
669
670 let guessed_mime = from_path(path)
671 .first_or_octet_stream()
672 .essence_str()
673 .to_string();
674
675 let mime_type = match detected_format {
676 FileFormat::Empty => "inode/x-empty".to_string(),
677 FileFormat::PlainText => {
678 if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
679 "text/plain".to_string()
680 } else {
681 guessed_mime.clone()
682 }
683 }
684 _ => {
685 let detected_mime = detected_format.media_type();
686 if detected_mime == "application/octet-stream"
687 && guessed_mime != "application/octet-stream"
688 {
689 guessed_mime.clone()
690 } else {
691 detected_mime.to_string()
692 }
693 }
694 };
695
696 normalize_mime_type(path, bytes, programming_language, &mime_type)
697}
698
699fn normalize_mime_type(
700 path: &Path,
701 bytes: &[u8],
702 programming_language: Option<&str>,
703 mime_type: &str,
704) -> String {
705 if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
706 return "text/plain".to_string();
707 }
708
709 mime_type.to_string()
710}
711
712fn should_prefer_text_mime(
713 path: &Path,
714 bytes: &[u8],
715 programming_language: Option<&str>,
716 mime_type: &str,
717) -> bool {
718 has_decodable_text(bytes)
719 && looks_like_textual_bytes(bytes)
720 && is_textual_source_candidate(path, programming_language)
721 && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
722}
723
724fn has_valid_json_text(bytes: &[u8]) -> bool {
725 if bytes.len() > JSON_VALIDATION_MAX_BYTES {
726 return false;
727 }
728
729 serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
730 || decode_utf16_json_text(bytes)
731 .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
732 .is_some()
733}
734
735fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
736 !bytes.contains(&0)
737 && !bytes.contains(&0xFF)
738 && bytes.starts_with(b"[\"")
739 && bytes.ends_with(b"\"]")
740 && bytes.len() >= 8
741}
742
743fn json_binary_override(bytes: &[u8]) -> Option<bool> {
744 if has_valid_json_text(bytes) {
745 return Some(false);
746 }
747
748 if bytes.contains(&0) {
749 return Some(true);
750 }
751
752 if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
753 return Some(true);
754 }
755
756 if is_wrapped_invalid_json_string_text(bytes) {
757 return Some(false);
758 }
759
760 None
761}
762
763fn detect_is_binary(
764 path: &Path,
765 bytes: &[u8],
766 detected_format: FileFormat,
767 programming_language: Option<&str>,
768) -> bool {
769 if lower_extension(path).as_deref() == Some("json")
770 && let Some(is_binary) = json_binary_override(bytes)
771 {
772 return is_binary;
773 }
774
775 if is_textual_format(detected_format) {
776 return false;
777 }
778
779 if lower_extension(path)
780 .as_deref()
781 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
782 {
783 return true;
784 }
785
786 if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
787 return false;
788 }
789
790 has_binary_control_chars(bytes)
791 || is_known_binary_format(detected_format)
792 || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
793 && !looks_like_textual_bytes(bytes))
794}
795
796fn should_treat_binary_bytes_as_text(
797 path: &Path,
798 bytes: &[u8],
799 programming_language: Option<&str>,
800) -> bool {
801 has_decodable_text(bytes)
802 && looks_like_textual_bytes(bytes)
803 && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
804}
805
806fn detect_is_archive(
807 path: &Path,
808 bytes: &[u8],
809 mime_type: &str,
810 is_text: bool,
811 detected_format: FileFormat,
812) -> bool {
813 if is_text {
814 return false;
815 }
816
817 lower_extension(path)
818 .as_deref()
819 .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
820 || matches!(
821 detected_format.kind(),
822 FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
823 )
824 || is_zip_archive(bytes)
825 || looks_like_gzip(bytes)
826 || looks_like_bzip2(bytes)
827 || looks_like_xz(bytes)
828 || looks_like_deb(bytes, path)
829 || looks_like_rpm(bytes, path)
830 || looks_like_squashfs(bytes, path)
831 || mime_type.contains("zip")
832 || mime_type.contains("compressed")
833 || mime_type.contains("tar")
834 || mime_type.contains("x-rpm")
835 || mime_type.contains("debian")
836}
837
838fn detect_is_media(
839 path: &Path,
840 bytes: &[u8],
841 mime_type: &str,
842 detected_format: FileFormat,
843) -> bool {
844 media_mime_from_content(bytes).is_some()
845 || matches!(
846 detected_format.kind(),
847 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
848 )
849 || mime_type.starts_with("image/")
850 || mime_type.starts_with("audio/")
851 || mime_type.starts_with("video/")
852 || (mime_type == "application/octet-stream"
853 && lower_extension(path).as_deref() == Some("tga")
854 && !has_binary_control_chars(bytes))
855}
856
857fn detect_is_script(
858 path: &Path,
859 bytes: &[u8],
860 programming_language: Option<&str>,
861 is_text: bool,
862) -> bool {
863 if !is_text || is_makefile(path) {
864 return false;
865 }
866
867 bytes.starts_with(b"#!")
868 || lower_extension(path).as_deref().is_some_and(|ext| {
869 matches!(
870 ext,
871 "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
872 )
873 })
874 || matches!(
875 programming_language,
876 Some(
877 "Shell"
878 | "Bash"
879 | "Zsh"
880 | "Fish"
881 | "Ksh"
882 | "Python"
883 | "Ruby"
884 | "Perl"
885 | "PHP"
886 | "PowerShell"
887 | "Awk"
888 )
889 )
890}
891
892fn detect_is_source(
893 path: &Path,
894 programming_language: Option<&str>,
895 is_text: bool,
896 is_script: bool,
897) -> bool {
898 if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
899 return false;
900 }
901
902 if is_c_like_source(path) || is_java_like_source(path) {
903 return true;
904 }
905
906 programming_language.is_some() || is_script
907}
908
909#[allow(clippy::too_many_arguments)]
910fn detect_file_type(
911 path: &Path,
912 bytes: &[u8],
913 detected_format: FileFormat,
914 mime_type: &str,
915 programming_language: Option<&str>,
916 is_binary: bool,
917 is_text: bool,
918 is_archive: bool,
919 is_media: bool,
920 is_script: bool,
921) -> String {
922 if bytes.is_empty() {
923 return "empty".to_string();
924 }
925
926 if looks_like_pdf(bytes) {
927 return "PDF document".to_string();
928 }
929
930 if let Some(file_type) = media_file_type_from_content(bytes) {
931 return file_type.to_string();
932 }
933
934 if is_archive {
935 return archive_file_type(path, bytes, detected_format);
936 }
937
938 if is_script {
939 return script_file_type(programming_language, bytes);
940 }
941
942 if is_text {
943 if lower_extension(path).as_deref() == Some("json") {
944 if has_valid_json_text(bytes) {
945 return "JSON text data".to_string();
946 }
947 return text_file_type(bytes);
948 }
949 if lower_extension(path).as_deref() == Some("xml") {
950 return "XML text data".to_string();
951 }
952 if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
953 return "YAML text data".to_string();
954 }
955 if lower_extension(path).as_deref() == Some("toml") {
956 return "TOML text data".to_string();
957 }
958 if matches!(
959 lower_extension(path).as_deref(),
960 Some("ini" | "cfg" | "conf")
961 ) {
962 return "INI text data".to_string();
963 }
964 if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
965 return "Git configuration text".to_string();
966 }
967 if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
968 return text_file_type(bytes);
969 }
970 if programming_language.is_some() && !is_media {
971 return source_file_type(programming_language, bytes);
972 }
973 return text_file_type(bytes);
974 }
975
976 if let Some(file_type) = format_based_file_type(detected_format) {
977 return file_type;
978 }
979
980 if is_binary && mime_type == "application/octet-stream" {
981 return "data".to_string();
982 }
983
984 mime_type.to_string()
985}
986
987fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
988 if matches!(programming_language, Some(language) if is_source_like_language(language)) {
989 return true;
990 }
991
992 if matches!(
993 lower_file_name(path).as_str(),
994 "dockerfile"
995 | "containerfile"
996 | "containerfile.core"
997 | "apkbuild"
998 | "podfile"
999 | "jamfile"
1000 | "jamroot"
1001 | "meson.build"
1002 | "build"
1003 | "workspace"
1004 | "buck"
1005 | "default.nix"
1006 | "flake.nix"
1007 | "shell.nix"
1008 ) {
1009 return true;
1010 }
1011
1012 path.extension()
1013 .and_then(|ext| ext.to_str())
1014 .is_some_and(|ext| {
1015 matches!(
1016 ext.to_ascii_lowercase().as_str(),
1017 "rs" | "py"
1018 | "js"
1019 | "mjs"
1020 | "cjs"
1021 | "jsx"
1022 | "ts"
1023 | "mts"
1024 | "cts"
1025 | "tsx"
1026 | "c"
1027 | "cpp"
1028 | "cc"
1029 | "cxx"
1030 | "h"
1031 | "hpp"
1032 | "m"
1033 | "mm"
1034 | "s"
1035 | "asm"
1036 | "java"
1037 | "go"
1038 | "rb"
1039 | "php"
1040 | "pl"
1041 | "swift"
1042 | "sh"
1043 | "bash"
1044 | "zsh"
1045 | "fish"
1046 | "ksh"
1047 | "ps1"
1048 | "psm1"
1049 | "psd1"
1050 | "awk"
1051 | "kt"
1052 | "kts"
1053 | "dart"
1054 | "scala"
1055 | "groovy"
1056 | "gradle"
1057 | "gvy"
1058 | "gy"
1059 | "gsh"
1060 | "cs"
1061 | "fs"
1062 | "fsx"
1063 | "r"
1064 | "lua"
1065 | "jl"
1066 | "ex"
1067 | "exs"
1068 | "clj"
1069 | "cljs"
1070 | "cljc"
1071 | "hs"
1072 | "erl"
1073 | "nix"
1074 | "zig"
1075 | "bzl"
1076 | "bazel"
1077 | "star"
1078 | "sky"
1079 | "ml"
1080 | "mli"
1081 | "tex"
1082 )
1083 })
1084}
1085
1086fn is_source_like_language(language: &str) -> bool {
1087 matches!(
1088 language,
1089 "Rust"
1090 | "Python"
1091 | "JavaScript"
1092 | "TypeScript"
1093 | "JavaScript/TypeScript"
1094 | "C"
1095 | "C++"
1096 | "Objective-C"
1097 | "Objective-C++"
1098 | "GAS"
1099 | "Java"
1100 | "Go"
1101 | "Ruby"
1102 | "PHP"
1103 | "Perl"
1104 | "Swift"
1105 | "Shell"
1106 | "PowerShell"
1107 | "Awk"
1108 | "Kotlin"
1109 | "Dart"
1110 | "Scala"
1111 | "C#"
1112 | "F#"
1113 | "R"
1114 | "Lua"
1115 | "Julia"
1116 | "Elixir"
1117 | "Clojure"
1118 | "Haskell"
1119 | "Erlang"
1120 | "Groovy"
1121 | "Nix"
1122 | "Zig"
1123 | "Starlark"
1124 | "OCaml"
1125 | "Meson"
1126 | "TeX"
1127 | "Dockerfile"
1128 | "Makefile"
1129 | "Jamfile"
1130 )
1131}
1132
1133fn extension(path: &Path) -> Option<&str> {
1134 path.extension().and_then(|ext| ext.to_str())
1135}
1136
1137fn lower_extension(path: &Path) -> Option<String> {
1138 extension(path).map(|ext| ext.to_ascii_lowercase())
1139}
1140
1141fn lower_file_name(path: &Path) -> String {
1142 path.file_name()
1143 .and_then(|name| name.to_str())
1144 .map(|name| name.to_ascii_lowercase())
1145 .unwrap_or_default()
1146}
1147
1148fn is_plain_text(path: &Path) -> bool {
1149 lower_extension(path)
1150 .as_deref()
1151 .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
1152}
1153
1154fn is_makefile(path: &Path) -> bool {
1155 matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
1156}
1157
1158fn is_source_map(path: &Path) -> bool {
1159 let path_lower = path.to_string_lossy().to_ascii_lowercase();
1160 path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
1161}
1162
1163fn is_c_like_source(path: &Path) -> bool {
1164 lower_extension(path).as_deref().is_some_and(|ext| {
1165 matches!(
1166 ext,
1167 "c" | "cc"
1168 | "cp"
1169 | "cpp"
1170 | "cxx"
1171 | "c++"
1172 | "h"
1173 | "hh"
1174 | "hpp"
1175 | "hxx"
1176 | "h++"
1177 | "i"
1178 | "ii"
1179 | "m"
1180 | "s"
1181 | "asm"
1182 )
1183 })
1184}
1185
1186fn is_java_like_source(path: &Path) -> bool {
1187 lower_extension(path)
1188 .as_deref()
1189 .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1190}
1191
1192fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1193 match detected_format {
1194 FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1195 format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1196 format => Some(match format.kind() {
1197 FileFormatKind::Image => short_name_or_name(&format, "image data"),
1198 FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1199 FileFormatKind::Video => short_name_or_name(&format, "video data"),
1200 _ => format.name().to_string(),
1201 }),
1202 }
1203}
1204
1205fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1206 format
1207 .short_name()
1208 .map(|short_name| format!("{short_name} {suffix}"))
1209 .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1210}
1211
1212fn detect_zip_like_mime(path: &Path) -> String {
1213 match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1214 Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1215 Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1216 "application/java-archive".to_string()
1217 }
1218 _ => "application/zip".to_string(),
1219 }
1220}
1221
1222fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1223 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1224 Some("image/png")
1225 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1226 Some("image/jpeg")
1227 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1228 Some("image/tiff")
1229 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1230 Some("image/webp")
1231 } else {
1232 None
1233 }
1234}
1235
1236fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1237 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1238 Some("PNG image data")
1239 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1240 Some("JPEG image data")
1241 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1242 Some("TIFF image data")
1243 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1244 Some("WebP image data")
1245 } else {
1246 None
1247 }
1248}
1249
1250fn looks_like_pdf(bytes: &[u8]) -> bool {
1251 bytes.starts_with(b"%PDF-")
1252}
1253
1254fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1255 ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1256}
1257
1258fn extract_rtf_text(bytes: &[u8]) -> String {
1259 let text = String::from_utf8_lossy(bytes);
1260 let chars: Vec<char> = text.chars().collect();
1261 let mut output = String::new();
1262 let mut index = 0usize;
1263
1264 while index < chars.len() {
1265 match chars[index] {
1266 '{' | '}' => {
1267 index += 1;
1268 }
1269 '\\' => {
1270 index += 1;
1271 if index >= chars.len() {
1272 break;
1273 }
1274
1275 match chars[index] {
1276 '\\' | '{' | '}' => {
1277 output.push(chars[index]);
1278 index += 1;
1279 }
1280 '\'' => {
1281 if index + 2 < chars.len() {
1282 let hex = [chars[index + 1], chars[index + 2]];
1283 let hex: String = hex.iter().collect();
1284 if let Ok(value) = u8::from_str_radix(&hex, 16) {
1285 output.push(value as char);
1286 index += 3;
1287 continue;
1288 }
1289 }
1290 index += 1;
1291 }
1292 control if control.is_ascii_alphabetic() => {
1293 let start = index;
1294 while index < chars.len() && chars[index].is_ascii_alphabetic() {
1295 index += 1;
1296 }
1297 let control_word: String = chars[start..index].iter().collect();
1298
1299 let number_start = index;
1300 if index < chars.len()
1301 && (chars[index] == '-' || chars[index].is_ascii_digit())
1302 {
1303 index += 1;
1304 while index < chars.len() && chars[index].is_ascii_digit() {
1305 index += 1;
1306 }
1307 }
1308 let parameter: String = chars[number_start..index].iter().collect();
1309
1310 if index < chars.len() && chars[index] == ' ' {
1311 index += 1;
1312 }
1313
1314 match control_word.as_str() {
1315 "par" | "line" => output.push('\n'),
1316 "tab" => output.push('\t'),
1317 "emdash" => output.push('—'),
1318 "endash" => output.push('–'),
1319 "bullet" => output.push('•'),
1320 "lquote" | "rquote" => output.push('\''),
1321 "ldblquote" | "rdblquote" => output.push('"'),
1322 "u" => {
1323 if let Ok(codepoint) = parameter.parse::<i32>() {
1324 let normalized = if codepoint < 0 {
1325 codepoint + 65_536
1326 } else {
1327 codepoint
1328 };
1329 if let Ok(normalized) = u32::try_from(normalized)
1330 && let Some(ch) = char::from_u32(normalized)
1331 {
1332 output.push(ch);
1333 }
1334 }
1335
1336 if index < chars.len()
1337 && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1338 {
1339 index += 1;
1340 }
1341 }
1342 _ => {}
1343 }
1344 }
1345 _ => {
1346 index += 1;
1347 }
1348 }
1349 }
1350 ch => {
1351 output.push(ch);
1352 index += 1;
1353 }
1354 }
1355 }
1356
1357 output
1358 .replace(['\r', '\u{0c}'], "\n")
1359 .lines()
1360 .map(str::trim_end)
1361 .collect::<Vec<_>>()
1362 .join("\n")
1363}
1364
1365fn looks_like_gzip(bytes: &[u8]) -> bool {
1366 bytes.starts_with(&[0x1f, 0x8b])
1367}
1368
1369fn looks_like_bzip2(bytes: &[u8]) -> bool {
1370 bytes.starts_with(b"BZh")
1371}
1372
1373fn looks_like_xz(bytes: &[u8]) -> bool {
1374 bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1375}
1376
1377fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1378 lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1379}
1380
1381fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1382 lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1383}
1384
1385fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1386 lower_extension(path)
1387 .as_deref()
1388 .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1389 && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1390 || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1391}
1392
1393fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1394 if looks_like_deb(bytes, path) {
1395 "debian binary package (format 2.0)".to_string()
1396 } else if looks_like_rpm(bytes, path) {
1397 "RPM package".to_string()
1398 } else if looks_like_squashfs(bytes, path) {
1399 "Squashfs filesystem".to_string()
1400 } else if looks_like_gzip(bytes) {
1401 "gzip compressed data".to_string()
1402 } else if looks_like_bzip2(bytes) {
1403 "bzip2 compressed data".to_string()
1404 } else if looks_like_xz(bytes) {
1405 "XZ compressed data".to_string()
1406 } else if is_zip_archive(bytes) {
1407 "Zip archive data".to_string()
1408 } else if lower_extension(path).as_deref() == Some("gem") {
1409 "POSIX tar archive".to_string()
1410 } else if let Some(file_type) = format_based_file_type(detected_format) {
1411 file_type
1412 } else {
1413 "archive data".to_string()
1414 }
1415}
1416
1417fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1418 let suffix = text_executable_label(bytes);
1419
1420 match programming_language {
1421 Some("Python") => format!("python script, {suffix}"),
1422 Some("Ruby") => format!("ruby script, {suffix}"),
1423 Some("Perl") => format!("perl script, {suffix}"),
1424 Some("PHP") => format!("php script, {suffix}"),
1425 Some("Shell") => format!("shell script, {suffix}"),
1426 Some("Bash") => format!("bash script, {suffix}"),
1427 Some("Zsh") => format!("zsh script, {suffix}"),
1428 Some("Fish") => format!("fish script, {suffix}"),
1429 Some("Ksh") => format!("ksh script, {suffix}"),
1430 Some("JavaScript") => format!("javascript script, {suffix}"),
1431 Some("TypeScript") => format!("typescript script, {suffix}"),
1432 Some("PowerShell") => format!("powershell script, {suffix}"),
1433 Some("Awk") => format!("awk script, {suffix}"),
1434 _ => format!("script, {suffix}"),
1435 }
1436}
1437
1438fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1439 let suffix = text_label(bytes);
1440 match programming_language {
1441 Some("C") => format!("C source, {suffix}"),
1442 Some("C++") => format!("C++ source, {suffix}"),
1443 Some("Java") => format!("Java source, {suffix}"),
1444 Some("C#") => format!("C# source, {suffix}"),
1445 Some("F#") => format!("F# source, {suffix}"),
1446 Some("Go") => format!("Go source, {suffix}"),
1447 Some("Rust") => format!("Rust source, {suffix}"),
1448 Some("Starlark") => format!("Starlark source, {suffix}"),
1449 Some("CMake") => format!("CMake source, {suffix}"),
1450 Some("Meson") => format!("Meson source, {suffix}"),
1451 Some("Nix") => format!("Nix source, {suffix}"),
1452 Some("Groovy") => format!("Groovy source, {suffix}"),
1453 Some("Makefile") => format!("Makefile source, {suffix}"),
1454 Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1455 Some("Jamfile") => format!("Jamfile source, {suffix}"),
1456 Some("Batchfile") => format!("Batchfile source, {suffix}"),
1457 Some(language) => format!("{language} source, {suffix}"),
1458 None => text_file_type(bytes),
1459 }
1460}
1461
1462fn text_file_type(bytes: &[u8]) -> String {
1463 text_label(bytes).to_string()
1464}
1465
1466fn text_label(bytes: &[u8]) -> &'static str {
1467 if std::str::from_utf8(bytes).is_ok() {
1468 if bytes.contains(&b'\n') {
1469 "UTF-8 Unicode text"
1470 } else {
1471 "UTF-8 Unicode text, with no line terminators"
1472 }
1473 } else if bytes.contains(&b'\n') {
1474 "text"
1475 } else {
1476 "text, with no line terminators"
1477 }
1478}
1479
1480fn text_executable_label(bytes: &[u8]) -> &'static str {
1481 if std::str::from_utf8(bytes).is_ok() {
1482 if bytes.contains(&b'\n') {
1483 "UTF-8 Unicode text executable"
1484 } else {
1485 "UTF-8 Unicode text executable, with no line terminators"
1486 }
1487 } else if bytes.contains(&b'\n') {
1488 "text executable"
1489 } else {
1490 "text executable, with no line terminators"
1491 }
1492}
1493
1494fn supported_image_metadata_format(
1495 ext: Option<&str>,
1496 detected_format: FileFormat,
1497) -> Option<ImageFormat> {
1498 match ext {
1499 Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1500 Some("png") => Some(ImageFormat::Png),
1501 Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1502 Some("webp") => Some(ImageFormat::WebP),
1503 _ => match detected_format.media_type() {
1504 "image/jpeg" => Some(ImageFormat::Jpeg),
1505 "image/png" => Some(ImageFormat::Png),
1506 "image/tiff" => Some(ImageFormat::Tiff),
1507 "image/webp" => Some(ImageFormat::WebP),
1508 _ => None,
1509 },
1510 }
1511}
1512
1513fn should_skip_binary_string_extraction(
1514 path: &Path,
1515 bytes: &[u8],
1516 detected_format: FileFormat,
1517) -> bool {
1518 matches!(lower_extension(path).as_deref(), Some("pdf"))
1519 || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1520 .is_some()
1521 || (matches!(
1522 detected_format.kind(),
1523 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1524 ) && !is_textual_format(detected_format))
1525 || media_mime_from_content(bytes).is_some()
1526 || is_zip_archive(bytes)
1527 || looks_like_gzip(bytes)
1528 || looks_like_bzip2(bytes)
1529 || looks_like_xz(bytes)
1530 || looks_like_deb(bytes, path)
1531 || looks_like_rpm(bytes, path)
1532 || looks_like_squashfs(bytes, path)
1533}
1534
1535fn should_skip_large_opaque_binary_text_extraction(
1536 _path: &Path,
1537 bytes: &[u8],
1538 detected_format: FileFormat,
1539) -> bool {
1540 is_large_opaque_binary_candidate(bytes, detected_format)
1541 && !sample_has_promising_printable_strings(bytes)
1542}
1543
1544fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1545 bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1546 && !is_textual_format(detected_format)
1547 && !matches!(
1548 detected_format.kind(),
1549 FileFormatKind::Archive
1550 | FileFormatKind::Compressed
1551 | FileFormatKind::Package
1552 | FileFormatKind::Audio
1553 | FileFormatKind::Image
1554 | FileFormatKind::Video
1555 )
1556}
1557
1558fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1559 const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1560
1561 let mut ranges = Vec::new();
1562 let mut push_range = |start: usize, end: usize| {
1563 if start < end && !ranges.contains(&(start, end)) {
1564 ranges.push((start, end));
1565 }
1566 };
1567
1568 push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1569 if len > SAMPLE_WINDOW_BYTES * 2 {
1570 let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1571 let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1572 push_range(mid_start, mid_end);
1573 }
1574 if len > SAMPLE_WINDOW_BYTES {
1575 push_range(len - SAMPLE_WINDOW_BYTES, len);
1576 }
1577
1578 ranges
1579}
1580
1581fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1582 let mut structured_signal_seen = false;
1583 let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1584 .into_iter()
1585 .filter(|&(start, end)| {
1586 let window = &bytes[start..end];
1587 if has_strong_structured_text_signal(window) {
1588 structured_signal_seen = true;
1589 }
1590 has_license_or_notice_signal(window)
1591 })
1592 .count();
1593
1594 structured_signal_seen || promising_license_windows >= 2
1595}
1596
1597fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1598 let mut combined_lines = BTreeSet::new();
1599
1600 for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1601 let window_text = extract_printable_strings(&bytes[start..end]);
1602 for line in window_text
1603 .lines()
1604 .map(str::trim)
1605 .filter(|line| !line.is_empty())
1606 {
1607 combined_lines.insert(line.to_string());
1608 }
1609 }
1610
1611 combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1612}
1613
1614fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1615 let strings = extract_printable_strings(bytes);
1616 if strings.is_empty() {
1617 return false;
1618 }
1619
1620 let lower = strings.to_ascii_lowercase();
1621 [
1622 "copyright",
1623 "license",
1624 "licensed under",
1625 "all rights reserved",
1626 "permission is hereby granted",
1627 "redistribution and use",
1628 "spdx-license-identifier",
1629 ]
1630 .iter()
1631 .any(|marker| lower.contains(marker))
1632}
1633
1634fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1635 let strings = extract_printable_strings(bytes);
1636 if strings.is_empty() {
1637 return false;
1638 }
1639
1640 let email_markers = strings.matches('@').count();
1641 let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1642
1643 email_markers + url_markers >= 3
1644}
1645
1646fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1647 match format {
1648 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1649 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1650 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1651 ImageFormat::WebP => {
1652 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1653 }
1654 _ => false,
1655 }
1656}
1657
1658fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1659 let mut values = Vec::new();
1660 values.extend(extract_exif_metadata_values(bytes));
1661 values.extend(extract_xmp_metadata_values(bytes, format));
1662 values_to_text(values)
1663}
1664
1665fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1666 let mut cursor = BufReader::new(Cursor::new(bytes));
1667 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1668 Ok(exif) => exif,
1669 Err(_) => return Vec::new(),
1670 };
1671
1672 let mut values = Vec::new();
1673 for field in exif.fields() {
1674 let rendered = match field.tag {
1675 exif::Tag::ImageDescription => Some(format_metadata_field(
1676 "Description",
1677 &field.display_value().with_unit(&exif).to_string(),
1678 )),
1679 exif::Tag::Copyright => Some(format_metadata_field(
1680 "Copyright",
1681 &field.display_value().with_unit(&exif).to_string(),
1682 )),
1683 exif::Tag::UserComment => Some(format_metadata_field(
1684 "Comment",
1685 &field.display_value().with_unit(&exif).to_string(),
1686 )),
1687 exif::Tag::Artist => Some(format_metadata_field(
1688 "Author",
1689 &field.display_value().with_unit(&exif).to_string(),
1690 )),
1691 _ => None,
1692 };
1693
1694 if let Some(rendered) = rendered {
1695 values.push(rendered);
1696 }
1697 }
1698
1699 values
1700}
1701
1702fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1703 let xmp = match extract_raw_xmp_packet(bytes, format) {
1704 Some(xmp) => xmp,
1705 None => return Vec::new(),
1706 };
1707
1708 parse_xmp_values(&xmp)
1709}
1710
1711fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1712 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1713 if let Ok(mut decoder) = reader.into_decoder()
1714 && let Ok(Some(xmp)) = decoder.xmp_metadata()
1715 {
1716 return (xmp.len() <= MAX_XMP_PACKET_BYTES).then_some(xmp);
1717 }
1718
1719 match format {
1720 ImageFormat::Png => extract_png_xmp_packet(bytes),
1721 _ => None,
1722 }
1723}
1724
1725fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1726 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1727
1728 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1729 return None;
1730 }
1731
1732 let mut offset = PNG_SIGNATURE.len();
1733 while offset + 12 <= bytes.len() {
1734 let length = u32::from_be_bytes([
1735 bytes[offset],
1736 bytes[offset + 1],
1737 bytes[offset + 2],
1738 bytes[offset + 3],
1739 ]) as usize;
1740 let chunk_start = offset + 8;
1741 let chunk_end = chunk_start + length;
1742 if chunk_end + 4 > bytes.len() {
1743 return None;
1744 }
1745
1746 let chunk_type = &bytes[offset + 4..offset + 8];
1747 if chunk_type == b"iTXt" {
1748 let data = &bytes[chunk_start..chunk_end];
1749 if let Some(xmp) = parse_png_itxt_xmp(data) {
1750 return Some(xmp);
1751 }
1752 }
1753
1754 offset = chunk_end + 4;
1755 }
1756
1757 None
1758}
1759
1760fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1761 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1762
1763 let keyword_end = data.iter().position(|&b| b == 0)?;
1764 if &data[..keyword_end] != XMP_KEYWORD {
1765 return None;
1766 }
1767
1768 let mut cursor = keyword_end + 1;
1769 let compression_flag = *data.get(cursor)?;
1770 cursor += 1;
1771 let compression_method = *data.get(cursor)?;
1772 cursor += 1;
1773 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1774 return None;
1775 }
1776
1777 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1778 cursor = language_end + 1;
1779
1780 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1781 cursor = translated_end + 1;
1782
1783 let text_bytes = &data[cursor..];
1784 if compression_flag == 1 {
1785 let decoder = ZlibDecoder::new(text_bytes);
1786 let mut decoded = Vec::new();
1787 decoder
1788 .take((MAX_XMP_PACKET_BYTES + 1) as u64)
1789 .read_to_end(&mut decoded)
1790 .ok()?;
1791 (decoded.len() <= MAX_XMP_PACKET_BYTES).then_some(decoded)
1792 } else {
1793 (text_bytes.len() <= MAX_XMP_PACKET_BYTES).then(|| text_bytes.to_vec())
1794 }
1795}
1796
1797fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1798 let mut reader = XmlReader::from_reader(xmp);
1799 reader.config_mut().trim_text(true);
1800
1801 let mut buf = Vec::new();
1802 let mut stack: Vec<String> = Vec::new();
1803 let mut values = Vec::new();
1804
1805 loop {
1806 match reader.read_event_into(&mut buf) {
1807 Ok(Event::Start(e)) => {
1808 stack.push(local_xml_name(e.name().as_ref()));
1809 }
1810 Ok(Event::End(_)) => {
1811 stack.pop();
1812 }
1813 Ok(Event::Empty(_)) => {}
1814 Ok(Event::Text(text)) => {
1815 if let Some(field) = stack
1816 .iter()
1817 .rev()
1818 .find_map(|name| allowed_xmp_field(name.as_str()))
1819 && let Ok(decoded) = text.decode()
1820 {
1821 let decoded = decoded.into_owned();
1822 if !decoded.trim().is_empty() {
1823 values.push(format_xmp_value(field, &decoded));
1824 }
1825 }
1826 }
1827 Ok(Event::CData(text)) => {
1828 if let Some(field) = stack
1829 .iter()
1830 .rev()
1831 .find_map(|name| allowed_xmp_field(name.as_str()))
1832 && let Ok(decoded) = text.decode()
1833 {
1834 let decoded = decoded.into_owned();
1835 if !decoded.trim().is_empty() {
1836 values.push(format_xmp_value(field, &decoded));
1837 }
1838 }
1839 }
1840 Ok(Event::Eof) | Err(_) => break,
1841 _ => {}
1842 }
1843 buf.clear();
1844 }
1845
1846 values
1847}
1848
1849fn local_xml_name(name: &[u8]) -> String {
1850 let name = std::str::from_utf8(name).unwrap_or_default();
1851 name.rsplit(':').next().unwrap_or(name).to_string()
1852}
1853
1854fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1855 match name {
1856 "creator" => Some("creator"),
1857 "rights" => Some("rights"),
1858 "description" => Some("description"),
1859 "title" => Some("title"),
1860 "subject" => Some("subject"),
1861 "UsageTerms" => Some("usage_terms"),
1862 "WebStatement" => Some("web_statement"),
1863 _ => None,
1864 }
1865}
1866
1867fn format_xmp_value(field: &str, value: &str) -> String {
1868 match field {
1869 "creator" => format_metadata_field("Author", value),
1870 "rights" => format_metadata_field("Copyright", value),
1871 "description" => format_metadata_field("Description", value),
1872 "title" => format_metadata_field("Title", value),
1873 "subject" => format_metadata_field("Subject", value),
1874 "usage_terms" => format_metadata_field("UsageTerms", value),
1875 "web_statement" => format_metadata_field("WebStatement", value),
1876 _ => value.to_string(),
1877 }
1878}
1879
1880fn format_metadata_field(label: &str, value: &str) -> String {
1881 format!("{label}: {value}")
1882}
1883
1884fn values_to_text(values: Vec<String>) -> String {
1885 let mut seen = BTreeSet::new();
1886 let mut normalized_lines = Vec::new();
1887
1888 for value in values {
1889 let normalized = normalize_metadata_value(&value);
1890 if normalized.is_empty() || !seen.insert(normalized.clone()) {
1891 continue;
1892 }
1893
1894 normalized_lines.push(normalized);
1895 }
1896
1897 let author_values: BTreeSet<String> = normalized_lines
1898 .iter()
1899 .filter_map(|line| split_metadata_field(line))
1900 .filter(|(label, _)| label.eq_ignore_ascii_case("Author"))
1901 .map(|(_, value)| value.to_string())
1902 .collect();
1903
1904 let mut lines = Vec::new();
1905 let mut total_bytes = 0usize;
1906
1907 for normalized in normalized_lines {
1908 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1909 break;
1910 }
1911
1912 if should_suppress_bare_copyright_metadata_line(&normalized, &author_values) {
1913 continue;
1914 }
1915
1916 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1917 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1918 break;
1919 }
1920
1921 total_bytes += added_bytes;
1922 lines.push(normalized);
1923 }
1924
1925 lines.join("\n")
1926}
1927
1928fn split_metadata_field(line: &str) -> Option<(&str, &str)> {
1929 let (label, value) = line.split_once(':')?;
1930 Some((label.trim(), value.trim()))
1931}
1932
1933fn should_suppress_bare_copyright_metadata_line(
1934 line: &str,
1935 author_values: &BTreeSet<String>,
1936) -> bool {
1937 let Some((label, value)) = split_metadata_field(line) else {
1938 return false;
1939 };
1940 if !label.eq_ignore_ascii_case("Copyright")
1941 || value.is_empty()
1942 || !author_values.contains(value)
1943 {
1944 return false;
1945 }
1946
1947 let lower = value.to_ascii_lowercase();
1948 !lower.contains("copyright")
1949 && !lower.contains("(c)")
1950 && !lower.contains('©')
1951 && !lower.contains("all rights")
1952 && !value.chars().any(|ch| ch.is_ascii_digit())
1953}
1954
1955fn normalize_metadata_value(value: &str) -> String {
1956 value
1957 .chars()
1958 .filter(|&ch| ch != '\0')
1959 .collect::<String>()
1960 .split_whitespace()
1961 .collect::<Vec<_>>()
1962 .join(" ")
1963 .trim()
1964 .to_string()
1965}
1966
1967fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1968 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1969 return (String::new(), None);
1970 }
1971
1972 if bytes.len() > MAX_PDF_TEXT_EXTRACTION_BYTES {
1973 return (
1974 String::new(),
1975 Some(format!(
1976 "PDF text extraction skipped because file exceeds {} bytes",
1977 MAX_PDF_TEXT_EXTRACTION_BYTES
1978 )),
1979 );
1980 }
1981
1982 let mut failures = Vec::new();
1983 let mut saw_success = false;
1984
1985 let extracted = catch_unwind(AssertUnwindSafe(
1986 || -> Result<String, Box<dyn std::error::Error>> {
1987 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1988 extract_first_pdf_page_text(&mut document)
1989 },
1990 ));
1991 match extracted {
1992 Ok(Ok(text)) => {
1993 saw_success = true;
1994 if let Some(normalized) = normalize_pdf_text(text) {
1995 return (normalized, None);
1996 }
1997 }
1998 Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1999 Err(payload) => failures.push(format!(
2000 "from-bytes first-page panic: {}",
2001 panic_payload_to_string(payload.as_ref())
2002 )),
2003 }
2004
2005 let extracted = catch_unwind(AssertUnwindSafe(
2006 || -> Result<String, Box<dyn std::error::Error>> {
2007 let mut document = pdf_oxide::document::PdfDocument::open(path)?;
2008 extract_pdf_text_from_document(&mut document)
2009 },
2010 ));
2011 match extracted {
2012 Ok(Ok(text)) => {
2013 saw_success = true;
2014 if let Some(normalized) = normalize_pdf_text(text) {
2015 return (normalized, None);
2016 }
2017 }
2018 Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
2019 Err(payload) => failures.push(format!(
2020 "open full-document panic: {}",
2021 panic_payload_to_string(payload.as_ref())
2022 )),
2023 }
2024
2025 let extracted = catch_unwind(AssertUnwindSafe(
2026 || -> Result<String, Box<dyn std::error::Error>> {
2027 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
2028 extract_pdf_text_from_document(&mut document)
2029 },
2030 ));
2031 match extracted {
2032 Ok(Ok(text)) => {
2033 saw_success = true;
2034 if let Some(normalized) = normalize_pdf_text(text) {
2035 return (normalized, None);
2036 }
2037 }
2038 Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
2039 Err(payload) => failures.push(format!(
2040 "from-bytes full-document panic: {}",
2041 panic_payload_to_string(payload.as_ref())
2042 )),
2043 }
2044
2045 if saw_success || is_non_actionable_pdf_failure(&failures) {
2046 (String::new(), None)
2047 } else {
2048 (
2049 String::new(),
2050 Some(format!(
2051 "PDF text extraction failed after {} attempts: {}",
2052 failures.len(),
2053 failures.join("; ")
2054 )),
2055 )
2056 }
2057}
2058
2059fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
2060 !failures.is_empty()
2061 && failures.iter().all(|failure| {
2062 failure.contains("requires a password")
2063 || failure.contains("Encrypt dictionary missing /O")
2064 || failure.contains("Encrypt dictionary missing /U")
2065 || failure.contains("security handler cannot be found")
2066 || failure.contains("Invalid cross-reference table")
2067 })
2068}
2069
2070fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
2071 if let Some(message) = payload.downcast_ref::<&str>() {
2072 (*message).to_string()
2073 } else if let Some(message) = payload.downcast_ref::<String>() {
2074 message.clone()
2075 } else {
2076 "unknown panic payload".to_string()
2077 }
2078}
2079
2080fn extract_first_pdf_page_text(
2081 document: &mut pdf_oxide::document::PdfDocument,
2082) -> Result<String, Box<dyn std::error::Error>> {
2083 if document.page_count()? == 0 {
2084 return Ok(String::new());
2085 }
2086
2087 let extracted_text = document.extract_text(0)?;
2088 let markdown_text =
2089 document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
2090 if pdf_markdown_heading_lines(&markdown_text).is_empty() {
2091 return Ok(extracted_text);
2092 }
2093
2094 let pipeline_text =
2095 document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
2096
2097 Ok(merge_pdf_first_page_text(
2098 &extracted_text,
2099 &markdown_text,
2100 &pipeline_text,
2101 ))
2102}
2103
2104fn extract_pdf_text_from_document(
2105 document: &mut pdf_oxide::document::PdfDocument,
2106) -> Result<String, Box<dyn std::error::Error>> {
2107 Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
2108}
2109
2110fn normalize_pdf_text(text: String) -> Option<String> {
2111 let normalized = text.replace(['\r', '\u{0c}'], "\n");
2112 (!normalized.trim().is_empty()).then_some(normalized)
2113}
2114
2115fn merge_pdf_first_page_text(
2116 _extracted_text: &str,
2117 markdown_text: &str,
2118 pipeline_text: &str,
2119) -> String {
2120 let pipeline = pipeline_text.trim();
2121 if pipeline.is_empty() {
2122 return String::new();
2123 }
2124
2125 let prefix = pdf_first_page_heading_prefix(markdown_text);
2126 let Some(prefix) = prefix else {
2127 return pipeline_text.to_string();
2128 };
2129
2130 if pdf_text_contains_heading_prefix(pipeline, &prefix) {
2131 pipeline_text.to_string()
2132 } else {
2133 format!("{prefix}\n\n{pipeline}")
2134 }
2135}
2136
2137fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
2138 normalize_pdf_heading_comparison_text(text)
2139 .contains(&normalize_pdf_heading_comparison_text(prefix))
2140}
2141
2142fn normalize_pdf_heading_comparison_text(text: &str) -> String {
2143 text.split_whitespace()
2144 .map(|part| part.to_ascii_lowercase())
2145 .collect::<Vec<_>>()
2146 .join(" ")
2147}
2148
2149fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
2150 let mut lines = Vec::new();
2151
2152 for line in pdf_markdown_heading_lines(markdown_text) {
2153 push_unique_line(&mut lines, line);
2154 }
2155
2156 (!lines.is_empty()).then(|| lines.join("\n"))
2157}
2158
2159fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
2160 text.lines()
2161 .map(str::trim)
2162 .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
2163 .map(|line| line.trim_matches('#').trim())
2164 .filter(|line| !line.is_empty())
2165 .filter(|line| !looks_like_numbered_section_heading(line))
2166 .take(4)
2167 .map(ToOwned::to_owned)
2168 .collect()
2169}
2170
2171fn push_unique_line(lines: &mut Vec<String>, line: String) {
2172 if !lines.iter().any(|existing| existing == &line) {
2173 lines.push(line);
2174 }
2175}
2176
2177fn looks_like_numbered_section_heading(line: &str) -> bool {
2178 let mut chars = line.chars();
2179 let Some(first) = chars.next() else {
2180 return false;
2181 };
2182
2183 if !first.is_ascii_digit() {
2184 return false;
2185 }
2186
2187 matches!(chars.next(), Some('.'))
2188}
2189
2190fn is_zip_archive(bytes: &[u8]) -> bool {
2191 bytes.starts_with(b"PK\x03\x04")
2192 || bytes.starts_with(b"PK\x05\x06")
2193 || bytes.starts_with(b"PK\x07\x08")
2194}
2195
2196pub fn extract_printable_strings(bytes: &[u8]) -> String {
2197 const MIN_LEN: usize = 4;
2198 const MIN_OUTPUT_BYTES: usize = 2_000_000;
2199 const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
2200
2201 let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
2202
2203 fn is_printable_ascii(b: u8) -> bool {
2204 matches!(b, 0x20..=0x7E)
2205 }
2206
2207 let mut out = String::new();
2208 let mut run: Vec<u8> = Vec::new();
2209
2210 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
2211 if run.len() >= MIN_LEN {
2212 if !out.is_empty() {
2213 out.push('\n');
2214 }
2215 out.push_str(&String::from_utf8_lossy(run));
2216 }
2217 run.clear();
2218 };
2219
2220 for &b in bytes {
2221 if is_printable_ascii(b) {
2222 run.push(b);
2223 } else {
2224 flush_run(&mut out, &mut run);
2225 if out.len() >= max_output_bytes {
2226 return out;
2227 }
2228 }
2229 }
2230 flush_run(&mut out, &mut run);
2231 if out.len() >= max_output_bytes {
2232 return out;
2233 }
2234
2235 for start in 0..=1 {
2236 run.clear();
2237 let mut i = start;
2238 while i + 1 < bytes.len() {
2239 let b0 = bytes[i];
2240 let b1 = bytes[i + 1];
2241 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
2242 if is_printable_ascii(ch) && zero == 0 {
2243 run.push(ch);
2244 } else {
2245 flush_run(&mut out, &mut run);
2246 if out.len() >= max_output_bytes {
2247 return out;
2248 }
2249 }
2250 i += 2;
2251 }
2252 flush_run(&mut out, &mut run);
2253 if out.len() >= max_output_bytes {
2254 return out;
2255 }
2256 }
2257
2258 out
2259}
2260
2261#[cfg(test)]
2262mod tests {
2263 use image::ImageFormat;
2264 use std::path::Path;
2265
2266 use crate::copyright::detect_copyrights;
2267
2268 use super::{
2269 ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, MAX_PDF_TEXT_EXTRACTION_BYTES,
2270 MAX_XMP_PACKET_BYTES, classify_file_info, extract_printable_strings,
2271 extract_raw_xmp_packet, extract_text_for_detection,
2272 extract_text_for_detection_with_diagnostics, format_metadata_field, format_xmp_value,
2273 is_non_actionable_pdf_failure, normalize_mime_type, normalize_pdf_heading_comparison_text,
2274 values_to_text, windows_metadata_or_empty_result,
2275 };
2276
2277 fn png_chunk(chunk_type: &[u8; 4], data: &[u8]) -> Vec<u8> {
2278 let mut out = Vec::new();
2279 out.extend_from_slice(&(data.len() as u32).to_be_bytes());
2280 out.extend_from_slice(chunk_type);
2281 out.extend_from_slice(data);
2282 out.extend_from_slice(&0u32.to_be_bytes());
2283 out
2284 }
2285
2286 fn build_png_with_xmp(xmp: &str) -> Vec<u8> {
2287 let mut bytes = Vec::new();
2288 bytes.extend_from_slice(b"\x89PNG\r\n\x1a\n");
2289
2290 let ihdr = [
2291 0, 0, 0, 1, 0, 0, 0, 1, 8, 2, 0, 0, 0, ];
2299 bytes.extend_from_slice(&png_chunk(b"IHDR", &ihdr));
2300
2301 let mut itxt = Vec::new();
2302 itxt.extend_from_slice(b"XML:com.adobe.xmp");
2303 itxt.push(0); itxt.push(0); itxt.push(0); itxt.push(0); itxt.push(0); itxt.extend_from_slice(xmp.as_bytes());
2309 bytes.extend_from_slice(&png_chunk(b"iTXt", &itxt));
2310
2311 bytes.extend_from_slice(&png_chunk(b"IEND", &[]));
2312 bytes
2313 }
2314
2315 #[test]
2316 fn test_extract_text_for_detection_skips_jar_archives() {
2317 let path = Path::new(
2318 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2319 );
2320 let bytes = std::fs::read(path).expect("failed to read jar fixture");
2321
2322 let (text, kind) = extract_text_for_detection(path, &bytes);
2323
2324 assert!(text.is_empty());
2325 assert_eq!(kind, ExtractedTextKind::None);
2326 }
2327
2328 #[test]
2329 fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2330 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2331 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2332
2333 let (text, kind) = extract_text_for_detection(path, &bytes);
2334
2335 assert_eq!(kind, ExtractedTextKind::Pdf);
2336 assert!(text.contains("Redistribution and use in source and binary forms"));
2337 }
2338
2339 #[test]
2340 fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2341 let path =
2342 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2343 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2344
2345 let (text, kind) = extract_text_for_detection(path, &bytes);
2346
2347 assert_eq!(kind, ExtractedTextKind::Pdf);
2348 assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2349 assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2350 }
2351
2352 #[test]
2353 fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2354 let path =
2355 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2356 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2357
2358 let (text, kind) = extract_text_for_detection(path, &bytes);
2359
2360 assert_eq!(kind, ExtractedTextKind::Pdf);
2361
2362 let normalized = normalize_pdf_heading_comparison_text(&text);
2363 let heading =
2364 normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2365 assert_eq!(normalized.matches(&heading).count(), 1);
2366 }
2367
2368 #[test]
2369 fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2370 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2371 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2372
2373 let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2374
2375 assert_eq!(kind, ExtractedTextKind::Pdf);
2376 assert!(text.contains("Redistribution and use in source and binary forms"));
2377 }
2378
2379 #[test]
2380 fn test_extract_text_for_detection_skips_oversized_pdf_payload() {
2381 let mut bytes = b"%PDF-1.7\n".to_vec();
2382 bytes.resize(MAX_PDF_TEXT_EXTRACTION_BYTES + 1, b'0');
2383
2384 let (text, kind, scan_error) =
2385 extract_text_for_detection_with_diagnostics(Path::new("oversized.pdf"), &bytes);
2386
2387 assert!(text.is_empty());
2388 assert_eq!(kind, ExtractedTextKind::None);
2389 assert!(
2390 scan_error
2391 .as_deref()
2392 .is_some_and(|message| message.contains("PDF text extraction skipped"))
2393 );
2394 }
2395
2396 #[test]
2397 fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2398 let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2399
2400 let (text, kind, scan_error) =
2401 extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2402
2403 assert!(text.is_empty());
2404 assert_eq!(kind, ExtractedTextKind::None);
2405 let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2406 assert!(scan_error.contains("PDF text extraction failed after"));
2407 }
2408
2409 #[test]
2410 fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2411 let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2412
2413 let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2414
2415 assert!(text.is_empty());
2416 assert_eq!(kind, ExtractedTextKind::None);
2417 }
2418
2419 #[test]
2420 fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2421 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2422 let text = b"Copyright 2026 Example Project!!!";
2423 bytes[..text.len()].copy_from_slice(text);
2424 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2425 bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2426
2427 let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2428
2429 assert_ne!(kind, ExtractedTextKind::None);
2430 assert!(text.contains("Copyright 2026 Example Project"));
2431 }
2432
2433 #[test]
2434 fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2435 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2436 let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2437 bytes[..noise.len()].copy_from_slice(noise);
2438 let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2439 bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2440
2441 let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2442
2443 assert!(text.is_empty());
2444 assert_eq!(kind, ExtractedTextKind::None);
2445 }
2446
2447 #[test]
2448 fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2449 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2450 let bytes = std::fs::read(path).expect("read PE fixture");
2451
2452 let (text, kind) = extract_text_for_detection(path, &bytes);
2453
2454 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2455 assert!(text.contains("License: This program is free software"));
2456 assert!(text.contains("LegalCopyright:"));
2457 }
2458
2459 #[test]
2460 fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2461 {
2462 let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2463 let mut bytes = std::fs::read(path).expect("read PE fixture");
2464 bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2465
2466 let (text, kind) = extract_text_for_detection(path, &bytes);
2467
2468 assert_ne!(kind, ExtractedTextKind::None);
2469 assert!(!text.trim().is_empty());
2470 }
2471
2472 #[test]
2473 fn test_windows_metadata_or_empty_result_preserves_metadata() {
2474 let (text, kind, scan_error) =
2475 windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2476
2477 assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2478 assert_eq!(text, "LegalCopyright: Example Corp");
2479 assert!(scan_error.is_none());
2480 }
2481
2482 #[test]
2483 fn test_format_xmp_value_labels_creator_and_title_fields() {
2484 assert_eq!(
2485 format_xmp_value("creator", "Chinmay Garde"),
2486 "Author: Chinmay Garde"
2487 );
2488 assert_eq!(
2489 format_xmp_value("title", "Bay Bridge At Night"),
2490 "Title: Bay Bridge At Night"
2491 );
2492 assert_eq!(
2493 format_xmp_value("description", "Embarcadero in the evening on Delta 3200"),
2494 "Description: Embarcadero in the evening on Delta 3200"
2495 );
2496 }
2497
2498 #[test]
2499 fn test_format_metadata_field_prefixes_exif_text() {
2500 assert_eq!(
2501 format_metadata_field("Author", "Chinmay Garde"),
2502 "Author: Chinmay Garde"
2503 );
2504 assert_eq!(
2505 format_metadata_field("Description", "Bay Bridge At Night"),
2506 "Description: Bay Bridge At Night"
2507 );
2508 }
2509
2510 #[test]
2511 fn test_extract_text_for_detection_keeps_image_author_separate_from_title_and_description() {
2512 let xmp = r#"<x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:creator>Chinmay Garde</dc:creator><dc:title>Bay Bridge At Night</dc:title><dc:description>Embarcadero in the evening on Delta 3200</dc:description></rdf:Description></rdf:RDF></x:xmpmeta>"#;
2513 let bytes = build_png_with_xmp(xmp);
2514
2515 let (text, kind) = extract_text_for_detection(Path::new("fixture.png"), &bytes);
2516
2517 assert_eq!(kind, ExtractedTextKind::ImageMetadata);
2518 assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2519 assert!(
2520 text.contains("Title: Bay Bridge At Night"),
2521 "text: {text:?}"
2522 );
2523 assert!(
2524 text.contains("Description: Embarcadero in the evening on Delta 3200"),
2525 "text: {text:?}"
2526 );
2527
2528 let (_copyrights, _holders, authors) = detect_copyrights(&text, None);
2529 assert_eq!(
2530 authors
2531 .iter()
2532 .map(|a| a.author.as_str())
2533 .collect::<Vec<_>>(),
2534 vec!["Chinmay Garde"],
2535 "authors: {authors:?}; text: {text:?}"
2536 );
2537 }
2538
2539 #[test]
2540 fn test_values_to_text_suppresses_bare_copyright_duplicate_of_author() {
2541 let text = values_to_text(vec![
2542 "Author: Chinmay Garde".to_string(),
2543 "Copyright: Chinmay Garde".to_string(),
2544 "Title: Bay Bridge At Night".to_string(),
2545 ]);
2546
2547 assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2548 assert!(
2549 text.contains("Title: Bay Bridge At Night"),
2550 "text: {text:?}"
2551 );
2552 assert!(!text.contains("Copyright: Chinmay Garde"), "text: {text:?}");
2553 }
2554
2555 #[test]
2556 fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2557 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2558 let text = b"Copyright 2026 Example Project!!!";
2559 bytes[..text.len()].copy_from_slice(text);
2560
2561 let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2562
2563 assert!(text.is_empty());
2564 assert_eq!(kind, ExtractedTextKind::None);
2565 }
2566
2567 #[test]
2568 fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2569 let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2570 let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2571 bytes[..text.len()].copy_from_slice(text);
2572
2573 let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2574
2575 assert_ne!(kind, ExtractedTextKind::None);
2576 assert!(text.contains("asn@redhat.com"));
2577 assert!(text.contains("https://publicsuffix.org/"));
2578 }
2579
2580 #[test]
2581 fn test_extract_text_for_detection_avoids_latin1_decode_for_binary_blob_noise() {
2582 let bytes = vec![
2583 0x28, 0x63, 0x29, 0x20, 0x4b, 0x30, 0x0e, 0x71, 0x86, 0x20, 0x62, 0x24, 0x4c,
2584 ];
2585
2586 let (text, kind) = extract_text_for_detection(Path::new("fixture.blb"), &bytes);
2587
2588 assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2589 assert_eq!(text, "(c) K0\n b$L");
2590 }
2591
2592 #[test]
2593 fn test_extract_raw_xmp_packet_rejects_oversized_png_itxt_payload() {
2594 let xmp = "A".repeat(MAX_XMP_PACKET_BYTES + 1);
2595 let bytes = build_png_with_xmp(&xmp);
2596
2597 assert!(extract_raw_xmp_packet(&bytes, ImageFormat::Png).is_none());
2598 }
2599
2600 #[test]
2601 fn test_non_actionable_pdf_failures_are_suppressed() {
2602 assert!(is_non_actionable_pdf_failure(&[
2603 "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2604 "open full-document: PDF is encrypted and requires a password".to_string(),
2605 ]));
2606 assert!(is_non_actionable_pdf_failure(&[
2607 "from-bytes first-page: Invalid cross-reference table".to_string(),
2608 "open full-document: Invalid cross-reference table".to_string(),
2609 ]));
2610 assert!(is_non_actionable_pdf_failure(&[
2611 "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2612 "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2613 ]));
2614 assert!(!is_non_actionable_pdf_failure(&[
2615 "from-bytes first-page: some other parser failure".to_string(),
2616 ]));
2617 }
2618
2619 #[test]
2620 fn test_extract_text_for_detection_skips_zip_like_archives() {
2621 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2622
2623 let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2624 let (crate_text, crate_kind) =
2625 extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2626
2627 assert!(whl_text.is_empty());
2628 assert_eq!(whl_kind, ExtractedTextKind::None);
2629 assert!(crate_text.is_empty());
2630 assert_eq!(crate_kind, ExtractedTextKind::None);
2631 }
2632
2633 #[test]
2634 fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2635 let path =
2636 Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2637 let bytes = std::fs::read(path).expect("failed to read lib fixture");
2638
2639 let (text, kind) = extract_text_for_detection(path, &bytes);
2640
2641 assert_ne!(kind, ExtractedTextKind::None);
2642 assert!(text.contains("Copyright nexB and others (c) 2012"));
2643 }
2644
2645 #[test]
2646 fn test_extract_text_for_detection_reads_font_metadata() {
2647 let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2648 let bytes = std::fs::read(path).expect("failed to read font fixture");
2649
2650 let (text, kind) = extract_text_for_detection(path, &bytes);
2651
2652 assert_eq!(kind, ExtractedTextKind::FontMetadata);
2653 assert!(text.contains("License Description:"), "{text}");
2654 assert!(
2655 text.contains("Open Font License") || text.contains("OFL"),
2656 "{text}"
2657 );
2658 assert!(text.contains("Lato"), "{text}");
2659 }
2660
2661 #[test]
2662 fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2663 let bytes = b"abcd\0".repeat(525_000);
2664
2665 let text = extract_printable_strings(&bytes);
2666
2667 assert!(
2668 text.len() > 2_000_000,
2669 "unexpected truncation at {}",
2670 text.len()
2671 );
2672 assert!(text.ends_with("abcd"));
2673 }
2674
2675 #[test]
2676 fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2677 let path = Path::new(
2678 "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2679 );
2680 let bytes = std::fs::read(path).expect("failed to read svg fixture");
2681
2682 let (text, kind) = extract_text_for_detection(path, &bytes);
2683
2684 assert_eq!(kind, ExtractedTextKind::Decoded);
2685 assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2686 }
2687
2688 #[test]
2689 fn test_extract_text_for_detection_preserves_blank_comment_lines_in_utf8_source() {
2690 let path = Path::new("testdata/plugin_email_url/files/IMarkerActionFilter.java");
2691 let bytes = std::fs::read(path).expect("failed to read java fixture");
2692
2693 let (text, kind) = extract_text_for_detection(path, &bytes);
2694
2695 assert_eq!(kind, ExtractedTextKind::Decoded);
2696 let lines: Vec<_> = text.lines().collect();
2697 assert_eq!(lines.get(2).copied(), Some(" *"));
2698 assert_eq!(
2699 lines.get(3).copied(),
2700 Some(" *https://github.com/rpm-software-management")
2701 );
2702 assert_eq!(lines.get(5).copied(), Some("https://gitlab.com/Conan_Kudo"));
2703 }
2704
2705 #[test]
2706 fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2707 let path = Path::new(
2708 "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2709 );
2710 let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2711
2712 let (text, kind) = extract_text_for_detection(path, &bytes);
2713
2714 assert_eq!(kind, ExtractedTextKind::Decoded);
2715 assert!(text.contains("GNU Lesser General Public"));
2716 assert!(text.contains("version"));
2717 assert!(text.contains("2.1 of the License"));
2718 }
2719
2720 #[test]
2721 fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2722 assert_eq!(
2723 normalize_mime_type(
2724 Path::new("main.ts"),
2725 b"export const answer = 42;\n",
2726 Some("TypeScript"),
2727 "video/mp2t",
2728 ),
2729 "text/plain"
2730 );
2731 }
2732
2733 #[test]
2734 fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2735 assert_eq!(
2736 normalize_mime_type(
2737 Path::new("main.js"),
2738 b"console.log('hello');\n",
2739 Some("JavaScript"),
2740 "application/octet-stream",
2741 ),
2742 "text/plain"
2743 );
2744 }
2745
2746 #[test]
2747 fn test_normalize_mime_type_preserves_binary_video_guess() {
2748 assert_eq!(
2749 normalize_mime_type(
2750 Path::new("main.ts"),
2751 &[0, 159, 146, 150, 0, 1, 2, 3],
2752 Some("TypeScript"),
2753 "video/mp2t",
2754 ),
2755 "video/mp2t"
2756 );
2757 }
2758
2759 #[test]
2760 fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2761 assert_eq!(
2762 normalize_mime_type(
2763 Path::new("main.ts"),
2764 &[0, 159, 146, 150],
2765 Some("TypeScript"),
2766 "application/octet-stream",
2767 ),
2768 "application/octet-stream"
2769 );
2770 }
2771
2772 #[test]
2773 fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2774 let classification = classify_file_info(Path::new("test.txt"), b"");
2775
2776 assert_eq!(classification.mime_type, "inode/x-empty");
2777 assert_eq!(classification.file_type, "empty");
2778 assert!(!classification.is_binary);
2779 assert!(classification.is_text);
2780 assert!(!classification.is_source);
2781 assert_eq!(classification.programming_language, None);
2782 }
2783
2784 #[test]
2785 fn test_classify_file_info_keeps_json_out_of_programming_language() {
2786 let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2787
2788 assert_eq!(classification.mime_type, "application/json");
2789 assert_eq!(classification.file_type, "JSON text data");
2790 assert!(classification.is_text);
2791 assert!(!classification.is_source);
2792 assert_eq!(classification.programming_language, None);
2793 }
2794
2795 #[test]
2796 fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2797 let classification =
2798 classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2799
2800 assert_eq!(classification.mime_type, "text/plain");
2801 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2802 assert!(classification.is_text);
2803 assert!(!classification.is_binary);
2804 }
2805
2806 #[test]
2807 fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2808 let classification =
2809 classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
2810
2811 assert_eq!(classification.mime_type, "application/octet-stream");
2812 assert_eq!(classification.file_type, "data");
2813 assert!(classification.is_binary);
2814 assert!(!classification.is_text);
2815 }
2816
2817 #[test]
2818 fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
2819 let classification = classify_file_info(
2820 Path::new("utf16.json"),
2821 &[
2822 0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
2823 ],
2824 );
2825
2826 assert!(!classification.is_binary);
2827 assert!(classification.is_text);
2828 assert_eq!(classification.mime_type, "application/json");
2829 assert_eq!(classification.file_type, "JSON text data");
2830 }
2831
2832 #[test]
2833 fn test_classify_file_info_treats_valid_utf16be_json_without_bom_as_text() {
2834 let classification = classify_file_info(
2835 Path::new("utf16be.json"),
2836 &[0x00, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D],
2837 );
2838
2839 assert!(!classification.is_binary);
2840 assert!(classification.is_text);
2841 assert_eq!(classification.mime_type, "application/json");
2842 assert_eq!(classification.file_type, "JSON text data");
2843 }
2844
2845 #[test]
2846 fn test_classify_file_info_treats_small_valid_utf16be_json_literal_as_text() {
2847 let classification =
2848 classify_file_info(Path::new("utf16be-literal.json"), &[0x00, 0x5B, 0x00, 0x5D]);
2849
2850 assert!(!classification.is_binary);
2851 assert!(classification.is_text);
2852 assert_eq!(classification.mime_type, "application/json");
2853 assert_eq!(classification.file_type, "JSON text data");
2854 }
2855
2856 #[test]
2857 fn test_extract_text_for_detection_decodes_utf16be_text_with_corrupted_bom_prefix() {
2858 let mut bytes = super::CORRUPTED_UTF16_BOM_PREFIX.to_vec();
2859 for code_unit in
2860 "Licensed to the Apache Software Foundation\nApache License, Version 2.0".encode_utf16()
2861 {
2862 bytes.extend_from_slice(&code_unit.to_be_bytes());
2863 }
2864
2865 let (text, kind) = extract_text_for_detection(Path::new("notice.ftl"), &bytes);
2866
2867 assert_eq!(kind, ExtractedTextKind::Decoded);
2868 assert!(text.contains("Apache Software Foundation"), "{text}");
2869 assert!(text.contains("Apache License, Version 2.0"), "{text}");
2870 }
2871
2872 #[test]
2873 fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
2874 let classification = classify_file_info(Path::new("true.json"), b"true");
2875
2876 assert!(!classification.is_binary);
2877 assert!(classification.is_text);
2878 assert_eq!(classification.mime_type, "application/json");
2879 assert_eq!(classification.file_type, "JSON text data");
2880 }
2881
2882 #[test]
2883 fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
2884 let classification = classify_file_info(
2885 Path::new("wrapped.json"),
2886 &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
2887 );
2888
2889 assert!(!classification.is_binary);
2890 assert!(classification.is_text);
2891 assert_eq!(classification.mime_type, "text/plain");
2892 assert_eq!(classification.file_type, "text, with no line terminators");
2893 }
2894
2895 #[test]
2896 fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
2897 let classification =
2898 classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
2899
2900 assert!(classification.is_binary);
2901 assert!(!classification.is_text);
2902 assert_eq!(classification.mime_type, "application/octet-stream");
2903 assert_eq!(classification.file_type, "data");
2904 }
2905
2906 #[test]
2907 fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
2908 let classification = classify_file_info(
2909 Path::new("crash.json"),
2910 &[
2911 0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
2912 ],
2913 );
2914
2915 assert!(classification.is_binary);
2916 assert!(!classification.is_text);
2917 assert_eq!(classification.mime_type, "application/octet-stream");
2918 }
2919
2920 #[test]
2921 fn test_classify_file_info_treats_dockerfile_as_source() {
2922 let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2923
2924 assert_eq!(
2925 classification.programming_language.as_deref(),
2926 Some("Dockerfile")
2927 );
2928 assert!(classification.is_source);
2929 assert!(!classification.is_script);
2930 assert_eq!(
2931 classification.file_type,
2932 "Dockerfile source, UTF-8 Unicode text"
2933 );
2934 }
2935
2936 #[test]
2937 fn test_classify_file_info_treats_makefile_as_text_not_source() {
2938 let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2939
2940 assert_eq!(classification.programming_language, None);
2941 assert!(classification.is_text);
2942 assert!(!classification.is_source);
2943 assert!(!classification.is_script);
2944 assert_eq!(classification.file_type, "UTF-8 Unicode text");
2945 }
2946
2947 #[test]
2948 fn test_classify_file_info_marks_supported_package_archives() {
2949 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2950
2951 let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2952 let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2953
2954 assert!(egg.is_archive);
2955 assert_eq!(egg.mime_type, "application/zip");
2956 assert_eq!(egg.file_type, "Zip archive data");
2957 assert!(nupkg.is_archive);
2958 assert_eq!(nupkg.mime_type, "application/zip");
2959 assert_eq!(nupkg.file_type, "Zip archive data");
2960 }
2961
2962 #[test]
2963 fn test_classify_file_info_marks_png_as_binary_media() {
2964 let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2965
2966 let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2967
2968 assert_eq!(classification.mime_type, "image/png");
2969 assert_eq!(classification.file_type, "PNG image data");
2970 assert!(classification.is_binary);
2971 assert!(!classification.is_text);
2972 assert!(classification.is_media);
2973 assert!(!classification.is_archive);
2974 assert!(!classification.is_source);
2975 }
2976
2977 #[test]
2978 fn test_classify_file_info_marks_pdf_as_binary_document() {
2979 let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2980
2981 let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2982
2983 assert_eq!(classification.mime_type, "application/pdf");
2984 assert_eq!(classification.file_type, "PDF document");
2985 assert!(classification.is_binary);
2986 assert!(!classification.is_text);
2987 assert!(!classification.is_archive);
2988 assert!(!classification.is_media);
2989 }
2990
2991 #[test]
2992 fn test_classify_file_info_marks_binary_blobs_as_binary() {
2993 let classification =
2994 classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2995
2996 assert!(classification.is_binary);
2997 assert!(!classification.is_text);
2998 assert!(!classification.is_source);
2999 assert_eq!(classification.programming_language, None);
3000 }
3001
3002 #[test]
3003 fn test_classify_file_info_treats_yaml_as_text_not_source() {
3004 let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
3005
3006 assert_eq!(classification.programming_language, None);
3007 assert!(classification.is_text);
3008 assert!(!classification.is_source);
3009 assert_eq!(classification.file_type, "YAML text data");
3010 }
3011
3012 #[test]
3013 fn test_classify_file_info_classifies_common_build_manifests() {
3014 let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
3015 let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
3016 let cmake = classify_file_info(
3017 Path::new("toolchain.cmake"),
3018 b"set(CMAKE_CXX_STANDARD 20)\n",
3019 );
3020 let gitmodules = classify_file_info(
3021 Path::new(".gitmodules"),
3022 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
3023 );
3024
3025 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
3026 assert!(gradle.is_source);
3027 assert_eq!(gradle.mime_type, "text/plain");
3028 assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
3029
3030 assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
3031 assert!(flake.is_source);
3032 assert_eq!(flake.mime_type, "text/plain");
3033 assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
3034
3035 assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
3036 assert!(cmake.is_source);
3037 assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
3038
3039 assert_eq!(gitmodules.programming_language, None);
3040 assert!(gitmodules.is_text);
3041 assert!(!gitmodules.is_source);
3042 assert_eq!(gitmodules.file_type, "Git configuration text");
3043 }
3044
3045 #[test]
3046 fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
3047 let header = classify_file_info(
3048 Path::new("include/demo.hpp"),
3049 b"#pragma once\nclass Demo {};\n",
3050 );
3051 let ipp = classify_file_info(
3052 Path::new("include/detail/demo.ipp"),
3053 b"template <class T> void parse() {}\n",
3054 );
3055
3056 assert_eq!(header.programming_language.as_deref(), Some("C++"));
3057 assert!(header.is_source);
3058 assert!(!header.is_script);
3059 assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
3060
3061 assert_eq!(ipp.programming_language, None);
3062 assert!(!ipp.is_source);
3063 assert!(!ipp.is_script);
3064 assert_eq!(ipp.file_type, "UTF-8 Unicode text");
3065 }
3066
3067 #[test]
3068 fn test_classify_file_info_preserves_specific_shell_family_labels() {
3069 let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
3070
3071 assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
3072 assert!(bash.is_script);
3073 assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
3074 }
3075
3076 #[test]
3077 fn test_classify_file_info_marks_jamfile_as_source() {
3078 let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
3079
3080 assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
3081 assert!(jamfile.is_source);
3082 assert!(!jamfile.is_script);
3083 assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
3084 }
3085
3086 #[test]
3087 fn test_classify_file_info_labels_javascript_shebang_scripts() {
3088 let classification = classify_file_info(
3089 Path::new("bin/run"),
3090 b"#!/usr/bin/env node\nconsole.log('hello');\n",
3091 );
3092
3093 assert_eq!(
3094 classification.programming_language.as_deref(),
3095 Some("JavaScript")
3096 );
3097 assert!(classification.is_script);
3098 assert_eq!(
3099 classification.file_type,
3100 "javascript script, UTF-8 Unicode text executable"
3101 );
3102 }
3103
3104 #[test]
3105 fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
3106 let classification = classify_file_info(
3107 Path::new("script.py"),
3108 b"# coding: latin-1\nprint(\"caf\xe9\")\n",
3109 );
3110
3111 assert_eq!(
3112 classification.programming_language.as_deref(),
3113 Some("Python")
3114 );
3115 assert!(classification.is_script);
3116 assert_eq!(classification.file_type, "python script, text executable");
3117 }
3118
3119 #[test]
3120 fn test_classify_file_info_treats_textual_tga_as_media() {
3121 let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
3122
3123 assert!(classification.is_media);
3124 assert!(classification.is_text);
3125 assert!(!classification.is_binary);
3126 }
3127
3128 #[test]
3129 fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
3130 let classification =
3131 classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
3132
3133 assert!(classification.is_binary);
3134 assert!(!classification.is_text);
3135 assert!(!classification.is_source);
3136 assert_eq!(classification.programming_language, None);
3137 }
3138
3139 #[test]
3140 fn test_extract_text_for_detection_skips_unsupported_image_formats() {
3141 let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
3142
3143 let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
3144
3145 assert!(text.is_empty());
3146 assert_eq!(kind, ExtractedTextKind::None);
3147 }
3148
3149 #[test]
3150 fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
3151 let cases = [
3152 (
3153 Path::new("bin/run"),
3154 b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
3155 Some("JavaScript"),
3156 true,
3157 true,
3158 ),
3159 (
3160 Path::new("Dockerfile"),
3161 b"FROM scratch\n".as_slice(),
3162 Some("Dockerfile"),
3163 true,
3164 false,
3165 ),
3166 (
3167 Path::new("package.json"),
3168 br#"{"name":"demo"}"#.as_slice(),
3169 None,
3170 false,
3171 false,
3172 ),
3173 (
3174 Path::new("config.yaml"),
3175 b"key: value\n".as_slice(),
3176 None,
3177 false,
3178 false,
3179 ),
3180 (
3181 Path::new("Makefile"),
3182 b"all:\n\techo hi\n".as_slice(),
3183 None,
3184 false,
3185 false,
3186 ),
3187 ];
3188
3189 for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
3190 let classification = classify_file_info(path, bytes);
3191
3192 assert_eq!(
3193 classification.programming_language.as_deref(),
3194 expected_language,
3195 "unexpected language for {}",
3196 path.display()
3197 );
3198 assert_eq!(
3199 classification.is_source,
3200 expected_is_source,
3201 "unexpected is_source for {}",
3202 path.display()
3203 );
3204 assert_eq!(
3205 classification.is_script,
3206 expected_is_script,
3207 "unexpected is_script for {}",
3208 path.display()
3209 );
3210 }
3211 }
3212}