1use std::collections::BTreeSet;
2use std::fs;
3use std::io::{BufReader, Cursor, Read};
4use std::panic::{AssertUnwindSafe, catch_unwind};
5use std::path::Path;
6
7use chrono::{TimeZone, Utc};
8use file_format::{FileFormat, Kind as FileFormatKind};
9use flate2::read::ZlibDecoder;
10use glob::Pattern;
11use image::{ImageDecoder, ImageFormat, ImageReader};
12use mime_guess::from_path;
13use quick_xml::events::Event;
14use quick_xml::reader::Reader as XmlReader;
15
16use crate::utils::language::detect_language;
17
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum ExtractedTextKind {
20 None,
21 Decoded,
22 Pdf,
23 BinaryStrings,
24 ImageMetadata,
25}
26
27#[derive(Debug, Clone, PartialEq, Eq)]
28pub struct FileInfoClassification {
29 pub mime_type: String,
30 pub file_type: String,
31 pub programming_language: Option<String>,
32 pub is_binary: bool,
33 pub is_text: bool,
34 pub is_archive: bool,
35 pub is_media: bool,
36 pub is_source: bool,
37 pub is_script: bool,
38}
39
40const MAX_IMAGE_METADATA_VALUES: usize = 64;
41const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
42const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
43const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
44 "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
45];
46const BINARY_EXTENSIONS: &[&str] = &[
47 "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
48];
49const ARCHIVE_EXTENSIONS: &[&str] = &[
50 "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
51 "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
52];
53
54pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
56 metadata.modified().ok().map(|time: std::time::SystemTime| {
57 let seconds_since_epoch = time
58 .duration_since(std::time::UNIX_EPOCH)
59 .unwrap()
60 .as_secs() as i64;
61
62 Utc.timestamp_opt(seconds_since_epoch, 0)
63 .single()
64 .unwrap_or_else(Utc::now)
65 .format("%Y-%m-%d")
66 .to_string()
67 })
68}
69
70pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
72 let path_str = path.to_string_lossy();
73 let file_name = path
74 .file_name()
75 .map(|name| name.to_string_lossy())
76 .unwrap_or_default();
77
78 for pattern in exclude_patterns {
79 if pattern.matches(&path_str) {
81 return true;
82 }
83
84 if pattern.matches(&file_name) {
86 return true;
87 }
88 }
89
90 false
91}
92
93pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
99 match String::from_utf8(bytes.to_vec()) {
100 Ok(s) => s,
101 Err(e) => {
102 let bytes = e.into_bytes();
103 if has_binary_control_chars(&bytes) {
104 return String::new();
105 }
106 bytes.iter().map(|&b| b as char).collect()
107 }
108 }
109}
110
111pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
112 let ext = path
113 .extension()
114 .and_then(|e| e.to_str())
115 .map(|s| s.to_ascii_lowercase());
116 let detected_format = detect_file_format(bytes);
117
118 if looks_like_rtf(bytes, ext.as_deref()) {
119 let text = extract_rtf_text(bytes);
120 return if text.trim().is_empty() {
121 (String::new(), ExtractedTextKind::None)
122 } else {
123 (text, ExtractedTextKind::Decoded)
124 };
125 }
126
127 if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
128 let text = extract_pdf_text(path, bytes);
129 return if text.is_empty() {
130 (String::new(), ExtractedTextKind::None)
131 } else {
132 (text, ExtractedTextKind::Pdf)
133 };
134 }
135
136 if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
137 let text = extract_image_metadata_text(bytes, format);
138 return if text.is_empty() {
139 if is_supported_image_container(bytes, format) {
140 (String::new(), ExtractedTextKind::None)
141 } else {
142 let decoded = decode_bytes_to_string(bytes);
143 if decoded.is_empty() {
144 (String::new(), ExtractedTextKind::None)
145 } else {
146 (decoded, ExtractedTextKind::Decoded)
147 }
148 }
149 } else {
150 (text, ExtractedTextKind::ImageMetadata)
151 };
152 }
153
154 if should_skip_binary_string_extraction(path, bytes, detected_format) {
155 return (String::new(), ExtractedTextKind::None);
156 }
157
158 let decoded = decode_bytes_to_string(bytes);
159 if !decoded.is_empty() {
160 return (decoded, ExtractedTextKind::Decoded);
161 }
162
163 let text = extract_printable_strings(bytes);
164 if text.is_empty() {
165 (String::new(), ExtractedTextKind::None)
166 } else {
167 (text, ExtractedTextKind::BinaryStrings)
168 }
169}
170
171pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
172 let detected_format = detect_file_format(bytes);
173 let detected_language = detect_language(path, bytes);
174 let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
175 let is_text = !is_binary;
176 let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
177 let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
178 let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
179 let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
180 let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
181 let programming_language = is_source.then(|| detected_language.clone()).flatten();
182 let file_type = detect_file_type(
183 path,
184 bytes,
185 detected_format,
186 &mime_type,
187 programming_language.as_deref(),
188 is_binary,
189 is_text,
190 is_archive,
191 is_media,
192 is_script,
193 );
194
195 FileInfoClassification {
196 mime_type,
197 file_type,
198 programming_language,
199 is_binary,
200 is_text,
201 is_archive,
202 is_media,
203 is_source,
204 is_script,
205 }
206}
207
208fn detect_file_format(bytes: &[u8]) -> FileFormat {
209 FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
210}
211
212fn is_utf8_text(bytes: &[u8]) -> bool {
213 std::str::from_utf8(bytes).is_ok()
214}
215
216fn has_binary_control_chars(bytes: &[u8]) -> bool {
217 let control_count = bytes
218 .iter()
219 .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
220 .count();
221 control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
222}
223
224fn has_decodable_text(bytes: &[u8]) -> bool {
225 bytes.is_empty() || is_utf8_text(bytes) || !has_binary_control_chars(bytes)
226}
227
228fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
229 if bytes.is_empty() || is_utf8_text(bytes) {
230 return true;
231 }
232
233 let printable_count = bytes
234 .iter()
235 .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
236 .count();
237 printable_count * 2 >= bytes.len()
238}
239
240fn is_textual_media_type(media_type: &str) -> bool {
241 media_type.starts_with("text/")
242 || matches!(
243 media_type,
244 "application/json" | "application/xml" | "text/xml"
245 )
246 || media_type.ends_with("+json")
247 || media_type.ends_with("+xml")
248}
249
250fn is_textual_format(detected_format: FileFormat) -> bool {
251 matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
252 || is_textual_media_type(detected_format.media_type())
253}
254
255fn is_known_binary_format(detected_format: FileFormat) -> bool {
256 !matches!(detected_format, FileFormat::ArbitraryBinaryData)
257 && !is_textual_format(detected_format)
258}
259
260pub fn detect_mime_type(
261 path: &Path,
262 bytes: &[u8],
263 detected_format: FileFormat,
264 programming_language: Option<&str>,
265) -> String {
266 if bytes.is_empty() {
267 return "inode/x-empty".to_string();
268 }
269
270 if is_zip_archive(bytes) {
271 return detect_zip_like_mime(path);
272 }
273
274 if looks_like_deb(bytes, path) {
275 return "application/vnd.debian.binary-package".to_string();
276 }
277
278 if looks_like_rpm(bytes, path) {
279 return "application/x-rpm".to_string();
280 }
281
282 let guessed_mime = from_path(path)
283 .first_or_octet_stream()
284 .essence_str()
285 .to_string();
286
287 let mime_type = match detected_format {
288 FileFormat::Empty => "inode/x-empty".to_string(),
289 FileFormat::PlainText => {
290 if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
291 "text/plain".to_string()
292 } else {
293 guessed_mime.clone()
294 }
295 }
296 _ => {
297 let detected_mime = detected_format.media_type();
298 if detected_mime == "application/octet-stream"
299 && guessed_mime != "application/octet-stream"
300 {
301 guessed_mime.clone()
302 } else {
303 detected_mime.to_string()
304 }
305 }
306 };
307
308 normalize_mime_type(path, bytes, programming_language, &mime_type)
309}
310
311fn normalize_mime_type(
312 path: &Path,
313 bytes: &[u8],
314 programming_language: Option<&str>,
315 mime_type: &str,
316) -> String {
317 if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
318 return "text/plain".to_string();
319 }
320
321 mime_type.to_string()
322}
323
324fn should_prefer_text_mime(
325 path: &Path,
326 bytes: &[u8],
327 programming_language: Option<&str>,
328 mime_type: &str,
329) -> bool {
330 has_decodable_text(bytes)
331 && looks_like_textual_bytes(bytes)
332 && is_textual_source_candidate(path, programming_language)
333 && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
334}
335
336fn detect_is_binary(
337 path: &Path,
338 bytes: &[u8],
339 detected_format: FileFormat,
340 programming_language: Option<&str>,
341) -> bool {
342 if is_textual_format(detected_format) {
343 return false;
344 }
345
346 if lower_extension(path)
347 .as_deref()
348 .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
349 {
350 return true;
351 }
352
353 if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
354 return false;
355 }
356
357 has_binary_control_chars(bytes)
358 || is_known_binary_format(detected_format)
359 || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
360 && !looks_like_textual_bytes(bytes))
361}
362
363fn should_treat_binary_bytes_as_text(
364 path: &Path,
365 bytes: &[u8],
366 programming_language: Option<&str>,
367) -> bool {
368 has_decodable_text(bytes)
369 && looks_like_textual_bytes(bytes)
370 && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
371}
372
373fn detect_is_archive(
374 path: &Path,
375 bytes: &[u8],
376 mime_type: &str,
377 is_text: bool,
378 detected_format: FileFormat,
379) -> bool {
380 if is_text {
381 return false;
382 }
383
384 lower_extension(path)
385 .as_deref()
386 .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
387 || matches!(
388 detected_format.kind(),
389 FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
390 )
391 || is_zip_archive(bytes)
392 || looks_like_gzip(bytes)
393 || looks_like_bzip2(bytes)
394 || looks_like_xz(bytes)
395 || looks_like_deb(bytes, path)
396 || looks_like_rpm(bytes, path)
397 || looks_like_squashfs(bytes, path)
398 || mime_type.contains("zip")
399 || mime_type.contains("compressed")
400 || mime_type.contains("tar")
401 || mime_type.contains("x-rpm")
402 || mime_type.contains("debian")
403}
404
405fn detect_is_media(
406 path: &Path,
407 bytes: &[u8],
408 mime_type: &str,
409 detected_format: FileFormat,
410) -> bool {
411 media_mime_from_content(bytes).is_some()
412 || matches!(
413 detected_format.kind(),
414 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
415 )
416 || mime_type.starts_with("image/")
417 || mime_type.starts_with("audio/")
418 || mime_type.starts_with("video/")
419 || (mime_type == "application/octet-stream"
420 && lower_extension(path).as_deref() == Some("tga")
421 && !has_binary_control_chars(bytes))
422}
423
424fn detect_is_script(
425 path: &Path,
426 bytes: &[u8],
427 programming_language: Option<&str>,
428 is_text: bool,
429) -> bool {
430 if !is_text || is_makefile(path) {
431 return false;
432 }
433
434 bytes.starts_with(b"#!")
435 || lower_extension(path).as_deref().is_some_and(|ext| {
436 matches!(
437 ext,
438 "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
439 )
440 })
441 || matches!(
442 programming_language,
443 Some("Shell" | "Python" | "Ruby" | "Perl" | "PHP" | "PowerShell" | "Awk")
444 )
445}
446
447fn detect_is_source(
448 path: &Path,
449 programming_language: Option<&str>,
450 is_text: bool,
451 is_script: bool,
452) -> bool {
453 if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
454 return false;
455 }
456
457 if is_c_like_source(path) || is_java_like_source(path) {
458 return true;
459 }
460
461 programming_language.is_some() || is_script
462}
463
464#[allow(clippy::too_many_arguments)]
465fn detect_file_type(
466 path: &Path,
467 bytes: &[u8],
468 detected_format: FileFormat,
469 mime_type: &str,
470 programming_language: Option<&str>,
471 is_binary: bool,
472 is_text: bool,
473 is_archive: bool,
474 is_media: bool,
475 is_script: bool,
476) -> String {
477 if bytes.is_empty() {
478 return "empty".to_string();
479 }
480
481 if looks_like_pdf(bytes) {
482 return "PDF document".to_string();
483 }
484
485 if let Some(file_type) = media_file_type_from_content(bytes) {
486 return file_type.to_string();
487 }
488
489 if is_archive {
490 return archive_file_type(path, bytes, detected_format);
491 }
492
493 if is_script {
494 return script_file_type(programming_language, bytes);
495 }
496
497 if is_text {
498 if lower_extension(path).as_deref() == Some("json") {
499 return "JSON text data".to_string();
500 }
501 if lower_extension(path).as_deref() == Some("xml") {
502 return "XML text data".to_string();
503 }
504 if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
505 return "YAML text data".to_string();
506 }
507 if lower_extension(path).as_deref() == Some("toml") {
508 return "TOML text data".to_string();
509 }
510 if matches!(
511 lower_extension(path).as_deref(),
512 Some("ini" | "cfg" | "conf")
513 ) {
514 return "INI text data".to_string();
515 }
516 if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
517 return "Git configuration text".to_string();
518 }
519 if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
520 return text_file_type(bytes);
521 }
522 if programming_language.is_some() && !is_media {
523 return text_file_type(bytes);
524 }
525 return text_file_type(bytes);
526 }
527
528 if let Some(file_type) = format_based_file_type(detected_format) {
529 return file_type;
530 }
531
532 if is_binary && mime_type == "application/octet-stream" {
533 return "data".to_string();
534 }
535
536 mime_type.to_string()
537}
538
539fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
540 if matches!(programming_language, Some(language) if is_source_like_language(language)) {
541 return true;
542 }
543
544 if matches!(
545 lower_file_name(path).as_str(),
546 "dockerfile"
547 | "containerfile"
548 | "containerfile.core"
549 | "apkbuild"
550 | "podfile"
551 | "meson.build"
552 | "build"
553 | "workspace"
554 | "buck"
555 | "default.nix"
556 | "flake.nix"
557 | "shell.nix"
558 ) {
559 return true;
560 }
561
562 path.extension()
563 .and_then(|ext| ext.to_str())
564 .is_some_and(|ext| {
565 matches!(
566 ext.to_ascii_lowercase().as_str(),
567 "rs" | "py"
568 | "js"
569 | "mjs"
570 | "cjs"
571 | "jsx"
572 | "ts"
573 | "mts"
574 | "cts"
575 | "tsx"
576 | "c"
577 | "cpp"
578 | "cc"
579 | "cxx"
580 | "h"
581 | "hpp"
582 | "m"
583 | "mm"
584 | "s"
585 | "asm"
586 | "java"
587 | "go"
588 | "rb"
589 | "php"
590 | "pl"
591 | "swift"
592 | "sh"
593 | "bash"
594 | "zsh"
595 | "fish"
596 | "ksh"
597 | "ps1"
598 | "psm1"
599 | "psd1"
600 | "awk"
601 | "kt"
602 | "kts"
603 | "dart"
604 | "scala"
605 | "groovy"
606 | "gradle"
607 | "gvy"
608 | "gy"
609 | "gsh"
610 | "cs"
611 | "fs"
612 | "fsx"
613 | "r"
614 | "lua"
615 | "jl"
616 | "ex"
617 | "exs"
618 | "clj"
619 | "cljs"
620 | "cljc"
621 | "hs"
622 | "erl"
623 | "nix"
624 | "zig"
625 | "bzl"
626 | "bazel"
627 | "star"
628 | "sky"
629 | "ml"
630 | "mli"
631 | "tex"
632 )
633 })
634}
635
636fn is_source_like_language(language: &str) -> bool {
637 matches!(
638 language,
639 "Rust"
640 | "Python"
641 | "JavaScript"
642 | "TypeScript"
643 | "JavaScript/TypeScript"
644 | "C"
645 | "C++"
646 | "Objective-C"
647 | "Objective-C++"
648 | "GAS"
649 | "Java"
650 | "Go"
651 | "Ruby"
652 | "PHP"
653 | "Perl"
654 | "Swift"
655 | "Shell"
656 | "PowerShell"
657 | "Awk"
658 | "Kotlin"
659 | "Dart"
660 | "Scala"
661 | "C#"
662 | "F#"
663 | "R"
664 | "Lua"
665 | "Julia"
666 | "Elixir"
667 | "Clojure"
668 | "Haskell"
669 | "Erlang"
670 | "Groovy"
671 | "Nix"
672 | "Zig"
673 | "Starlark"
674 | "OCaml"
675 | "Meson"
676 | "TeX"
677 | "Dockerfile"
678 | "Makefile"
679 )
680}
681
682fn extension(path: &Path) -> Option<&str> {
683 path.extension().and_then(|ext| ext.to_str())
684}
685
686fn lower_extension(path: &Path) -> Option<String> {
687 extension(path).map(|ext| ext.to_ascii_lowercase())
688}
689
690fn lower_file_name(path: &Path) -> String {
691 path.file_name()
692 .and_then(|name| name.to_str())
693 .map(|name| name.to_ascii_lowercase())
694 .unwrap_or_default()
695}
696
697fn is_plain_text(path: &Path) -> bool {
698 lower_extension(path)
699 .as_deref()
700 .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
701}
702
703fn is_makefile(path: &Path) -> bool {
704 matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
705}
706
707fn is_source_map(path: &Path) -> bool {
708 let path_lower = path.to_string_lossy().to_ascii_lowercase();
709 path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
710}
711
712fn is_c_like_source(path: &Path) -> bool {
713 lower_extension(path).as_deref().is_some_and(|ext| {
714 matches!(
715 ext,
716 "c" | "cc"
717 | "cp"
718 | "cpp"
719 | "cxx"
720 | "c++"
721 | "h"
722 | "hh"
723 | "hpp"
724 | "hxx"
725 | "h++"
726 | "i"
727 | "ii"
728 | "m"
729 | "s"
730 | "asm"
731 )
732 })
733}
734
735fn is_java_like_source(path: &Path) -> bool {
736 lower_extension(path)
737 .as_deref()
738 .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
739}
740
741fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
742 match detected_format {
743 FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
744 format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
745 format => Some(match format.kind() {
746 FileFormatKind::Image => short_name_or_name(&format, "image data"),
747 FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
748 FileFormatKind::Video => short_name_or_name(&format, "video data"),
749 _ => format.name().to_string(),
750 }),
751 }
752}
753
754fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
755 format
756 .short_name()
757 .map(|short_name| format!("{short_name} {suffix}"))
758 .unwrap_or_else(|| format!("{} {suffix}", format.name()))
759}
760
761fn detect_zip_like_mime(path: &Path) -> String {
762 match extension(path).map(|ext| ext.to_ascii_lowercase()) {
763 Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
764 Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
765 "application/java-archive".to_string()
766 }
767 _ => "application/zip".to_string(),
768 }
769}
770
771fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
772 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
773 Some("image/png")
774 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
775 Some("image/jpeg")
776 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
777 Some("image/tiff")
778 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
779 Some("image/webp")
780 } else {
781 None
782 }
783}
784
785fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
786 if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
787 Some("PNG image data")
788 } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
789 Some("JPEG image data")
790 } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
791 Some("TIFF image data")
792 } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
793 Some("WebP image data")
794 } else {
795 None
796 }
797}
798
799fn looks_like_pdf(bytes: &[u8]) -> bool {
800 bytes.starts_with(b"%PDF-")
801}
802
803fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
804 ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
805}
806
807fn extract_rtf_text(bytes: &[u8]) -> String {
808 let text = String::from_utf8_lossy(bytes);
809 let chars: Vec<char> = text.chars().collect();
810 let mut output = String::new();
811 let mut index = 0usize;
812
813 while index < chars.len() {
814 match chars[index] {
815 '{' | '}' => {
816 index += 1;
817 }
818 '\\' => {
819 index += 1;
820 if index >= chars.len() {
821 break;
822 }
823
824 match chars[index] {
825 '\\' | '{' | '}' => {
826 output.push(chars[index]);
827 index += 1;
828 }
829 '\'' => {
830 if index + 2 < chars.len() {
831 let hex = [chars[index + 1], chars[index + 2]];
832 let hex: String = hex.iter().collect();
833 if let Ok(value) = u8::from_str_radix(&hex, 16) {
834 output.push(value as char);
835 index += 3;
836 continue;
837 }
838 }
839 index += 1;
840 }
841 control if control.is_ascii_alphabetic() => {
842 let start = index;
843 while index < chars.len() && chars[index].is_ascii_alphabetic() {
844 index += 1;
845 }
846 let control_word: String = chars[start..index].iter().collect();
847
848 let number_start = index;
849 if index < chars.len()
850 && (chars[index] == '-' || chars[index].is_ascii_digit())
851 {
852 index += 1;
853 while index < chars.len() && chars[index].is_ascii_digit() {
854 index += 1;
855 }
856 }
857 let parameter: String = chars[number_start..index].iter().collect();
858
859 if index < chars.len() && chars[index] == ' ' {
860 index += 1;
861 }
862
863 match control_word.as_str() {
864 "par" | "line" => output.push('\n'),
865 "tab" => output.push('\t'),
866 "emdash" => output.push('—'),
867 "endash" => output.push('–'),
868 "bullet" => output.push('•'),
869 "lquote" | "rquote" => output.push('\''),
870 "ldblquote" | "rdblquote" => output.push('"'),
871 "u" => {
872 if let Ok(codepoint) = parameter.parse::<i32>() {
873 let normalized = if codepoint < 0 {
874 codepoint + 65_536
875 } else {
876 codepoint
877 };
878 if let Ok(normalized) = u32::try_from(normalized)
879 && let Some(ch) = char::from_u32(normalized)
880 {
881 output.push(ch);
882 }
883 }
884
885 if index < chars.len()
886 && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
887 {
888 index += 1;
889 }
890 }
891 _ => {}
892 }
893 }
894 _ => {
895 index += 1;
896 }
897 }
898 }
899 ch => {
900 output.push(ch);
901 index += 1;
902 }
903 }
904 }
905
906 output
907 .replace(['\r', '\u{0c}'], "\n")
908 .lines()
909 .map(str::trim_end)
910 .collect::<Vec<_>>()
911 .join("\n")
912}
913
914fn looks_like_gzip(bytes: &[u8]) -> bool {
915 bytes.starts_with(&[0x1f, 0x8b])
916}
917
918fn looks_like_bzip2(bytes: &[u8]) -> bool {
919 bytes.starts_with(b"BZh")
920}
921
922fn looks_like_xz(bytes: &[u8]) -> bool {
923 bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
924}
925
926fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
927 lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
928}
929
930fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
931 lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
932}
933
934fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
935 lower_extension(path)
936 .as_deref()
937 .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
938 && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
939 || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
940}
941
942fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
943 if looks_like_deb(bytes, path) {
944 "debian binary package (format 2.0)".to_string()
945 } else if looks_like_rpm(bytes, path) {
946 "RPM package".to_string()
947 } else if looks_like_squashfs(bytes, path) {
948 "Squashfs filesystem".to_string()
949 } else if looks_like_gzip(bytes) {
950 "gzip compressed data".to_string()
951 } else if looks_like_bzip2(bytes) {
952 "bzip2 compressed data".to_string()
953 } else if looks_like_xz(bytes) {
954 "XZ compressed data".to_string()
955 } else if is_zip_archive(bytes) {
956 "Zip archive data".to_string()
957 } else if lower_extension(path).as_deref() == Some("gem") {
958 "POSIX tar archive".to_string()
959 } else if let Some(file_type) = format_based_file_type(detected_format) {
960 file_type
961 } else {
962 "archive data".to_string()
963 }
964}
965
966fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
967 let suffix = text_executable_label(bytes);
968
969 match programming_language {
970 Some("Python") => format!("python script, {suffix}"),
971 Some("Ruby") => format!("ruby script, {suffix}"),
972 Some("Perl") => format!("perl script, {suffix}"),
973 Some("PHP") => format!("php script, {suffix}"),
974 Some("Shell") => format!("shell script, {suffix}"),
975 Some("JavaScript") => format!("javascript script, {suffix}"),
976 Some("TypeScript") => format!("typescript script, {suffix}"),
977 Some("PowerShell") => format!("powershell script, {suffix}"),
978 Some("Awk") => format!("awk script, {suffix}"),
979 _ => format!("script, {suffix}"),
980 }
981}
982
983fn text_file_type(bytes: &[u8]) -> String {
984 text_label(bytes).to_string()
985}
986
987fn text_label(bytes: &[u8]) -> &'static str {
988 if std::str::from_utf8(bytes).is_ok() {
989 if bytes.contains(&b'\n') {
990 "UTF-8 Unicode text"
991 } else {
992 "UTF-8 Unicode text, with no line terminators"
993 }
994 } else if bytes.contains(&b'\n') {
995 "text"
996 } else {
997 "text, with no line terminators"
998 }
999}
1000
1001fn text_executable_label(bytes: &[u8]) -> &'static str {
1002 if std::str::from_utf8(bytes).is_ok() {
1003 if bytes.contains(&b'\n') {
1004 "UTF-8 Unicode text executable"
1005 } else {
1006 "UTF-8 Unicode text executable, with no line terminators"
1007 }
1008 } else if bytes.contains(&b'\n') {
1009 "text executable"
1010 } else {
1011 "text executable, with no line terminators"
1012 }
1013}
1014
1015fn supported_image_metadata_format(
1016 ext: Option<&str>,
1017 detected_format: FileFormat,
1018) -> Option<ImageFormat> {
1019 match ext {
1020 Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1021 Some("png") => Some(ImageFormat::Png),
1022 Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1023 Some("webp") => Some(ImageFormat::WebP),
1024 _ => match detected_format.media_type() {
1025 "image/jpeg" => Some(ImageFormat::Jpeg),
1026 "image/png" => Some(ImageFormat::Png),
1027 "image/tiff" => Some(ImageFormat::Tiff),
1028 "image/webp" => Some(ImageFormat::WebP),
1029 _ => None,
1030 },
1031 }
1032}
1033
1034fn should_skip_binary_string_extraction(
1035 path: &Path,
1036 bytes: &[u8],
1037 detected_format: FileFormat,
1038) -> bool {
1039 matches!(lower_extension(path).as_deref(), Some("pdf"))
1040 || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1041 .is_some()
1042 || (matches!(
1043 detected_format.kind(),
1044 FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1045 ) && !is_textual_format(detected_format))
1046 || media_mime_from_content(bytes).is_some()
1047 || is_zip_archive(bytes)
1048 || looks_like_gzip(bytes)
1049 || looks_like_bzip2(bytes)
1050 || looks_like_xz(bytes)
1051 || looks_like_deb(bytes, path)
1052 || looks_like_rpm(bytes, path)
1053 || looks_like_squashfs(bytes, path)
1054}
1055
1056fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1057 match format {
1058 ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1059 ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1060 ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1061 ImageFormat::WebP => {
1062 bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1063 }
1064 _ => false,
1065 }
1066}
1067
1068fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1069 let mut values = Vec::new();
1070 values.extend(extract_exif_metadata_values(bytes));
1071 values.extend(extract_xmp_metadata_values(bytes, format));
1072 values_to_text(values)
1073}
1074
1075fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1076 let mut cursor = BufReader::new(Cursor::new(bytes));
1077 let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1078 Ok(exif) => exif,
1079 Err(_) => return Vec::new(),
1080 };
1081
1082 let mut values = Vec::new();
1083 for field in exif.fields() {
1084 let rendered = match field.tag {
1085 exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1086 Some(field.display_value().with_unit(&exif).to_string())
1087 }
1088 exif::Tag::Artist => Some(format!(
1089 "Author: {}",
1090 field.display_value().with_unit(&exif)
1091 )),
1092 _ => None,
1093 };
1094
1095 if let Some(rendered) = rendered {
1096 values.push(rendered);
1097 }
1098 }
1099
1100 values
1101}
1102
1103fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1104 let xmp = match extract_raw_xmp_packet(bytes, format) {
1105 Some(xmp) => xmp,
1106 None => return Vec::new(),
1107 };
1108
1109 parse_xmp_values(&xmp)
1110}
1111
1112fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1113 let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1114 if let Ok(mut decoder) = reader.into_decoder()
1115 && let Ok(Some(xmp)) = decoder.xmp_metadata()
1116 {
1117 return Some(xmp);
1118 }
1119
1120 match format {
1121 ImageFormat::Png => extract_png_xmp_packet(bytes),
1122 _ => None,
1123 }
1124}
1125
1126fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1127 const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1128
1129 if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1130 return None;
1131 }
1132
1133 let mut offset = PNG_SIGNATURE.len();
1134 while offset + 12 <= bytes.len() {
1135 let length = u32::from_be_bytes([
1136 bytes[offset],
1137 bytes[offset + 1],
1138 bytes[offset + 2],
1139 bytes[offset + 3],
1140 ]) as usize;
1141 let chunk_start = offset + 8;
1142 let chunk_end = chunk_start + length;
1143 if chunk_end + 4 > bytes.len() {
1144 return None;
1145 }
1146
1147 let chunk_type = &bytes[offset + 4..offset + 8];
1148 if chunk_type == b"iTXt" {
1149 let data = &bytes[chunk_start..chunk_end];
1150 if let Some(xmp) = parse_png_itxt_xmp(data) {
1151 return Some(xmp);
1152 }
1153 }
1154
1155 offset = chunk_end + 4;
1156 }
1157
1158 None
1159}
1160
1161fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1162 const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1163
1164 let keyword_end = data.iter().position(|&b| b == 0)?;
1165 if &data[..keyword_end] != XMP_KEYWORD {
1166 return None;
1167 }
1168
1169 let mut cursor = keyword_end + 1;
1170 let compression_flag = *data.get(cursor)?;
1171 cursor += 1;
1172 let compression_method = *data.get(cursor)?;
1173 cursor += 1;
1174 if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1175 return None;
1176 }
1177
1178 let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1179 cursor = language_end + 1;
1180
1181 let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1182 cursor = translated_end + 1;
1183
1184 let text_bytes = &data[cursor..];
1185 if compression_flag == 1 {
1186 let mut decoder = ZlibDecoder::new(text_bytes);
1187 let mut decoded = Vec::new();
1188 decoder.read_to_end(&mut decoded).ok()?;
1189 Some(decoded)
1190 } else {
1191 Some(text_bytes.to_vec())
1192 }
1193}
1194
1195fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1196 let mut reader = XmlReader::from_reader(xmp);
1197 reader.config_mut().trim_text(true);
1198
1199 let mut buf = Vec::new();
1200 let mut stack: Vec<String> = Vec::new();
1201 let mut values = Vec::new();
1202
1203 loop {
1204 match reader.read_event_into(&mut buf) {
1205 Ok(Event::Start(e)) => {
1206 stack.push(local_xml_name(e.name().as_ref()));
1207 }
1208 Ok(Event::End(_)) => {
1209 stack.pop();
1210 }
1211 Ok(Event::Empty(_)) => {}
1212 Ok(Event::Text(text)) => {
1213 if let Some(field) = stack
1214 .iter()
1215 .rev()
1216 .find_map(|name| allowed_xmp_field(name.as_str()))
1217 && let Ok(decoded) = text.decode()
1218 {
1219 let decoded = decoded.into_owned();
1220 if !decoded.trim().is_empty() {
1221 values.push(format_xmp_value(field, &decoded));
1222 }
1223 }
1224 }
1225 Ok(Event::CData(text)) => {
1226 if let Some(field) = stack
1227 .iter()
1228 .rev()
1229 .find_map(|name| allowed_xmp_field(name.as_str()))
1230 && let Ok(decoded) = text.decode()
1231 {
1232 let decoded = decoded.into_owned();
1233 if !decoded.trim().is_empty() {
1234 values.push(format_xmp_value(field, &decoded));
1235 }
1236 }
1237 }
1238 Ok(Event::Eof) | Err(_) => break,
1239 _ => {}
1240 }
1241 buf.clear();
1242 }
1243
1244 values
1245}
1246
1247fn local_xml_name(name: &[u8]) -> String {
1248 let name = std::str::from_utf8(name).unwrap_or_default();
1249 name.rsplit(':').next().unwrap_or(name).to_string()
1250}
1251
1252fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1253 match name {
1254 "creator" => Some("creator"),
1255 "rights" => Some("rights"),
1256 "description" => Some("description"),
1257 "title" => Some("title"),
1258 "subject" => Some("subject"),
1259 "UsageTerms" => Some("usage_terms"),
1260 "WebStatement" => Some("web_statement"),
1261 _ => None,
1262 }
1263}
1264
1265fn format_xmp_value(field: &str, value: &str) -> String {
1266 match field {
1267 "creator" => format!("Author: {value}"),
1268 _ => value.to_string(),
1269 }
1270}
1271
1272fn values_to_text(values: Vec<String>) -> String {
1273 let mut seen = BTreeSet::new();
1274 let mut lines = Vec::new();
1275 let mut total_bytes = 0usize;
1276
1277 for value in values {
1278 if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1279 break;
1280 }
1281
1282 let normalized = normalize_metadata_value(&value);
1283 if normalized.is_empty() || !seen.insert(normalized.clone()) {
1284 continue;
1285 }
1286
1287 let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1288 if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1289 break;
1290 }
1291
1292 total_bytes += added_bytes;
1293 lines.push(normalized);
1294 }
1295
1296 lines.join("\n")
1297}
1298
1299fn normalize_metadata_value(value: &str) -> String {
1300 value
1301 .chars()
1302 .filter(|&ch| ch != '\0')
1303 .collect::<String>()
1304 .split_whitespace()
1305 .collect::<Vec<_>>()
1306 .join(" ")
1307 .trim()
1308 .to_string()
1309}
1310
1311fn extract_pdf_text(path: &Path, bytes: &[u8]) -> String {
1312 if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1313 return String::new();
1314 }
1315
1316 let extracted = catch_unwind(AssertUnwindSafe(
1317 || -> Result<String, Box<dyn std::error::Error>> {
1318 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1319 extract_first_pdf_page_text(&mut document)
1320 },
1321 ));
1322 if let Ok(Ok(text)) = extracted
1323 && let Some(normalized) = normalize_pdf_text(text)
1324 {
1325 return normalized;
1326 }
1327
1328 let extracted = catch_unwind(AssertUnwindSafe(
1329 || -> Result<String, Box<dyn std::error::Error>> {
1330 let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1331 extract_pdf_text_from_document(&mut document)
1332 },
1333 ));
1334 if let Ok(Ok(text)) = extracted
1335 && let Some(normalized) = normalize_pdf_text(text)
1336 {
1337 return normalized;
1338 }
1339
1340 let extracted = catch_unwind(AssertUnwindSafe(
1341 || -> Result<String, Box<dyn std::error::Error>> {
1342 let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1343 extract_pdf_text_from_document(&mut document)
1344 },
1345 ));
1346 if let Ok(Ok(text)) = extracted
1347 && let Some(normalized) = normalize_pdf_text(text)
1348 {
1349 return normalized;
1350 }
1351
1352 String::new()
1353}
1354
1355fn extract_first_pdf_page_text(
1356 document: &mut pdf_oxide::document::PdfDocument,
1357) -> Result<String, Box<dyn std::error::Error>> {
1358 if document.page_count()? == 0 {
1359 return Ok(String::new());
1360 }
1361
1362 let extracted_text = document.extract_text(0)?;
1363 let markdown_text =
1364 document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1365 if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1366 return Ok(extracted_text);
1367 }
1368
1369 let pipeline_text =
1370 document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1371
1372 Ok(merge_pdf_first_page_text(
1373 &extracted_text,
1374 &markdown_text,
1375 &pipeline_text,
1376 ))
1377}
1378
1379fn extract_pdf_text_from_document(
1380 document: &mut pdf_oxide::document::PdfDocument,
1381) -> Result<String, Box<dyn std::error::Error>> {
1382 Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1383}
1384
1385fn normalize_pdf_text(text: String) -> Option<String> {
1386 let normalized = text.replace(['\r', '\u{0c}'], "\n");
1387 (!normalized.trim().is_empty()).then_some(normalized)
1388}
1389
1390fn merge_pdf_first_page_text(
1391 _extracted_text: &str,
1392 markdown_text: &str,
1393 pipeline_text: &str,
1394) -> String {
1395 let pipeline = pipeline_text.trim();
1396 if pipeline.is_empty() {
1397 return String::new();
1398 }
1399
1400 let prefix = pdf_first_page_heading_prefix(markdown_text);
1401 let Some(prefix) = prefix else {
1402 return pipeline_text.to_string();
1403 };
1404
1405 if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1406 pipeline_text.to_string()
1407 } else {
1408 format!("{prefix}\n\n{pipeline}")
1409 }
1410}
1411
1412fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1413 normalize_pdf_heading_comparison_text(text)
1414 .contains(&normalize_pdf_heading_comparison_text(prefix))
1415}
1416
1417fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1418 text.split_whitespace()
1419 .map(|part| part.to_ascii_lowercase())
1420 .collect::<Vec<_>>()
1421 .join(" ")
1422}
1423
1424fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1425 let mut lines = Vec::new();
1426
1427 for line in pdf_markdown_heading_lines(markdown_text) {
1428 push_unique_line(&mut lines, line);
1429 }
1430
1431 (!lines.is_empty()).then(|| lines.join("\n"))
1432}
1433
1434fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1435 text.lines()
1436 .map(str::trim)
1437 .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1438 .map(|line| line.trim_matches('#').trim())
1439 .filter(|line| !line.is_empty())
1440 .filter(|line| !looks_like_numbered_section_heading(line))
1441 .take(4)
1442 .map(ToOwned::to_owned)
1443 .collect()
1444}
1445
1446fn push_unique_line(lines: &mut Vec<String>, line: String) {
1447 if !lines.iter().any(|existing| existing == &line) {
1448 lines.push(line);
1449 }
1450}
1451
1452fn looks_like_numbered_section_heading(line: &str) -> bool {
1453 let mut chars = line.chars();
1454 let Some(first) = chars.next() else {
1455 return false;
1456 };
1457
1458 if !first.is_ascii_digit() {
1459 return false;
1460 }
1461
1462 matches!(chars.next(), Some('.'))
1463}
1464
1465fn is_zip_archive(bytes: &[u8]) -> bool {
1466 bytes.starts_with(b"PK\x03\x04")
1467 || bytes.starts_with(b"PK\x05\x06")
1468 || bytes.starts_with(b"PK\x07\x08")
1469}
1470
1471pub fn extract_printable_strings(bytes: &[u8]) -> String {
1472 const MIN_LEN: usize = 4;
1473 const MIN_OUTPUT_BYTES: usize = 2_000_000;
1474 const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1475
1476 let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1477
1478 fn is_printable_ascii(b: u8) -> bool {
1479 matches!(b, 0x20..=0x7E)
1480 }
1481
1482 let mut out = String::new();
1483 let mut run: Vec<u8> = Vec::new();
1484
1485 let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1486 if run.len() >= MIN_LEN {
1487 if !out.is_empty() {
1488 out.push('\n');
1489 }
1490 out.push_str(&String::from_utf8_lossy(run));
1491 }
1492 run.clear();
1493 };
1494
1495 for &b in bytes {
1496 if is_printable_ascii(b) {
1497 run.push(b);
1498 } else {
1499 flush_run(&mut out, &mut run);
1500 if out.len() >= max_output_bytes {
1501 return out;
1502 }
1503 }
1504 }
1505 flush_run(&mut out, &mut run);
1506 if out.len() >= max_output_bytes {
1507 return out;
1508 }
1509
1510 for start in 0..=1 {
1511 run.clear();
1512 let mut i = start;
1513 while i + 1 < bytes.len() {
1514 let b0 = bytes[i];
1515 let b1 = bytes[i + 1];
1516 let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1517 if is_printable_ascii(ch) && zero == 0 {
1518 run.push(ch);
1519 } else {
1520 flush_run(&mut out, &mut run);
1521 if out.len() >= max_output_bytes {
1522 return out;
1523 }
1524 }
1525 i += 2;
1526 }
1527 flush_run(&mut out, &mut run);
1528 if out.len() >= max_output_bytes {
1529 return out;
1530 }
1531 }
1532
1533 out
1534}
1535
1536#[cfg(test)]
1537mod tests {
1538 use std::path::Path;
1539
1540 use super::{
1541 ExtractedTextKind, classify_file_info, extract_printable_strings,
1542 extract_text_for_detection, normalize_mime_type, normalize_pdf_heading_comparison_text,
1543 };
1544
1545 #[test]
1546 fn test_extract_text_for_detection_skips_jar_archives() {
1547 let path = Path::new(
1548 "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
1549 );
1550 let bytes = std::fs::read(path).expect("failed to read jar fixture");
1551
1552 let (text, kind) = extract_text_for_detection(path, &bytes);
1553
1554 assert!(text.is_empty());
1555 assert_eq!(kind, ExtractedTextKind::None);
1556 }
1557
1558 #[test]
1559 fn test_extract_text_for_detection_reads_pdf_fixture_text() {
1560 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1561 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1562
1563 let (text, kind) = extract_text_for_detection(path, &bytes);
1564
1565 assert_eq!(kind, ExtractedTextKind::Pdf);
1566 assert!(text.contains("Redistribution and use in source and binary forms"));
1567 }
1568
1569 #[test]
1570 fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
1571 let path =
1572 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1573 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1574
1575 let (text, kind) = extract_text_for_detection(path, &bytes);
1576
1577 assert_eq!(kind, ExtractedTextKind::Pdf);
1578 assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
1579 assert!(!text.contains("DISCLAIMER OF WARRANTY"));
1580 }
1581
1582 #[test]
1583 fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
1584 let path =
1585 Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1586 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1587
1588 let (text, kind) = extract_text_for_detection(path, &bytes);
1589
1590 assert_eq!(kind, ExtractedTextKind::Pdf);
1591
1592 let normalized = normalize_pdf_heading_comparison_text(&text);
1593 let heading =
1594 normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
1595 assert_eq!(normalized.matches(&heading).count(), 1);
1596 }
1597
1598 #[test]
1599 fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
1600 let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1601 let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1602
1603 let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
1604
1605 assert_eq!(kind, ExtractedTextKind::Pdf);
1606 assert!(text.contains("Redistribution and use in source and binary forms"));
1607 }
1608
1609 #[test]
1610 fn test_extract_text_for_detection_skips_zip_like_archives() {
1611 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
1612
1613 let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
1614 let (crate_text, crate_kind) =
1615 extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
1616
1617 assert!(whl_text.is_empty());
1618 assert_eq!(whl_kind, ExtractedTextKind::None);
1619 assert!(crate_text.is_empty());
1620 assert_eq!(crate_kind, ExtractedTextKind::None);
1621 }
1622
1623 #[test]
1624 fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
1625 let path =
1626 Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
1627 let bytes = std::fs::read(path).expect("failed to read lib fixture");
1628
1629 let (text, kind) = extract_text_for_detection(path, &bytes);
1630
1631 assert_ne!(kind, ExtractedTextKind::None);
1632 assert!(text.contains("Copyright nexB and others (c) 2012"));
1633 }
1634
1635 #[test]
1636 fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
1637 let bytes = b"abcd\0".repeat(525_000);
1638
1639 let text = extract_printable_strings(&bytes);
1640
1641 assert!(
1642 text.len() > 2_000_000,
1643 "unexpected truncation at {}",
1644 text.len()
1645 );
1646 assert!(text.ends_with("abcd"));
1647 }
1648
1649 #[test]
1650 fn test_extract_text_for_detection_decodes_svg_fixture_text() {
1651 let path = Path::new(
1652 "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
1653 );
1654 let bytes = std::fs::read(path).expect("failed to read svg fixture");
1655
1656 let (text, kind) = extract_text_for_detection(path, &bytes);
1657
1658 assert_eq!(kind, ExtractedTextKind::Decoded);
1659 assert!(text.contains("creativecommons.org/licenses/publicdomain"));
1660 }
1661
1662 #[test]
1663 fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
1664 let path = Path::new(
1665 "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
1666 );
1667 let bytes = std::fs::read(path).expect("failed to read rtf fixture");
1668
1669 let (text, kind) = extract_text_for_detection(path, &bytes);
1670
1671 assert_eq!(kind, ExtractedTextKind::Decoded);
1672 assert!(text.contains("GNU Lesser General Public"));
1673 assert!(text.contains("version"));
1674 assert!(text.contains("2.1 of the License"));
1675 }
1676
1677 #[test]
1678 fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
1679 assert_eq!(
1680 normalize_mime_type(
1681 Path::new("main.ts"),
1682 b"export const answer = 42;\n",
1683 Some("TypeScript"),
1684 "video/mp2t",
1685 ),
1686 "text/plain"
1687 );
1688 }
1689
1690 #[test]
1691 fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
1692 assert_eq!(
1693 normalize_mime_type(
1694 Path::new("main.js"),
1695 b"console.log('hello');\n",
1696 Some("JavaScript"),
1697 "application/octet-stream",
1698 ),
1699 "text/plain"
1700 );
1701 }
1702
1703 #[test]
1704 fn test_normalize_mime_type_preserves_binary_video_guess() {
1705 assert_eq!(
1706 normalize_mime_type(
1707 Path::new("main.ts"),
1708 &[0, 159, 146, 150, 0, 1, 2, 3],
1709 Some("TypeScript"),
1710 "video/mp2t",
1711 ),
1712 "video/mp2t"
1713 );
1714 }
1715
1716 #[test]
1717 fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
1718 assert_eq!(
1719 normalize_mime_type(
1720 Path::new("main.ts"),
1721 &[0, 159, 146, 150],
1722 Some("TypeScript"),
1723 "application/octet-stream",
1724 ),
1725 "application/octet-stream"
1726 );
1727 }
1728
1729 #[test]
1730 fn test_classify_file_info_marks_empty_files_as_text_not_source() {
1731 let classification = classify_file_info(Path::new("test.txt"), b"");
1732
1733 assert_eq!(classification.mime_type, "inode/x-empty");
1734 assert_eq!(classification.file_type, "empty");
1735 assert!(!classification.is_binary);
1736 assert!(classification.is_text);
1737 assert!(!classification.is_source);
1738 assert_eq!(classification.programming_language, None);
1739 }
1740
1741 #[test]
1742 fn test_classify_file_info_keeps_json_out_of_programming_language() {
1743 let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
1744
1745 assert_eq!(classification.mime_type, "application/json");
1746 assert_eq!(classification.file_type, "JSON text data");
1747 assert!(classification.is_text);
1748 assert!(!classification.is_source);
1749 assert_eq!(classification.programming_language, None);
1750 }
1751
1752 #[test]
1753 fn test_classify_file_info_treats_dockerfile_as_source() {
1754 let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
1755
1756 assert_eq!(
1757 classification.programming_language.as_deref(),
1758 Some("Dockerfile")
1759 );
1760 assert!(classification.is_source);
1761 assert!(!classification.is_script);
1762 assert_eq!(classification.file_type, "UTF-8 Unicode text");
1763 }
1764
1765 #[test]
1766 fn test_classify_file_info_treats_makefile_as_text_not_source() {
1767 let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
1768
1769 assert_eq!(classification.programming_language, None);
1770 assert!(classification.is_text);
1771 assert!(!classification.is_source);
1772 assert!(!classification.is_script);
1773 assert_eq!(classification.file_type, "UTF-8 Unicode text");
1774 }
1775
1776 #[test]
1777 fn test_classify_file_info_marks_supported_package_archives() {
1778 let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
1779
1780 let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
1781 let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
1782
1783 assert!(egg.is_archive);
1784 assert_eq!(egg.mime_type, "application/zip");
1785 assert_eq!(egg.file_type, "Zip archive data");
1786 assert!(nupkg.is_archive);
1787 assert_eq!(nupkg.mime_type, "application/zip");
1788 assert_eq!(nupkg.file_type, "Zip archive data");
1789 }
1790
1791 #[test]
1792 fn test_classify_file_info_marks_png_as_binary_media() {
1793 let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
1794
1795 let classification = classify_file_info(Path::new("logo.png"), png_bytes);
1796
1797 assert_eq!(classification.mime_type, "image/png");
1798 assert_eq!(classification.file_type, "PNG image data");
1799 assert!(classification.is_binary);
1800 assert!(!classification.is_text);
1801 assert!(classification.is_media);
1802 assert!(!classification.is_archive);
1803 assert!(!classification.is_source);
1804 }
1805
1806 #[test]
1807 fn test_classify_file_info_marks_pdf_as_binary_document() {
1808 let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
1809
1810 let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
1811
1812 assert_eq!(classification.mime_type, "application/pdf");
1813 assert_eq!(classification.file_type, "PDF document");
1814 assert!(classification.is_binary);
1815 assert!(!classification.is_text);
1816 assert!(!classification.is_archive);
1817 assert!(!classification.is_media);
1818 }
1819
1820 #[test]
1821 fn test_classify_file_info_marks_binary_blobs_as_binary() {
1822 let classification =
1823 classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
1824
1825 assert!(classification.is_binary);
1826 assert!(!classification.is_text);
1827 assert!(!classification.is_source);
1828 assert_eq!(classification.programming_language, None);
1829 }
1830
1831 #[test]
1832 fn test_classify_file_info_treats_yaml_as_text_not_source() {
1833 let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
1834
1835 assert_eq!(classification.programming_language, None);
1836 assert!(classification.is_text);
1837 assert!(!classification.is_source);
1838 assert_eq!(classification.file_type, "YAML text data");
1839 }
1840
1841 #[test]
1842 fn test_classify_file_info_classifies_common_build_manifests() {
1843 let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
1844 let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
1845 let gitmodules = classify_file_info(
1846 Path::new(".gitmodules"),
1847 b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
1848 );
1849
1850 assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
1851 assert!(gradle.is_source);
1852 assert_eq!(gradle.mime_type, "text/plain");
1853
1854 assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
1855 assert!(flake.is_source);
1856 assert_eq!(flake.mime_type, "text/plain");
1857
1858 assert_eq!(gitmodules.programming_language, None);
1859 assert!(gitmodules.is_text);
1860 assert!(!gitmodules.is_source);
1861 assert_eq!(gitmodules.file_type, "Git configuration text");
1862 }
1863
1864 #[test]
1865 fn test_classify_file_info_labels_javascript_shebang_scripts() {
1866 let classification = classify_file_info(
1867 Path::new("bin/run"),
1868 b"#!/usr/bin/env node\nconsole.log('hello');\n",
1869 );
1870
1871 assert_eq!(
1872 classification.programming_language.as_deref(),
1873 Some("JavaScript")
1874 );
1875 assert!(classification.is_script);
1876 assert_eq!(
1877 classification.file_type,
1878 "javascript script, UTF-8 Unicode text executable"
1879 );
1880 }
1881
1882 #[test]
1883 fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
1884 let classification = classify_file_info(
1885 Path::new("script.py"),
1886 b"# coding: latin-1\nprint(\"caf\xe9\")\n",
1887 );
1888
1889 assert_eq!(
1890 classification.programming_language.as_deref(),
1891 Some("Python")
1892 );
1893 assert!(classification.is_script);
1894 assert_eq!(classification.file_type, "python script, text executable");
1895 }
1896
1897 #[test]
1898 fn test_classify_file_info_treats_textual_tga_as_media() {
1899 let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
1900
1901 assert!(classification.is_media);
1902 assert!(classification.is_text);
1903 assert!(!classification.is_binary);
1904 }
1905
1906 #[test]
1907 fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
1908 let classification =
1909 classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
1910
1911 assert!(classification.is_binary);
1912 assert!(!classification.is_text);
1913 assert!(!classification.is_source);
1914 assert_eq!(classification.programming_language, None);
1915 }
1916
1917 #[test]
1918 fn test_extract_text_for_detection_skips_unsupported_image_formats() {
1919 let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
1920
1921 let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
1922
1923 assert!(text.is_empty());
1924 assert_eq!(kind, ExtractedTextKind::None);
1925 }
1926
1927 #[test]
1928 fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
1929 let cases = [
1930 (
1931 Path::new("bin/run"),
1932 b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
1933 Some("JavaScript"),
1934 true,
1935 true,
1936 ),
1937 (
1938 Path::new("Dockerfile"),
1939 b"FROM scratch\n".as_slice(),
1940 Some("Dockerfile"),
1941 true,
1942 false,
1943 ),
1944 (
1945 Path::new("package.json"),
1946 br#"{"name":"demo"}"#.as_slice(),
1947 None,
1948 false,
1949 false,
1950 ),
1951 (
1952 Path::new("config.yaml"),
1953 b"key: value\n".as_slice(),
1954 None,
1955 false,
1956 false,
1957 ),
1958 (
1959 Path::new("Makefile"),
1960 b"all:\n\techo hi\n".as_slice(),
1961 None,
1962 false,
1963 false,
1964 ),
1965 ];
1966
1967 for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
1968 let classification = classify_file_info(path, bytes);
1969
1970 assert_eq!(
1971 classification.programming_language.as_deref(),
1972 expected_language,
1973 "unexpected language for {}",
1974 path.display()
1975 );
1976 assert_eq!(
1977 classification.is_source,
1978 expected_is_source,
1979 "unexpected is_source for {}",
1980 path.display()
1981 );
1982 assert_eq!(
1983 classification.is_script,
1984 expected_is_script,
1985 "unexpected is_script for {}",
1986 path.display()
1987 );
1988 }
1989 }
1990}