1use std::collections::HashSet;
50use std::fs;
51use std::io::{BufRead, BufReader, Read};
52use std::path::Path;
53
54pub mod extensions;
55pub mod interpreters;
56pub mod tags;
57
58use extensions::{EXTENSIONS, EXTENSIONS_NEED_BINARY_CHECK, NAMES};
59use interpreters::INTERPRETERS;
60use tags::*;
61
62pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
67
68#[derive(Debug)]
70pub enum IdentifyError {
71 PathNotFound(String),
73 IoError(std::io::Error),
75}
76
77impl std::fmt::Display for IdentifyError {
78 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79 match self {
80 IdentifyError::PathNotFound(path) => write!(f, "{path} does not exist."),
81 IdentifyError::IoError(err) => write!(f, "IO error: {err}"),
82 }
83 }
84}
85
86impl std::error::Error for IdentifyError {}
87
88impl From<std::io::Error> for IdentifyError {
89 fn from(err: std::io::Error) -> Self {
90 IdentifyError::IoError(err)
91 }
92}
93
94pub fn tags_from_path<P: AsRef<Path>>(path: P) -> Result<TagSet> {
134 let path = path.as_ref();
135 let path_str = path.to_string_lossy();
136
137 let metadata = match fs::symlink_metadata(path) {
138 Ok(meta) => meta,
139 Err(_) => return Err(Box::new(IdentifyError::PathNotFound(path_str.to_string()))),
140 };
141
142 let file_type = metadata.file_type();
143
144 if file_type.is_dir() {
145 return Ok([DIRECTORY].iter().cloned().collect());
146 }
147 if file_type.is_symlink() {
148 return Ok([SYMLINK].iter().cloned().collect());
149 }
150
151 #[cfg(unix)]
153 {
154 use std::os::unix::fs::FileTypeExt;
155 if file_type.is_socket() {
156 return Ok([SOCKET].iter().cloned().collect());
157 }
158 }
159
160 let mut tags = TagSet::new();
161 tags.insert(FILE);
162
163 let is_executable = {
165 #[cfg(unix)]
166 {
167 use std::os::unix::fs::PermissionsExt;
168 metadata.permissions().mode() & 0o111 != 0
169 }
170 #[cfg(not(unix))]
171 {
172 path.extension()
174 .and_then(|ext| ext.to_str())
175 .map(|ext| matches!(ext.to_lowercase().as_str(), "exe" | "bat" | "cmd"))
176 .unwrap_or(false)
177 }
178 };
179
180 if is_executable {
181 tags.insert(EXECUTABLE);
182 } else {
183 tags.insert(NON_EXECUTABLE);
184 }
185
186 if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
188 let filename_tags = tags_from_filename(filename);
189 if !filename_tags.is_empty() {
190 tags.extend(filename_tags);
191 } else if is_executable {
192 if let Ok(shebang_tags) = parse_shebang_from_file(path) {
194 tags.extend(shebang_tags);
195 }
196 }
197 }
198
199 if !tags.iter().any(|tag| ENCODING_TAGS.contains(tag)) {
201 if file_is_text(path)? {
202 tags.insert(TEXT);
203 } else {
204 tags.insert(BINARY);
205 }
206 }
207
208 Ok(tags)
209}
210
211pub fn tags_from_filename(filename: &str) -> TagSet {
242 let mut tags = TagSet::new();
243
244 for part in std::iter::once(filename).chain(filename.split('.')) {
246 if let Some(name_tags) = NAMES.get(part) {
247 tags.extend(name_tags.iter().cloned());
248 break;
249 }
250 }
251
252 if let Some(ext) = Path::new(filename).extension().and_then(|e| e.to_str()) {
254 let ext_lower = ext.to_lowercase();
255
256 if let Some(ext_tags) = EXTENSIONS.get(ext_lower.as_str()) {
257 tags.extend(ext_tags.iter().cloned());
258 } else if let Some(ext_tags) = EXTENSIONS_NEED_BINARY_CHECK.get(ext_lower.as_str()) {
259 tags.extend(ext_tags.iter().cloned());
260 }
261 }
262
263 tags
264}
265
266pub fn tags_from_interpreter(interpreter: &str) -> TagSet {
298 let interpreter_name = interpreter.split('/').next_back().unwrap_or(interpreter);
300
301 let mut current = interpreter_name;
303 while !current.is_empty() {
304 if let Some(tags) = INTERPRETERS.get(current) {
305 return tags.clone();
306 }
307
308 match current.rfind('.') {
310 Some(pos) => current = ¤t[..pos],
311 None => break,
312 }
313 }
314
315 TagSet::new()
316}
317
318pub fn file_is_text<P: AsRef<Path>>(path: P) -> Result<bool> {
352 let file = fs::File::open(path)?;
353 is_text(file)
354}
355
356pub fn is_text<R: Read>(mut reader: R) -> Result<bool> {
382 let mut buffer = [0; 1024];
383 let bytes_read = reader.read(&mut buffer)?;
384
385 let text_chars: HashSet<u8> = [
387 7, 8, 9, 10, 11, 12, 13, 27, ]
389 .iter()
390 .cloned()
391 .chain(0x20..0x7F) .chain(0x80..=0xFF) .collect();
394
395 let is_text = buffer[..bytes_read]
396 .iter()
397 .all(|&byte| text_chars.contains(&byte));
398 Ok(is_text)
399}
400
401pub fn parse_shebang_from_file<P: AsRef<Path>>(path: P) -> Result<TagSet> {
440 let path = path.as_ref();
441
442 let metadata = fs::metadata(path)?;
444 #[cfg(unix)]
445 {
446 use std::os::unix::fs::PermissionsExt;
447 if metadata.permissions().mode() & 0o111 == 0 {
448 return Ok(TagSet::new());
449 }
450 }
451
452 let file = fs::File::open(path)?;
453 parse_shebang(file)
454}
455
456pub fn parse_shebang<R: Read>(reader: R) -> Result<TagSet> {
486 let mut buf_reader = BufReader::new(reader);
487 let mut first_line = String::new();
488 buf_reader.read_line(&mut first_line)?;
489
490 if !first_line.starts_with("#!") {
491 return Ok(TagSet::new());
492 }
493
494 let shebang_line = first_line[2..].trim();
496
497 let parts: Vec<&str> = shebang_line.split_whitespace().collect();
499 if parts.is_empty() {
500 return Ok(TagSet::new());
501 }
502
503 let cmd = if parts.len() >= 2 && parts[0] == "/usr/bin/env" {
504 if parts[1] == "-S" && parts.len() > 2 {
505 &parts[2..]
506 } else {
507 &parts[1..]
508 }
509 } else {
510 &parts
511 };
512
513 if cmd.is_empty() {
514 return Ok(TagSet::new());
515 }
516
517 let interpreter = cmd[0].split('/').next_back().unwrap_or(cmd[0]);
519 Ok(tags_from_interpreter(interpreter))
520}
521
522#[cfg(test)]
523mod tests {
524 use super::*;
525 use std::fs;
526 use std::io::Cursor;
527 use std::os::unix::fs::PermissionsExt;
528 use tempfile::{NamedTempFile, tempdir};
529
530 #[test]
532 fn test_all_basic_tags_exist() {
533 assert!(TYPE_TAGS.contains("file"));
534 assert!(TYPE_TAGS.contains("directory"));
535 assert!(MODE_TAGS.contains("executable"));
536 assert!(ENCODING_TAGS.contains("text"));
537 }
538
539 #[test]
540 fn test_tag_groups_are_disjoint() {
541 assert!(TYPE_TAGS.is_disjoint(&MODE_TAGS));
542 assert!(TYPE_TAGS.is_disjoint(&ENCODING_TAGS));
543 assert!(MODE_TAGS.is_disjoint(&ENCODING_TAGS));
544 }
545
546 #[test]
548 fn test_tags_from_filename_basic() {
549 let tags = tags_from_filename("file.py");
550 assert!(tags.contains("text"));
551 assert!(tags.contains("python"));
552 }
553
554 #[test]
555 fn test_tags_from_filename_special_names() {
556 let tags = tags_from_filename("Dockerfile");
557 assert!(tags.contains("dockerfile"));
558 assert!(tags.contains("text"));
559
560 let tags = tags_from_filename("Makefile");
561 assert!(tags.contains("makefile"));
562 assert!(tags.contains("text"));
563
564 let tags = tags_from_filename("Cargo.toml");
565 assert!(tags.contains("toml"));
566 assert!(tags.contains("cargo"));
567 }
568
569 #[test]
570 fn test_tags_from_filename_case_insensitive_extension() {
571 let tags = tags_from_filename("image.JPG");
572 assert!(tags.contains("binary"));
573 assert!(tags.contains("image"));
574 assert!(tags.contains("jpeg"));
575 }
576
577 #[test]
578 fn test_tags_from_filename_precedence() {
579 let tags = tags_from_filename("setup.cfg");
581 assert!(tags.contains("ini"));
582 }
583
584 #[test]
585 fn test_tags_from_filename_complex_names() {
586 let tags = tags_from_filename("Dockerfile.xenial");
587 assert!(tags.contains("dockerfile"));
588
589 let tags = tags_from_filename("README.md");
590 assert!(tags.contains("markdown"));
591 assert!(tags.contains("plain-text"));
592 }
593
594 #[test]
595 fn test_tags_from_filename_unrecognized() {
596 let tags = tags_from_filename("unknown.xyz");
597 assert!(tags.is_empty());
598
599 let tags = tags_from_filename("noextension");
600 assert!(tags.is_empty());
601 }
602
603 #[test]
605 fn test_tags_from_interpreter_basic() {
606 let tags = tags_from_interpreter("python3");
607 assert!(tags.contains("python"));
608 assert!(tags.contains("python3"));
609 }
610
611 #[test]
612 fn test_tags_from_interpreter_versioned() {
613 let tags = tags_from_interpreter("python3.11.2");
614 assert!(tags.contains("python"));
615 assert!(tags.contains("python3"));
616
617 let tags = tags_from_interpreter("php8.1");
618 assert!(tags.contains("php"));
619 assert!(tags.contains("php8"));
620 }
621
622 #[test]
623 fn test_tags_from_interpreter_with_path() {
624 let tags = tags_from_interpreter("/usr/bin/python3");
625 assert!(tags.contains("python"));
626 assert!(tags.contains("python3"));
627 }
628
629 #[test]
630 fn test_tags_from_interpreter_unrecognized() {
631 let tags = tags_from_interpreter("unknown-interpreter");
632 assert!(tags.is_empty());
633
634 let tags = tags_from_interpreter("");
635 assert!(tags.is_empty());
636 }
637
638 #[test]
640 fn test_is_text_basic() {
641 assert!(is_text(Cursor::new(b"hello world")).unwrap());
642 assert!(is_text(Cursor::new(b"")).unwrap());
643 assert!(!is_text(Cursor::new(b"hello\x00world")).unwrap());
644 }
645
646 #[test]
647 fn test_is_text_unicode() {
648 assert!(is_text(Cursor::new("éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)ノ".as_bytes())).unwrap());
649 assert!(is_text(Cursor::new(r"¯\_(ツ)_/¯".as_bytes())).unwrap());
650 assert!(is_text(Cursor::new("♪┏(・o・)┛♪┗ ( ・o・) ┓♪".as_bytes())).unwrap());
651 }
652
653 #[test]
654 fn test_is_text_binary_data() {
655 assert!(!is_text(Cursor::new(&[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01])).unwrap());
657 assert!(!is_text(Cursor::new(&[0x43, 0x92, 0xd9, 0x0f, 0xaf, 0x32, 0x2c])).unwrap());
659 }
660
661 #[test]
663 fn test_parse_shebang_basic() {
664 let tags = parse_shebang(Cursor::new(b"#!/usr/bin/python")).unwrap();
665 assert!(tags.contains("python"));
666
667 let tags = parse_shebang(Cursor::new(b"#!/usr/bin/env python")).unwrap();
668 assert!(tags.contains("python"));
669 }
670
671 #[test]
672 fn test_parse_shebang_env_with_flags() {
673 let tags = parse_shebang(Cursor::new(b"#!/usr/bin/env -S python -u")).unwrap();
674 assert!(tags.contains("python"));
675 }
676
677 #[test]
678 fn test_parse_shebang_spaces() {
679 let tags = parse_shebang(Cursor::new(b"#! /usr/bin/python")).unwrap();
680 assert!(tags.contains("python"));
681
682 let tags = parse_shebang(Cursor::new(b"#!/usr/bin/foo python")).unwrap();
683 assert!(tags.is_empty()); }
686
687 #[test]
688 fn test_parse_shebang_no_shebang() {
689 let tags = parse_shebang(Cursor::new(b"import sys")).unwrap();
690 assert!(tags.is_empty());
691
692 let tags = parse_shebang(Cursor::new(b"")).unwrap();
693 assert!(tags.is_empty());
694 }
695
696 #[test]
697 fn test_parse_shebang_invalid_utf8() {
698 let result = parse_shebang(Cursor::new(&[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd]));
699 match result {
700 Ok(tags) => assert!(tags.is_empty()),
701 Err(_) => (), }
703 }
704
705 #[test]
707 fn test_tags_from_path_file_not_found() {
708 let result = tags_from_path("/nonexistent/path");
709 assert!(result.is_err());
710 assert!(result.unwrap_err().to_string().contains("does not exist"));
711 }
712
713 #[test]
714 fn test_tags_from_path_regular_file() {
715 let file = NamedTempFile::new().unwrap();
716 fs::write(&file, "print('hello')").unwrap();
717
718 let tags = tags_from_path(file.path()).unwrap();
719 assert!(tags.contains("file"));
720 assert!(tags.contains("non-executable"));
721 assert!(tags.contains("text"));
722 }
723
724 #[test]
725 fn test_tags_from_path_executable_file() {
726 let dir = tempdir().unwrap();
727 let script_path = dir.path().join("script.py");
728 fs::write(&script_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
729
730 let mut perms = fs::metadata(&script_path).unwrap().permissions();
731 perms.set_mode(0o755);
732 fs::set_permissions(&script_path, perms).unwrap();
733
734 let tags = tags_from_path(&script_path).unwrap();
735 assert!(tags.contains("file"));
736 assert!(tags.contains("executable"));
737 assert!(tags.contains("python"));
738 assert!(tags.contains("text"));
739 }
740
741 #[test]
742 fn test_tags_from_path_directory() {
743 let dir = tempdir().unwrap();
744 let tags = tags_from_path(dir.path()).unwrap();
745 assert_eq!(tags, HashSet::from(["directory"]));
746 }
747
748 #[test]
749 fn test_tags_from_path_binary_file() {
750 let dir = tempdir().unwrap();
751 let binary_path = dir.path().join("binary");
752 fs::write(&binary_path, &[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01]).unwrap();
753
754 let tags = tags_from_path(&binary_path).unwrap();
755 assert!(tags.contains("file"));
756 assert!(tags.contains("binary"));
757 assert!(tags.contains("non-executable"));
758 }
759
760 #[test]
761 fn test_file_is_text_simple() {
762 let dir = tempdir().unwrap();
763 let text_path = dir.path().join("text.txt");
764 fs::write(&text_path, "Hello, world!").unwrap();
765 assert!(file_is_text(&text_path).unwrap());
766 }
767
768 #[test]
769 fn test_file_is_text_does_not_exist() {
770 let result = file_is_text("/nonexistent/file");
771 assert!(result.is_err());
772 }
773
774 #[test]
776 fn test_plist_binary_detection() {
777 let dir = tempdir().unwrap();
778 let plist_path = dir.path().join("test.plist");
779
780 let binary_plist = [
782 0x62, 0x70, 0x6c, 0x69, 0x73, 0x74, 0x30, 0x30, 0xd1, 0x01, 0x02, 0x5f, 0x10, 0x0f,
784 ];
785 fs::write(&plist_path, &binary_plist).unwrap();
786
787 let tags = tags_from_path(&plist_path).unwrap();
788 assert!(tags.contains("plist"));
789 assert!(tags.contains("binary"));
790 }
791
792 #[test]
793 fn test_plist_text_detection() {
794 let dir = tempdir().unwrap();
795 let plist_path = dir.path().join("test.plist");
796
797 let text_plist = r#"<?xml version="1.0" encoding="UTF-8"?>
798<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
799<plist version="1.0">
800<dict>
801 <key>TestKey</key>
802 <string>TestValue</string>
803</dict>
804</plist>"#;
805 fs::write(&plist_path, text_plist).unwrap();
806
807 let tags = tags_from_path(&plist_path).unwrap();
808 assert!(tags.contains("plist"));
809 assert!(tags.contains("text"));
810 }
811
812 #[test]
814 fn test_empty_file() {
815 let dir = tempdir().unwrap();
816 let empty_path = dir.path().join("empty");
817 fs::write(&empty_path, "").unwrap();
818
819 let tags = tags_from_path(&empty_path).unwrap();
820 assert!(tags.contains("file"));
821 assert!(tags.contains("text")); assert!(tags.contains("non-executable"));
823 }
824
825 #[test]
826 fn test_shebang_incomplete() {
827 let shebang_incomplete = parse_shebang(Cursor::new(b"#! \n")).unwrap();
828 assert!(shebang_incomplete.is_empty());
829 }
830
831 #[test]
832 fn test_multiple_extensions() {
833 let tags = tags_from_filename("backup.tar.gz");
834 assert!(tags.contains("binary"));
835 assert!(tags.contains("gzip"));
836 }
837
838 #[test]
840 fn test_comprehensive_shebang_parsing() {
841 let test_cases = vec![
842 ("", vec![]),
843 ("#!/usr/bin/python", vec!["python"]),
844 ("#!/usr/bin/env python", vec!["python"]),
845 ("#! /usr/bin/python", vec!["python"]),
846 ("#!/usr/bin/foo python", vec![]), ("#!/usr/bin/env -S python -u", vec!["python"]),
848 ("#!/usr/bin/env", vec![]),
849 ("#!/usr/bin/env -S", vec![]),
850 ];
851
852 for (input, expected) in test_cases {
853 let tags = parse_shebang(Cursor::new(input.as_bytes())).unwrap();
854 let expected_set: TagSet = expected.iter().cloned().collect();
855 assert_eq!(tags, expected_set, "Failed for input: '{}'", input);
856 }
857 }
858
859 #[test]
860 fn test_invalid_utf8_shebang() {
861 let invalid_utf8_cases = vec![
863 &[0xf9, 0x93, 0x01, 0x42, 0xcd][..],
864 &[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd][..],
865 &[0x23, 0x21, 0x00, 0x00, 0x00, 0x00][..],
866 ];
867
868 for input in invalid_utf8_cases {
869 let result = parse_shebang(Cursor::new(input));
871 match result {
872 Ok(tags) => assert!(tags.is_empty()),
873 Err(_) => (), }
875 }
876 }
877}