1use std::collections::HashSet;
59use std::fmt;
60use std::fs;
61use std::io::{BufReader, Read};
62use std::path::Path;
63
64pub mod extensions;
65pub mod interpreters;
66pub mod tags;
67
68#[derive(Debug, Clone, PartialEq, Eq, Hash)]
73pub struct ShebangTuple {
74 components: Box<[String]>,
75}
76
77impl ShebangTuple {
78 pub fn new() -> Self {
80 Self {
81 components: Box::new([]),
82 }
83 }
84
85 pub fn from_vec(vec: Vec<String>) -> Self {
87 Self {
88 components: vec.into_boxed_slice(),
89 }
90 }
91
92 pub const fn len(&self) -> usize {
94 self.components.len()
95 }
96
97 pub const fn is_empty(&self) -> bool {
99 self.components.is_empty()
100 }
101
102 pub fn get(&self, index: usize) -> Option<&str> {
105 self.components.get(index).map(|s| s.as_str())
106 }
107
108 pub fn first(&self) -> Option<&str> {
110 self.get(0)
111 }
112
113 pub fn into_vec(self) -> Vec<String> {
115 self.components.into_vec()
116 }
117
118 pub fn iter(&self) -> std::slice::Iter<'_, String> {
120 self.components.iter()
121 }
122
123 pub fn as_slice(&self) -> &[String] {
125 &self.components
126 }
127}
128
129impl std::ops::Index<usize> for ShebangTuple {
131 type Output = str;
132
133 fn index(&self, index: usize) -> &Self::Output {
134 &self.components[index]
135 }
136}
137
138impl<'a> IntoIterator for &'a ShebangTuple {
140 type Item = &'a String;
141 type IntoIter = std::slice::Iter<'a, String>;
142
143 fn into_iter(self) -> Self::IntoIter {
144 self.components.iter()
145 }
146}
147
148impl FromIterator<String> for ShebangTuple {
150 fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
151 Self::from_vec(iter.into_iter().collect())
152 }
153}
154
155impl fmt::Display for ShebangTuple {
157 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
158 write!(f, "(")?;
159 for (i, component) in self.components.iter().enumerate() {
160 if i > 0 {
161 write!(f, ", ")?;
162 }
163 write!(f, "'{component}'")?;
164 }
165 if self.components.len() == 1 {
166 write!(f, ",")?; }
168 write!(f, ")")
169 }
170}
171
172impl From<Vec<String>> for ShebangTuple {
174 fn from(vec: Vec<String>) -> Self {
175 Self::from_vec(vec)
176 }
177}
178
179impl Default for ShebangTuple {
181 fn default() -> Self {
182 Self::new()
183 }
184}
185
186use extensions::{get_extension_tags, get_extensions_need_binary_check_tags, get_name_tags};
187use interpreters::get_interpreter_tags;
188use tags::*;
189
190#[derive(Debug, Clone)]
195pub struct FileIdentifier {
196 skip_content_analysis: bool,
197 skip_shebang_analysis: bool,
198 custom_extensions: Option<std::collections::HashMap<String, TagSet>>,
199}
200
201impl Default for FileIdentifier {
202 fn default() -> Self {
203 Self::new()
204 }
205}
206
207impl FileIdentifier {
208 pub fn new() -> Self {
216 Self {
217 skip_content_analysis: false,
218 skip_shebang_analysis: false,
219 custom_extensions: None,
220 }
221 }
222
223 pub fn skip_content_analysis(mut self) -> Self {
228 self.skip_content_analysis = true;
229 self
230 }
231
232 pub fn skip_shebang_analysis(mut self) -> Self {
237 self.skip_shebang_analysis = true;
238 self
239 }
240
241 pub fn with_custom_extensions(
246 mut self,
247 extensions: std::collections::HashMap<String, TagSet>,
248 ) -> Self {
249 self.custom_extensions = Some(extensions);
250 self
251 }
252
253 pub fn identify<P: AsRef<Path>>(&self, path: P) -> Result<TagSet> {
257 self.identify_with_config(path)
258 }
259
260 fn identify_with_config<P: AsRef<Path>>(&self, path: P) -> Result<TagSet> {
261 let path = path.as_ref();
262 let path_str = path.to_string_lossy();
263
264 let metadata = match fs::symlink_metadata(path) {
266 Ok(meta) => meta,
267 Err(_) => {
268 return Err(IdentifyError::PathNotFound {
269 path: path_str.to_string(),
270 });
271 }
272 };
273
274 if let Some(file_type_tags) = analyze_file_type(&metadata) {
276 return Ok(file_type_tags);
277 }
278
279 let mut tags = TagSet::new();
281 tags.insert(FILE);
282
283 let is_executable = analyze_permissions(path, &metadata);
285 if is_executable {
286 tags.insert(EXECUTABLE);
287 } else {
288 tags.insert(NON_EXECUTABLE);
289 }
290
291 let filename_and_shebang_tags =
293 self.analyze_filename_and_shebang_configured(path, is_executable);
294 tags.extend(filename_and_shebang_tags);
295
296 if !self.skip_content_analysis {
298 let encoding_tags = analyze_content_encoding(path, &tags)?;
299 tags.extend(encoding_tags);
300 }
301
302 Ok(tags)
303 }
304
305 fn analyze_filename_and_shebang_configured<P: AsRef<Path>>(
306 &self,
307 path: P,
308 is_executable: bool,
309 ) -> TagSet {
310 let path = path.as_ref();
311 let mut tags = TagSet::new();
312
313 if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
315 if let Some(custom_exts) = &self.custom_extensions {
317 if let Some(ext) = Path::new(filename).extension().and_then(|e| e.to_str()) {
318 let ext_lower = ext.to_lowercase();
319 if let Some(ext_tags) = custom_exts.get(&ext_lower) {
320 tags.extend(ext_tags.iter().cloned());
321 return tags; }
323 }
324 }
325
326 let filename_tags = tags_from_filename(filename);
328 if !filename_tags.is_empty() {
329 tags.extend(filename_tags);
330 } else if is_executable && !self.skip_shebang_analysis {
331 if let Ok(shebang_components) = parse_shebang_from_file(path) {
333 if !shebang_components.is_empty() {
334 let interpreter_tags = tags_from_interpreter(&shebang_components[0]);
335 tags.extend(interpreter_tags);
336 }
337 }
338 }
339 }
340
341 tags
342 }
343}
344
345pub type Result<T> = std::result::Result<T, IdentifyError>;
350
351#[derive(thiserror::Error, Debug)]
353pub enum IdentifyError {
354 #[error("{path} does not exist.")]
356 PathNotFound { path: String },
357
358 #[error("IO error: {source}")]
360 IoError {
361 #[from]
362 source: std::io::Error,
363 },
364
365 #[error("Path contains invalid UTF-8: {path}")]
367 InvalidPath { path: String },
368
369 #[error("File contains invalid UTF-8 content")]
371 InvalidUtf8,
372}
373
374fn analyze_file_type(metadata: &std::fs::Metadata) -> Option<TagSet> {
379 let file_type = metadata.file_type();
380
381 if file_type.is_dir() {
382 return Some([DIRECTORY].iter().cloned().collect());
383 }
384 if file_type.is_symlink() {
385 return Some([SYMLINK].iter().cloned().collect());
386 }
387
388 #[cfg(unix)]
390 {
391 use std::os::unix::fs::FileTypeExt;
392 if file_type.is_socket() {
393 return Some([SOCKET].iter().cloned().collect());
394 }
395 }
396
397 None
399}
400
401fn analyze_permissions<P: AsRef<Path>>(path: P, metadata: &std::fs::Metadata) -> bool {
406 #[cfg(unix)]
407 {
408 use std::os::unix::fs::PermissionsExt;
409 let _ = path; metadata.permissions().mode() & 0o111 != 0
411 }
412 #[cfg(not(unix))]
413 {
414 let _ = metadata; let path = path.as_ref();
417 path.extension()
418 .and_then(|ext| ext.to_str())
419 .map(|ext| matches!(ext.to_lowercase().as_str(), "exe" | "bat" | "cmd"))
420 .unwrap_or(false)
421 }
422}
423
424fn analyze_filename_and_shebang<P: AsRef<Path>>(path: P, is_executable: bool) -> TagSet {
429 let path = path.as_ref();
430 let mut tags = TagSet::new();
431
432 if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
434 let filename_tags = tags_from_filename(filename);
435 if !filename_tags.is_empty() {
436 tags.extend(filename_tags);
437 } else if is_executable {
438 if let Ok(shebang_components) = parse_shebang_from_file(path) {
440 if !shebang_components.is_empty() {
441 let interpreter_tags = tags_from_interpreter(&shebang_components[0]);
442 tags.extend(interpreter_tags);
443 }
444 }
445 }
446 }
447
448 tags
449}
450
451fn analyze_content_encoding<P: AsRef<Path>>(path: P, existing_tags: &TagSet) -> Result<TagSet> {
455 let mut tags = TagSet::new();
456
457 if !existing_tags.iter().any(|tag| ENCODING_TAGS.contains(tag)) {
459 if file_is_text(path)? {
460 tags.insert(TEXT);
461 } else {
462 tags.insert(BINARY);
463 }
464 }
465
466 Ok(tags)
467}
468
469pub fn tags_from_path<P: AsRef<Path>>(path: P) -> Result<TagSet> {
509 let path = path.as_ref();
510 let path_str = path.to_string_lossy();
511
512 let metadata = match fs::symlink_metadata(path) {
514 Ok(meta) => meta,
515 Err(_) => {
516 return Err(IdentifyError::PathNotFound {
517 path: path_str.to_string(),
518 });
519 }
520 };
521
522 if let Some(file_type_tags) = analyze_file_type(&metadata) {
524 return Ok(file_type_tags);
525 }
526
527 let mut tags = TagSet::new();
529 tags.insert(FILE);
530
531 let is_executable = analyze_permissions(path, &metadata);
533 if is_executable {
534 tags.insert(EXECUTABLE);
535 } else {
536 tags.insert(NON_EXECUTABLE);
537 }
538
539 let filename_and_shebang_tags = analyze_filename_and_shebang(path, is_executable);
541 tags.extend(filename_and_shebang_tags);
542
543 let encoding_tags = analyze_content_encoding(path, &tags)?;
545 tags.extend(encoding_tags);
546
547 Ok(tags)
548}
549
550pub fn tags_from_filename(filename: &str) -> TagSet {
581 let mut tags = TagSet::new();
582
583 for part in std::iter::once(filename).chain(filename.split('.')) {
585 let name_tags = get_name_tags(part);
586 if !name_tags.is_empty() {
587 tags.extend(name_tags);
588 break;
589 }
590 }
591
592 if let Some(ext) = Path::new(filename).extension().and_then(|e| e.to_str()) {
594 let ext_lower = ext.to_lowercase();
595
596 let ext_tags = get_extension_tags(&ext_lower);
597 if !ext_tags.is_empty() {
598 tags.extend(ext_tags);
599 } else {
600 let binary_check_tags = get_extensions_need_binary_check_tags(&ext_lower);
601 if !binary_check_tags.is_empty() {
602 tags.extend(binary_check_tags);
603 }
604 }
605 }
606
607 tags
608}
609
610pub fn tags_from_interpreter(interpreter: &str) -> TagSet {
642 let interpreter_name = interpreter.split('/').next_back().unwrap_or(interpreter);
644
645 let mut current = interpreter_name;
647 while !current.is_empty() {
648 let tags = get_interpreter_tags(current);
649 if !tags.is_empty() {
650 return tags;
651 }
652
653 match current.rfind('.') {
655 Some(pos) => current = ¤t[..pos],
656 None => break,
657 }
658 }
659
660 TagSet::new()
661}
662
663pub fn file_is_text<P: AsRef<Path>>(path: P) -> Result<bool> {
697 let file = fs::File::open(path)?;
698 is_text(file)
699}
700
701pub fn is_text<R: Read>(mut reader: R) -> Result<bool> {
727 let mut buffer = [0; 1024];
728 let bytes_read = reader.read(&mut buffer)?;
729
730 let text_chars: HashSet<u8> = [
732 7, 8, 9, 10, 11, 12, 13, 27, ]
734 .iter()
735 .cloned()
736 .chain(0x20..0x7F) .chain(0x80..=0xFF) .collect();
739
740 let is_text = buffer[..bytes_read]
741 .iter()
742 .all(|&byte| text_chars.contains(&byte));
743 Ok(is_text)
744}
745
746pub fn parse_shebang_from_file<P: AsRef<Path>>(path: P) -> Result<ShebangTuple> {
784 let path = path.as_ref();
785
786 let metadata = fs::metadata(path)?;
788 #[cfg(unix)]
789 {
790 use std::os::unix::fs::PermissionsExt;
791 if metadata.permissions().mode() & 0o111 == 0 {
792 return Ok(ShebangTuple::new());
793 }
794 }
795
796 let file = fs::File::open(path)?;
797 parse_shebang(file)
798}
799
800pub fn parse_shebang<R: Read>(reader: R) -> Result<ShebangTuple> {
829 use std::io::BufRead;
830
831 let mut buf_reader = BufReader::new(reader);
832
833 let mut first_line_bytes = Vec::new();
835 match buf_reader.read_until(b'\n', &mut first_line_bytes) {
836 Ok(0) => return Ok(ShebangTuple::new()), Ok(_) => {
838 if first_line_bytes.ends_with(b"\n") {
840 first_line_bytes.pop();
841 }
842 if first_line_bytes.ends_with(b"\r") {
844 first_line_bytes.pop();
845 }
846 }
847 Err(_) => return Ok(ShebangTuple::new()), }
849
850 if first_line_bytes.len() < 2 || &first_line_bytes[0..2] != b"#!" {
852 return Ok(ShebangTuple::new());
853 }
854
855 if first_line_bytes.len() > 1024 {
857 first_line_bytes.truncate(1024);
858 }
859
860 let first_line = match String::from_utf8(first_line_bytes) {
862 Ok(line) => line,
863 Err(_) => return Ok(ShebangTuple::new()),
864 };
865
866 let shebang_line = first_line[2..].trim();
868
869 for c in shebang_line.chars() {
871 if !c.is_ascii() || (c.is_control() && c != '\t') {
872 return Ok(ShebangTuple::new());
873 }
874 }
875
876 let parts: smallvec::SmallVec<[&str; 4]> = shebang_line.split_whitespace().collect();
878 if parts.is_empty() {
879 return Ok(ShebangTuple::new());
880 }
881
882 let cmd: smallvec::SmallVec<[&str; 2]> = if parts[0] == "/usr/bin/env" {
883 if parts.len() == 1 {
884 smallvec::SmallVec::new()
886 } else if parts.len() >= 2 && parts[1] == "-S" {
887 if parts.len() > 2 {
888 parts[2..].iter().copied().collect()
889 } else {
890 smallvec::SmallVec::new()
892 }
893 } else {
894 parts[1..].iter().copied().collect()
895 }
896 } else {
897 parts.iter().copied().collect()
898 };
899
900 if cmd.is_empty() {
901 return Ok(ShebangTuple::new());
902 }
903
904 Ok(ShebangTuple::from_vec(
906 cmd.iter().map(|s| s.to_string()).collect(),
907 ))
908}
909
910#[cfg(test)]
911mod tests {
912 use super::*;
913 use std::fs;
914 use std::io::Cursor;
915 use std::os::unix::fs::PermissionsExt;
916 use tempfile::{NamedTempFile, tempdir};
917
918 macro_rules! shebang_tuple {
920 () => {
921 ShebangTuple::new()
922 };
923 ($($item:expr),+) => {
924 ShebangTuple::from_vec(vec![$($item.to_string()),+])
925 };
926 }
927
928 #[test]
930 fn test_all_basic_tags_exist() {
931 assert!(TYPE_TAGS.contains("file"));
932 assert!(TYPE_TAGS.contains("directory"));
933 assert!(MODE_TAGS.contains("executable"));
934 assert!(ENCODING_TAGS.contains("text"));
935 }
936
937 #[test]
938 fn test_tag_groups_are_disjoint() {
939 assert!(TYPE_TAGS.is_disjoint(&MODE_TAGS));
940 assert!(TYPE_TAGS.is_disjoint(&ENCODING_TAGS));
941 assert!(MODE_TAGS.is_disjoint(&ENCODING_TAGS));
942 }
943
944 #[test]
946 fn test_tags_from_filename_basic() {
947 let tags = tags_from_filename("file.py");
948 assert!(tags.contains("text"));
949 assert!(tags.contains("python"));
950 }
951
952 #[test]
953 fn test_tags_from_filename_special_names() {
954 let tags = tags_from_filename("Dockerfile");
955 assert!(tags.contains("dockerfile"));
956 assert!(tags.contains("text"));
957
958 let tags = tags_from_filename("Makefile");
959 assert!(tags.contains("makefile"));
960 assert!(tags.contains("text"));
961
962 let tags = tags_from_filename("Cargo.toml");
963 assert!(tags.contains("toml"));
964 assert!(tags.contains("cargo"));
965 }
966
967 #[test]
968 fn test_tags_from_filename_case_insensitive_extension() {
969 let tags = tags_from_filename("image.JPG");
970 assert!(tags.contains("binary"));
971 assert!(tags.contains("image"));
972 assert!(tags.contains("jpeg"));
973 }
974
975 #[test]
976 fn test_tags_from_filename_precedence() {
977 let tags = tags_from_filename("setup.cfg");
979 assert!(tags.contains("ini"));
980 }
981
982 #[test]
983 fn test_tags_from_filename_complex_names() {
984 let tags = tags_from_filename("Dockerfile.xenial");
985 assert!(tags.contains("dockerfile"));
986
987 let tags = tags_from_filename("README.md");
988 assert!(tags.contains("markdown"));
989 assert!(tags.contains("plain-text"));
990 }
991
992 #[test]
993 fn test_tags_from_filename_unrecognized() {
994 let tags = tags_from_filename("unknown.xyz");
995 assert!(tags.is_empty());
996
997 let tags = tags_from_filename("noextension");
998 assert!(tags.is_empty());
999 }
1000
1001 #[test]
1003 fn test_tags_from_interpreter_basic() {
1004 let tags = tags_from_interpreter("python3");
1005 assert!(tags.contains("python"));
1006 assert!(tags.contains("python3"));
1007 }
1008
1009 #[test]
1010 fn test_tags_from_interpreter_versioned() {
1011 let tags = tags_from_interpreter("python3.11.2");
1012 assert!(tags.contains("python"));
1013 assert!(tags.contains("python3"));
1014
1015 let tags = tags_from_interpreter("php8.1");
1016 assert!(tags.contains("php"));
1017 assert!(tags.contains("php8"));
1018 }
1019
1020 #[test]
1021 fn test_tags_from_interpreter_with_path() {
1022 let tags = tags_from_interpreter("/usr/bin/python3");
1023 assert!(tags.contains("python"));
1024 assert!(tags.contains("python3"));
1025 }
1026
1027 #[test]
1028 fn test_tags_from_interpreter_unrecognized() {
1029 let tags = tags_from_interpreter("unknown-interpreter");
1030 assert!(tags.is_empty());
1031
1032 let tags = tags_from_interpreter("");
1033 assert!(tags.is_empty());
1034 }
1035
1036 #[test]
1038 fn test_is_text_basic() {
1039 assert!(is_text(Cursor::new(b"hello world")).unwrap());
1040 assert!(is_text(Cursor::new(b"")).unwrap());
1041 assert!(!is_text(Cursor::new(b"hello\x00world")).unwrap());
1042 }
1043
1044 #[test]
1045 fn test_is_text_unicode() {
1046 assert!(is_text(Cursor::new("éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)ノ".as_bytes())).unwrap());
1047 assert!(is_text(Cursor::new(r"¯\_(ツ)_/¯".as_bytes())).unwrap());
1048 assert!(is_text(Cursor::new("♪┏(・o・)┛♪┗ ( ・o・) ┓♪".as_bytes())).unwrap());
1049 }
1050
1051 #[test]
1052 fn test_is_text_binary_data() {
1053 assert!(!is_text(Cursor::new(&[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01])).unwrap());
1055 assert!(!is_text(Cursor::new(&[0x43, 0x92, 0xd9, 0x0f, 0xaf, 0x32, 0x2c])).unwrap());
1057 }
1058
1059 #[test]
1061 fn test_parse_shebang_basic() {
1062 let components = parse_shebang(Cursor::new(b"#!/usr/bin/python")).unwrap();
1063 assert_eq!(components, shebang_tuple!["/usr/bin/python"]);
1064
1065 let components = parse_shebang(Cursor::new(b"#!/usr/bin/env python")).unwrap();
1066 assert_eq!(components, shebang_tuple!["python"]);
1067 }
1068
1069 #[test]
1070 fn test_parse_shebang_env_with_flags() {
1071 let components = parse_shebang(Cursor::new(b"#!/usr/bin/env -S python -u")).unwrap();
1072 assert_eq!(components, shebang_tuple!["python", "-u"]);
1073 }
1074
1075 #[test]
1076 fn test_parse_shebang_spaces() {
1077 let components = parse_shebang(Cursor::new(b"#! /usr/bin/python")).unwrap();
1078 assert_eq!(components, shebang_tuple!["/usr/bin/python"]);
1079
1080 let components = parse_shebang(Cursor::new(b"#!/usr/bin/foo python")).unwrap();
1081 assert_eq!(components, shebang_tuple!["/usr/bin/foo", "python"]);
1082 }
1083
1084 #[test]
1085 fn test_parse_shebang_no_shebang() {
1086 let components = parse_shebang(Cursor::new(b"import sys")).unwrap();
1087 assert!(components.is_empty());
1088
1089 let components = parse_shebang(Cursor::new(b"")).unwrap();
1090 assert!(components.is_empty());
1091 }
1092
1093 #[test]
1094 fn test_parse_shebang_invalid_utf8() {
1095 let result = parse_shebang(Cursor::new(&[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd]));
1096 match result {
1097 Ok(components) => assert!(components.is_empty()),
1098 Err(_) => (), }
1100 }
1101
1102 #[test]
1104 fn test_tags_from_path_file_not_found() {
1105 let result = tags_from_path("/nonexistent/path");
1106 assert!(result.is_err());
1107 assert!(result.unwrap_err().to_string().contains("does not exist"));
1108 }
1109
1110 #[test]
1111 fn test_tags_from_path_regular_file() {
1112 let file = NamedTempFile::new().unwrap();
1113 fs::write(&file, "print('hello')").unwrap();
1114
1115 let tags = tags_from_path(file.path()).unwrap();
1116 assert!(tags.contains("file"));
1117 assert!(tags.contains("non-executable"));
1118 assert!(tags.contains("text"));
1119 }
1120
1121 #[test]
1122 fn test_tags_from_path_executable_file() {
1123 let dir = tempdir().unwrap();
1124 let script_path = dir.path().join("script.py");
1125 fs::write(&script_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
1126
1127 let mut perms = fs::metadata(&script_path).unwrap().permissions();
1128 perms.set_mode(0o755);
1129 fs::set_permissions(&script_path, perms).unwrap();
1130
1131 let tags = tags_from_path(&script_path).unwrap();
1132 assert!(tags.contains("file"));
1133 assert!(tags.contains("executable"));
1134 assert!(tags.contains("python"));
1135 assert!(tags.contains("text"));
1136 }
1137
1138 #[test]
1139 fn test_tags_from_path_directory() {
1140 let dir = tempdir().unwrap();
1141 let tags = tags_from_path(dir.path()).unwrap();
1142 assert_eq!(tags, HashSet::from(["directory"]));
1143 }
1144
1145 #[test]
1146 fn test_tags_from_path_binary_file() {
1147 let dir = tempdir().unwrap();
1148 let binary_path = dir.path().join("binary");
1149 fs::write(&binary_path, &[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01]).unwrap();
1150
1151 let tags = tags_from_path(&binary_path).unwrap();
1152 assert!(tags.contains("file"));
1153 assert!(tags.contains("binary"));
1154 assert!(tags.contains("non-executable"));
1155 }
1156
1157 #[test]
1158 fn test_file_is_text_simple() {
1159 let dir = tempdir().unwrap();
1160 let text_path = dir.path().join("text.txt");
1161 fs::write(&text_path, "Hello, world!").unwrap();
1162 assert!(file_is_text(&text_path).unwrap());
1163 }
1164
1165 #[test]
1166 fn test_file_is_text_does_not_exist() {
1167 let result = file_is_text("/nonexistent/file");
1168 assert!(result.is_err());
1169 }
1170
1171 #[test]
1173 fn test_plist_binary_detection() {
1174 let dir = tempdir().unwrap();
1175 let plist_path = dir.path().join("test.plist");
1176
1177 let binary_plist = [
1179 0x62, 0x70, 0x6c, 0x69, 0x73, 0x74, 0x30, 0x30, 0xd1, 0x01, 0x02, 0x5f, 0x10, 0x0f,
1181 ];
1182 fs::write(&plist_path, &binary_plist).unwrap();
1183
1184 let tags = tags_from_path(&plist_path).unwrap();
1185 assert!(tags.contains("plist"));
1186 assert!(tags.contains("binary"));
1187 }
1188
1189 #[test]
1190 fn test_plist_text_detection() {
1191 let dir = tempdir().unwrap();
1192 let plist_path = dir.path().join("test.plist");
1193
1194 let text_plist = r#"<?xml version="1.0" encoding="UTF-8"?>
1195<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
1196<plist version="1.0">
1197<dict>
1198 <key>TestKey</key>
1199 <string>TestValue</string>
1200</dict>
1201</plist>"#;
1202 fs::write(&plist_path, text_plist).unwrap();
1203
1204 let tags = tags_from_path(&plist_path).unwrap();
1205 assert!(tags.contains("plist"));
1206 assert!(tags.contains("text"));
1207 }
1208
1209 #[test]
1211 fn test_empty_file() {
1212 let dir = tempdir().unwrap();
1213 let empty_path = dir.path().join("empty");
1214 fs::write(&empty_path, "").unwrap();
1215
1216 let tags = tags_from_path(&empty_path).unwrap();
1217 assert!(tags.contains("file"));
1218 assert!(tags.contains("text")); assert!(tags.contains("non-executable"));
1220 }
1221
1222 #[test]
1223 fn test_shebang_incomplete() {
1224 let shebang_incomplete = parse_shebang(Cursor::new(b"#! \n")).unwrap();
1225 assert!(shebang_incomplete.is_empty());
1226 }
1227
1228 #[test]
1229 fn test_multiple_extensions() {
1230 let tags = tags_from_filename("backup.tar.gz");
1231 assert!(tags.contains("binary"));
1232 assert!(tags.contains("gzip"));
1233 }
1234
1235 #[test]
1237 fn test_file_identifier_default() {
1238 let dir = tempdir().unwrap();
1239 let py_file = dir.path().join("test.py");
1240 fs::write(&py_file, "print('hello')").unwrap();
1241
1242 let identifier = FileIdentifier::new();
1243 let tags = identifier.identify(&py_file).unwrap();
1244
1245 assert!(tags.contains("file"));
1246 assert!(tags.contains("python"));
1247 assert!(tags.contains("text"));
1248 assert!(tags.contains("non-executable"));
1249 }
1250
1251 #[test]
1252 fn test_file_identifier_skip_content_analysis() {
1253 let dir = tempdir().unwrap();
1254 let unknown_file = dir.path().join("unknown_file");
1255 fs::write(&unknown_file, "some content").unwrap();
1256
1257 let identifier = FileIdentifier::new().skip_content_analysis();
1258 let tags = identifier.identify(&unknown_file).unwrap();
1259
1260 assert!(tags.contains("file"));
1261 assert!(tags.contains("non-executable"));
1262 assert!(!tags.contains("text"));
1264 assert!(!tags.contains("binary"));
1265 }
1266
1267 #[test]
1268 fn test_file_identifier_skip_shebang_analysis() {
1269 let dir = tempdir().unwrap();
1270 let script_file = dir.path().join("script");
1271 fs::write(&script_file, "#!/usr/bin/env python3\nprint('hello')").unwrap();
1272
1273 let mut perms = fs::metadata(&script_file).unwrap().permissions();
1274 perms.set_mode(0o755);
1275 fs::set_permissions(&script_file, perms).unwrap();
1276
1277 let identifier = FileIdentifier::new().skip_shebang_analysis();
1278 let tags = identifier.identify(&script_file).unwrap();
1279
1280 assert!(tags.contains("file"));
1281 assert!(tags.contains("executable"));
1282 assert!(!tags.contains("python"));
1285 }
1286
1287 #[test]
1288 fn test_file_identifier_custom_extensions() {
1289 let dir = tempdir().unwrap();
1290 let custom_file = dir.path().join("test.myext");
1291 fs::write(&custom_file, "custom content").unwrap();
1292
1293 let mut custom_extensions = std::collections::HashMap::new();
1294 custom_extensions.insert("myext".to_string(), HashSet::from(["custom", "text"]));
1295
1296 let identifier = FileIdentifier::new().with_custom_extensions(custom_extensions);
1297 let tags = identifier.identify(&custom_file).unwrap();
1298
1299 assert!(tags.contains("file"));
1300 assert!(tags.contains("custom"));
1301 assert!(tags.contains("text"));
1302 assert!(tags.contains("non-executable"));
1303 }
1304
1305 #[test]
1306 fn test_file_identifier_chaining() {
1307 let dir = tempdir().unwrap();
1308 let test_file = dir.path().join("test.unknown");
1309 fs::write(&test_file, "content").unwrap();
1310
1311 let identifier = FileIdentifier::new()
1312 .skip_content_analysis()
1313 .skip_shebang_analysis();
1314 let tags = identifier.identify(&test_file).unwrap();
1315
1316 assert!(tags.contains("file"));
1317 assert!(tags.contains("non-executable"));
1318 assert!(!tags.contains("text"));
1320 assert!(!tags.contains("binary"));
1321 }
1322
1323 #[test]
1325 fn test_comprehensive_shebang_parsing() {
1326 let test_cases = vec![
1327 ("", vec![]),
1328 ("#!/usr/bin/python", vec!["python"]),
1329 ("#!/usr/bin/env python", vec!["python"]),
1330 ("#! /usr/bin/python", vec!["python"]),
1331 ("#!/usr/bin/foo python", vec![]), ("#!/usr/bin/env -S python -u", vec!["python"]),
1333 ("#!/usr/bin/env", vec![]),
1334 ("#!/usr/bin/env -S", vec![]),
1335 ];
1336
1337 for (input, _expected) in test_cases {
1338 let components = parse_shebang(Cursor::new(input.as_bytes())).unwrap();
1339
1340 match input {
1341 "" => assert!(components.is_empty()),
1342 "#!/usr/bin/python" => assert_eq!(components, shebang_tuple!["/usr/bin/python"]),
1343 "#!/usr/bin/env python" => assert_eq!(components, shebang_tuple!["python"]),
1344 "#! /usr/bin/python" => assert_eq!(components, shebang_tuple!["/usr/bin/python"]),
1345 "#!/usr/bin/foo python" => {
1346 assert_eq!(components, shebang_tuple!["/usr/bin/foo", "python"])
1347 }
1348 "#!/usr/bin/env -S python -u" => {
1349 assert_eq!(components, shebang_tuple!["python", "-u"])
1350 }
1351 "#!/usr/bin/env" => {
1352 assert!(
1354 components.is_empty(),
1355 "Got components: {:?} for input: '{}'",
1356 components,
1357 input
1358 );
1359 }
1360 "#!/usr/bin/env -S" => {
1361 assert!(
1363 components.is_empty(),
1364 "Got components: {:?} for input: '{}'",
1365 components,
1366 input
1367 );
1368 }
1369 _ => {}
1370 }
1371 }
1372 }
1373
1374 #[test]
1375 fn test_invalid_utf8_shebang() {
1376 let invalid_utf8_cases = vec![
1378 &[0xf9, 0x93, 0x01, 0x42, 0xcd][..],
1379 &[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd][..],
1380 &[0x23, 0x21, 0x00, 0x00, 0x00, 0x00][..],
1381 ];
1382
1383 for input in invalid_utf8_cases {
1384 let result = parse_shebang(Cursor::new(input));
1386 match result {
1387 Ok(components) => assert!(components.is_empty()),
1388 Err(_) => (), }
1390 }
1391 }
1392}