1use std::collections::HashSet;
59use std::fmt;
60use std::fs;
61use std::io::{BufReader, Read};
62use std::path::Path;
63
64pub mod extensions;
65pub mod interpreters;
66pub mod tags;
67
68#[derive(Debug, Clone, PartialEq, Eq, Hash)]
73pub struct ShebangTuple {
74 components: Box<[String]>,
75}
76
77impl ShebangTuple {
78 pub fn new() -> Self {
80 Self {
81 components: Box::new([]),
82 }
83 }
84
85 pub fn from_vec(vec: Vec<String>) -> Self {
87 Self {
88 components: vec.into_boxed_slice(),
89 }
90 }
91
92 pub const fn len(&self) -> usize {
94 self.components.len()
95 }
96
97 pub const fn is_empty(&self) -> bool {
99 self.components.is_empty()
100 }
101
102 pub fn get(&self, index: usize) -> Option<&str> {
105 self.components.get(index).map(|s| s.as_str())
106 }
107
108 pub fn first(&self) -> Option<&str> {
110 self.get(0)
111 }
112
113 pub fn into_vec(self) -> Vec<String> {
115 self.components.into_vec()
116 }
117
118 pub fn iter(&self) -> std::slice::Iter<'_, String> {
120 self.components.iter()
121 }
122
123 pub fn as_slice(&self) -> &[String] {
125 &self.components
126 }
127}
128
129impl std::ops::Index<usize> for ShebangTuple {
131 type Output = str;
132
133 fn index(&self, index: usize) -> &Self::Output {
134 &self.components[index]
135 }
136}
137
138impl<'a> IntoIterator for &'a ShebangTuple {
140 type Item = &'a String;
141 type IntoIter = std::slice::Iter<'a, String>;
142
143 fn into_iter(self) -> Self::IntoIter {
144 self.components.iter()
145 }
146}
147
148impl FromIterator<String> for ShebangTuple {
150 fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
151 Self::from_vec(iter.into_iter().collect())
152 }
153}
154
155impl fmt::Display for ShebangTuple {
157 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
158 write!(f, "(")?;
159 for (i, component) in self.components.iter().enumerate() {
160 if i > 0 {
161 write!(f, ", ")?;
162 }
163 write!(f, "'{component}'")?;
164 }
165 if self.components.len() == 1 {
166 write!(f, ",")?; }
168 write!(f, ")")
169 }
170}
171
172impl From<Vec<String>> for ShebangTuple {
174 fn from(vec: Vec<String>) -> Self {
175 Self::from_vec(vec)
176 }
177}
178
179impl Default for ShebangTuple {
181 fn default() -> Self {
182 Self::new()
183 }
184}
185
186use extensions::{get_extension_tags, get_extensions_need_binary_check_tags, get_name_tags};
187use interpreters::get_interpreter_tags;
188pub use tags::FileKind;
189use tags::*;
190
191#[derive(Debug, Clone)]
212pub struct FileInfo<'a> {
213 pub filename: &'a str,
215 pub file_kind: FileKind,
217 pub is_executable: bool,
219 pub content: Option<&'a [u8]>,
222}
223
224#[derive(Debug, Clone)]
229pub struct FileIdentifier {
230 skip_content_analysis: bool,
231 skip_shebang_analysis: bool,
232 custom_extensions: Option<std::collections::HashMap<String, TagSet>>,
233}
234
235impl Default for FileIdentifier {
236 fn default() -> Self {
237 Self::new()
238 }
239}
240
241impl FileIdentifier {
242 pub fn new() -> Self {
250 Self {
251 skip_content_analysis: false,
252 skip_shebang_analysis: false,
253 custom_extensions: None,
254 }
255 }
256
257 pub fn skip_content_analysis(mut self) -> Self {
262 self.skip_content_analysis = true;
263 self
264 }
265
266 pub fn skip_shebang_analysis(mut self) -> Self {
271 self.skip_shebang_analysis = true;
272 self
273 }
274
275 pub fn with_custom_extensions(
280 mut self,
281 extensions: std::collections::HashMap<String, TagSet>,
282 ) -> Self {
283 self.custom_extensions = Some(extensions);
284 self
285 }
286
287 pub fn identify<P: AsRef<Path>>(&self, path: P) -> Result<TagSet> {
291 self.identify_with_config(path)
292 }
293
294 fn identify_with_config<P: AsRef<Path>>(&self, path: P) -> Result<TagSet> {
295 let path = path.as_ref();
296 let path_str = path.to_string_lossy();
297
298 let metadata = match fs::symlink_metadata(path) {
300 Ok(meta) => meta,
301 Err(_) => {
302 return Err(IdentifyError::PathNotFound {
303 path: path_str.to_string(),
304 });
305 }
306 };
307
308 if let Some(file_type_tags) = analyze_file_type(&metadata) {
310 return Ok(file_type_tags);
311 }
312
313 let mut tags = TagSet::new();
315 tags.insert(FILE);
316
317 let is_executable = analyze_permissions(path, &metadata);
319 tags.insert(if is_executable {
320 EXECUTABLE
321 } else {
322 NON_EXECUTABLE
323 });
324
325 tags.extend(self.analyze_filename_and_shebang_configured(path, is_executable));
327
328 if !self.skip_content_analysis {
330 tags.extend(analyze_content_encoding(path, &tags)?);
331 }
332
333 Ok(tags)
334 }
335
336 pub fn identify_from(&self, info: &FileInfo<'_>) -> TagSet {
341 match info.file_kind {
342 FileKind::Directory => return HashSet::from([DIRECTORY]),
343 FileKind::Symlink => return HashSet::from([SYMLINK]),
344 FileKind::Socket => return HashSet::from([SOCKET]),
345 FileKind::Regular => {}
346 }
347
348 let mut tags = TagSet::new();
349 tags.insert(FILE);
350 tags.insert(if info.is_executable {
351 EXECUTABLE
352 } else {
353 NON_EXECUTABLE
354 });
355
356 let mut filename_matched = false;
358 if let Some(custom_exts) = &self.custom_extensions
359 && let Some(ext) = Path::new(info.filename)
360 .extension()
361 .and_then(|e| e.to_str())
362 && let Some(ext_tags) = custom_exts.get(&ext.to_lowercase())
363 {
364 tags.extend(ext_tags.iter().copied());
365 filename_matched = true;
366 }
367 if !filename_matched {
368 let filename_tags = tags_from_filename(info.filename);
369 if !filename_tags.is_empty() {
370 tags.extend(filename_tags);
371 filename_matched = true;
372 }
373 }
374
375 if !filename_matched
377 && info.is_executable
378 && !self.skip_shebang_analysis
379 && let Some(content) = info.content
380 && let Ok(shebang) = parse_shebang(content)
381 && let Some(interp) = shebang.first()
382 {
383 tags.extend(tags_from_interpreter(interp));
384 }
385
386 if !self.skip_content_analysis
388 && !tags.iter().any(|t| ENCODING_TAGS.contains(t))
389 && let Some(content) = info.content
390 && let Ok(text) = is_text(content)
391 {
392 tags.insert(if text { TEXT } else { BINARY });
393 }
394
395 tags
396 }
397
398 fn analyze_filename_and_shebang_configured<P: AsRef<Path>>(
399 &self,
400 path: P,
401 is_executable: bool,
402 ) -> TagSet {
403 let path = path.as_ref();
404 let mut tags = TagSet::new();
405
406 if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
408 if let Some(custom_exts) = &self.custom_extensions
410 && let Some(ext) = Path::new(filename).extension().and_then(|e| e.to_str())
411 && let Some(ext_tags) = custom_exts.get(&ext.to_lowercase())
412 {
413 tags.extend(ext_tags.iter().copied());
414 return tags; }
416
417 let filename_tags = tags_from_filename(filename);
419 if !filename_tags.is_empty() {
420 tags.extend(filename_tags);
421 } else if is_executable && !self.skip_shebang_analysis {
422 if let Ok(shebang_components) = parse_shebang_from_file(path)
424 && let Some(interp) = shebang_components.first()
425 {
426 tags.extend(tags_from_interpreter(interp));
427 }
428 }
429 }
430
431 tags
432 }
433}
434
435pub type Result<T> = std::result::Result<T, IdentifyError>;
440
441#[derive(thiserror::Error, Debug)]
443pub enum IdentifyError {
444 #[error("{path} does not exist.")]
446 PathNotFound { path: String },
447
448 #[error("IO error: {source}")]
450 IoError {
451 #[from]
452 source: std::io::Error,
453 },
454
455 #[error("Path contains invalid UTF-8: {path}")]
457 InvalidPath { path: String },
458
459 #[error("File contains invalid UTF-8 content")]
461 InvalidUtf8,
462}
463
464fn analyze_file_type(metadata: &std::fs::Metadata) -> Option<TagSet> {
469 let file_type = metadata.file_type();
470
471 if file_type.is_dir() {
472 return Some(HashSet::from([DIRECTORY]));
473 }
474 if file_type.is_symlink() {
475 return Some(HashSet::from([SYMLINK]));
476 }
477
478 #[cfg(unix)]
480 {
481 use std::os::unix::fs::FileTypeExt;
482 if file_type.is_socket() {
483 return Some(HashSet::from([SOCKET]));
484 }
485 }
486
487 None
489}
490
491fn analyze_permissions<P: AsRef<Path>>(path: P, metadata: &std::fs::Metadata) -> bool {
496 #[cfg(unix)]
497 {
498 use std::os::unix::fs::PermissionsExt;
499 let _ = path; metadata.permissions().mode() & 0o111 != 0
501 }
502 #[cfg(not(unix))]
503 {
504 let _ = metadata; let path = path.as_ref();
507 path.extension()
508 .and_then(|ext| ext.to_str())
509 .map(|ext| matches!(ext.to_lowercase().as_str(), "exe" | "bat" | "cmd"))
510 .unwrap_or(false)
511 }
512}
513
514fn analyze_filename_and_shebang<P: AsRef<Path>>(path: P, is_executable: bool) -> TagSet {
519 let path = path.as_ref();
520 let mut tags = TagSet::new();
521
522 if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
524 let filename_tags = tags_from_filename(filename);
525 if !filename_tags.is_empty() {
526 tags.extend(filename_tags);
527 } else if is_executable {
528 if let Ok(shebang_components) = parse_shebang_from_file(path)
530 && let Some(interp) = shebang_components.first()
531 {
532 tags.extend(tags_from_interpreter(interp));
533 }
534 }
535 }
536
537 tags
538}
539
540fn analyze_content_encoding<P: AsRef<Path>>(path: P, existing_tags: &TagSet) -> Result<TagSet> {
544 let mut tags = TagSet::new();
545
546 if !existing_tags.iter().any(|tag| ENCODING_TAGS.contains(tag)) {
548 if file_is_text(path)? {
549 tags.insert(TEXT);
550 } else {
551 tags.insert(BINARY);
552 }
553 }
554
555 Ok(tags)
556}
557
558pub fn tags_from_path<P: AsRef<Path>>(path: P) -> Result<TagSet> {
598 let path = path.as_ref();
599 let path_str = path.to_string_lossy();
600
601 let metadata = match fs::symlink_metadata(path) {
603 Ok(meta) => meta,
604 Err(_) => {
605 return Err(IdentifyError::PathNotFound {
606 path: path_str.to_string(),
607 });
608 }
609 };
610
611 if let Some(file_type_tags) = analyze_file_type(&metadata) {
613 return Ok(file_type_tags);
614 }
615
616 let mut tags = TagSet::new();
618 tags.insert(FILE);
619
620 let is_executable = analyze_permissions(path, &metadata);
622 tags.insert(if is_executable {
623 EXECUTABLE
624 } else {
625 NON_EXECUTABLE
626 });
627
628 tags.extend(analyze_filename_and_shebang(path, is_executable));
630
631 tags.extend(analyze_content_encoding(path, &tags)?);
633
634 Ok(tags)
635}
636
637pub fn tags_from_info(info: &FileInfo<'_>) -> TagSet {
678 match info.file_kind {
679 FileKind::Directory => return HashSet::from([DIRECTORY]),
680 FileKind::Symlink => return HashSet::from([SYMLINK]),
681 FileKind::Socket => return HashSet::from([SOCKET]),
682 FileKind::Regular => {}
683 }
684
685 let mut tags = TagSet::new();
686 tags.insert(FILE);
687 tags.insert(if info.is_executable {
688 EXECUTABLE
689 } else {
690 NON_EXECUTABLE
691 });
692
693 let filename_tags = tags_from_filename(info.filename);
695 if !filename_tags.is_empty() {
696 tags.extend(filename_tags);
697 } else if info.is_executable {
698 if let Some(content) = info.content
700 && let Ok(shebang) = parse_shebang(content)
701 && let Some(interp) = shebang.first()
702 {
703 tags.extend(tags_from_interpreter(interp));
704 }
705 }
706
707 if !tags.iter().any(|tag| ENCODING_TAGS.contains(tag))
709 && let Some(content) = info.content
710 && let Ok(text) = is_text(content)
711 {
712 tags.insert(if text { TEXT } else { BINARY });
713 }
714
715 tags
716}
717
718pub fn tags_from_filename(filename: &str) -> TagSet {
749 let mut tags = TagSet::new();
750
751 for part in std::iter::once(filename).chain(filename.split('.')) {
753 let name_tags = get_name_tags(part);
754 if !name_tags.is_empty() {
755 tags.extend(name_tags);
756 break;
757 }
758 }
759
760 if let Some(ext) = Path::new(filename).extension().and_then(|e| e.to_str()) {
762 let ext_lower = ext.to_lowercase();
763
764 let ext_tags = get_extension_tags(&ext_lower);
765 if !ext_tags.is_empty() {
766 tags.extend(ext_tags);
767 } else {
768 let binary_check_tags = get_extensions_need_binary_check_tags(&ext_lower);
769 if !binary_check_tags.is_empty() {
770 tags.extend(binary_check_tags);
771 }
772 }
773 }
774
775 tags
776}
777
778pub fn tags_from_interpreter(interpreter: &str) -> TagSet {
810 let interpreter_name = interpreter.split('/').next_back().unwrap_or(interpreter);
812
813 let mut current = interpreter_name;
815 while !current.is_empty() {
816 let tags = get_interpreter_tags(current);
817 if !tags.is_empty() {
818 return tags;
819 }
820
821 match current.rfind('.') {
823 Some(pos) => current = ¤t[..pos],
824 None => break,
825 }
826 }
827
828 TagSet::new()
829}
830
831pub fn file_is_text<P: AsRef<Path>>(path: P) -> Result<bool> {
865 let file = fs::File::open(path)?;
866 is_text(file)
867}
868
869pub fn is_text<R: Read>(mut reader: R) -> Result<bool> {
895 const TEXT_BYTES: [bool; 256] = {
899 let mut table = [false; 256];
900 let mut i = 0x20;
901 while i < 0x7F {
902 table[i] = true;
903 i += 1;
904 }
905 let mut i = 0x80;
906 while i < 256 {
907 table[i] = true;
908 i += 1;
909 }
910 table[7] = true;
911 table[8] = true;
912 table[9] = true;
913 table[10] = true;
914 table[11] = true;
915 table[12] = true;
916 table[13] = true;
917 table[27] = true;
918 table
919 };
920
921 let mut buffer = [0; 1024];
922 let bytes_read = reader.read(&mut buffer)?;
923
924 Ok(buffer[..bytes_read].iter().all(|&b| TEXT_BYTES[b as usize]))
925}
926
927pub fn parse_shebang_from_file<P: AsRef<Path>>(path: P) -> Result<ShebangTuple> {
965 let path = path.as_ref();
966
967 let metadata = fs::metadata(path)?;
969 #[cfg(unix)]
970 {
971 use std::os::unix::fs::PermissionsExt;
972 if metadata.permissions().mode() & 0o111 == 0 {
973 return Ok(ShebangTuple::new());
974 }
975 }
976
977 let file = fs::File::open(path)?;
978 parse_shebang(file)
979}
980
981pub fn parse_shebang<R: Read>(reader: R) -> Result<ShebangTuple> {
1010 use std::io::BufRead;
1011
1012 let mut buf_reader = BufReader::new(reader);
1013
1014 let mut first_line_bytes = Vec::new();
1016 match buf_reader.read_until(b'\n', &mut first_line_bytes) {
1017 Ok(0) => return Ok(ShebangTuple::new()), Ok(_) => {
1019 if first_line_bytes.ends_with(b"\n") {
1021 first_line_bytes.pop();
1022 }
1023 if first_line_bytes.ends_with(b"\r") {
1025 first_line_bytes.pop();
1026 }
1027 }
1028 Err(_) => return Ok(ShebangTuple::new()), }
1030
1031 if first_line_bytes.len() < 2 || &first_line_bytes[0..2] != b"#!" {
1033 return Ok(ShebangTuple::new());
1034 }
1035
1036 if first_line_bytes.len() > 1024 {
1038 first_line_bytes.truncate(1024);
1039 }
1040
1041 let first_line = match String::from_utf8(first_line_bytes) {
1043 Ok(line) => line,
1044 Err(_) => return Ok(ShebangTuple::new()),
1045 };
1046
1047 let shebang_line = first_line[2..].trim();
1049
1050 for c in shebang_line.chars() {
1052 if !c.is_ascii() || (c.is_control() && c != '\t') {
1053 return Ok(ShebangTuple::new());
1054 }
1055 }
1056
1057 let parts: smallvec::SmallVec<[&str; 4]> = shebang_line.split_whitespace().collect();
1059 if parts.is_empty() {
1060 return Ok(ShebangTuple::new());
1061 }
1062
1063 let cmd: smallvec::SmallVec<[&str; 2]> = if parts[0] == "/usr/bin/env" {
1064 if parts.len() == 1 {
1065 smallvec::SmallVec::new()
1067 } else if parts.len() >= 2 && parts[1] == "-S" {
1068 if parts.len() > 2 {
1069 parts[2..].iter().copied().collect()
1070 } else {
1071 smallvec::SmallVec::new()
1073 }
1074 } else {
1075 parts[1..].iter().copied().collect()
1076 }
1077 } else {
1078 parts.iter().copied().collect()
1079 };
1080
1081 if cmd.is_empty() {
1082 return Ok(ShebangTuple::new());
1083 }
1084
1085 Ok(ShebangTuple::from_vec(
1087 cmd.iter().map(|s| s.to_string()).collect(),
1088 ))
1089}
1090
1091#[cfg(test)]
1092mod tests {
1093 use super::*;
1094 use std::fs;
1095 use std::io::Cursor;
1096 use std::os::unix::fs::PermissionsExt;
1097 use tempfile::{NamedTempFile, tempdir};
1098
1099 macro_rules! shebang_tuple {
1101 () => {
1102 ShebangTuple::new()
1103 };
1104 ($($item:expr),+) => {
1105 ShebangTuple::from_vec(vec![$($item.to_string()),+])
1106 };
1107 }
1108
1109 #[test]
1111 fn test_all_basic_tags_exist() {
1112 assert!(TYPE_TAGS.contains("file"));
1113 assert!(TYPE_TAGS.contains("directory"));
1114 assert!(MODE_TAGS.contains("executable"));
1115 assert!(ENCODING_TAGS.contains("text"));
1116 }
1117
1118 #[test]
1119 fn test_tag_groups_are_disjoint() {
1120 assert!(TYPE_TAGS.is_disjoint(&MODE_TAGS));
1121 assert!(TYPE_TAGS.is_disjoint(&ENCODING_TAGS));
1122 assert!(MODE_TAGS.is_disjoint(&ENCODING_TAGS));
1123 }
1124
1125 #[test]
1127 fn test_tags_from_filename_basic() {
1128 let tags = tags_from_filename("file.py");
1129 assert!(tags.contains("text"));
1130 assert!(tags.contains("python"));
1131 }
1132
1133 #[test]
1134 fn test_tags_from_filename_special_names() {
1135 let tags = tags_from_filename("Dockerfile");
1136 assert!(tags.contains("dockerfile"));
1137 assert!(tags.contains("text"));
1138
1139 let tags = tags_from_filename("Makefile");
1140 assert!(tags.contains("makefile"));
1141 assert!(tags.contains("text"));
1142
1143 let tags = tags_from_filename("Cargo.toml");
1144 assert!(tags.contains("toml"));
1145 assert!(tags.contains("cargo"));
1146 }
1147
1148 #[test]
1149 fn test_tags_from_filename_case_insensitive_extension() {
1150 let tags = tags_from_filename("image.JPG");
1151 assert!(tags.contains("binary"));
1152 assert!(tags.contains("image"));
1153 assert!(tags.contains("jpeg"));
1154 }
1155
1156 #[test]
1157 fn test_tags_from_filename_precedence() {
1158 let tags = tags_from_filename("setup.cfg");
1160 assert!(tags.contains("ini"));
1161 }
1162
1163 #[test]
1164 fn test_tags_from_filename_complex_names() {
1165 let tags = tags_from_filename("Dockerfile.xenial");
1166 assert!(tags.contains("dockerfile"));
1167
1168 let tags = tags_from_filename("README.md");
1169 assert!(tags.contains("markdown"));
1170 assert!(tags.contains("plain-text"));
1171 }
1172
1173 #[test]
1174 fn test_tags_from_filename_unrecognized() {
1175 let tags = tags_from_filename("unknown.xyz");
1176 assert!(tags.is_empty());
1177
1178 let tags = tags_from_filename("noextension");
1179 assert!(tags.is_empty());
1180 }
1181
1182 #[test]
1184 fn test_tags_from_interpreter_basic() {
1185 let tags = tags_from_interpreter("python3");
1186 assert!(tags.contains("python"));
1187 assert!(tags.contains("python3"));
1188 }
1189
1190 #[test]
1191 fn test_tags_from_interpreter_versioned() {
1192 let tags = tags_from_interpreter("python3.11.2");
1193 assert!(tags.contains("python"));
1194 assert!(tags.contains("python3"));
1195
1196 let tags = tags_from_interpreter("php8.1");
1197 assert!(tags.contains("php"));
1198 assert!(tags.contains("php8"));
1199 }
1200
1201 #[test]
1202 fn test_tags_from_interpreter_with_path() {
1203 let tags = tags_from_interpreter("/usr/bin/python3");
1204 assert!(tags.contains("python"));
1205 assert!(tags.contains("python3"));
1206 }
1207
1208 #[test]
1209 fn test_tags_from_interpreter_unrecognized() {
1210 let tags = tags_from_interpreter("unknown-interpreter");
1211 assert!(tags.is_empty());
1212
1213 let tags = tags_from_interpreter("");
1214 assert!(tags.is_empty());
1215 }
1216
1217 #[test]
1219 fn test_is_text_basic() {
1220 assert!(is_text(Cursor::new(b"hello world")).unwrap());
1221 assert!(is_text(Cursor::new(b"")).unwrap());
1222 assert!(!is_text(Cursor::new(b"hello\x00world")).unwrap());
1223 }
1224
1225 #[test]
1226 fn test_is_text_unicode() {
1227 assert!(is_text(Cursor::new("éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)ノ".as_bytes())).unwrap());
1228 assert!(is_text(Cursor::new(r"¯\_(ツ)_/¯".as_bytes())).unwrap());
1229 assert!(is_text(Cursor::new("♪┏(・o・)┛♪┗ ( ・o・) ┓♪".as_bytes())).unwrap());
1230 }
1231
1232 #[test]
1233 fn test_is_text_binary_data() {
1234 assert!(!is_text(Cursor::new(&[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01])).unwrap());
1236 assert!(!is_text(Cursor::new(&[0x43, 0x92, 0xd9, 0x0f, 0xaf, 0x32, 0x2c])).unwrap());
1238 }
1239
1240 #[test]
1242 fn test_parse_shebang_basic() {
1243 let components = parse_shebang(Cursor::new(b"#!/usr/bin/python")).unwrap();
1244 assert_eq!(components, shebang_tuple!["/usr/bin/python"]);
1245
1246 let components = parse_shebang(Cursor::new(b"#!/usr/bin/env python")).unwrap();
1247 assert_eq!(components, shebang_tuple!["python"]);
1248 }
1249
1250 #[test]
1251 fn test_parse_shebang_env_with_flags() {
1252 let components = parse_shebang(Cursor::new(b"#!/usr/bin/env -S python -u")).unwrap();
1253 assert_eq!(components, shebang_tuple!["python", "-u"]);
1254 }
1255
1256 #[test]
1257 fn test_parse_shebang_spaces() {
1258 let components = parse_shebang(Cursor::new(b"#! /usr/bin/python")).unwrap();
1259 assert_eq!(components, shebang_tuple!["/usr/bin/python"]);
1260
1261 let components = parse_shebang(Cursor::new(b"#!/usr/bin/foo python")).unwrap();
1262 assert_eq!(components, shebang_tuple!["/usr/bin/foo", "python"]);
1263 }
1264
1265 #[test]
1266 fn test_parse_shebang_no_shebang() {
1267 let components = parse_shebang(Cursor::new(b"import sys")).unwrap();
1268 assert!(components.is_empty());
1269
1270 let components = parse_shebang(Cursor::new(b"")).unwrap();
1271 assert!(components.is_empty());
1272 }
1273
1274 #[test]
1275 fn test_parse_shebang_invalid_utf8() {
1276 let result = parse_shebang(Cursor::new(&[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd]));
1277 match result {
1278 Ok(components) => assert!(components.is_empty()),
1279 Err(_) => (), }
1281 }
1282
1283 #[test]
1285 fn test_tags_from_path_file_not_found() {
1286 let result = tags_from_path("/nonexistent/path");
1287 assert!(result.is_err());
1288 assert!(result.unwrap_err().to_string().contains("does not exist"));
1289 }
1290
1291 #[test]
1292 fn test_tags_from_path_regular_file() {
1293 let file = NamedTempFile::new().unwrap();
1294 fs::write(&file, "print('hello')").unwrap();
1295
1296 let tags = tags_from_path(file.path()).unwrap();
1297 assert!(tags.contains("file"));
1298 assert!(tags.contains("non-executable"));
1299 assert!(tags.contains("text"));
1300 }
1301
1302 #[test]
1303 fn test_tags_from_path_executable_file() {
1304 let dir = tempdir().unwrap();
1305 let script_path = dir.path().join("script.py");
1306 fs::write(&script_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
1307
1308 let mut perms = fs::metadata(&script_path).unwrap().permissions();
1309 perms.set_mode(0o755);
1310 fs::set_permissions(&script_path, perms).unwrap();
1311
1312 let tags = tags_from_path(&script_path).unwrap();
1313 assert!(tags.contains("file"));
1314 assert!(tags.contains("executable"));
1315 assert!(tags.contains("python"));
1316 assert!(tags.contains("text"));
1317 }
1318
1319 #[test]
1320 fn test_tags_from_path_directory() {
1321 let dir = tempdir().unwrap();
1322 let tags = tags_from_path(dir.path()).unwrap();
1323 assert_eq!(tags, HashSet::from(["directory"]));
1324 }
1325
1326 #[test]
1327 fn test_tags_from_path_binary_file() {
1328 let dir = tempdir().unwrap();
1329 let binary_path = dir.path().join("binary");
1330 fs::write(&binary_path, &[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01]).unwrap();
1331
1332 let tags = tags_from_path(&binary_path).unwrap();
1333 assert!(tags.contains("file"));
1334 assert!(tags.contains("binary"));
1335 assert!(tags.contains("non-executable"));
1336 }
1337
1338 #[test]
1339 fn test_file_is_text_simple() {
1340 let dir = tempdir().unwrap();
1341 let text_path = dir.path().join("text.txt");
1342 fs::write(&text_path, "Hello, world!").unwrap();
1343 assert!(file_is_text(&text_path).unwrap());
1344 }
1345
1346 #[test]
1347 fn test_file_is_text_does_not_exist() {
1348 let result = file_is_text("/nonexistent/file");
1349 assert!(result.is_err());
1350 }
1351
1352 #[test]
1354 fn test_plist_binary_detection() {
1355 let dir = tempdir().unwrap();
1356 let plist_path = dir.path().join("test.plist");
1357
1358 let binary_plist = [
1360 0x62, 0x70, 0x6c, 0x69, 0x73, 0x74, 0x30, 0x30, 0xd1, 0x01, 0x02, 0x5f, 0x10, 0x0f,
1362 ];
1363 fs::write(&plist_path, &binary_plist).unwrap();
1364
1365 let tags = tags_from_path(&plist_path).unwrap();
1366 assert!(tags.contains("plist"));
1367 assert!(tags.contains("binary"));
1368 }
1369
1370 #[test]
1371 fn test_plist_text_detection() {
1372 let dir = tempdir().unwrap();
1373 let plist_path = dir.path().join("test.plist");
1374
1375 let text_plist = r#"<?xml version="1.0" encoding="UTF-8"?>
1376<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
1377<plist version="1.0">
1378<dict>
1379 <key>TestKey</key>
1380 <string>TestValue</string>
1381</dict>
1382</plist>"#;
1383 fs::write(&plist_path, text_plist).unwrap();
1384
1385 let tags = tags_from_path(&plist_path).unwrap();
1386 assert!(tags.contains("plist"));
1387 assert!(tags.contains("text"));
1388 }
1389
1390 #[test]
1392 fn test_empty_file() {
1393 let dir = tempdir().unwrap();
1394 let empty_path = dir.path().join("empty");
1395 fs::write(&empty_path, "").unwrap();
1396
1397 let tags = tags_from_path(&empty_path).unwrap();
1398 assert!(tags.contains("file"));
1399 assert!(tags.contains("text")); assert!(tags.contains("non-executable"));
1401 }
1402
1403 #[test]
1404 fn test_shebang_incomplete() {
1405 let shebang_incomplete = parse_shebang(Cursor::new(b"#! \n")).unwrap();
1406 assert!(shebang_incomplete.is_empty());
1407 }
1408
1409 #[test]
1410 fn test_multiple_extensions() {
1411 let tags = tags_from_filename("backup.tar.gz");
1412 assert!(tags.contains("binary"));
1413 assert!(tags.contains("gzip"));
1414 }
1415
1416 #[test]
1418 fn test_file_identifier_default() {
1419 let dir = tempdir().unwrap();
1420 let py_file = dir.path().join("test.py");
1421 fs::write(&py_file, "print('hello')").unwrap();
1422
1423 let identifier = FileIdentifier::new();
1424 let tags = identifier.identify(&py_file).unwrap();
1425
1426 assert!(tags.contains("file"));
1427 assert!(tags.contains("python"));
1428 assert!(tags.contains("text"));
1429 assert!(tags.contains("non-executable"));
1430 }
1431
1432 #[test]
1433 fn test_file_identifier_skip_content_analysis() {
1434 let dir = tempdir().unwrap();
1435 let unknown_file = dir.path().join("unknown_file");
1436 fs::write(&unknown_file, "some content").unwrap();
1437
1438 let identifier = FileIdentifier::new().skip_content_analysis();
1439 let tags = identifier.identify(&unknown_file).unwrap();
1440
1441 assert!(tags.contains("file"));
1442 assert!(tags.contains("non-executable"));
1443 assert!(!tags.contains("text"));
1445 assert!(!tags.contains("binary"));
1446 }
1447
1448 #[test]
1449 fn test_file_identifier_skip_shebang_analysis() {
1450 let dir = tempdir().unwrap();
1451 let script_file = dir.path().join("script");
1452 fs::write(&script_file, "#!/usr/bin/env python3\nprint('hello')").unwrap();
1453
1454 let mut perms = fs::metadata(&script_file).unwrap().permissions();
1455 perms.set_mode(0o755);
1456 fs::set_permissions(&script_file, perms).unwrap();
1457
1458 let identifier = FileIdentifier::new().skip_shebang_analysis();
1459 let tags = identifier.identify(&script_file).unwrap();
1460
1461 assert!(tags.contains("file"));
1462 assert!(tags.contains("executable"));
1463 assert!(!tags.contains("python"));
1466 }
1467
1468 #[test]
1469 fn test_file_identifier_custom_extensions() {
1470 let dir = tempdir().unwrap();
1471 let custom_file = dir.path().join("test.myext");
1472 fs::write(&custom_file, "custom content").unwrap();
1473
1474 let mut custom_extensions = std::collections::HashMap::new();
1475 custom_extensions.insert("myext".to_string(), HashSet::from(["custom", "text"]));
1476
1477 let identifier = FileIdentifier::new().with_custom_extensions(custom_extensions);
1478 let tags = identifier.identify(&custom_file).unwrap();
1479
1480 assert!(tags.contains("file"));
1481 assert!(tags.contains("custom"));
1482 assert!(tags.contains("text"));
1483 assert!(tags.contains("non-executable"));
1484 }
1485
1486 #[test]
1487 fn test_file_identifier_chaining() {
1488 let dir = tempdir().unwrap();
1489 let test_file = dir.path().join("test.unknown");
1490 fs::write(&test_file, "content").unwrap();
1491
1492 let identifier = FileIdentifier::new()
1493 .skip_content_analysis()
1494 .skip_shebang_analysis();
1495 let tags = identifier.identify(&test_file).unwrap();
1496
1497 assert!(tags.contains("file"));
1498 assert!(tags.contains("non-executable"));
1499 assert!(!tags.contains("text"));
1501 assert!(!tags.contains("binary"));
1502 }
1503
1504 #[test]
1506 fn test_comprehensive_shebang_parsing() {
1507 let test_cases = vec![
1508 ("", vec![]),
1509 ("#!/usr/bin/python", vec!["python"]),
1510 ("#!/usr/bin/env python", vec!["python"]),
1511 ("#! /usr/bin/python", vec!["python"]),
1512 ("#!/usr/bin/foo python", vec![]), ("#!/usr/bin/env -S python -u", vec!["python"]),
1514 ("#!/usr/bin/env", vec![]),
1515 ("#!/usr/bin/env -S", vec![]),
1516 ];
1517
1518 for (input, _expected) in test_cases {
1519 let components = parse_shebang(Cursor::new(input.as_bytes())).unwrap();
1520
1521 match input {
1522 "" => assert!(components.is_empty()),
1523 "#!/usr/bin/python" => assert_eq!(components, shebang_tuple!["/usr/bin/python"]),
1524 "#!/usr/bin/env python" => assert_eq!(components, shebang_tuple!["python"]),
1525 "#! /usr/bin/python" => assert_eq!(components, shebang_tuple!["/usr/bin/python"]),
1526 "#!/usr/bin/foo python" => {
1527 assert_eq!(components, shebang_tuple!["/usr/bin/foo", "python"])
1528 }
1529 "#!/usr/bin/env -S python -u" => {
1530 assert_eq!(components, shebang_tuple!["python", "-u"])
1531 }
1532 "#!/usr/bin/env" => {
1533 assert!(
1535 components.is_empty(),
1536 "Got components: {:?} for input: '{}'",
1537 components,
1538 input
1539 );
1540 }
1541 "#!/usr/bin/env -S" => {
1542 assert!(
1544 components.is_empty(),
1545 "Got components: {:?} for input: '{}'",
1546 components,
1547 input
1548 );
1549 }
1550 _ => {}
1551 }
1552 }
1553 }
1554
1555 #[test]
1556 fn test_invalid_utf8_shebang() {
1557 let invalid_utf8_cases = vec![
1559 &[0xf9, 0x93, 0x01, 0x42, 0xcd][..],
1560 &[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd][..],
1561 &[0x23, 0x21, 0x00, 0x00, 0x00, 0x00][..],
1562 ];
1563
1564 for input in invalid_utf8_cases {
1565 let result = parse_shebang(Cursor::new(input));
1567 match result {
1568 Ok(components) => assert!(components.is_empty()),
1569 Err(_) => (), }
1571 }
1572 }
1573
1574 #[test]
1577 fn test_tags_from_info_regular_file() {
1578 let info = FileInfo {
1579 filename: "script.py",
1580 file_kind: FileKind::Regular,
1581 is_executable: false,
1582 content: Some(b"print('hello')"),
1583 };
1584 let tags = tags_from_info(&info);
1585 assert!(tags.contains("file"));
1586 assert!(tags.contains("non-executable"));
1587 assert!(tags.contains("python"));
1588 assert!(tags.contains("text"));
1589 }
1590
1591 #[test]
1592 fn test_tags_from_info_directory() {
1593 let info = FileInfo {
1594 filename: "src",
1595 file_kind: FileKind::Directory,
1596 is_executable: false,
1597 content: None,
1598 };
1599 let tags = tags_from_info(&info);
1600 assert!(tags.contains("directory"));
1601 assert_eq!(tags.len(), 1);
1602 }
1603
1604 #[test]
1605 fn test_tags_from_info_symlink() {
1606 let info = FileInfo {
1607 filename: "link",
1608 file_kind: FileKind::Symlink,
1609 is_executable: false,
1610 content: None,
1611 };
1612 let tags = tags_from_info(&info);
1613 assert!(tags.contains("symlink"));
1614 assert_eq!(tags.len(), 1);
1615 }
1616
1617 #[test]
1618 fn test_tags_from_info_socket() {
1619 let info = FileInfo {
1620 filename: "sock",
1621 file_kind: FileKind::Socket,
1622 is_executable: false,
1623 content: None,
1624 };
1625 let tags = tags_from_info(&info);
1626 assert!(tags.contains("socket"));
1627 assert_eq!(tags.len(), 1);
1628 }
1629
1630 #[test]
1631 fn test_tags_from_info_executable_with_shebang() {
1632 let info = FileInfo {
1633 filename: "my-script",
1634 file_kind: FileKind::Regular,
1635 is_executable: true,
1636 content: Some(b"#!/usr/bin/env python3\nprint('hello')"),
1637 };
1638 let tags = tags_from_info(&info);
1639 assert!(tags.contains("file"));
1640 assert!(tags.contains("executable"));
1641 assert!(tags.contains("python"));
1642 assert!(tags.contains("python3"));
1643 assert!(tags.contains("text"));
1644 }
1645
1646 #[test]
1647 fn test_tags_from_info_binary_content() {
1648 let info = FileInfo {
1649 filename: "data.bin",
1650 file_kind: FileKind::Regular,
1651 is_executable: false,
1652 content: Some(&[0x7f, 0x45, 0x4c, 0x46, 0x00]),
1653 };
1654 let tags = tags_from_info(&info);
1655 assert!(tags.contains("file"));
1656 assert!(tags.contains("binary"));
1657 }
1658
1659 #[test]
1660 fn test_tags_from_info_no_content() {
1661 let info = FileInfo {
1662 filename: "unknown",
1663 file_kind: FileKind::Regular,
1664 is_executable: false,
1665 content: None,
1666 };
1667 let tags = tags_from_info(&info);
1668 assert!(tags.contains("file"));
1669 assert!(tags.contains("non-executable"));
1670 assert!(!tags.contains("text"));
1672 assert!(!tags.contains("binary"));
1673 }
1674
1675 #[test]
1676 fn test_tags_from_info_extension_provides_encoding() {
1677 let info = FileInfo {
1678 filename: "app.js",
1679 file_kind: FileKind::Regular,
1680 is_executable: false,
1681 content: None,
1682 };
1683 let tags = tags_from_info(&info);
1684 assert!(tags.contains("javascript"));
1685 assert!(tags.contains("text"));
1686 }
1687
1688 #[test]
1689 fn test_identify_from_with_custom_extensions() {
1690 let mut custom = std::collections::HashMap::new();
1691 custom.insert("myext".to_string(), HashSet::from(["text", "custom-lang"]));
1692
1693 let identifier = FileIdentifier::new().with_custom_extensions(custom);
1694 let info = FileInfo {
1695 filename: "code.myext",
1696 file_kind: FileKind::Regular,
1697 is_executable: false,
1698 content: Some(b"some code"),
1699 };
1700 let tags = identifier.identify_from(&info);
1701 assert!(tags.contains("custom-lang"));
1702 assert!(tags.contains("text"));
1703 }
1704
1705 #[test]
1706 fn test_identify_from_skip_content() {
1707 let identifier = FileIdentifier::new().skip_content_analysis();
1708 let info = FileInfo {
1709 filename: "unknown",
1710 file_kind: FileKind::Regular,
1711 is_executable: false,
1712 content: Some(b"hello world"),
1713 };
1714 let tags = identifier.identify_from(&info);
1715 assert!(!tags.contains("text"));
1716 assert!(!tags.contains("binary"));
1717 }
1718
1719 #[test]
1720 fn test_identify_from_skip_shebang() {
1721 let identifier = FileIdentifier::new().skip_shebang_analysis();
1722 let info = FileInfo {
1723 filename: "my-script",
1724 file_kind: FileKind::Regular,
1725 is_executable: true,
1726 content: Some(b"#!/usr/bin/env python3\nprint('hello')"),
1727 };
1728 let tags = identifier.identify_from(&info);
1729 assert!(!tags.contains("python"));
1730 assert!(tags.contains("text"));
1732 }
1733}