1use std::borrow::Cow;
22use std::io::{BufRead, Read};
23use std::ops::BitOrAssign;
24use std::path::Path;
25
26#[cfg(feature = "serde")]
27use serde::de::{Error as DeError, SeqAccess, Visitor};
28
29pub mod tags;
30
31const TAG_WORDS: usize = tags::ALL_TAGS.len().div_ceil(64);
32
33#[derive(Clone, Copy, Default)]
38pub struct TagSet {
39 bits: [u64; TAG_WORDS],
40}
41
42impl std::fmt::Debug for TagSet {
43 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44 f.debug_list().entries(self.iter()).finish()
45 }
46}
47
48fn tag_id(tag: &str) -> Option<usize> {
49 tags::ALL_TAGS.binary_search(&tag).ok()
50}
51
52pub struct TagSetIter<'a> {
53 bits: &'a [u64; TAG_WORDS],
54 word_idx: usize,
55 cur_word: u64,
56}
57
58impl Iterator for TagSetIter<'_> {
59 type Item = &'static str;
60
61 fn next(&mut self) -> Option<Self::Item> {
62 loop {
63 if self.cur_word != 0 {
64 let tz = self.cur_word.trailing_zeros() as usize;
66 self.cur_word &= self.cur_word - 1;
68
69 let idx = (self.word_idx.saturating_sub(1) * 64) + tz;
72 return tags::ALL_TAGS.get(idx).copied();
73 }
74
75 if self.word_idx >= TAG_WORDS {
76 return None;
77 }
78
79 self.cur_word = self.bits[self.word_idx];
80 self.word_idx += 1;
81 }
82 }
83}
84
85impl TagSet {
86 pub const fn new(tag_ids: &[u16]) -> Self {
91 let mut bits = [0u64; TAG_WORDS];
92 let mut idx = 0;
93 while idx < tag_ids.len() {
94 let tag_id = tag_ids[idx] as usize;
95 assert!(tag_id < tags::ALL_TAGS.len(), "tag id out of range");
96 bits[tag_id / 64] |= 1u64 << (tag_id % 64);
97 idx += 1;
98 }
99
100 Self { bits }
101 }
102
103 pub fn from_tags<I, S>(tags: I) -> Self
107 where
108 I: IntoIterator<Item = S>,
109 S: AsRef<str>,
110 {
111 let mut bits = [0u64; TAG_WORDS];
112 for tag in tags {
113 let tag = tag.as_ref();
114 let Some(tag_id) = tag_id(tag) else {
115 debug_assert!(false, "unknown tag: {tag}");
116 continue;
117 };
118 bits[tag_id / 64] |= 1u64 << (tag_id % 64);
119 }
120
121 Self { bits }
122 }
123
124 pub const fn insert(&mut self, tag_id: u16) {
125 let tag_id = tag_id as usize;
126 assert!(tag_id < tags::ALL_TAGS.len(), "tag id out of range");
127 self.bits[tag_id / 64] |= 1u64 << (tag_id % 64);
128 }
129
130 pub fn is_disjoint(&self, other: &TagSet) -> bool {
132 for idx in 0..TAG_WORDS {
133 if (self.bits[idx] & other.bits[idx]) != 0 {
134 return false;
135 }
136 }
137 true
138 }
139
140 pub fn is_subset(&self, other: &TagSet) -> bool {
142 for idx in 0..TAG_WORDS {
143 if (self.bits[idx] & !other.bits[idx]) != 0 {
144 return false;
145 }
146 }
147 true
148 }
149
150 pub fn iter(&self) -> TagSetIter<'_> {
152 TagSetIter {
153 bits: &self.bits,
154 word_idx: 0,
155 cur_word: 0,
156 }
157 }
158
159 pub fn is_empty(&self) -> bool {
161 self.bits.iter().all(|&w| w == 0)
162 }
163}
164
165impl BitOrAssign<&TagSet> for TagSet {
166 fn bitor_assign(&mut self, rhs: &TagSet) {
167 for idx in 0..TAG_WORDS {
168 self.bits[idx] |= rhs.bits[idx];
169 }
170 }
171}
172
173#[cfg(feature = "serde")]
174impl<'de> serde::Deserialize<'de> for TagSet {
175 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
176 where
177 D: serde::Deserializer<'de>,
178 {
179 struct TagSetVisitor;
180
181 impl<'de> Visitor<'de> for TagSetVisitor {
182 type Value = TagSet;
183
184 fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
185 formatter.write_str("a sequence of tag strings")
186 }
187
188 fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
189 where
190 A: SeqAccess<'de>,
191 {
192 let mut tags = TagSet::default();
193 while let Some(tag) = seq.next_element::<Cow<str>>()? {
194 let Some(tag_id) = tag_id(&tag) else {
195 let msg = format!(
196 "Type tag `{tag}` is not recognized. Check for typos or upgrade prek to get new tags."
197 );
198 return Err(A::Error::custom(msg));
199 };
200 let tag_id = u16::try_from(tag_id)
201 .map_err(|_| A::Error::custom("tag id out of range"))?;
202 tags.insert(tag_id);
203 }
204 Ok(tags)
205 }
206 }
207
208 deserializer.deserialize_seq(TagSetVisitor)
209 }
210}
211
212#[cfg(feature = "schemars")]
213impl schemars::JsonSchema for TagSet {
214 fn inline_schema() -> bool {
215 true
216 }
217
218 fn schema_name() -> Cow<'static, str> {
219 Cow::Borrowed("TagSet")
220 }
221
222 fn json_schema(_generator: &mut schemars::SchemaGenerator) -> schemars::Schema {
223 schemars::json_schema!({
224 "type": "array",
225 "items": {
226 "type": "string",
227 },
228 "uniqueItems": true,
229 })
230 }
231}
232
233#[derive(Debug, thiserror::Error)]
234pub enum Error {
235 #[error(transparent)]
236 Io(#[from] std::io::Error),
237
238 #[error(transparent)]
239 Shebang(#[from] ShebangError),
240}
241
242pub fn tags_from_path(path: &Path) -> Result<TagSet, Error> {
244 let metadata = std::fs::symlink_metadata(path)?;
245 if metadata.is_dir() {
246 return Ok(tags::TAG_SET_DIRECTORY);
247 } else if metadata.is_symlink() {
248 return Ok(tags::TAG_SET_SYMLINK);
249 }
250 #[cfg(unix)]
251 {
252 use std::os::unix::fs::FileTypeExt;
253 let file_type = metadata.file_type();
254 if file_type.is_socket() {
255 return Ok(tags::TAG_SET_SOCKET);
256 }
257 };
258
259 let mut tags = tags::TAG_SET_FILE;
260
261 let executable;
262 #[cfg(unix)]
263 {
264 use std::os::unix::fs::PermissionsExt;
265 executable = metadata.permissions().mode() & 0o111 != 0;
266 }
267 #[cfg(not(unix))]
268 {
269 executable = true;
273 }
274
275 if executable {
276 tags.insert(tags::TAG_EXECUTABLE);
277 } else {
278 tags.insert(tags::TAG_NON_EXECUTABLE);
279 }
280
281 let filename_tags = tags_from_filename(path);
282 tags |= &filename_tags;
283 if executable {
284 if let Ok(shebang) = parse_shebang(path) {
285 let interpreter_tags = tags_from_interpreter(shebang[0].as_str());
286 tags |= &interpreter_tags;
287 }
288 }
289
290 if tags.is_disjoint(&tags::TAG_SET_TEXT_OR_BINARY) {
291 if is_text_file(path) {
292 tags.insert(tags::TAG_TEXT);
293 } else {
294 tags.insert(tags::TAG_BINARY);
295 }
296 }
297
298 Ok(tags)
299}
300
301fn tags_from_filename(filename: &Path) -> TagSet {
302 let ext = filename.extension().and_then(|ext| ext.to_str());
303 let filename = filename
304 .file_name()
305 .and_then(|name| name.to_str())
306 .expect("Invalid filename");
307
308 let mut result = tags::NAMES
309 .get(filename)
310 .or_else(|| {
311 filename
313 .split('.')
314 .next()
315 .and_then(|name| tags::NAMES.get(name))
316 })
317 .copied()
318 .unwrap_or_default();
319
320 if let Some(ext) = ext {
321 if ext.chars().all(|c| c.is_ascii_lowercase()) {
323 if let Some(tags) = tags::EXTENSIONS.get(ext) {
324 result |= tags;
325 }
326 } else {
327 let ext_lower = ext.to_ascii_lowercase();
328 if let Some(tags) = tags::EXTENSIONS.get(ext_lower.as_str()) {
329 result |= tags;
330 }
331 }
332 }
333
334 result
335}
336
337fn tags_from_interpreter(interpreter: &str) -> TagSet {
338 let mut name = interpreter
339 .rfind('/')
340 .map(|pos| &interpreter[pos + 1..])
341 .unwrap_or(interpreter);
342
343 while !name.is_empty() {
344 if let Some(tags) = tags::INTERPRETERS.get(name) {
345 return *tags;
346 }
347
348 if let Some(pos) = name.rfind('.') {
350 name = &name[..pos];
351 } else {
352 break;
353 }
354 }
355
356 TagSet::default()
357}
358
359#[derive(thiserror::Error, Debug)]
360pub enum ShebangError {
361 #[error("No shebang found")]
362 NoShebang,
363 #[error("Shebang contains non-printable characters")]
364 NonPrintableChars,
365 #[error("Failed to parse shebang")]
366 ParseFailed,
367 #[error("No command found in shebang")]
368 NoCommand,
369 #[error("IO error: {0}")]
370 IoError(#[from] std::io::Error),
371}
372
373fn starts_with(slice: &[String], prefix: &[&str]) -> bool {
374 slice.len() >= prefix.len() && slice.iter().zip(prefix.iter()).all(|(s, p)| s == p)
375}
376
377fn parse_nix_shebang<R: BufRead>(reader: &mut R, mut cmd: Vec<String>) -> Vec<String> {
382 loop {
383 let Ok(buf) = reader.fill_buf() else {
384 break;
385 };
386
387 if buf.len() < 2 || &buf[..2] != b"#!" {
388 break;
389 }
390
391 reader.consume(2);
392
393 let mut next_line = String::new();
394 match reader.read_line(&mut next_line) {
395 Ok(0) => break,
396 Ok(_) => {}
397 Err(err) => {
398 if err.kind() == std::io::ErrorKind::InvalidData {
399 return cmd;
400 }
401 break;
402 }
403 }
404
405 let trimmed = next_line.trim();
406 if trimmed.is_empty() {
407 continue;
408 }
409
410 if let Some(line_tokens) = shlex::split(trimmed) {
411 for idx in 0..line_tokens.len().saturating_sub(1) {
412 if line_tokens[idx] == "-i" {
413 if let Some(interpreter) = line_tokens.get(idx + 1) {
414 cmd = vec![interpreter.clone()];
415 }
416 }
417 }
418 }
419 }
420
421 cmd
422}
423
424pub fn parse_shebang(path: &Path) -> Result<Vec<String>, ShebangError> {
425 let file = std::fs::File::open(path)?;
426 let mut reader = std::io::BufReader::new(file);
427 let mut line = String::new();
428 reader.read_line(&mut line)?;
429 if !line.starts_with("#!") {
430 return Err(ShebangError::NoShebang);
431 }
432
433 if line
435 .bytes()
436 .any(|b| !(0x20..=0x7E).contains(&b) && !(0x09..=0x0D).contains(&b))
437 {
438 return Err(ShebangError::NonPrintableChars);
439 }
440
441 let mut tokens = shlex::split(line[2..].trim()).ok_or(ShebangError::ParseFailed)?;
442 let mut cmd =
443 if starts_with(&tokens, &["/usr/bin/env", "-S"]) || starts_with(&tokens, &["env", "-S"]) {
444 tokens.drain(0..2);
445 tokens
446 } else if starts_with(&tokens, &["/usr/bin/env"]) || starts_with(&tokens, &["env"]) {
447 tokens.drain(0..1);
448 tokens
449 } else {
450 tokens
451 };
452 if cmd.is_empty() {
453 return Err(ShebangError::NoCommand);
454 }
455 if cmd[0] == "nix-shell" {
456 cmd = parse_nix_shebang(&mut reader, cmd);
457 }
458 if cmd.is_empty() {
459 return Err(ShebangError::NoCommand);
460 }
461
462 Ok(cmd)
463}
464
465static IS_TEXT_CHAR: [u32; 8] = {
467 let mut table = [0u32; 8];
468 let mut i = 0;
469 while i < 256 {
470 let is_text =
474 (i >= 0x20 && i < 0x7F) || i >= 0x80 || matches!(i, 7 | 8 | 9 | 10 | 11 | 12 | 13 | 27);
475 if is_text {
476 table[i / 32] |= 1 << (i % 32);
477 }
478 i += 1;
479 }
480 table
481};
482
483fn is_text_char(b: u8) -> bool {
484 let idx = b as usize;
485 (IS_TEXT_CHAR[idx / 32] & (1 << (idx % 32))) != 0
486}
487
488fn is_text_file(path: &Path) -> bool {
493 let mut buffer = [0; 1024];
494 let Ok(mut file) = fs_err::File::open(path) else {
495 return false;
496 };
497
498 let Ok(bytes_read) = file.read(&mut buffer) else {
499 return false;
500 };
501 if bytes_read == 0 {
502 return true;
503 }
504
505 buffer[..bytes_read].iter().all(|&b| is_text_char(b))
506}
507
508#[cfg(test)]
509mod tests {
510 use super::{TagSet, tags};
511 use std::io::Write;
512 use std::path::Path;
513
514 fn assert_tagset(actual: &TagSet, expected: &[&'static str]) {
515 let mut actual_vec: Vec<_> = actual.iter().collect();
516 actual_vec.sort_unstable();
517 let mut expected_vec = expected.to_vec();
518 expected_vec.sort_unstable();
519 assert_eq!(actual_vec, expected_vec);
520 }
521
522 #[test]
523 #[cfg(unix)]
524 fn tags_from_path() -> anyhow::Result<()> {
525 let dir = tempfile::tempdir()?;
526 let src = dir.path().join("source.txt");
527 let dest = dir.path().join("link.txt");
528 fs_err::File::create(&src)?;
529 std::os::unix::fs::symlink(&src, &dest)?;
530
531 let tags = super::tags_from_path(dir.path())?;
532 assert_tagset(&tags, &["directory"]);
533 let tags = super::tags_from_path(&src)?;
534 assert_tagset(&tags, &["plain-text", "non-executable", "file", "text"]);
535 let tags = super::tags_from_path(&dest)?;
536 assert_tagset(&tags, &["symlink"]);
537
538 Ok(())
539 }
540
541 #[test]
542 #[cfg(windows)]
543 fn tags_from_path() -> anyhow::Result<()> {
544 let dir = tempfile::tempdir()?;
545 let src = dir.path().join("source.txt");
546 fs_err::File::create(&src)?;
547
548 let tags = super::tags_from_path(dir.path())?;
549 assert_tagset(&tags, &["directory"]);
550 let tags = super::tags_from_path(&src)?;
551 assert_tagset(&tags, &["plain-text", "executable", "file", "text"]);
552
553 Ok(())
554 }
555
556 #[test]
557 fn tags_from_filename() {
558 let tags = super::tags_from_filename(Path::new("test.py"));
559 assert_tagset(&tags, &["python", "text"]);
560
561 let tags = super::tags_from_filename(Path::new("bitbake.bbappend"));
562 assert_tagset(&tags, &["bitbake", "text"]);
563
564 let tags = super::tags_from_filename(Path::new("project.fsproj"));
565 assert_tagset(&tags, &["fsproj", "msbuild", "text", "xml"]);
566
567 let tags = super::tags_from_filename(Path::new("data.json"));
568 assert_tagset(&tags, &["json", "text"]);
569
570 let tags = super::tags_from_filename(Path::new("build.props"));
571 assert_tagset(&tags, &["msbuild", "text", "xml"]);
572
573 let tags = super::tags_from_filename(Path::new("profile.psd1"));
574 assert_tagset(&tags, &["powershell", "text"]);
575
576 let tags = super::tags_from_filename(Path::new("style.xslt"));
577 assert_tagset(&tags, &["text", "xml", "xsl"]);
578
579 let tags = super::tags_from_filename(Path::new("Pipfile"));
580 assert_tagset(&tags, &["toml", "text"]);
581
582 let tags = super::tags_from_filename(Path::new("Pipfile.lock"));
583 assert_tagset(&tags, &["json", "text"]);
584
585 let tags = super::tags_from_filename(Path::new("file.pdf"));
586 assert_tagset(&tags, &["pdf", "binary"]);
587
588 let tags = super::tags_from_filename(Path::new("FILE.PDF"));
589 assert_tagset(&tags, &["pdf", "binary"]);
590
591 let tags = super::tags_from_filename(Path::new(".envrc"));
592 assert_tagset(&tags, &["bash", "shell", "text"]);
593
594 let tags = super::tags_from_filename(Path::new("meson.options"));
595 assert_tagset(&tags, &["meson", "meson-options", "text"]);
596
597 let tags = super::tags_from_filename(Path::new("Tiltfile"));
598 assert_tagset(&tags, &["text", "tiltfile"]);
599
600 let tags = super::tags_from_filename(Path::new("Tiltfile.dev"));
601 assert_tagset(&tags, &["text", "tiltfile"]);
602 }
603
604 #[test]
605 fn tags_from_interpreter() {
606 let tags = super::tags_from_interpreter("/usr/bin/python3");
607 assert_tagset(&tags, &["python", "python3"]);
608
609 let tags = super::tags_from_interpreter("/usr/bin/python3.12");
610 assert_tagset(&tags, &["python", "python3"]);
611
612 let tags = super::tags_from_interpreter("/usr/bin/python3.12.3");
613 assert_tagset(&tags, &["python", "python3"]);
614
615 let tags = super::tags_from_interpreter("python");
616 assert_tagset(&tags, &["python"]);
617
618 let tags = super::tags_from_interpreter("sh");
619 assert_tagset(&tags, &["shell", "sh"]);
620
621 let tags = super::tags_from_interpreter("invalid");
622 assert!(tags.is_empty());
623 }
624
625 #[test]
626 fn tagset_new_iter_and_is_empty() {
627 let empty = TagSet::new(&[]);
628 assert!(empty.is_empty());
629 assert_eq!(empty.iter().count(), 0);
630
631 let binary_id = u16::try_from(super::tag_id("binary").expect("binary id")).unwrap();
632 let text_id = u16::try_from(super::tag_id("text").expect("text id")).unwrap();
633 let set = TagSet::new(&[text_id, binary_id, text_id]);
634
635 assert!(!set.is_empty());
636 assert_eq!(set.iter().collect::<Vec<_>>(), vec!["binary", "text"]);
637 }
638
639 #[test]
640 fn tagset_from_tags_intersects_subset_and_bitor_assign() {
641 let a = TagSet::from_tags(["python", "text"]);
642 let b = TagSet::from_tags(["python"]);
643 let c = TagSet::from_tags(["binary"]);
644
645 assert!(b.is_subset(&a));
646 assert!(!a.is_subset(&b));
647 assert!(!a.is_disjoint(&b));
648 assert!(a.is_disjoint(&c));
649
650 let mut merged = b;
651 merged |= &c;
652 assert_tagset(&merged, &["python", "binary"]);
653 }
654
655 #[test]
656 fn tagset_new_panics_on_out_of_range_id() {
657 let out_of_range = u16::try_from(tags::ALL_TAGS.len()).unwrap();
658 let result = std::panic::catch_unwind(|| TagSet::new(&[out_of_range]));
659 assert!(result.is_err());
660 }
661
662 #[cfg(feature = "serde")]
663 #[test]
664 fn tagset_deserialize_from_string_slice() {
665 let parsed: TagSet =
666 serde_json::from_str(r#"["python","text"]"#).expect("should parse tags");
667 assert_tagset(&parsed, &["python", "text"]);
668 }
669
670 #[cfg(feature = "serde")]
671 #[test]
672 fn tagset_deserialize_unknown_tag_errors() {
673 let err = serde_json::from_str::<TagSet>(r#"["not-a-real-tag"]"#).unwrap_err();
674 assert!(
675 err.to_string()
676 .contains("Type tag `not-a-real-tag` is not recognized"),
677 "unexpected error: {err}"
678 );
679 }
680
681 #[test]
682 fn parse_shebang_nix_shell_interpreter() -> anyhow::Result<()> {
683 let mut file = tempfile::NamedTempFile::new()?;
684 writeln!(
685 file,
686 indoc::indoc! {r#"
687 #!/usr/bin/env nix-shell
688 #! nix-shell --pure -i bash -p "python3.withPackages (p: [ p.numpy p.sympy ])"
689 #! nix-shell -I nixpkgs=https://example.com
690 echo hi
691 "#}
692 )?;
693 file.flush()?;
694
695 let cmd = super::parse_shebang(file.path())?;
696 assert_eq!(cmd, vec!["bash"]);
697
698 Ok(())
699 }
700
701 #[test]
702 fn parse_shebang_nix_shell_without_interpreter() -> anyhow::Result<()> {
703 let mut file = tempfile::NamedTempFile::new()?;
704 writeln!(
705 file,
706 indoc::indoc! {r"
707 #!/usr/bin/env nix-shell -p python3
708 #! nix-shell --pure -I nixpkgs=https://example.com
709 echo hi
710 "}
711 )?;
712 file.flush()?;
713
714 let cmd = super::parse_shebang(file.path())?;
715 assert_eq!(cmd, vec!["nix-shell", "-p", "python3"]);
716
717 Ok(())
718 }
719}