1use glob::MatchOptions;
2use regex::Regex;
3use std::collections::HashMap;
4use std::ffi::OsStr;
5use std::path::Path;
6use std::str::FromStr;
7use std::sync::LazyLock;
8
9const GLOB_MATCH_OPTIONS: MatchOptions = MatchOptions {
11 case_sensitive: true,
12 require_literal_separator: true,
13 require_literal_leading_dot: false,
14};
15
16macro_rules! _include {
17 ($path:literal) => {
18 include!(concat!(env!("OUT_DIR"), "/languages/", $path));
19 };
20}
21
22_include!("language.rs");
23_include!("category_mixin.rs");
24_include!("name_mixin.rs");
25_include!("parse_variant_mixin.rs");
26_include!("color_hex_mixin.rs");
27_include!("color_rgb_mixin.rs");
28_include!("nerd_font_glyph_mixin.rs");
29_include!("priority_mixin.rs");
30_include!("from_extension_mixin.rs");
31_include!("from_filename_mixin.rs");
32_include!("from_interpreter_mixin.rs");
33_include!("glob_mappings_mixin.rs");
34_include!("heuristic_mappings_mixin.rs");
35
36impl Language {
37 fn from_path_extension(path: impl AsRef<Path>) -> Vec<Self> {
39 let extension = path.as_ref().extension().and_then(|ext| ext.to_str());
40 extension.map_or(vec![], Self::from_extension)
41 }
42
43 fn from_path_filename(path: impl AsRef<Path>) -> Vec<Self> {
45 let filename = path
46 .as_ref()
47 .file_name()
48 .and_then(|filename| filename.to_str());
49 filename.map_or(vec![], Self::from_filename)
50 }
51
52 fn from_shebang(contents: &[u8]) -> Vec<Self> {
54 const MAX_SHEBANG_LENGTH: usize = 50;
55
56 let mut lines = contents.split(|&c| c == b'\n');
57 let first_line = lines.next().unwrap_or_default();
58 if first_line.len() < 2 || first_line[0] != b'#' || first_line[1] != b'!' {
59 return vec![];
60 }
61 let first_line = if first_line.len() > MAX_SHEBANG_LENGTH {
62 &first_line[..MAX_SHEBANG_LENGTH]
63 } else {
64 first_line
65 };
66 let first_line = String::from_utf8_lossy(first_line);
67 let first_line = first_line.trim_end();
69
70 static RE: LazyLock<Regex> = LazyLock::new(|| {
71 Regex::new(r"^#!(?:/usr(?:/local)?)?/bin/(?:env\s+)?([\w\d]+)\r?$").unwrap()
72 });
73
74 RE.captures(first_line)
75 .and_then(|c| c.get(1))
76 .map_or(vec![], |m| {
77 let interpreter = m.as_str();
78 Self::from_interpreter(interpreter)
79 })
80 }
81
82 pub fn from_glob(path: impl AsRef<Path>) -> Vec<Self> {
84 let path = path.as_ref();
85
86 struct GlobMapping {
87 patterns: Vec<glob::Pattern>,
88 language: Language,
89 }
90 static GLOB_MAPPINGS: LazyLock<Vec<GlobMapping>> = LazyLock::new(|| {
91 Language::glob_mappings()
92 .into_iter()
93 .map(|(patterns, language)| {
94 let patterns = patterns
95 .into_iter()
96 .map(|pattern| glob::Pattern::new(pattern).unwrap())
97 .collect();
98 GlobMapping { patterns, language }
99 })
100 .collect()
101 });
102
103 GLOB_MAPPINGS
104 .iter()
105 .filter(|gm| {
106 gm.patterns
107 .iter()
108 .any(|p| p.matches_path_with(path.as_ref(), GLOB_MATCH_OPTIONS))
109 })
110 .map(|gm| gm.language)
111 .collect()
112 }
113
114 fn filter_by_heuristics(languages: &[Self], contents: &str) -> Vec<Self> {
116 static HEURISTICS: LazyLock<HashMap<Language, Vec<Regex>>> = LazyLock::new(|| {
117 Language::heuristic_mappings()
118 .into_iter()
119 .map(|(language, patterns)| {
120 let patterns = patterns
121 .into_iter()
122 .map(|pattern| Regex::new(pattern).unwrap())
123 .collect();
124 (language, patterns)
125 })
126 .collect()
127 });
128
129 languages
130 .iter()
131 .filter(|language| {
132 HEURISTICS
133 .get(language)
134 .is_some_and(|heuristics| heuristics.iter().any(|re| re.is_match(contents)))
135 })
136 .cloned()
137 .collect()
138 }
139
140 fn find_simple(path: impl AsRef<Path>, contents: &[u8]) -> Vec<Self> {
143 let languages = Self::from_shebang(contents);
144 if !languages.is_empty() {
145 return languages;
146 }
147 let languages = Self::from_path_filename(&path);
148 if !languages.is_empty() {
149 return languages;
150 }
151 let languages = Self::from_glob(&path);
152 if !languages.is_empty() {
153 return languages;
154 }
155 Self::from_path_extension(&path)
156 }
157
158 pub fn pick(path: impl AsRef<Path>, contents: &[u8], read_limit: usize) -> Option<Self> {
162 let path = path.as_ref();
163 let path = Self::maybe_strip_example_extension(path);
164 let languages = Self::find_simple(path, contents);
165 if languages.len() == 1 {
166 return Some(languages[0]);
167 }
168
169 let contents = if contents.len() > read_limit {
170 &contents[..read_limit]
171 } else {
172 contents
173 };
174 let heuristic_contents = std::str::from_utf8(contents).unwrap_or_default();
175 let by_heuristics = Self::filter_by_heuristics(&languages, heuristic_contents);
176
177 let found_languages = match by_heuristics.len() {
178 0 => languages,
179 1 => return Some(by_heuristics[0]),
180 _ => by_heuristics,
181 };
182
183 found_languages.into_iter().max_by_key(Self::priority)
184 }
185
186 fn maybe_strip_example_extension(path: &Path) -> &Path {
188 const EXAMPLE_EXT: &[u8] = b".example";
189 path.as_os_str()
190 .as_encoded_bytes()
191 .strip_suffix(EXAMPLE_EXT)
192 .map(|bytes| {
193 unsafe { OsStr::from_encoded_bytes_unchecked(bytes) }
196 })
197 .map(Path::new)
198 .unwrap_or(path)
199 }
200
201 const fn serialize(&self) -> Serialize {
205 Serialize {
206 name: self.name(),
207 category: self.category(),
208 hex: self.hex(),
209 nerd_font_glyph: self.nerd_font_glyph(),
210 }
211 }
212}
213
214#[derive(Debug, PartialEq, Eq)]
215pub struct ParseError;
216
217impl FromStr for Language {
218 type Err = ParseError;
219
220 fn from_str(s: &str) -> Result<Self, Self::Err> {
223 Self::parse_variant(s).ok_or(ParseError)
224 }
225}
226
227impl serde::Serialize for Language {
228 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
230 where
231 S: serde::Serializer,
232 {
233 Self::serialize(self).serialize(serializer)
235 }
236}
237
238#[non_exhaustive]
240#[derive(Clone, Debug, serde::Deserialize, Eq, Hash, PartialEq, serde::Serialize)]
241#[serde(rename_all = "lowercase")]
242pub enum Category {
243 Data,
245 Markup,
247 Pattern,
249 Programming,
251 Prose,
253 Query,
255}
256
257#[derive(Debug, serde::Serialize)]
259struct Serialize {
260 name: &'static str,
261 category: Category,
262 hex: &'static str,
263 nerd_font_glyph: Option<&'static str>,
264}
265
266#[cfg(test)]
267mod language_tests {
268 use super::*;
269 use rstest::rstest;
270
271 #[rstest(
272 shebang,
273 language,
274 case::simple(b"#!/bin/sh", Language::Shell),
275 case::unix_newline(b"#!/bin/sh\n", Language::Shell),
276 case::windows_newline(b"#!/bin/sh\r\n", Language::Shell),
277 case::with_env(b"#!/usr/bin/env sh\r\n", Language::Shell)
278 )]
279 fn test_from_shebang(shebang: &[u8], language: Language) {
280 let languages = Language::from_shebang(shebang);
281 assert!(languages.contains(&language));
282 }
283
284 #[rstest(
285 input,
286 expected,
287 case("path/to/data.json", "path/to/data.json"),
288 case("path/to/data.json.example", "path/to/data.json")
289 )]
290 fn test_maybe_strip_example_extension(input: &str, expected: &str) {
291 let input = Path::new(input);
292 let expected = Path::new(expected);
293 let actual = Language::maybe_strip_example_extension(input);
294 assert_eq!(actual, expected);
295 }
296}