gengo_language/
lib.rs

1use glob::MatchOptions;
2use regex::Regex;
3use std::collections::HashMap;
4use std::ffi::OsStr;
5use std::path::Path;
6use std::str::FromStr;
7use std::sync::LazyLock;
8
9/// Copied glob options from the main crate.
10const GLOB_MATCH_OPTIONS: MatchOptions = MatchOptions {
11    case_sensitive: true,
12    require_literal_separator: true,
13    require_literal_leading_dot: false,
14};
15
16macro_rules! _include {
17    ($path:literal) => {
18        include!(concat!(env!("OUT_DIR"), "/languages/", $path));
19    };
20}
21
22_include!("language.rs");
23_include!("category_mixin.rs");
24_include!("name_mixin.rs");
25_include!("parse_variant_mixin.rs");
26_include!("color_hex_mixin.rs");
27_include!("color_rgb_mixin.rs");
28_include!("nerd_font_glyph_mixin.rs");
29_include!("priority_mixin.rs");
30_include!("from_extension_mixin.rs");
31_include!("from_filename_mixin.rs");
32_include!("from_interpreter_mixin.rs");
33_include!("glob_mappings_mixin.rs");
34_include!("heuristic_mappings_mixin.rs");
35
36impl Language {
37    /// Gets languages from a path's extension.
38    fn from_path_extension(path: impl AsRef<Path>) -> Vec<Self> {
39        let extension = path.as_ref().extension().and_then(|ext| ext.to_str());
40        extension.map_or(vec![], Self::from_extension)
41    }
42
43    /// Gets languages from a path's filename.
44    fn from_path_filename(path: impl AsRef<Path>) -> Vec<Self> {
45        let filename = path
46            .as_ref()
47            .file_name()
48            .and_then(|filename| filename.to_str());
49        filename.map_or(vec![], Self::from_filename)
50    }
51
52    /// Gets languages by a shebang.
53    fn from_shebang(contents: &[u8]) -> Vec<Self> {
54        const MAX_SHEBANG_LENGTH: usize = 50;
55
56        let mut lines = contents.split(|&c| c == b'\n');
57        let first_line = lines.next().unwrap_or_default();
58        if first_line.len() < 2 || first_line[0] != b'#' || first_line[1] != b'!' {
59            return vec![];
60        }
61        let first_line = if first_line.len() > MAX_SHEBANG_LENGTH {
62            &first_line[..MAX_SHEBANG_LENGTH]
63        } else {
64            first_line
65        };
66        let first_line = String::from_utf8_lossy(first_line);
67        // NOTE Handle trailing spaces, `\r`, etc.
68        let first_line = first_line.trim_end();
69
70        static RE: LazyLock<Regex> = LazyLock::new(|| {
71            Regex::new(r"^#!(?:/usr(?:/local)?)?/bin/(?:env\s+)?([\w\d]+)\r?$").unwrap()
72        });
73
74        RE.captures(first_line)
75            .and_then(|c| c.get(1))
76            .map_or(vec![], |m| {
77                let interpreter = m.as_str();
78                Self::from_interpreter(interpreter)
79            })
80    }
81
82    /// Gets the languages that match a glob pattern.
83    pub fn from_glob(path: impl AsRef<Path>) -> Vec<Self> {
84        let path = path.as_ref();
85
86        struct GlobMapping {
87            patterns: Vec<glob::Pattern>,
88            language: Language,
89        }
90        static GLOB_MAPPINGS: LazyLock<Vec<GlobMapping>> = LazyLock::new(|| {
91            Language::glob_mappings()
92                .into_iter()
93                .map(|(patterns, language)| {
94                    let patterns = patterns
95                        .into_iter()
96                        .map(|pattern| glob::Pattern::new(pattern).unwrap())
97                        .collect();
98                    GlobMapping { patterns, language }
99                })
100                .collect()
101        });
102
103        GLOB_MAPPINGS
104            .iter()
105            .filter(|gm| {
106                gm.patterns
107                    .iter()
108                    .any(|p| p.matches_path_with(path.as_ref(), GLOB_MATCH_OPTIONS))
109            })
110            .map(|gm| gm.language)
111            .collect()
112    }
113
114    /// Filters an iterable of languages by heuristics.
115    fn filter_by_heuristics(languages: &[Self], contents: &str) -> Vec<Self> {
116        static HEURISTICS: LazyLock<HashMap<Language, Vec<Regex>>> = LazyLock::new(|| {
117            Language::heuristic_mappings()
118                .into_iter()
119                .map(|(language, patterns)| {
120                    let patterns = patterns
121                        .into_iter()
122                        .map(|pattern| Regex::new(pattern).unwrap())
123                        .collect();
124                    (language, patterns)
125                })
126                .collect()
127        });
128
129        languages
130            .iter()
131            .filter(|language| {
132                HEURISTICS
133                    .get(language)
134                    .is_some_and(|heuristics| heuristics.iter().any(|re| re.is_match(contents)))
135            })
136            .cloned()
137            .collect()
138    }
139
140    /// Uses simple checks to find one or more matching languages. Checks by shebang, filename,
141    /// filepath glob, and extension.
142    fn find_simple(path: impl AsRef<Path>, contents: &[u8]) -> Vec<Self> {
143        let languages = Self::from_shebang(contents);
144        if !languages.is_empty() {
145            return languages;
146        }
147        let languages = Self::from_path_filename(&path);
148        if !languages.is_empty() {
149            return languages;
150        }
151        let languages = Self::from_glob(&path);
152        if !languages.is_empty() {
153            return languages;
154        }
155        Self::from_path_extension(&path)
156    }
157
158    /// Picks the best guess from a file's name and contents.
159    ///
160    /// When checking heuristics, only the first `read_limit` bytes will be read.
161    pub fn pick(path: impl AsRef<Path>, contents: &[u8], read_limit: usize) -> Option<Self> {
162        let path = path.as_ref();
163        let path = Self::maybe_strip_example_extension(path);
164        let languages = Self::find_simple(path, contents);
165        if languages.len() == 1 {
166            return Some(languages[0]);
167        }
168
169        let contents = if contents.len() > read_limit {
170            &contents[..read_limit]
171        } else {
172            contents
173        };
174        let heuristic_contents = std::str::from_utf8(contents).unwrap_or_default();
175        let by_heuristics = Self::filter_by_heuristics(&languages, heuristic_contents);
176
177        let found_languages = match by_heuristics.len() {
178            0 => languages,
179            1 => return Some(by_heuristics[0]),
180            _ => by_heuristics,
181        };
182
183        found_languages.into_iter().max_by_key(Self::priority)
184    }
185
186    /// Strips the .example prefix if it exists, or returns the original path reference.
187    fn maybe_strip_example_extension(path: &Path) -> &Path {
188        const EXAMPLE_EXT: &[u8] = b".example";
189        path.as_os_str()
190            .as_encoded_bytes()
191            .strip_suffix(EXAMPLE_EXT)
192            .map(|bytes| {
193                // SAFETY
194                // - Only contains content from the original OsStr backing the Path
195                unsafe { OsStr::from_encoded_bytes_unchecked(bytes) }
196            })
197            .map(Path::new)
198            .unwrap_or(path)
199    }
200
201    /// Returns an object that implements `serde::Serialize` for the language to
202    /// serialize the language's attributes. This effectively turns the language
203    /// from an `enum` into a `struct`.
204    const fn serialize(&self) -> Serialize {
205        Serialize {
206            name: self.name(),
207            category: self.category(),
208            hex: self.hex(),
209            nerd_font_glyph: self.nerd_font_glyph(),
210        }
211    }
212}
213
214#[derive(Debug, PartialEq, Eq)]
215pub struct ParseError;
216
217impl FromStr for Language {
218    type Err = ParseError;
219
220    /// Converts a string of the variant's name into that variant.
221    /// This can be useful for setting up language overrides.
222    fn from_str(s: &str) -> Result<Self, Self::Err> {
223        Self::parse_variant(s).ok_or(ParseError)
224    }
225}
226
227impl serde::Serialize for Language {
228    /// Serializes the language into a string.
229    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
230    where
231        S: serde::Serializer,
232    {
233        // NOTE A bit redundant LOL
234        Self::serialize(self).serialize(serializer)
235    }
236}
237
238/// A category for a language.
239#[non_exhaustive]
240#[derive(Clone, Debug, serde::Deserialize, Eq, Hash, PartialEq, serde::Serialize)]
241#[serde(rename_all = "lowercase")]
242pub enum Category {
243    /// Data files. Examples: JSON, YAML, XML, CSV, etc.
244    Data,
245    /// Markup languages. Examples: HTML, Pug, etc.
246    Markup,
247    /// Languages that define text patterns. Examples: Regex, ABNF, etc.
248    Pattern,
249    /// Programming languages. Examples: Rust, C, C++, Java, etc.
250    Programming,
251    /// Prose. Examples: Plain text, Markdown, etc.
252    Prose,
253    /// Query languages. Examples: SQL, GraphQL, etc.
254    Query,
255}
256
257/// Helper struct for serializing the attributes of a `Language`.
258#[derive(Debug, serde::Serialize)]
259struct Serialize {
260    name: &'static str,
261    category: Category,
262    hex: &'static str,
263    nerd_font_glyph: Option<&'static str>,
264}
265
266#[cfg(test)]
267mod language_tests {
268    use super::*;
269    use rstest::rstest;
270
271    #[rstest(
272        shebang,
273        language,
274        case::simple(b"#!/bin/sh", Language::Shell),
275        case::unix_newline(b"#!/bin/sh\n", Language::Shell),
276        case::windows_newline(b"#!/bin/sh\r\n", Language::Shell),
277        case::with_env(b"#!/usr/bin/env sh\r\n", Language::Shell)
278    )]
279    fn test_from_shebang(shebang: &[u8], language: Language) {
280        let languages = Language::from_shebang(shebang);
281        assert!(languages.contains(&language));
282    }
283
284    #[rstest(
285        input,
286        expected,
287        case("path/to/data.json", "path/to/data.json"),
288        case("path/to/data.json.example", "path/to/data.json")
289    )]
290    fn test_maybe_strip_example_extension(input: &str, expected: &str) {
291        let input = Path::new(input);
292        let expected = Path::new(expected);
293        let actual = Language::maybe_strip_example_extension(input);
294        assert_eq!(actual, expected);
295    }
296}