Skip to main content

marco_core/render/
code_languages.rs

1//! Code fence language normalization.
2//!
3//! This module exists so both the editor and viewer can share:
4//! - a small, curated list of common language aliases
5//! - consistent display labels (e.g. `rs` -> `Rust`)
6//!
7//! The core renderer also uses this to populate `data-language` on `<pre>`
8//! so CSS themes can show a proper label instead of a generic "Code".
9
10use std::borrow::Cow;
11
12#[derive(Debug, Clone, Copy)]
13/// Canonical code-language name plus accepted aliases.
14pub struct CodeLanguage {
15    /// Canonical name used for display and (typically) for syntect token lookup.
16    pub canonical: &'static str,
17    /// Lowercase aliases commonly used in fenced code blocks.
18    pub aliases: &'static [&'static str],
19}
20
21/// A small, human-curated set of common languages and their aliases.
22///
23/// Notes:
24/// - Matching is ASCII-case-insensitive.
25/// - Aliases should be lowercase.
26pub const KNOWN_CODE_LANGUAGES: &[CodeLanguage] = &[
27    CodeLanguage {
28        canonical: "Rust",
29        aliases: &["rs", "rust"],
30    },
31    CodeLanguage {
32        canonical: "JavaScript",
33        aliases: &["js", "javascript", "jsx", "mjs", "cjs", "node"],
34    },
35    CodeLanguage {
36        canonical: "TypeScript",
37        aliases: &["ts", "tsx", "mts", "cts", "typescript"],
38    },
39    CodeLanguage {
40        canonical: "Python",
41        aliases: &["py", "python", "python3", "pycon"],
42    },
43    CodeLanguage {
44        canonical: "Bash",
45        aliases: &["sh", "bash", "zsh", "shell"],
46    },
47    CodeLanguage {
48        canonical: "HTML",
49        aliases: &["html", "htm"],
50    },
51    CodeLanguage {
52        canonical: "CSS",
53        aliases: &["css"],
54    },
55    CodeLanguage {
56        canonical: "JSON",
57        aliases: &["json", "jsonc", "json5"],
58    },
59    CodeLanguage {
60        canonical: "YAML",
61        aliases: &["yaml", "yml"],
62    },
63    CodeLanguage {
64        canonical: "TOML",
65        aliases: &["toml"],
66    },
67    CodeLanguage {
68        canonical: "XML",
69        aliases: &["xml"],
70    },
71    CodeLanguage {
72        canonical: "Markdown",
73        aliases: &["md", "markdown", "mkd", "mkdown"],
74    },
75    CodeLanguage {
76        canonical: "Mermaid",
77        aliases: &["mermaid", "mmd"],
78    },
79    CodeLanguage {
80        canonical: "SQL",
81        aliases: &["sql"],
82    },
83    CodeLanguage {
84        canonical: "C",
85        aliases: &["c", "h"],
86    },
87    CodeLanguage {
88        canonical: "C++",
89        aliases: &["cpp", "c++", "cxx", "cc", "hpp", "hh", "h++", "hxx"],
90    },
91    CodeLanguage {
92        canonical: "C#",
93        aliases: &["cs", "c#", "csharp"],
94    },
95    CodeLanguage {
96        canonical: "Java",
97        aliases: &["java"],
98    },
99    CodeLanguage {
100        canonical: "Dart",
101        aliases: &["dart"],
102    },
103    CodeLanguage {
104        canonical: "Go",
105        aliases: &["go", "golang"],
106    },
107    CodeLanguage {
108        canonical: "Scala",
109        aliases: &["scala"],
110    },
111    CodeLanguage {
112        canonical: "Groovy",
113        aliases: &["groovy", "gradle"],
114    },
115    CodeLanguage {
116        canonical: "Clojure",
117        aliases: &["clojure", "clj", "edn"],
118    },
119    CodeLanguage {
120        canonical: "Ruby",
121        aliases: &["rb", "ruby"],
122    },
123    CodeLanguage {
124        canonical: "Perl",
125        aliases: &["perl", "pl", "pm"],
126    },
127    CodeLanguage {
128        canonical: "PHP",
129        aliases: &["php"],
130    },
131    CodeLanguage {
132        canonical: "Haskell",
133        aliases: &["haskell", "hs"],
134    },
135    CodeLanguage {
136        canonical: "Elixir",
137        aliases: &["elixir", "ex", "exs"],
138    },
139    CodeLanguage {
140        canonical: "Erlang",
141        aliases: &["erlang", "erl"],
142    },
143    CodeLanguage {
144        canonical: "F#",
145        aliases: &["fsharp", "fs", "fsx", "fsi", "fsscript"],
146    },
147    CodeLanguage {
148        canonical: "Kotlin",
149        aliases: &["kotlin", "kt"],
150    },
151    CodeLanguage {
152        canonical: "Swift",
153        aliases: &["swift"],
154    },
155    CodeLanguage {
156        canonical: "Objective-C",
157        aliases: &["objectivec", "objc", "obj-c", "mm"],
158    },
159    CodeLanguage {
160        canonical: "OCaml",
161        aliases: &["ocaml", "ml"],
162    },
163    CodeLanguage {
164        canonical: "Lua",
165        aliases: &["lua"],
166    },
167    CodeLanguage {
168        canonical: "Nim",
169        aliases: &["nim", "nimrod"],
170    },
171    CodeLanguage {
172        canonical: "Nix",
173        aliases: &["nix"],
174    },
175    CodeLanguage {
176        canonical: "Zig",
177        aliases: &["zig"],
178    },
179    CodeLanguage {
180        canonical: "R",
181        aliases: &["r"],
182    },
183    CodeLanguage {
184        canonical: "Matlab",
185        aliases: &["matlab"],
186    },
187    CodeLanguage {
188        canonical: "Fortran",
189        aliases: &["fortran", "f90", "f95"],
190    },
191    CodeLanguage {
192        canonical: "GraphQL",
193        aliases: &["graphql", "gql"],
194    },
195    CodeLanguage {
196        canonical: "Protocol Buffers",
197        aliases: &["proto", "protobuf"],
198    },
199    CodeLanguage {
200        canonical: "Solidity",
201        aliases: &["solidity", "sol"],
202    },
203    CodeLanguage {
204        canonical: "Terraform (HCL)",
205        aliases: &["terraform", "tf", "hcl"],
206    },
207    CodeLanguage {
208        canonical: "INI",
209        aliases: &["ini"],
210    },
211    CodeLanguage {
212        canonical: "MathML",
213        aliases: &["mathml", "katex"],
214    },
215    CodeLanguage {
216        canonical: "Dockerfile",
217        aliases: &["dockerfile", "docker"],
218    },
219    CodeLanguage {
220        canonical: "Makefile",
221        aliases: &["makefile", "make", "mk", "mak"],
222    },
223    CodeLanguage {
224        canonical: "PowerShell",
225        aliases: &["powershell", "pwsh", "ps", "ps1"],
226    },
227    CodeLanguage {
228        canonical: "Vim Script",
229        aliases: &["vim", "vimscript"],
230    },
231    CodeLanguage {
232        canonical: "Assembly",
233        aliases: &["asm", "nasm", "x86asm"],
234    },
235    CodeLanguage {
236        canonical: "Plaintext",
237        aliases: &["plaintext", "text", "txt"],
238    },
239    CodeLanguage {
240        canonical: "Diff",
241        aliases: &["diff", "patch"],
242    },
243];
244
245/// If `raw` is a known language (by canonical name or alias), return its canonical name.
246pub fn canonical_language_name(raw: &str) -> Option<&'static str> {
247    let raw = raw.trim();
248    if raw.is_empty() {
249        return None;
250    }
251
252    // Fast path: exact canonical match (case-insensitive).
253    for lang in KNOWN_CODE_LANGUAGES {
254        if raw.eq_ignore_ascii_case(lang.canonical) {
255            return Some(lang.canonical);
256        }
257    }
258
259    let lower = raw.to_ascii_lowercase();
260    for lang in KNOWN_CODE_LANGUAGES {
261        if lang.aliases.iter().any(|a| *a == lower) {
262            return Some(lang.canonical);
263        }
264    }
265
266    None
267}
268
269/// Returns a display label for a fenced code language.
270///
271/// - Known languages get a canonical, nicely-cased label (`rs` -> `Rust`).
272/// - Unknown languages fall back to the trimmed original text (preserving user intent).
273pub fn language_display_label<'a>(raw: &'a str) -> Option<Cow<'a, str>> {
274    let raw_trimmed = raw.trim();
275    if raw_trimmed.is_empty() {
276        return None;
277    }
278
279    if let Some(canonical) = canonical_language_name(raw_trimmed) {
280        return Some(Cow::Borrowed(canonical));
281    }
282
283    Some(Cow::Borrowed(raw_trimmed))
284}
285
286#[cfg(test)]
287mod tests {
288    use super::*;
289
290    #[test]
291    fn smoke_test_canonical_language_name_aliases() {
292        assert_eq!(canonical_language_name("rs"), Some("Rust"));
293        assert_eq!(canonical_language_name("Rust"), Some("Rust"));
294        assert_eq!(canonical_language_name("JS"), Some("JavaScript"));
295        assert_eq!(canonical_language_name("c++"), Some("C++"));
296        assert_eq!(canonical_language_name("tsx"), Some("TypeScript"));
297        assert_eq!(canonical_language_name("gql"), Some("GraphQL"));
298        assert_eq!(canonical_language_name("proto"), Some("Protocol Buffers"));
299        assert_eq!(canonical_language_name("tf"), Some("Terraform (HCL)"));
300        assert_eq!(canonical_language_name("nimrod"), Some("Nim"));
301        assert_eq!(canonical_language_name("ps"), Some("PowerShell"));
302        assert_eq!(canonical_language_name("mmd"), Some("Mermaid"));
303        assert_eq!(canonical_language_name("katex"), Some("MathML"));
304    }
305
306    #[test]
307    fn smoke_test_language_display_label_unknown_falls_back() {
308        assert_eq!(language_display_label("  mylang  ").unwrap(), "mylang");
309    }
310}