Skip to main content

marco_core/render/
code_languages.rs

1//! Code fence language normalization.
2//!
3//! This module exists so both `marco` and `polo` can share:
4//! - a small, curated list of common language aliases
5//! - consistent display labels (e.g. `rs` -> `Rust`)
6//!
7//! The core renderer also uses this to populate `data-language` on `<pre>`
8//! so CSS themes can show a proper label instead of a generic "Code".
9
10use std::borrow::Cow;
11
12#[derive(Debug, Clone, Copy)]
13pub struct CodeLanguage {
14    /// Canonical name used for display and (typically) for syntect token lookup.
15    pub canonical: &'static str,
16    /// Lowercase aliases commonly used in fenced code blocks.
17    pub aliases: &'static [&'static str],
18}
19
20/// A small, human-curated set of common languages and their aliases.
21///
22/// Notes:
23/// - Matching is ASCII-case-insensitive.
24/// - Aliases should be lowercase.
25pub const KNOWN_CODE_LANGUAGES: &[CodeLanguage] = &[
26    CodeLanguage {
27        canonical: "Rust",
28        aliases: &["rs", "rust"],
29    },
30    CodeLanguage {
31        canonical: "JavaScript",
32        aliases: &["js", "javascript", "jsx", "mjs", "cjs", "node"],
33    },
34    CodeLanguage {
35        canonical: "TypeScript",
36        aliases: &["ts", "tsx", "mts", "cts", "typescript"],
37    },
38    CodeLanguage {
39        canonical: "Python",
40        aliases: &["py", "python", "python3", "pycon"],
41    },
42    CodeLanguage {
43        canonical: "Bash",
44        aliases: &["sh", "bash", "zsh", "shell"],
45    },
46    CodeLanguage {
47        canonical: "HTML",
48        aliases: &["html", "htm"],
49    },
50    CodeLanguage {
51        canonical: "CSS",
52        aliases: &["css"],
53    },
54    CodeLanguage {
55        canonical: "JSON",
56        aliases: &["json", "jsonc", "json5"],
57    },
58    CodeLanguage {
59        canonical: "YAML",
60        aliases: &["yaml", "yml"],
61    },
62    CodeLanguage {
63        canonical: "TOML",
64        aliases: &["toml"],
65    },
66    CodeLanguage {
67        canonical: "XML",
68        aliases: &["xml"],
69    },
70    CodeLanguage {
71        canonical: "Markdown",
72        aliases: &["md", "markdown", "mkd", "mkdown"],
73    },
74    CodeLanguage {
75        canonical: "Mermaid",
76        aliases: &["mermaid", "mmd"],
77    },
78    CodeLanguage {
79        canonical: "SQL",
80        aliases: &["sql"],
81    },
82    CodeLanguage {
83        canonical: "C",
84        aliases: &["c", "h"],
85    },
86    CodeLanguage {
87        canonical: "C++",
88        aliases: &["cpp", "c++", "cxx", "cc", "hpp", "hh", "h++", "hxx"],
89    },
90    CodeLanguage {
91        canonical: "C#",
92        aliases: &["cs", "c#", "csharp"],
93    },
94    CodeLanguage {
95        canonical: "Java",
96        aliases: &["java"],
97    },
98    CodeLanguage {
99        canonical: "Dart",
100        aliases: &["dart"],
101    },
102    CodeLanguage {
103        canonical: "Go",
104        aliases: &["go", "golang"],
105    },
106    CodeLanguage {
107        canonical: "Scala",
108        aliases: &["scala"],
109    },
110    CodeLanguage {
111        canonical: "Groovy",
112        aliases: &["groovy", "gradle"],
113    },
114    CodeLanguage {
115        canonical: "Clojure",
116        aliases: &["clojure", "clj", "edn"],
117    },
118    CodeLanguage {
119        canonical: "Ruby",
120        aliases: &["rb", "ruby"],
121    },
122    CodeLanguage {
123        canonical: "Perl",
124        aliases: &["perl", "pl", "pm"],
125    },
126    CodeLanguage {
127        canonical: "PHP",
128        aliases: &["php"],
129    },
130    CodeLanguage {
131        canonical: "Haskell",
132        aliases: &["haskell", "hs"],
133    },
134    CodeLanguage {
135        canonical: "Elixir",
136        aliases: &["elixir", "ex", "exs"],
137    },
138    CodeLanguage {
139        canonical: "Erlang",
140        aliases: &["erlang", "erl"],
141    },
142    CodeLanguage {
143        canonical: "F#",
144        aliases: &["fsharp", "fs", "fsx", "fsi", "fsscript"],
145    },
146    CodeLanguage {
147        canonical: "Kotlin",
148        aliases: &["kotlin", "kt"],
149    },
150    CodeLanguage {
151        canonical: "Swift",
152        aliases: &["swift"],
153    },
154    CodeLanguage {
155        canonical: "Objective-C",
156        aliases: &["objectivec", "objc", "obj-c", "mm"],
157    },
158    CodeLanguage {
159        canonical: "OCaml",
160        aliases: &["ocaml", "ml"],
161    },
162    CodeLanguage {
163        canonical: "Lua",
164        aliases: &["lua"],
165    },
166    CodeLanguage {
167        canonical: "Nim",
168        aliases: &["nim", "nimrod"],
169    },
170    CodeLanguage {
171        canonical: "Nix",
172        aliases: &["nix"],
173    },
174    CodeLanguage {
175        canonical: "Zig",
176        aliases: &["zig"],
177    },
178    CodeLanguage {
179        canonical: "R",
180        aliases: &["r"],
181    },
182    CodeLanguage {
183        canonical: "Matlab",
184        aliases: &["matlab"],
185    },
186    CodeLanguage {
187        canonical: "Fortran",
188        aliases: &["fortran", "f90", "f95"],
189    },
190    CodeLanguage {
191        canonical: "GraphQL",
192        aliases: &["graphql", "gql"],
193    },
194    CodeLanguage {
195        canonical: "Protocol Buffers",
196        aliases: &["proto", "protobuf"],
197    },
198    CodeLanguage {
199        canonical: "Solidity",
200        aliases: &["solidity", "sol"],
201    },
202    CodeLanguage {
203        canonical: "Terraform (HCL)",
204        aliases: &["terraform", "tf", "hcl"],
205    },
206    CodeLanguage {
207        canonical: "INI",
208        aliases: &["ini"],
209    },
210    CodeLanguage {
211        canonical: "MathML",
212        aliases: &["mathml", "katex"],
213    },
214    CodeLanguage {
215        canonical: "Dockerfile",
216        aliases: &["dockerfile", "docker"],
217    },
218    CodeLanguage {
219        canonical: "Makefile",
220        aliases: &["makefile", "make", "mk", "mak"],
221    },
222    CodeLanguage {
223        canonical: "PowerShell",
224        aliases: &["powershell", "pwsh", "ps", "ps1"],
225    },
226    CodeLanguage {
227        canonical: "Vim Script",
228        aliases: &["vim", "vimscript"],
229    },
230    CodeLanguage {
231        canonical: "Assembly",
232        aliases: &["asm", "nasm", "x86asm"],
233    },
234    CodeLanguage {
235        canonical: "Plaintext",
236        aliases: &["plaintext", "text", "txt"],
237    },
238    CodeLanguage {
239        canonical: "Diff",
240        aliases: &["diff", "patch"],
241    },
242];
243
244/// If `raw` is a known language (by canonical name or alias), return its canonical name.
245pub fn canonical_language_name(raw: &str) -> Option<&'static str> {
246    let raw = raw.trim();
247    if raw.is_empty() {
248        return None;
249    }
250
251    // Fast path: exact canonical match (case-insensitive).
252    for lang in KNOWN_CODE_LANGUAGES {
253        if raw.eq_ignore_ascii_case(lang.canonical) {
254            return Some(lang.canonical);
255        }
256    }
257
258    let lower = raw.to_ascii_lowercase();
259    for lang in KNOWN_CODE_LANGUAGES {
260        if lang.aliases.iter().any(|a| *a == lower) {
261            return Some(lang.canonical);
262        }
263    }
264
265    None
266}
267
268/// Returns a display label for a fenced code language.
269///
270/// - Known languages get a canonical, nicely-cased label (`rs` -> `Rust`).
271/// - Unknown languages fall back to the trimmed original text (preserving user intent).
272pub fn language_display_label<'a>(raw: &'a str) -> Option<Cow<'a, str>> {
273    let raw_trimmed = raw.trim();
274    if raw_trimmed.is_empty() {
275        return None;
276    }
277
278    if let Some(canonical) = canonical_language_name(raw_trimmed) {
279        return Some(Cow::Borrowed(canonical));
280    }
281
282    Some(Cow::Borrowed(raw_trimmed))
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288
289    #[test]
290    fn smoke_test_canonical_language_name_aliases() {
291        assert_eq!(canonical_language_name("rs"), Some("Rust"));
292        assert_eq!(canonical_language_name("Rust"), Some("Rust"));
293        assert_eq!(canonical_language_name("JS"), Some("JavaScript"));
294        assert_eq!(canonical_language_name("c++"), Some("C++"));
295        assert_eq!(canonical_language_name("tsx"), Some("TypeScript"));
296        assert_eq!(canonical_language_name("gql"), Some("GraphQL"));
297        assert_eq!(canonical_language_name("proto"), Some("Protocol Buffers"));
298        assert_eq!(canonical_language_name("tf"), Some("Terraform (HCL)"));
299        assert_eq!(canonical_language_name("nimrod"), Some("Nim"));
300        assert_eq!(canonical_language_name("ps"), Some("PowerShell"));
301        assert_eq!(canonical_language_name("mmd"), Some("Mermaid"));
302        assert_eq!(canonical_language_name("katex"), Some("MathML"));
303    }
304
305    #[test]
306    fn smoke_test_language_display_label_unknown_falls_back() {
307        assert_eq!(language_display_label("  mylang  ").unwrap(), "mylang");
308    }
309}