thread_language/ext_iden.rs
1// SPDX-FileCopyrightText: 2025 Knitli Inc. <knitli@knit.li>
2// SPDX-FileContributor: Adam Poulemanos <adam@knit.li>
3//
4// SPDX-License-Identifier: AGPL-3.0-or-later
5
6//! Optimized extension matching for language detection.
7//!
8//! This module provides high-performance file identification.
9//! for efficient multi-pattern matching.
10//!
11//! The optimization strategies significantly improve performance over the naive
12//! O(n*m) approach of checking each language's extensions individually.
13
14use crate::{
15 SupportLang,
16 constants::{EXTENSION_TO_LANG, EXTENSIONS},
17};
18use aho_corasick::{AhoCorasick, AhoCorasickBuilder, Anchored, Input, MatchKind, StartKind};
19use std::sync::LazyLock;
20
21/// Aho-Corasick automaton for efficient multi-pattern matching.
22/// Built lazily on first use with all extensions normalized to lowercase.
23static AHO_CORASICK: LazyLock<AhoCorasick> = LazyLock::new(|| {
24 // Use LeftmostLongest to prefer longer matches (e.g., "cpp" over "c")
25 AhoCorasickBuilder::new()
26 .match_kind(MatchKind::LeftmostLongest)
27 .start_kind(StartKind::Anchored)
28 .build(EXTENSIONS)
29 .expect("Failed to build Aho-Corasick automaton")
30});
31
32/// Aho-Corasick based extension matching for comprehensive pattern matching.
33///
34/// This function uses a pre-built automaton to efficiently match against
35/// all possible extensions simultaneously.
36///
37/// # Arguments
38/// * `ext` - The file extension to match (case-insensitive)
39///
40/// # Returns
41/// * `Some(SupportLang)` if a matching language is found
42/// * `None` if no language matches the extension
43#[inline(always)]
44pub fn match_by_aho_corasick(ext: &str) -> Option<SupportLang> {
45 if ext.is_empty() {
46 return None;
47 }
48 let ext_lower = ext.to_ascii_lowercase();
49 // Find matches and ensure they span the entire extension
50 for mat in AHO_CORASICK.find_iter(Input::new(&ext_lower).anchored(Anchored::Yes)) {
51 // Only accept matches that span the entire extension
52 if mat.end() == ext_lower.len() {
53 let pattern_id = mat.pattern().as_usize();
54 return Some(EXTENSION_TO_LANG[pattern_id]);
55 }
56 }
57 None
58}
59
60#[cfg(test)]
61mod tests {
62 use super::*;
63
64 #[test]
65 fn test_aho_corasick_matching() {
66 // Test basic matching
67 assert_eq!(match_by_aho_corasick("rs"), Some(SupportLang::Rust));
68 assert_eq!(match_by_aho_corasick("py"), Some(SupportLang::Python));
69 assert_eq!(match_by_aho_corasick("js"), Some(SupportLang::JavaScript));
70
71 // Test case insensitivity
72 assert_eq!(match_by_aho_corasick("RS"), Some(SupportLang::Rust));
73 assert_eq!(match_by_aho_corasick("PY"), Some(SupportLang::Python));
74
75 // Test complex extensions
76 assert_eq!(match_by_aho_corasick("tsx"), Some(SupportLang::Tsx));
77 assert_eq!(match_by_aho_corasick("cpp"), Some(SupportLang::Cpp));
78 assert_eq!(match_by_aho_corasick("workflow"), Some(SupportLang::Hcl));
79
80 // Test ambiguous extensions (C vs C++)
81 // "c" extension should match C (first in enum order)
82 assert_eq!(match_by_aho_corasick("c"), Some(SupportLang::C));
83
84 // Test non-existent extensions
85 assert_eq!(match_by_aho_corasick("xyz"), None);
86 assert_eq!(match_by_aho_corasick(""), None);
87 }
88}