Skip to main content

thread_language/
ext_iden.rs

1// SPDX-FileCopyrightText: 2025 Knitli Inc. <knitli@knit.li>
2// SPDX-FileContributor: Adam Poulemanos <adam@knit.li>
3//
4// SPDX-License-Identifier: AGPL-3.0-or-later
5
6//! Optimized extension matching for language detection.
7//!
8//! This module provides high-performance file identification.
9//! for efficient multi-pattern matching.
10//!
11//! The optimization strategies significantly improve performance over the naive
12//! O(n*m) approach of checking each language's extensions individually.
13
14use crate::{
15    SupportLang,
16    constants::{EXTENSION_TO_LANG, EXTENSIONS},
17};
18use aho_corasick::{AhoCorasick, AhoCorasickBuilder, Anchored, Input, MatchKind, StartKind};
19use std::sync::LazyLock;
20
21/// Aho-Corasick automaton for efficient multi-pattern matching.
22/// Built lazily on first use with all extensions normalized to lowercase.
23static AHO_CORASICK: LazyLock<AhoCorasick> = LazyLock::new(|| {
24    // Use LeftmostLongest to prefer longer matches (e.g., "cpp" over "c")
25    AhoCorasickBuilder::new()
26        .match_kind(MatchKind::LeftmostLongest)
27        .start_kind(StartKind::Anchored)
28        .build(EXTENSIONS)
29        .expect("Failed to build Aho-Corasick automaton")
30});
31
32/// Aho-Corasick based extension matching for comprehensive pattern matching.
33///
34/// This function uses a pre-built automaton to efficiently match against
35/// all possible extensions simultaneously.
36///
37/// # Arguments
38/// * `ext` - The file extension to match (case-insensitive)
39///
40/// # Returns
41/// * `Some(SupportLang)` if a matching language is found
42/// * `None` if no language matches the extension
43#[inline(always)]
44pub fn match_by_aho_corasick(ext: &str) -> Option<SupportLang> {
45    if ext.is_empty() {
46        return None;
47    }
48    let ext_lower = ext.to_ascii_lowercase();
49    // Find matches and ensure they span the entire extension
50    for mat in AHO_CORASICK.find_iter(Input::new(&ext_lower).anchored(Anchored::Yes)) {
51        // Only accept matches that span the entire extension
52        if mat.end() == ext_lower.len() {
53            let pattern_id = mat.pattern().as_usize();
54            return Some(EXTENSION_TO_LANG[pattern_id]);
55        }
56    }
57    None
58}
59
60#[cfg(test)]
61mod tests {
62    use super::*;
63
64    #[test]
65    fn test_aho_corasick_matching() {
66        // Test basic matching
67        assert_eq!(match_by_aho_corasick("rs"), Some(SupportLang::Rust));
68        assert_eq!(match_by_aho_corasick("py"), Some(SupportLang::Python));
69        assert_eq!(match_by_aho_corasick("js"), Some(SupportLang::JavaScript));
70
71        // Test case insensitivity
72        assert_eq!(match_by_aho_corasick("RS"), Some(SupportLang::Rust));
73        assert_eq!(match_by_aho_corasick("PY"), Some(SupportLang::Python));
74
75        // Test complex extensions
76        assert_eq!(match_by_aho_corasick("tsx"), Some(SupportLang::Tsx));
77        assert_eq!(match_by_aho_corasick("cpp"), Some(SupportLang::Cpp));
78        assert_eq!(match_by_aho_corasick("workflow"), Some(SupportLang::Hcl));
79
80        // Test ambiguous extensions (C vs C++)
81        // "c" extension should match C (first in enum order)
82        assert_eq!(match_by_aho_corasick("c"), Some(SupportLang::C));
83
84        // Test non-existent extensions
85        assert_eq!(match_by_aho_corasick("xyz"), None);
86        assert_eq!(match_by_aho_corasick(""), None);
87    }
88}