Skip to main content

aptu_coder_core/languages/
regex_fallback.rs

1// SPDX-FileCopyrightText: 2026 aptu-coder contributors
2// SPDX-License-Identifier: Apache-2.0
3//! Regex-based semantic extraction for formats without tree-sitter grammars.
4//!
5//! Covers CSS, YAML, JSON, TOML, and Astro. Each function returns a
6//! [`SemanticAnalysis`] with `functions` populated as a best-effort symbol
7//! list (selectors, keys, section headers, or frontmatter exports). All
8//! errors are handled internally; callers always receive a valid (possibly
9//! empty) result.
10
11use crate::parser::SemanticExtractor;
12use crate::types::{FunctionInfo, SemanticAnalysis};
13use regex::Regex;
14use std::sync::LazyLock;
15use tracing;
16
17// --- compiled patterns (compiled once at startup) ---
18
19static CSS_SELECTOR: LazyLock<Regex> =
20    LazyLock::new(|| Regex::new(r"^[.#][\w-]+[\s,:{]").expect("valid CSS selector pattern"));
21
22static YAML_TOP_KEY: LazyLock<Regex> =
23    LazyLock::new(|| Regex::new(r"^(\w[\w-]*): ").expect("valid YAML top-level key pattern"));
24
25static JSON_FIRST_KEY: LazyLock<Regex> = LazyLock::new(|| {
26    Regex::new(r#"^\s{0,2}"(\w+)":"#).expect("valid JSON first-level key pattern")
27});
28
29static TOML_SECTION: LazyLock<Regex> =
30    LazyLock::new(|| Regex::new(r"^\[([^\]]+)\]").expect("valid TOML section header pattern"));
31
32// --- extraction functions ---
33
34/// Extract CSS class/ID selectors as function entries.
35pub fn extract_css(source: &str) -> SemanticAnalysis {
36    let mut functions = Vec::new();
37    for (idx, line) in source.lines().enumerate() {
38        let trimmed = line.trim_start();
39        if CSS_SELECTOR.is_match(trimmed) {
40            let name = trimmed
41                .trim_end_matches(|c: char| c == '{' || c == ',' || c == ':' || c.is_whitespace())
42                .to_string();
43            if !name.is_empty() {
44                let line_no = idx + 1;
45                functions.push(FunctionInfo {
46                    name,
47                    line: line_no,
48                    end_line: line_no,
49                    parameters: Vec::new(),
50                    return_type: None,
51                });
52            }
53        }
54    }
55    SemanticAnalysis {
56        functions,
57        ..Default::default()
58    }
59}
60
61/// Extract YAML top-level keys as function entries.
62pub fn extract_yaml(source: &str) -> SemanticAnalysis {
63    let mut functions = Vec::new();
64    for (idx, line) in source.lines().enumerate() {
65        if let Some(caps) = YAML_TOP_KEY.captures(line) {
66            let name = caps[1].to_string();
67            let line_no = idx + 1;
68            functions.push(FunctionInfo {
69                name,
70                line: line_no,
71                end_line: line_no,
72                parameters: Vec::new(),
73                return_type: None,
74            });
75        }
76    }
77    SemanticAnalysis {
78        functions,
79        ..Default::default()
80    }
81}
82
83/// Extract JSON first-level string keys as function entries.
84pub fn extract_json(source: &str) -> SemanticAnalysis {
85    let mut functions = Vec::new();
86    for (idx, line) in source.lines().enumerate() {
87        if let Some(caps) = JSON_FIRST_KEY.captures(line) {
88            let name = caps[1].to_string();
89            let line_no = idx + 1;
90            functions.push(FunctionInfo {
91                name,
92                line: line_no,
93                end_line: line_no,
94                parameters: Vec::new(),
95                return_type: None,
96            });
97        }
98    }
99    SemanticAnalysis {
100        functions,
101        ..Default::default()
102    }
103}
104
105/// Extract TOML section headers as function entries.
106pub fn extract_toml(source: &str) -> SemanticAnalysis {
107    let mut functions = Vec::new();
108    for (idx, line) in source.lines().enumerate() {
109        if let Some(caps) = TOML_SECTION.captures(line) {
110            let name = caps[1].to_string();
111            let line_no = idx + 1;
112            functions.push(FunctionInfo {
113                name,
114                line: line_no,
115                end_line: line_no,
116                parameters: Vec::new(),
117                return_type: None,
118            });
119        }
120    }
121    SemanticAnalysis {
122        functions,
123        ..Default::default()
124    }
125}
126
127/// Extract Astro frontmatter imports/exports via the TypeScript extractor.
128///
129/// Splits on lines starting with `---`, extracts the block between the first
130/// and second delimiter, then delegates to [`SemanticExtractor::extract`] with
131/// `language = "typescript"`. Returns [`Default::default`] when no frontmatter
132/// is found or extraction fails.
133pub fn extract_astro(source: &str) -> SemanticAnalysis {
134    let block = extract_frontmatter(source);
135    let Some(frontmatter) = block else {
136        return SemanticAnalysis::default();
137    };
138    SemanticExtractor::extract(&frontmatter, "typescript", None, None).unwrap_or_else(|err| {
139        tracing::warn!(error = %err, "astro TypeScript extractor failed; returning empty analysis");
140        SemanticAnalysis::default()
141    })
142}
143
144fn extract_frontmatter(source: &str) -> Option<String> {
145    let mut delimiters = source
146        .lines()
147        .enumerate()
148        .filter(|(_, line)| line.starts_with("---"));
149    let (first, _) = delimiters.next()?;
150    let (second, _) = delimiters.next()?;
151    let block: Vec<&str> = source
152        .lines()
153        .skip(first + 1)
154        .take(second - first - 1)
155        .collect();
156    Some(block.join("\n"))
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162
163    #[test]
164    fn test_regex_fallback_css_basic() {
165        // Arrange
166        let source = ".container {\n  color: red;\n}\n#header {\n  font-size: 16px;\n}\n";
167        // Act
168        let result = extract_css(source);
169        // Assert
170        let names: Vec<&str> = result.functions.iter().map(|f| f.name.as_str()).collect();
171        assert!(
172            names.contains(&".container"),
173            "expected .container in {names:?}"
174        );
175        assert!(names.contains(&"#header"), "expected #header in {names:?}");
176    }
177
178    #[test]
179    fn test_regex_fallback_yaml_basic() {
180        // Arrange
181        let source = "name: my-project\nversion: 1.0\n  nested: value\n";
182        // Act
183        let result = extract_yaml(source);
184        // Assert
185        let names: Vec<&str> = result.functions.iter().map(|f| f.name.as_str()).collect();
186        assert!(names.contains(&"name"), "expected name in {names:?}");
187        assert!(names.contains(&"version"), "expected version in {names:?}");
188        // nested key has leading spaces so must NOT appear
189        assert!(
190            !names.contains(&"nested"),
191            "nested must not appear in {names:?}"
192        );
193    }
194
195    #[test]
196    fn test_regex_fallback_json_basic() {
197        // Arrange
198        let source = "{\n  \"name\": \"project\",\n  \"version\": \"1.0\"\n}\n";
199        // Act
200        let result = extract_json(source);
201        // Assert
202        let names: Vec<&str> = result.functions.iter().map(|f| f.name.as_str()).collect();
203        assert!(names.contains(&"name"), "expected name in {names:?}");
204        assert!(names.contains(&"version"), "expected version in {names:?}");
205    }
206
207    #[test]
208    fn test_regex_fallback_toml_basic() {
209        // Arrange
210        let source = "[package]\nname = \"my-crate\"\n\n[dependencies]\nregex = \"1\"\n";
211        // Act
212        let result = extract_toml(source);
213        // Assert
214        let names: Vec<&str> = result.functions.iter().map(|f| f.name.as_str()).collect();
215        assert!(names.contains(&"package"), "expected package in {names:?}");
216        assert!(
217            names.contains(&"dependencies"),
218            "expected dependencies in {names:?}"
219        );
220    }
221
222    #[cfg(feature = "lang-typescript")]
223    #[test]
224    fn test_regex_fallback_astro_basic() {
225        // Arrange: Astro file with TypeScript frontmatter
226        let source =
227            "---\nimport Foo from './Foo.astro';\nconst title = 'Hello';\n---\n<h1>{title}</h1>\n";
228        // Act
229        let result = extract_astro(source);
230        // Assert: TypeScript extractor should find the import
231        assert!(
232            !result.imports.is_empty() || !result.functions.is_empty(),
233            "expected imports or functions from frontmatter; got empty result"
234        );
235    }
236
237    #[test]
238    fn test_regex_fallback_astro_no_frontmatter() {
239        // Arrange: Astro file without --- delimiters
240        let source = "<h1>Hello World</h1>\n<p>No frontmatter here.</p>\n";
241        // Act
242        let result = extract_astro(source);
243        // Assert: returns empty without panic
244        assert!(result.functions.is_empty());
245        assert!(result.imports.is_empty());
246    }
247
248    #[test]
249    fn test_regex_fallback_empty_file() {
250        // Arrange: empty source for each format
251        assert!(extract_css("").functions.is_empty());
252        assert!(extract_yaml("").functions.is_empty());
253        assert!(extract_json("").functions.is_empty());
254        assert!(extract_toml("").functions.is_empty());
255        assert!(extract_astro("").functions.is_empty());
256    }
257}