Skip to main content

dlin_core/parser/
sql.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4/// A reference to another dbt model via ref()
5#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
6pub struct RefCall {
7    /// Optional package name (for cross-project refs)
8    pub package: Option<String>,
9    /// Model name
10    pub name: String,
11}
12
13/// A reference to a dbt source via source()
14#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
15pub struct SourceCall {
16    /// Source name
17    pub source_name: String,
18    /// Table name within the source
19    pub table_name: String,
20}
21
22static JINJA_COMMENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{#[\s\S]*?#\}").unwrap());
23
24// Matches ref('name'), ref("name"), ref('pkg', 'name'), ref("pkg", "name")
25// Handles {{ ref(...) }} and {{- ref(...) -}} whitespace control
26static REF_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
27    Regex::new(
28        r#"(?x)
29        \{\{-?\s*
30        ref\s*\(\s*
31        (?:
32            # Two-argument form: ref('pkg', 'name') or ref("pkg", "name")
33            (?:['"]([^'"]+)['"]\s*,\s*['"]([^'"]+)['"])
34            |
35            # Single-argument form: ref('name') or ref("name")
36            ['"]([^'"]+)['"]
37        )
38        \s*\)\s*
39        -?\}\}
40    "#,
41    )
42    .unwrap()
43});
44
45// Matches source('src_name', 'table_name')
46static SOURCE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
47    Regex::new(
48        r#"(?x)
49        \{\{-?\s*
50        source\s*\(\s*
51        ['"]([^'"]+)['"]\s*,\s*['"]([^'"]+)['"]
52        \s*\)\s*
53        -?\}\}
54    "#,
55    )
56    .unwrap()
57});
58
59/// Strip Jinja comments from SQL content
60fn strip_jinja_comments(sql: &str) -> String {
61    JINJA_COMMENT.replace_all(sql, "").to_string()
62}
63
64/// Extract all refs, sources, and config from SQL content in a single pass.
65/// Tries minijinja rendering first; falls back to regex on failure.
66///
67/// `macro_prefix` is the pre-built concatenation of valid macro SQL files
68/// so that custom macros containing ref()/source() are expanded and tracked.
69pub fn extract_all(sql: &str, macro_prefix: &str) -> super::jinja::JinjaExtraction {
70    extract_all_with_vars(sql, macro_prefix, &std::collections::HashMap::new())
71}
72
73/// Like [`extract_all`] but resolves `var()` calls using project-level variables.
74pub fn extract_all_with_vars(
75    sql: &str,
76    macro_prefix: &str,
77    vars: &std::collections::HashMap<String, serde_json::Value>,
78) -> super::jinja::JinjaExtraction {
79    if let Some(ext) = super::jinja::extract_via_jinja_with_vars(sql, macro_prefix, vars) {
80        return ext;
81    }
82    super::jinja::JinjaExtraction {
83        refs: extract_refs_regex(sql),
84        sources: extract_sources_regex(sql),
85        config: extract_config_regex(sql),
86    }
87}
88
89/// Extract all ref() and source() calls from SQL content in a single pass.
90/// Tries minijinja rendering first; falls back to regex on failure.
91///
92/// `macro_prefix` is the pre-built concatenation of valid macro SQL files
93/// so that custom macros containing ref()/source() are expanded and tracked.
94pub fn extract_refs_and_sources(sql: &str, macro_prefix: &str) -> (Vec<RefCall>, Vec<SourceCall>) {
95    extract_refs_and_sources_with_vars(sql, macro_prefix, &std::collections::HashMap::new())
96}
97
98/// Like [`extract_refs_and_sources`] but resolves `var()` calls using project-level variables.
99pub fn extract_refs_and_sources_with_vars(
100    sql: &str,
101    macro_prefix: &str,
102    vars: &std::collections::HashMap<String, serde_json::Value>,
103) -> (Vec<RefCall>, Vec<SourceCall>) {
104    if let Some(ext) = super::jinja::extract_via_jinja_with_vars(sql, macro_prefix, vars) {
105        return (ext.refs, ext.sources);
106    }
107    (extract_refs_regex(sql), extract_sources_regex(sql))
108}
109
110/// Extract all ref() calls from SQL content.
111pub fn extract_refs(sql: &str) -> Vec<RefCall> {
112    extract_refs_and_sources(sql, "").0
113}
114
115/// Extract all source() calls from SQL content.
116pub fn extract_sources(sql: &str) -> Vec<SourceCall> {
117    extract_refs_and_sources(sql, "").1
118}
119
120/// Regex fallback for extracting ref() calls
121fn extract_refs_regex(sql: &str) -> Vec<RefCall> {
122    let cleaned = strip_jinja_comments(sql);
123    let mut refs = Vec::new();
124
125    for cap in REF_PATTERN.captures_iter(&cleaned) {
126        if let (Some(pkg), Some(name)) = (cap.get(1), cap.get(2)) {
127            refs.push(RefCall {
128                package: Some(pkg.as_str().to_string()),
129                name: name.as_str().to_string(),
130            });
131        } else if let Some(name) = cap.get(3) {
132            refs.push(RefCall {
133                package: None,
134                name: name.as_str().to_string(),
135            });
136        }
137    }
138
139    refs
140}
141
142/// Regex fallback for extracting source() calls
143fn extract_sources_regex(sql: &str) -> Vec<SourceCall> {
144    let cleaned = strip_jinja_comments(sql);
145    let mut sources = Vec::new();
146
147    for cap in SOURCE_PATTERN.captures_iter(&cleaned) {
148        sources.push(SourceCall {
149            source_name: cap[1].to_string(),
150            table_name: cap[2].to_string(),
151        });
152    }
153
154    sources
155}
156
157/// Parsed config block from SQL
158#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
159pub struct SqlConfig {
160    pub materialized: Option<String>,
161    pub tags: Vec<String>,
162}
163
164// Matches {{ config(...) }} blocks — captures the inner arguments
165static CONFIG_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
166    Regex::new(
167        r#"(?x)
168        \{\{-?\s*
169        config\s*\(
170        ([\s\S]*?)
171        \)\s*
172        -?\}\}
173    "#,
174    )
175    .unwrap()
176});
177
178// Matches materialized='value' or materialized="value"
179static MATERIALIZED_PATTERN: LazyLock<Regex> =
180    LazyLock::new(|| Regex::new(r#"materialized\s*=\s*['"]([^'"]+)['"]"#).unwrap());
181
182// Matches tags=['a', 'b'] or tags=["a", "b"]
183static TAGS_PATTERN: LazyLock<Regex> =
184    LazyLock::new(|| Regex::new(r#"tags\s*=\s*\[([^\]]*)\]"#).unwrap());
185
186// Matches individual tag values inside the tags list
187static TAG_VALUE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"['"]([^'"]+)['"]"#).unwrap());
188
189/// Extract config() block settings from SQL content.
190/// Tries minijinja rendering first; falls back to regex on failure.
191pub fn extract_config(sql: &str, macro_prefix: &str) -> SqlConfig {
192    if let Some(ext) = super::jinja::extract_via_jinja(sql, macro_prefix) {
193        return ext.config;
194    }
195    extract_config_regex(sql)
196}
197
198/// Regex fallback for extracting config() settings
199fn extract_config_regex(sql: &str) -> SqlConfig {
200    let cleaned = strip_jinja_comments(sql);
201    let mut config = SqlConfig::default();
202
203    if let Some(cap) = CONFIG_PATTERN.captures(&cleaned) {
204        let inner = &cap[1];
205
206        if let Some(mat) = MATERIALIZED_PATTERN.captures(inner) {
207            config.materialized = Some(mat[1].to_string());
208        }
209
210        if let Some(tags_cap) = TAGS_PATTERN.captures(inner) {
211            let tags_inner = &tags_cap[1];
212            config.tags = TAG_VALUE
213                .captures_iter(tags_inner)
214                .map(|c| c[1].to_string())
215                .collect();
216        }
217    }
218
219    config
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225
226    #[test]
227    fn test_single_ref() {
228        let sql = "SELECT * FROM {{ ref('stg_orders') }}";
229        let refs = extract_refs(sql);
230        assert_eq!(refs.len(), 1);
231        assert_eq!(refs[0].name, "stg_orders");
232        assert!(refs[0].package.is_none());
233    }
234
235    #[test]
236    fn test_double_quoted_ref() {
237        let sql = r#"SELECT * FROM {{ ref("stg_orders") }}"#;
238        let refs = extract_refs(sql);
239        assert_eq!(refs.len(), 1);
240        assert_eq!(refs[0].name, "stg_orders");
241    }
242
243    #[test]
244    fn test_two_arg_ref() {
245        let sql = "SELECT * FROM {{ ref('other_project', 'stg_orders') }}";
246        let refs = extract_refs(sql);
247        assert_eq!(refs.len(), 1);
248        assert_eq!(refs[0].package.as_deref(), Some("other_project"));
249        assert_eq!(refs[0].name, "stg_orders");
250    }
251
252    #[test]
253    fn test_whitespace_control() {
254        let sql = "SELECT * FROM {{- ref('stg_orders') -}}";
255        let refs = extract_refs(sql);
256        assert_eq!(refs.len(), 1);
257        assert_eq!(refs[0].name, "stg_orders");
258    }
259
260    #[test]
261    fn test_multiple_refs() {
262        let sql = r#"
263            SELECT
264                o.*,
265                c.name
266            FROM {{ ref('stg_orders') }} o
267            JOIN {{ ref('stg_customers') }} c ON o.customer_id = c.id
268        "#;
269        let refs = extract_refs(sql);
270        assert_eq!(refs.len(), 2);
271        assert_eq!(refs[0].name, "stg_orders");
272        assert_eq!(refs[1].name, "stg_customers");
273    }
274
275    #[test]
276    fn test_source() {
277        let sql = "SELECT * FROM {{ source('raw', 'orders') }}";
278        let sources = extract_sources(sql);
279        assert_eq!(sources.len(), 1);
280        assert_eq!(sources[0].source_name, "raw");
281        assert_eq!(sources[0].table_name, "orders");
282    }
283
284    #[test]
285    fn test_source_whitespace_control() {
286        let sql = "SELECT * FROM {{- source('raw', 'orders') -}}";
287        let sources = extract_sources(sql);
288        assert_eq!(sources.len(), 1);
289        assert_eq!(sources[0].source_name, "raw");
290    }
291
292    #[test]
293    fn test_strip_jinja_comments() {
294        let sql = r#"
295            {# This is a comment with {{ ref('should_be_ignored') }} #}
296            SELECT * FROM {{ ref('actual_model') }}
297        "#;
298        let refs = extract_refs(sql);
299        assert_eq!(refs.len(), 1);
300        assert_eq!(refs[0].name, "actual_model");
301    }
302
303    #[test]
304    fn test_mixed_refs_and_sources() {
305        let sql = r#"
306            SELECT *
307            FROM {{ source('raw', 'orders') }}
308            JOIN {{ ref('stg_customers') }} ON 1=1
309        "#;
310        let refs = extract_refs(sql);
311        let sources = extract_sources(sql);
312        assert_eq!(refs.len(), 1);
313        assert_eq!(sources.len(), 1);
314    }
315
316    #[test]
317    fn test_no_refs() {
318        let sql = "SELECT 1 as id";
319        let refs = extract_refs(sql);
320        assert!(refs.is_empty());
321    }
322
323    #[test]
324    fn test_extra_spaces() {
325        let sql = "SELECT * FROM {{  ref(  'stg_orders'  )  }}";
326        let refs = extract_refs(sql);
327        assert_eq!(refs.len(), 1);
328        assert_eq!(refs[0].name, "stg_orders");
329    }
330
331    // ─── Config extraction tests ───
332
333    #[test]
334    fn test_config_materialized() {
335        let sql = "{{ config(materialized='incremental') }}\nSELECT 1";
336        let config = extract_config(sql, "");
337        assert_eq!(config.materialized.as_deref(), Some("incremental"));
338        assert!(config.tags.is_empty());
339    }
340
341    #[test]
342    fn test_config_materialized_double_quotes() {
343        let sql = r#"{{ config(materialized="table") }}"#;
344        let config = extract_config(sql, "");
345        assert_eq!(config.materialized.as_deref(), Some("table"));
346    }
347
348    #[test]
349    fn test_config_tags() {
350        let sql = "{{ config(tags=['nightly', 'finance']) }}\nSELECT 1";
351        let config = extract_config(sql, "");
352        assert_eq!(config.tags, vec!["nightly", "finance"]);
353    }
354
355    #[test]
356    fn test_config_both() {
357        let sql = "{{ config(materialized='view', tags=['daily']) }}\nSELECT 1";
358        let config = extract_config(sql, "");
359        assert_eq!(config.materialized.as_deref(), Some("view"));
360        assert_eq!(config.tags, vec!["daily"]);
361    }
362
363    #[test]
364    fn test_config_whitespace_control() {
365        let sql = "{{- config(materialized='ephemeral') -}}\nSELECT 1";
366        let config = extract_config(sql, "");
367        assert_eq!(config.materialized.as_deref(), Some("ephemeral"));
368    }
369
370    #[test]
371    fn test_config_multiline() {
372        let sql = r#"{{
373            config(
374                materialized='incremental',
375                tags=['nightly', 'warehouse']
376            )
377        }}
378        SELECT 1"#;
379        let config = extract_config(sql, "");
380        assert_eq!(config.materialized.as_deref(), Some("incremental"));
381        assert_eq!(config.tags, vec!["nightly", "warehouse"]);
382    }
383
384    #[test]
385    fn test_no_config() {
386        let sql = "SELECT * FROM {{ ref('orders') }}";
387        let config = extract_config(sql, "");
388        assert!(config.materialized.is_none());
389        assert!(config.tags.is_empty());
390    }
391
392    #[test]
393    fn test_config_in_comment_ignored() {
394        let sql = r#"
395            {# {{ config(materialized='table') }} #}
396            SELECT 1
397        "#;
398        let config = extract_config(sql, "");
399        assert!(config.materialized.is_none());
400    }
401}