Skip to main content

dlin_core/parser/
yaml_schema.rs

1use serde::Deserialize;
2
3/// Top-level schema YAML file (can contain sources, models, exposures)
4#[derive(Debug, Deserialize, Default)]
5pub struct SchemaFile {
6    #[serde(default)]
7    pub sources: Vec<SourceDefinition>,
8
9    #[serde(default)]
10    pub models: Vec<ModelDefinition>,
11
12    #[serde(default)]
13    pub exposures: Vec<ExposureDefinition>,
14}
15
16#[derive(Debug, Deserialize, Clone)]
17pub struct SourceDefinition {
18    pub name: String,
19    #[serde(default)]
20    pub description: Option<String>,
21    #[serde(default)]
22    pub tables: Vec<SourceTable>,
23}
24
25#[derive(Debug, Deserialize, Clone)]
26pub struct SourceTable {
27    pub name: String,
28    #[serde(default)]
29    pub description: Option<String>,
30    #[serde(default)]
31    pub columns: Vec<ColumnDefinition>,
32}
33
34#[derive(Debug, Deserialize, Clone)]
35pub struct ColumnDefinition {
36    pub name: String,
37    #[serde(default)]
38    pub description: Option<String>,
39    #[serde(default, alias = "data_tests")]
40    pub tests: Vec<TestDefinition>,
41}
42
43/// Tests can be either a string or a map.
44/// Complex variants are deserialized into `serde_json::Value` because serde-saphyr
45/// has no intermediate Value type. This is safe for dbt schema files which use
46/// JSON-compatible YAML.
47#[derive(Debug, Deserialize, Clone)]
48#[serde(untagged)]
49pub enum TestDefinition {
50    Simple(String),
51    Complex(serde_json::Value),
52}
53
54impl TestDefinition {
55    /// Extract the test name from either variant.
56    ///
57    /// - `Simple("not_null")` → `"not_null"`
58    /// - `Complex({"unique": {...}})` → `"unique"`
59    /// - `Complex({"name": "custom", "test_name": "accepted_values", ...})` → `"accepted_values"`
60    pub fn test_name(&self) -> Option<&str> {
61        match self {
62            TestDefinition::Simple(s) => Some(s.as_str()),
63            TestDefinition::Complex(v) => {
64                let obj = v.as_object()?;
65                // Alternative format: {"name": "...", "test_name": "accepted_values", ...}
66                if let Some(tn) = obj.get("test_name").and_then(|v| v.as_str()) {
67                    return Some(tn);
68                }
69                // Standard format: single-key map like {"unique": {...}}
70                // Note: serde_json::Map uses BTreeMap, so keys() is alphabetically ordered.
71                // Skip objects that only have meta-keys (name/config/arguments).
72                for key in obj.keys() {
73                    if !matches!(key.as_str(), "config" | "arguments" | "name") {
74                        return Some(key.as_str());
75                    }
76                }
77                None
78            }
79        }
80    }
81}
82
83#[derive(Debug, Deserialize, Clone)]
84pub struct ModelDefinition {
85    pub name: String,
86    #[serde(default)]
87    pub description: Option<String>,
88    #[serde(default)]
89    pub columns: Vec<ColumnDefinition>,
90    #[serde(default)]
91    pub config: Option<ModelConfig>,
92    #[serde(default)]
93    pub tags: Vec<String>,
94    /// Model-level tests (not attached to a specific column)
95    #[serde(default, alias = "data_tests")]
96    pub tests: Vec<TestDefinition>,
97}
98
99#[derive(Debug, Deserialize, Clone, Default)]
100pub struct ModelConfig {
101    #[serde(default)]
102    pub materialized: Option<String>,
103    #[serde(default)]
104    pub tags: Vec<String>,
105}
106
107#[derive(Debug, Deserialize, Clone)]
108pub struct ExposureDefinition {
109    pub name: String,
110    #[serde(default)]
111    pub description: Option<String>,
112    #[serde(default)]
113    pub label: Option<String>,
114    #[serde(rename = "type", default)]
115    pub exposure_type: Option<String>,
116    #[serde(default)]
117    pub url: Option<String>,
118    #[serde(default)]
119    pub maturity: Option<String>,
120    #[serde(default)]
121    pub depends_on: Vec<String>,
122    #[serde(default)]
123    pub owner: Option<ExposureOwner>,
124}
125
126#[derive(Debug, Deserialize, Clone)]
127pub struct ExposureOwner {
128    pub name: Option<String>,
129    pub email: Option<String>,
130}
131
132/// Parse a schema YAML file
133pub fn parse_schema_file(
134    content: &str,
135    path: Option<&std::path::Path>,
136) -> anyhow::Result<SchemaFile> {
137    let location = path
138        .map(|p| p.display().to_string())
139        .unwrap_or_else(|| "<input>".to_string());
140    super::yaml_from_str(content, &location)
141}
142
143#[cfg(test)]
144mod tests {
145    use super::*;
146
147    #[test]
148    fn test_parse_sources() {
149        let yaml = r#"
150sources:
151  - name: raw
152    description: Raw data from the warehouse
153    tables:
154      - name: orders
155        description: Raw orders table
156      - name: customers
157"#;
158        let schema = parse_schema_file(yaml, None).unwrap();
159        assert_eq!(schema.sources.len(), 1);
160        assert_eq!(schema.sources[0].name, "raw");
161        assert_eq!(schema.sources[0].tables.len(), 2);
162        assert_eq!(schema.sources[0].tables[0].name, "orders");
163    }
164
165    #[test]
166    fn test_parse_models_with_data_tests() {
167        let yaml = r#"
168models:
169  - name: stg_orders
170    description: Staged orders
171    columns:
172      - name: order_id
173        data_tests:
174          - not_null
175          - unique
176"#;
177        let schema = parse_schema_file(yaml, None).unwrap();
178        assert_eq!(schema.models.len(), 1);
179        assert_eq!(schema.models[0].name, "stg_orders");
180        assert_eq!(schema.models[0].columns.len(), 1);
181        assert_eq!(schema.models[0].columns[0].tests.len(), 2);
182    }
183
184    #[test]
185    fn test_parse_models_with_legacy_tests_key() {
186        let yaml = r#"
187models:
188  - name: stg_orders
189    columns:
190      - name: order_id
191        tests:
192          - not_null
193          - unique
194"#;
195        let schema = parse_schema_file(yaml, None).unwrap();
196        assert_eq!(schema.models[0].columns[0].tests.len(), 2);
197    }
198
199    #[test]
200    fn test_parse_data_tests_all_formats() {
201        let yaml = r#"
202models:
203  - name: orders
204    columns:
205      - name: order_id
206        data_tests:
207          - not_null
208          - unique:
209              config:
210                where: "order_id > 21"
211      - name: status
212        data_tests:
213          - accepted_values:
214              arguments:
215                values:
216                  - placed
217                  - shipped
218                  - completed
219                  - returned
220              config:
221                severity: warn
222      - name: customer_id
223        data_tests:
224          - relationships:
225              arguments:
226                to: ref('customers')
227                field: id
228          - name: custom_test_name
229            test_name: accepted_values
230            arguments:
231              values:
232                - 1
233                - 2
234                - 3
235            config:
236              where: "order_date = current_date"
237"#;
238        let schema = parse_schema_file(yaml, None).unwrap();
239        let model = &schema.models[0];
240        assert_eq!(model.columns.len(), 3);
241
242        // Simple + map with config
243        assert_eq!(model.columns[0].tests.len(), 2);
244        assert!(
245            matches!(model.columns[0].tests[0], TestDefinition::Simple(ref s) if s == "not_null")
246        );
247        assert!(matches!(
248            model.columns[0].tests[1],
249            TestDefinition::Complex(_)
250        ));
251
252        // accepted_values with arguments + config
253        assert_eq!(model.columns[1].tests.len(), 1);
254        assert!(matches!(
255            model.columns[1].tests[0],
256            TestDefinition::Complex(_)
257        ));
258
259        // relationships + alternative name/test_name format
260        assert_eq!(model.columns[2].tests.len(), 2);
261        assert!(matches!(
262            model.columns[2].tests[0],
263            TestDefinition::Complex(_)
264        ));
265        assert!(matches!(
266            model.columns[2].tests[1],
267            TestDefinition::Complex(_)
268        ));
269    }
270
271    #[test]
272    fn test_parse_exposures() {
273        let yaml = r#"
274exposures:
275  - name: weekly_report
276    description: Weekly business report
277    type: dashboard
278    depends_on:
279      - ref('orders')
280      - ref('customers')
281    owner:
282      name: Data Team
283      email: data@example.com
284"#;
285        let schema = parse_schema_file(yaml, None).unwrap();
286        assert_eq!(schema.exposures.len(), 1);
287        assert_eq!(schema.exposures[0].name, "weekly_report");
288        assert_eq!(schema.exposures[0].depends_on.len(), 2);
289    }
290
291    #[test]
292    fn test_parse_duplicate_mapping_keys() {
293        // Duplicate mapping keys (same key at same level) should be tolerated
294        // with last-value-wins, matching PyYAML behavior.
295        let yaml = r#"
296sources:
297  - name: raw
298    tables:
299      - name: orders
300sources:
301  - name: other
302    tables:
303      - name: users
304"#;
305        let schema = parse_schema_file(yaml, None).unwrap();
306        // Last value wins: "other" source replaces "raw"
307        assert_eq!(schema.sources.len(), 1);
308        assert_eq!(schema.sources[0].name, "other");
309    }
310
311    #[test]
312    fn test_empty_file() {
313        let yaml = "";
314        let schema = parse_schema_file(yaml, None).unwrap();
315        assert!(schema.sources.is_empty());
316        assert!(schema.models.is_empty());
317        assert!(schema.exposures.is_empty());
318    }
319
320    #[test]
321    fn test_test_name_extraction() {
322        // Simple string test
323        let simple = TestDefinition::Simple("not_null".to_string());
324        assert_eq!(simple.test_name(), Some("not_null"));
325
326        // Complex single-key map: {"unique": {"config": ...}}
327        let complex_single = TestDefinition::Complex(serde_json::json!({
328            "unique": {"config": {"where": "id > 0"}}
329        }));
330        assert_eq!(complex_single.test_name(), Some("unique"));
331
332        // Complex with test_name field: {"name": "custom", "test_name": "accepted_values", ...}
333        let complex_named = TestDefinition::Complex(serde_json::json!({
334            "name": "custom_test_name",
335            "test_name": "accepted_values",
336            "arguments": {"values": [1, 2]}
337        }));
338        assert_eq!(complex_named.test_name(), Some("accepted_values"));
339
340        // Complex relationships test
341        let relationships = TestDefinition::Complex(serde_json::json!({
342            "relationships": {"arguments": {"to": "ref('customers')", "field": "id"}}
343        }));
344        assert_eq!(relationships.test_name(), Some("relationships"));
345
346        // Edge case: {"name": "something"} without test_name should return None
347        let name_only = TestDefinition::Complex(serde_json::json!({"name": "something"}));
348        assert_eq!(name_only.test_name(), None);
349    }
350}