csv_managed/
schema_cmd.rs

1use std::collections::HashSet;
2use std::str::FromStr;
3
4use anyhow::{Context, Result, anyhow};
5use log::info;
6
7use crate::cli::SchemaArgs;
8use crate::schema::{ColumnMeta, ColumnType, Schema, ValueReplacement};
9
10pub fn execute(args: &SchemaArgs) -> Result<()> {
11    let mut columns = parse_columns(&args.columns)
12        .with_context(|| "Parsing --column definitions for schema creation".to_string())?;
13    apply_replacements(&mut columns, &args.replacements)
14        .with_context(|| "Parsing --replace definitions for schema creation".to_string())?;
15
16    let schema = Schema { columns };
17    schema
18        .save(&args.output)
19        .with_context(|| format!("Writing schema to {:?}", args.output))?;
20
21    info!(
22        "Defined schema with {} column(s) written to {:?}",
23        schema.columns.len(),
24        args.output
25    );
26
27    Ok(())
28}
29
30fn parse_columns(specs: &[String]) -> Result<Vec<ColumnMeta>> {
31    let mut columns = Vec::new();
32    let mut seen = HashSet::new();
33    let mut output_names = HashSet::new();
34
35    for raw in specs {
36        for token in raw.split(',') {
37            let token = token.trim();
38            if token.is_empty() {
39                continue;
40            }
41            let (name_part, type_part) = token.split_once(':').ok_or_else(|| {
42                anyhow!("Column definition '{token}' must use the form name:type")
43            })?;
44
45            let name = name_part.trim();
46            if name.is_empty() {
47                return Err(anyhow!(
48                    "Column name cannot be empty in definition '{token}'"
49                ));
50            }
51            if !seen.insert(name.to_string()) {
52                return Err(anyhow!("Duplicate column name '{name}' provided"));
53            }
54
55            let (type_raw, rename_raw) = if let Some((ty, rename)) = type_part.split_once("->") {
56                (ty, Some(rename))
57            } else {
58                (type_part, None)
59            };
60
61            let column_type = ColumnType::from_str(type_raw.trim())
62                .map_err(|err| anyhow!("Column '{name}' has invalid type '{type_part}': {err}"))?;
63
64            let rename = rename_raw
65                .map(|value| value.trim())
66                .filter(|value| !value.is_empty())
67                .map(|value| value.to_string());
68
69            if let Some(ref alias) = rename {
70                if alias != name && seen.contains(alias) {
71                    return Err(anyhow!(
72                        "Output name '{alias}' conflicts with an existing column name"
73                    ));
74                }
75                if !output_names.insert(alias.clone()) {
76                    return Err(anyhow!("Duplicate output column name '{alias}' provided"));
77                }
78            }
79
80            if rename.is_none() {
81                output_names.insert(name.to_string());
82            }
83
84            columns.push(ColumnMeta {
85                name: name.to_string(),
86                datatype: column_type,
87                rename,
88                value_replacements: Vec::new(),
89            });
90        }
91    }
92
93    if columns.is_empty() {
94        return Err(anyhow!("At least one --column definition is required"));
95    }
96
97    Ok(columns)
98}
99
100fn apply_replacements(columns: &mut [ColumnMeta], specs: &[String]) -> Result<()> {
101    if specs.is_empty() {
102        return Ok(());
103    }
104    let mut lookup = HashSet::new();
105    for column in columns.iter() {
106        lookup.insert(column.name.clone());
107    }
108
109    for raw in specs {
110        let spec = raw.trim();
111        if spec.is_empty() {
112            continue;
113        }
114        let (column_name, mapping) = spec.split_once('=').ok_or_else(|| {
115            anyhow!("Replacement '{spec}' must use the form column=value->new_value")
116        })?;
117        let column_name = column_name.trim();
118        if column_name.is_empty() {
119            return Err(anyhow!("Replacement '{spec}' is missing a column name"));
120        }
121        if !lookup.contains(column_name) {
122            return Err(anyhow!(
123                "Replacement references unknown column '{column_name}'"
124            ));
125        }
126        let (from_raw, to_raw) = mapping.split_once("->").ok_or_else(|| {
127            anyhow!(
128                "Replacement '{spec}' must include '->' to separate original and replacement values"
129            )
130        })?;
131        let from = from_raw.trim().to_string();
132        let to = to_raw.trim().to_string();
133        let column = columns
134            .iter_mut()
135            .find(|c| c.name == column_name)
136            .expect("column should exist");
137        if let Some(existing) = column
138            .value_replacements
139            .iter()
140            .position(|r| r.from == from)
141        {
142            column.value_replacements.remove(existing);
143        }
144        column
145            .value_replacements
146            .push(ValueReplacement { from, to });
147    }
148
149    Ok(())
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    #[test]
157    fn parse_columns_accepts_comma_and_repeats() {
158        let specs = vec![
159            "id:integer,name:string".to_string(),
160            "amount:float".to_string(),
161        ];
162        let columns = parse_columns(&specs).expect("parsed");
163        assert_eq!(columns.len(), 3);
164        assert_eq!(columns[0].name, "id");
165        assert_eq!(columns[1].name, "name");
166        assert_eq!(columns[2].name, "amount");
167        assert_eq!(columns[0].datatype, ColumnType::Integer);
168        assert_eq!(columns[1].datatype, ColumnType::String);
169        assert_eq!(columns[2].datatype, ColumnType::Float);
170    }
171
172    #[test]
173    fn duplicate_columns_are_rejected() {
174        let specs = vec!["id:integer,id:string".to_string()];
175        let err = parse_columns(&specs).unwrap_err();
176        assert!(err.to_string().contains("Duplicate column name"));
177    }
178
179    #[test]
180    fn missing_type_is_rejected() {
181        let specs = vec!["id".to_string()];
182        let err = parse_columns(&specs).unwrap_err();
183        assert!(err.to_string().contains("must use the form"));
184    }
185
186    #[test]
187    fn parse_columns_supports_output_rename() {
188        let specs = vec!["id:integer->Identifier,name:string".to_string()];
189        let columns = parse_columns(&specs).expect("parsed");
190        assert_eq!(columns.len(), 2);
191        assert_eq!(columns[0].rename.as_deref(), Some("Identifier"));
192        assert!(columns[1].rename.is_none());
193    }
194
195    #[test]
196    fn duplicate_output_names_are_rejected() {
197        let specs = vec![
198            "id:integer->Identifier".to_string(),
199            "code:string->Identifier".to_string(),
200        ];
201        let err = parse_columns(&specs).unwrap_err();
202        assert!(err.to_string().contains("Duplicate output column name"));
203    }
204
205    #[test]
206    fn replacements_apply_to_columns() {
207        let specs = vec!["status:string".to_string()];
208        let mut columns = parse_columns(&specs).expect("parsed");
209        let replacements = vec!["status=pending->shipped".to_string()];
210        apply_replacements(&mut columns, &replacements).expect("applied");
211        assert_eq!(columns[0].value_replacements.len(), 1);
212        assert_eq!(columns[0].value_replacements[0].from, "pending");
213        assert_eq!(columns[0].value_replacements[0].to, "shipped");
214    }
215
216    #[test]
217    fn replacements_validate_column_names() {
218        let specs = vec!["status:string".to_string()];
219        let mut columns = parse_columns(&specs).expect("parsed");
220        let replacements = vec!["missing=pending->shipped".to_string()];
221        let err = apply_replacements(&mut columns, &replacements).unwrap_err();
222        assert!(err.to_string().contains("unknown column"));
223    }
224}