1use std::collections::{HashMap, HashSet};
2use std::path::Path;
3
4use zer_core::{
5 error::ZerError,
6 record::{FieldValue, Record},
7 schema::{FieldDef, FieldKind, Schema},
8};
9
10use crate::config::{NameHeuristics, ValuePatterns};
11
12fn text_samples<'a>(field_name: &str, records: &'a [Record], n: usize) -> Vec<&'a str> {
15 records
16 .iter()
17 .filter_map(|r| match r.fields.get(field_name) {
18 Some(FieldValue::Text(s)) if !s.is_empty() => Some(s.as_str()),
19 _ => None,
20 })
21 .take(n)
22 .collect()
23}
24
25fn collect_field_names(records: &[Record]) -> Vec<String> {
26 let mut names: HashSet<String> = HashSet::new();
27 for record in records {
28 for name in record.fields.keys() {
29 names.insert(name.clone());
30 }
31 }
32 let mut sorted: Vec<String> = names.into_iter().collect();
33 sorted.sort();
34 sorted
35}
36
37pub struct SchemaInferrer {
58 overrides: HashMap<String, FieldKind>,
59 name_heuristics: NameHeuristics,
60 value_patterns: ValuePatterns,
61}
62
63impl SchemaInferrer {
64 pub fn new() -> Self {
67 Self {
68 overrides: HashMap::new(),
69 name_heuristics: NameHeuristics::load_default(),
70 value_patterns: ValuePatterns::load_default(),
71 }
72 }
73
74 pub fn with_name_heuristics_file(mut self, path: impl AsRef<Path>) -> Result<Self, ZerError> {
78 self.name_heuristics = NameHeuristics::from_file(path.as_ref())?;
79 Ok(self)
80 }
81
82 pub fn with_value_patterns_file(mut self, path: impl AsRef<Path>) -> Result<Self, ZerError> {
86 self.value_patterns = ValuePatterns::from_file(path.as_ref())?;
87 Ok(self)
88 }
89
90 pub fn override_field(mut self, name: impl Into<String>, kind: FieldKind) -> Self {
95 self.overrides.insert(name.into(), kind);
96 self
97 }
98
99 pub fn infer(&self, records: &[Record]) -> Result<Schema, ZerError> {
106 let field_names = collect_field_names(records);
107 if field_names.is_empty() {
108 return Err(ZerError::EmptySchema);
109 }
110
111 let fields: Vec<FieldDef> = field_names
112 .into_iter()
113 .map(|name| {
114 let kind = self.overrides.get(&name).copied().unwrap_or_else(|| {
115 self.name_heuristics.infer_kind(&name).unwrap_or_else(|| {
116 let samples = text_samples(&name, records, 50);
117 self.value_patterns.infer_kind(&samples)
118 })
119 });
120 FieldDef { name, kind }
121 })
122 .collect();
123
124 Ok(Schema { fields })
125 }
126}
127
128impl Default for SchemaInferrer {
129 fn default() -> Self {
130 Self::new()
131 }
132}
133
134#[cfg(test)]
137mod tests {
138 use super::*;
139
140 fn text_record(id: u64, fields: &[(&str, &str)]) -> Record {
141 let mut r = Record::new(id);
142 for (k, v) in fields {
143 r = r.insert(*k, FieldValue::Text(v.to_string()));
144 }
145 r
146 }
147
148 fn infer_name(col: &str) -> Option<FieldKind> {
150 NameHeuristics::load_default().infer_kind(col)
151 }
152
153 fn infer_values(field: &str, records: &[Record]) -> FieldKind {
154 let samples = text_samples(field, records, 50);
155 ValuePatterns::load_default().infer_kind(&samples)
156 }
157
158 #[test]
161 fn infer_common_name_fields() {
162 let cases = [
163 ("first_name", FieldKind::Name),
164 ("last_name", FieldKind::Name),
165 ("voornamen", FieldKind::Name),
166 ("achternaam", FieldKind::Name),
167 ("surname", FieldKind::Name),
168 ];
169 for (col, expected) in cases {
170 assert_eq!(
171 infer_name(col),
172 Some(expected),
173 "'{col}' should infer as {expected:?}"
174 );
175 }
176 }
177
178 #[test]
179 fn infer_date_fields_by_name() {
180 for col in ["dob", "geboortedatum", "birth_date", "created_at"] {
181 assert_eq!(
182 infer_name(col),
183 Some(FieldKind::Date),
184 "'{col}' should infer as Date"
185 );
186 }
187 }
188
189 #[test]
190 fn infer_phone_fields_by_name() {
191 for col in ["phone", "tel", "mobile", "msisdn"] {
192 assert_eq!(
193 infer_name(col),
194 Some(FieldKind::Phone),
195 "'{col}' should infer as Phone"
196 );
197 }
198 }
199
200 #[test]
201 fn infer_address_fields_by_name() {
202 for col in ["straatnaam", "postcode", "woonplaats", "huisnummer"] {
203 assert_eq!(
204 infer_name(col),
205 Some(FieldKind::Address),
206 "'{col}' should infer as Address"
207 );
208 }
209 }
210
211 #[test]
212 fn infer_id_fields_by_name() {
213 for col in ["bsn", "imsi", "iccid", "document_nummer", "passport_id"] {
214 let result = infer_name(col);
215 assert_eq!(
216 result,
217 Some(FieldKind::Id),
218 "'{col}' should infer as Id, got {result:?}"
219 );
220 }
221 }
222
223 #[test]
226 fn infer_date_from_iso_values() {
227 let records: Vec<Record> = (0..20)
228 .map(|i| text_record(i, &[("col_1", "2024-03-15")]))
229 .collect();
230 assert_eq!(infer_values("col_1", &records), FieldKind::Date);
231 }
232
233 #[test]
234 fn infer_numeric_from_number_values() {
235 let records: Vec<Record> = (0..20)
236 .map(|i| text_record(i, &[("col_1", &i.to_string())]))
237 .collect();
238 assert_eq!(infer_values("col_1", &records), FieldKind::Numeric);
239 }
240
241 #[test]
242 fn infer_categorical_from_low_cardinality_values() {
243 let values = ["M", "V", "M", "V", "M", "V", "M", "V", "M", "V"];
244 let records: Vec<Record> = values
245 .iter()
246 .enumerate()
247 .map(|(i, v)| text_record(i as u64, &[("geslacht", v)]))
248 .collect();
249 assert_eq!(infer_values("geslacht", &records), FieldKind::Categorical);
250 }
251
252 #[test]
253 fn infer_falls_back_to_freetext_for_empty_field() {
254 let records = vec![Record::new(1)];
255 assert_eq!(infer_values("col_1", &records), FieldKind::FreeText);
256 }
257
258 #[test]
261 fn override_takes_precedence_over_name_heuristic() {
262 let records = vec![text_record(1, &[("dob", "1990-01-01")])];
263 let schema = SchemaInferrer::new()
264 .override_field("dob", FieldKind::Id)
265 .infer(&records)
266 .unwrap();
267
268 let dob = schema.fields.iter().find(|f| f.name == "dob").unwrap();
269 assert_eq!(
270 dob.kind,
271 FieldKind::Id,
272 "override must win over name heuristic"
273 );
274 }
275
276 #[test]
277 fn override_takes_precedence_over_value_pattern() {
278 let records: Vec<Record> = (0..20)
279 .map(|i| text_record(i, &[("col_x", "2024-01-01")]))
280 .collect();
281 let schema = SchemaInferrer::new()
282 .override_field("col_x", FieldKind::FreeText)
283 .infer(&records)
284 .unwrap();
285
286 let field = schema.fields.iter().find(|f| f.name == "col_x").unwrap();
287 assert_eq!(field.kind, FieldKind::FreeText);
288 }
289
290 #[test]
293 fn with_name_heuristics_file_overrides_default() {
294 let dir = tempfile::tempdir().unwrap();
295 let path = dir.path().join("names.toml");
296 std::fs::write(
297 &path,
298 r#"
299[[rules]]
300kind = "Id"
301exact = ["custom_col"]
302"#,
303 )
304 .unwrap();
305
306 let records = vec![text_record(1, &[("custom_col", "ABC123")])];
307 let schema = SchemaInferrer::new()
308 .with_name_heuristics_file(&path)
309 .unwrap()
310 .infer(&records)
311 .unwrap();
312
313 let f = schema
314 .fields
315 .iter()
316 .find(|f| f.name == "custom_col")
317 .unwrap();
318 assert_eq!(f.kind, FieldKind::Id);
319 }
320
321 #[test]
322 fn with_value_patterns_file_overrides_default() {
323 let dir = tempfile::tempdir().unwrap();
324 let path = dir.path().join("values.toml");
325 std::fs::write(
326 &path,
327 r#"
328[[patterns]]
329kind = "Phone"
330regex = '^\+31\d{9}$'
331threshold = 0.8
332
333[fallback]
334default_kind = "FreeText"
335"#,
336 )
337 .unwrap();
338
339 let records: Vec<Record> = (0..20)
340 .map(|i| text_record(i, &[("col", "+31612345678")]))
341 .collect();
342 let schema = SchemaInferrer::new()
343 .with_value_patterns_file(&path)
344 .unwrap()
345 .infer(&records)
346 .unwrap();
347
348 let f = schema.fields.iter().find(|f| f.name == "col").unwrap();
349 assert_eq!(f.kind, FieldKind::Phone);
350 }
351
352 #[test]
353 fn with_name_heuristics_file_missing_returns_error() {
354 let result =
355 SchemaInferrer::new().with_name_heuristics_file("/nonexistent/path/names.toml");
356 assert!(result.is_err());
357 }
358
359 #[test]
362 fn infer_brp_like_records() {
363 let records: Vec<Record> = (0..10)
364 .map(|i| {
365 text_record(
366 i,
367 &[
368 ("voornamen", "Erik"),
369 ("achternaam", "Hendriks"),
370 ("geboortedatum", "1980-06-15"),
371 ("postcode", "1234AB"),
372 ("nationaliteit", "Nederland"),
373 ],
374 )
375 })
376 .collect();
377
378 let schema = SchemaInferrer::new().infer(&records).unwrap();
379 let kind_of = |n: &str| schema.fields.iter().find(|f| f.name == n).map(|f| f.kind);
380
381 assert_eq!(kind_of("voornamen"), Some(FieldKind::Name));
382 assert_eq!(kind_of("achternaam"), Some(FieldKind::Name));
383 assert_eq!(kind_of("geboortedatum"), Some(FieldKind::Date));
384 }
385
386 #[test]
387 fn infer_empty_records_returns_error() {
388 let result = SchemaInferrer::new().infer(&[]);
389 assert!(
390 matches!(result, Err(ZerError::EmptySchema)),
391 "empty record slice must return EmptySchema"
392 );
393 }
394
395 #[test]
396 fn infer_record_with_no_fields_returns_error() {
397 let records = vec![Record::new(1), Record::new(2)];
398 let result = SchemaInferrer::new().infer(&records);
399 assert!(
400 matches!(result, Err(ZerError::EmptySchema)),
401 "records with no fields must return EmptySchema"
402 );
403 }
404
405 #[test]
406 fn infer_handles_null_values_gracefully() {
407 let mut records = vec![];
408 for i in 0..10u64 {
409 let mut r = Record::new(i);
410 if i % 2 == 0 {
411 r = r.insert("col", FieldValue::Text("2024-01-01".into()));
412 } else {
413 r = r.insert("col", FieldValue::Null);
414 }
415 records.push(r);
416 }
417 let schema = SchemaInferrer::new().infer(&records).unwrap();
418 assert_eq!(schema.len(), 1);
419 }
420
421 #[test]
422 fn infer_field_names_sorted_deterministically() {
423 let records = vec![text_record(1, &[("zzz", "a"), ("aaa", "b"), ("mmm", "c")])];
424 let schema = SchemaInferrer::new().infer(&records).unwrap();
425 let names: Vec<&str> = schema.fields.iter().map(|f| f.name.as_str()).collect();
426 assert_eq!(names, vec!["aaa", "mmm", "zzz"]);
427 }
428}