1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Value-pattern sampling heuristics for SchemaInferrer.
#
# When the column name gives no signal, SchemaInferrer samples up to 50 non-null
# text values and evaluates each pattern below in order. The first pattern whose
# ALL conditions are satisfied is returned.
#
# Fields per pattern:
# kind , FieldKind to assign (same names as the Rust enum variants)
# regex , regex applied to each sample; set to "" to skip regex matching
# threshold , minimum fraction of samples that must match the regex (0.0–1.0)
# unique_rate_min, optional lower bound on (unique values / total samples)
# unique_rate_max, optional upper bound on (unique values / total samples)
# avg_len_min , optional lower bound on mean string length
# avg_len_max , optional upper bound on mean string length
#
# A pattern with regex = "" and threshold = 0.0 matches purely on statistical
# conditions (unique_rate_*, avg_len_*).
#
# The [fallback] section sets the kind returned when no pattern matches.
[[]]
= "Date"
= '^\d{4}[-/]\d{2}[-/]\d{2}$'
= 0.8
[[]]
= "Phone"
= '^[\d\s\-\+\(\)]{7,15}$'
= 0.8
[[]]
= "Numeric"
= '^\d+(\.\d+)?$'
= 0.9
[[]]
= "Id"
= '^[A-Z0-9\-]{4,20}$'
= 0.8
= 0.9
[[]]
= "Name"
= ""
= 0.0
= 0.7
= 30.0
[[]]
= "Categorical"
= ""
= 0.0
= 0.2
[]
= "FreeText"