1use crate::parser::{Parser, SqlDialect, StatementType};
7use crate::redactor::config::RedactConfig;
8use crate::redactor::StrategyKind;
9use crate::schema::SchemaBuilder;
10use std::fs::File;
11use std::io::Write;
12
13#[derive(Debug)]
15pub struct ColumnAnalysis {
16 pub table: String,
17 pub column: String,
18 #[allow(dead_code)]
19 pub column_type: String,
20 pub suggested_strategy: Option<StrategyKind>,
21 pub confidence: Confidence,
22}
23
24#[derive(Debug, Clone, Copy, PartialEq)]
26pub enum Confidence {
27 High,
28 Medium,
29 Low,
30 None,
31}
32
33impl Confidence {
34 fn as_comment(&self) -> &'static str {
35 match self {
36 Confidence::High => "",
37 Confidence::Medium => " # Medium confidence",
38 Confidence::Low => " # Low confidence - review",
39 Confidence::None => "",
40 }
41 }
42}
43
44pub fn generate_config(config: &RedactConfig) -> anyhow::Result<()> {
46 let analyses = analyze_for_config(&config.input, config.dialect)?;
47
48 let output_path = config.output.clone().unwrap_or_else(|| {
50 let mut path = config.input.clone();
51 path.set_extension("redact.yaml");
52 path
53 });
54
55 generate_config_yaml(&analyses, &output_path)?;
56
57 eprintln!("Generated config: {:?}", output_path);
58 eprintln!(
59 "Found {} columns with potential PII",
60 analyses
61 .iter()
62 .filter(|a| a.suggested_strategy.is_some())
63 .count()
64 );
65
66 Ok(())
67}
68
69fn analyze_for_config(
71 input: &std::path::Path,
72 dialect: SqlDialect,
73) -> anyhow::Result<Vec<ColumnAnalysis>> {
74 let file = File::open(input)?;
75 let mut parser = Parser::with_dialect(file, 64 * 1024, dialect);
76 let mut builder = SchemaBuilder::new();
77
78 while let Some(stmt) = parser.read_statement()? {
80 let (stmt_type, _table_name) =
81 Parser::<&[u8]>::parse_statement_with_dialect(&stmt, dialect);
82
83 if stmt_type == StatementType::CreateTable {
84 let stmt_str = String::from_utf8_lossy(&stmt);
85 builder.parse_create_table(&stmt_str);
86 }
87 }
88
89 let schema = builder.build();
90
91 let mut analyses = Vec::new();
93 for table in schema.iter() {
94 for col in &table.columns {
95 let (strategy, confidence) = detect_pii(&col.name, &format!("{:?}", col.col_type));
96 analyses.push(ColumnAnalysis {
97 table: table.name.clone(),
98 column: col.name.clone(),
99 column_type: format!("{:?}", col.col_type),
100 suggested_strategy: strategy,
101 confidence,
102 });
103 }
104 }
105
106 Ok(analyses)
107}
108
109fn detect_pii(column_name: &str, _column_type: &str) -> (Option<StrategyKind>, Confidence) {
111 let name = column_name.to_lowercase();
112
113 if name.contains("email") {
115 return (
116 Some(StrategyKind::Hash {
117 preserve_domain: true,
118 }),
119 Confidence::High,
120 );
121 }
122 if name.contains("password") || name.contains("passwd") {
123 return (
124 Some(StrategyKind::Constant {
125 value: "$2b$10$REDACTED".to_string(),
126 }),
127 Confidence::High,
128 );
129 }
130 if name.contains("ssn") || name.contains("social_security") {
131 return (Some(StrategyKind::Null), Confidence::High);
132 }
133 if name.contains("tax_id") || name == "tin" {
134 return (Some(StrategyKind::Null), Confidence::High);
135 }
136 if name.contains("credit_card") || name.starts_with("cc_") {
137 return (
138 Some(StrategyKind::Mask {
139 pattern: "****-****-****-XXXX".to_string(),
140 }),
141 Confidence::High,
142 );
143 }
144
145 if name.contains("phone") || name.contains("mobile") || name.contains("cell") {
147 return (
148 Some(StrategyKind::Fake {
149 generator: "phone".to_string(),
150 }),
151 Confidence::Medium,
152 );
153 }
154 if name == "first_name" || name == "fname" {
155 return (
156 Some(StrategyKind::Fake {
157 generator: "first_name".to_string(),
158 }),
159 Confidence::High,
160 );
161 }
162 if name == "last_name" || name == "lname" || name.contains("surname") {
163 return (
164 Some(StrategyKind::Fake {
165 generator: "last_name".to_string(),
166 }),
167 Confidence::High,
168 );
169 }
170 if (name.contains("name") && !name.contains("username") && name != "name")
171 || name == "full_name"
172 {
173 return (
174 Some(StrategyKind::Fake {
175 generator: "name".to_string(),
176 }),
177 Confidence::Medium,
178 );
179 }
180 if name.contains("address") || name.contains("street") {
181 return (
182 Some(StrategyKind::Fake {
183 generator: "address".to_string(),
184 }),
185 Confidence::Medium,
186 );
187 }
188 if name == "city" {
189 return (
190 Some(StrategyKind::Fake {
191 generator: "city".to_string(),
192 }),
193 Confidence::Medium,
194 );
195 }
196 if name.contains("zip") || name.contains("postal") {
197 return (
198 Some(StrategyKind::Fake {
199 generator: "zip".to_string(),
200 }),
201 Confidence::Medium,
202 );
203 }
204 if name.contains("ip_address") || name == "ip_addr" || name == "ip" {
205 return (
206 Some(StrategyKind::Fake {
207 generator: "ip".to_string(),
208 }),
209 Confidence::Medium,
210 );
211 }
212 if name.contains("birth") || name == "dob" || name.contains("date_of_birth") {
213 return (
214 Some(StrategyKind::Fake {
215 generator: "date".to_string(),
216 }),
217 Confidence::Medium,
218 );
219 }
220
221 if name.contains("company") || name.contains("organization") {
223 return (
224 Some(StrategyKind::Fake {
225 generator: "company".to_string(),
226 }),
227 Confidence::Low,
228 );
229 }
230
231 (None, Confidence::None)
232}
233
234fn generate_config_yaml(
236 analyses: &[ColumnAnalysis],
237 output: &std::path::Path,
238) -> anyhow::Result<()> {
239 let mut yaml = String::new();
240
241 yaml.push_str("# sql-splitter redact configuration\n");
243 yaml.push_str("# Generated by: sql-splitter redact <input> --generate-config\n");
244 yaml.push_str("#\n");
245 yaml.push_str("# Review and modify this file before running redaction.\n");
246 yaml.push_str("# See: https://github.com/helgesverre/sql-splitter#redact-config\n");
247 yaml.push('\n');
248
249 yaml.push_str("# Random seed for reproducible redaction (optional)\n");
251 yaml.push_str("# seed: 12345\n\n");
252
253 yaml.push_str("# Locale for fake data generation\n");
255 yaml.push_str("# Supported: en, de_de, fr_fr, zh_cn, zh_tw, ja_jp, pt_br, ar_sa\n");
256 yaml.push_str("locale: en\n\n");
257
258 yaml.push_str("# Default strategy for columns not matching any rule\n");
260 yaml.push_str("defaults:\n");
261 yaml.push_str(" strategy: skip\n\n");
262
263 yaml.push_str("# Redaction rules (processed in order, first match wins)\n");
265 yaml.push_str("rules:\n");
266
267 let mut by_table: std::collections::BTreeMap<&str, Vec<&ColumnAnalysis>> =
269 std::collections::BTreeMap::new();
270 for analysis in analyses {
271 by_table.entry(&analysis.table).or_default().push(analysis);
272 }
273
274 for (table, columns) in by_table {
275 yaml.push_str(&format!("\n # --- Table: {} ---\n", table));
276
277 for col in columns {
278 if let Some(ref strategy) = col.suggested_strategy {
279 let confidence_note = col.confidence.as_comment();
280
281 yaml.push_str(&format!(" - column: \"{}.{}\"\n", table, col.column));
282
283 match strategy {
284 StrategyKind::Null => {
285 yaml.push_str(" strategy: null\n");
286 }
287 StrategyKind::Constant { value } => {
288 yaml.push_str(" strategy: constant\n");
289 yaml.push_str(&format!(" value: \"{}\"\n", value));
290 }
291 StrategyKind::Hash { preserve_domain } => {
292 yaml.push_str(" strategy: hash\n");
293 if *preserve_domain {
294 yaml.push_str(" preserve_domain: true\n");
295 }
296 }
297 StrategyKind::Mask { pattern } => {
298 yaml.push_str(" strategy: mask\n");
299 yaml.push_str(&format!(" pattern: \"{}\"\n", pattern));
300 }
301 StrategyKind::Fake { generator } => {
302 yaml.push_str(" strategy: fake\n");
303 yaml.push_str(&format!(" generator: {}\n", generator));
304 }
305 StrategyKind::Shuffle => {
306 yaml.push_str(" strategy: shuffle\n");
307 }
308 StrategyKind::Skip => {
309 yaml.push_str(" strategy: skip\n");
310 }
311 }
312
313 if !confidence_note.is_empty() {
314 yaml.push_str(&format!(" {}\n", confidence_note.trim()));
315 }
316 } else {
317 yaml.push_str(&format!(
319 " # - column: \"{}.{}\" # No PII detected\n",
320 table, col.column
321 ));
322 yaml.push_str(" # strategy: skip\n");
323 }
324 }
325 }
326
327 yaml.push_str("\n# Tables to skip entirely (no redaction applied)\n");
329 yaml.push_str("skip_tables:\n");
330 yaml.push_str(" # - schema_migrations\n");
331 yaml.push_str(" # - ar_internal_metadata\n");
332
333 let mut file = File::create(output)?;
335 file.write_all(yaml.as_bytes())?;
336
337 Ok(())
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 #[test]
345 fn test_detect_email() {
346 let (strategy, confidence) = detect_pii("email", "Text");
347 assert!(matches!(strategy, Some(StrategyKind::Hash { .. })));
348 assert_eq!(confidence, Confidence::High);
349
350 let (strategy, _) = detect_pii("user_email", "Text");
351 assert!(matches!(strategy, Some(StrategyKind::Hash { .. })));
352 }
353
354 #[test]
355 fn test_detect_password() {
356 let (strategy, confidence) = detect_pii("password", "Text");
357 assert!(matches!(strategy, Some(StrategyKind::Constant { .. })));
358 assert_eq!(confidence, Confidence::High);
359 }
360
361 #[test]
362 fn test_detect_ssn() {
363 let (strategy, confidence) = detect_pii("ssn", "Text");
364 assert!(matches!(strategy, Some(StrategyKind::Null)));
365 assert_eq!(confidence, Confidence::High);
366 }
367
368 #[test]
369 fn test_detect_phone() {
370 let (strategy, confidence) = detect_pii("phone_number", "Text");
371 assert!(matches!(strategy, Some(StrategyKind::Fake { .. })));
372 assert_eq!(confidence, Confidence::Medium);
373 }
374
375 #[test]
376 fn test_detect_no_pii() {
377 let (strategy, confidence) = detect_pii("id", "Int");
378 assert!(strategy.is_none());
379 assert_eq!(confidence, Confidence::None);
380
381 let (strategy, _) = detect_pii("created_at", "DateTime");
382 assert!(strategy.is_none());
383 }
384}