1use crate::parser::{Parser, SqlDialect, StatementType};
7use crate::redactor::config::RedactConfig;
8use crate::redactor::StrategyKind;
9use crate::schema::SchemaBuilder;
10use std::fs::File;
11use std::io::Write;
12
13#[derive(Debug)]
15pub struct ColumnAnalysis {
16 pub table: String,
17 pub column: String,
18 pub column_type: String,
19 pub suggested_strategy: Option<StrategyKind>,
20 pub confidence: Confidence,
21}
22
23#[derive(Debug, Clone, Copy, PartialEq)]
25pub enum Confidence {
26 High,
27 Medium,
28 Low,
29 None,
30}
31
32impl Confidence {
33 fn as_comment(&self) -> &'static str {
34 match self {
35 Confidence::High => "",
36 Confidence::Medium => " # Medium confidence",
37 Confidence::Low => " # Low confidence - review",
38 Confidence::None => "",
39 }
40 }
41}
42
43pub fn generate_config(config: &RedactConfig) -> anyhow::Result<()> {
45 let analyses = analyze_for_config(&config.input, config.dialect)?;
46
47 let output_path = config.output.clone().unwrap_or_else(|| {
49 let mut path = config.input.clone();
50 path.set_extension("redact.yaml");
51 path
52 });
53
54 generate_config_yaml(&analyses, &output_path)?;
55
56 eprintln!("Generated config: {:?}", output_path);
57 eprintln!(
58 "Found {} columns with potential PII",
59 analyses
60 .iter()
61 .filter(|a| a.suggested_strategy.is_some())
62 .count()
63 );
64
65 Ok(())
66}
67
68fn analyze_for_config(
70 input: &std::path::Path,
71 dialect: SqlDialect,
72) -> anyhow::Result<Vec<ColumnAnalysis>> {
73 let file = File::open(input)?;
74 let mut parser = Parser::with_dialect(file, 64 * 1024, dialect);
75 let mut builder = SchemaBuilder::new();
76
77 while let Some(stmt) = parser.read_statement()? {
79 let (stmt_type, _table_name) =
80 Parser::<&[u8]>::parse_statement_with_dialect(&stmt, dialect);
81
82 if stmt_type == StatementType::CreateTable {
83 let stmt_str = String::from_utf8_lossy(&stmt);
84 builder.parse_create_table(&stmt_str);
85 }
86 }
87
88 let schema = builder.build();
89
90 let mut analyses = Vec::new();
92 for table in schema.iter() {
93 for col in &table.columns {
94 let (strategy, confidence) = detect_pii(&col.name, &format!("{:?}", col.col_type));
95 analyses.push(ColumnAnalysis {
96 table: table.name.clone(),
97 column: col.name.clone(),
98 column_type: format!("{:?}", col.col_type),
99 suggested_strategy: strategy,
100 confidence,
101 });
102 }
103 }
104
105 Ok(analyses)
106}
107
108fn detect_pii(column_name: &str, _column_type: &str) -> (Option<StrategyKind>, Confidence) {
110 let name = column_name.to_lowercase();
111
112 if name.contains("email") {
114 return (
115 Some(StrategyKind::Hash {
116 preserve_domain: true,
117 }),
118 Confidence::High,
119 );
120 }
121 if name.contains("password") || name.contains("passwd") {
122 return (
123 Some(StrategyKind::Constant {
124 value: "$2b$10$REDACTED".to_string(),
125 }),
126 Confidence::High,
127 );
128 }
129 if name.contains("ssn") || name.contains("social_security") {
130 return (Some(StrategyKind::Null), Confidence::High);
131 }
132 if name.contains("tax_id") || name == "tin" {
133 return (Some(StrategyKind::Null), Confidence::High);
134 }
135 if name.contains("credit_card") || name.starts_with("cc_") {
136 return (
137 Some(StrategyKind::Mask {
138 pattern: "****-****-****-XXXX".to_string(),
139 }),
140 Confidence::High,
141 );
142 }
143
144 if name.contains("phone") || name.contains("mobile") || name.contains("cell") {
146 return (
147 Some(StrategyKind::Fake {
148 generator: "phone".to_string(),
149 }),
150 Confidence::Medium,
151 );
152 }
153 if name == "first_name" || name == "fname" {
154 return (
155 Some(StrategyKind::Fake {
156 generator: "first_name".to_string(),
157 }),
158 Confidence::High,
159 );
160 }
161 if name == "last_name" || name == "lname" || name.contains("surname") {
162 return (
163 Some(StrategyKind::Fake {
164 generator: "last_name".to_string(),
165 }),
166 Confidence::High,
167 );
168 }
169 if (name.contains("name") && !name.contains("username") && name != "name")
170 || name == "full_name"
171 {
172 return (
173 Some(StrategyKind::Fake {
174 generator: "name".to_string(),
175 }),
176 Confidence::Medium,
177 );
178 }
179 if name.contains("address") || name.contains("street") {
180 return (
181 Some(StrategyKind::Fake {
182 generator: "address".to_string(),
183 }),
184 Confidence::Medium,
185 );
186 }
187 if name == "city" {
188 return (
189 Some(StrategyKind::Fake {
190 generator: "city".to_string(),
191 }),
192 Confidence::Medium,
193 );
194 }
195 if name.contains("zip") || name.contains("postal") {
196 return (
197 Some(StrategyKind::Fake {
198 generator: "zip".to_string(),
199 }),
200 Confidence::Medium,
201 );
202 }
203 if name.contains("ip_address") || name == "ip_addr" || name == "ip" {
204 return (
205 Some(StrategyKind::Fake {
206 generator: "ip".to_string(),
207 }),
208 Confidence::Medium,
209 );
210 }
211 if name.contains("birth") || name == "dob" || name.contains("date_of_birth") {
212 return (
213 Some(StrategyKind::Fake {
214 generator: "date".to_string(),
215 }),
216 Confidence::Medium,
217 );
218 }
219
220 if name.contains("company") || name.contains("organization") {
222 return (
223 Some(StrategyKind::Fake {
224 generator: "company".to_string(),
225 }),
226 Confidence::Low,
227 );
228 }
229
230 (None, Confidence::None)
231}
232
233fn generate_config_yaml(
235 analyses: &[ColumnAnalysis],
236 output: &std::path::Path,
237) -> anyhow::Result<()> {
238 let mut yaml = String::new();
239
240 yaml.push_str("# sql-splitter redact configuration\n");
242 yaml.push_str("# Generated by: sql-splitter redact <input> --generate-config\n");
243 yaml.push_str("#\n");
244 yaml.push_str("# Review and modify this file before running redaction.\n");
245 yaml.push_str("# See: https://github.com/helgesverre/sql-splitter#redact-config\n");
246 yaml.push('\n');
247
248 yaml.push_str("# Random seed for reproducible redaction (optional)\n");
250 yaml.push_str("# seed: 12345\n\n");
251
252 yaml.push_str("# Locale for fake data generation\n");
254 yaml.push_str("# Supported: en, de_de, fr_fr, zh_cn, zh_tw, ja_jp, pt_br, ar_sa\n");
255 yaml.push_str("locale: en\n\n");
256
257 yaml.push_str("# Default strategy for columns not matching any rule\n");
259 yaml.push_str("defaults:\n");
260 yaml.push_str(" strategy: skip\n\n");
261
262 yaml.push_str("# Redaction rules (processed in order, first match wins)\n");
264 yaml.push_str("rules:\n");
265
266 let mut by_table: std::collections::BTreeMap<&str, Vec<&ColumnAnalysis>> =
268 std::collections::BTreeMap::new();
269 for analysis in analyses {
270 by_table.entry(&analysis.table).or_default().push(analysis);
271 }
272
273 for (table, columns) in by_table {
274 yaml.push_str(&format!("\n # --- Table: {} ---\n", table));
275
276 for col in columns {
277 if let Some(ref strategy) = col.suggested_strategy {
278 let confidence_note = col.confidence.as_comment();
279
280 yaml.push_str(&format!(" - column: \"{}.{}\"\n", table, col.column));
281
282 match strategy {
283 StrategyKind::Null => {
284 yaml.push_str(" strategy: null\n");
285 }
286 StrategyKind::Constant { value } => {
287 yaml.push_str(" strategy: constant\n");
288 yaml.push_str(&format!(" value: \"{}\"\n", value));
289 }
290 StrategyKind::Hash { preserve_domain } => {
291 yaml.push_str(" strategy: hash\n");
292 if *preserve_domain {
293 yaml.push_str(" preserve_domain: true\n");
294 }
295 }
296 StrategyKind::Mask { pattern } => {
297 yaml.push_str(" strategy: mask\n");
298 yaml.push_str(&format!(" pattern: \"{}\"\n", pattern));
299 }
300 StrategyKind::Fake { generator } => {
301 yaml.push_str(" strategy: fake\n");
302 yaml.push_str(&format!(" generator: {}\n", generator));
303 }
304 StrategyKind::Shuffle => {
305 yaml.push_str(" strategy: shuffle\n");
306 }
307 StrategyKind::Skip => {
308 yaml.push_str(" strategy: skip\n");
309 }
310 }
311
312 if !confidence_note.is_empty() {
313 yaml.push_str(&format!(" {}\n", confidence_note.trim()));
314 }
315 } else {
316 yaml.push_str(&format!(
318 " # - column: \"{}.{}\" # No PII detected\n",
319 table, col.column
320 ));
321 yaml.push_str(" # strategy: skip\n");
322 }
323 }
324 }
325
326 yaml.push_str("\n# Tables to skip entirely (no redaction applied)\n");
328 yaml.push_str("skip_tables:\n");
329 yaml.push_str(" # - schema_migrations\n");
330 yaml.push_str(" # - ar_internal_metadata\n");
331
332 let mut file = File::create(output)?;
334 file.write_all(yaml.as_bytes())?;
335
336 Ok(())
337}
338
339#[cfg(test)]
340mod tests {
341 use super::*;
342
343 #[test]
344 fn test_detect_email() {
345 let (strategy, confidence) = detect_pii("email", "Text");
346 assert!(matches!(strategy, Some(StrategyKind::Hash { .. })));
347 assert_eq!(confidence, Confidence::High);
348
349 let (strategy, _) = detect_pii("user_email", "Text");
350 assert!(matches!(strategy, Some(StrategyKind::Hash { .. })));
351 }
352
353 #[test]
354 fn test_detect_password() {
355 let (strategy, confidence) = detect_pii("password", "Text");
356 assert!(matches!(strategy, Some(StrategyKind::Constant { .. })));
357 assert_eq!(confidence, Confidence::High);
358 }
359
360 #[test]
361 fn test_detect_ssn() {
362 let (strategy, confidence) = detect_pii("ssn", "Text");
363 assert!(matches!(strategy, Some(StrategyKind::Null)));
364 assert_eq!(confidence, Confidence::High);
365 }
366
367 #[test]
368 fn test_detect_phone() {
369 let (strategy, confidence) = detect_pii("phone_number", "Text");
370 assert!(matches!(strategy, Some(StrategyKind::Fake { .. })));
371 assert_eq!(confidence, Confidence::Medium);
372 }
373
374 #[test]
375 fn test_detect_no_pii() {
376 let (strategy, confidence) = detect_pii("id", "Int");
377 assert!(strategy.is_none());
378 assert_eq!(confidence, Confidence::None);
379
380 let (strategy, _) = detect_pii("created_at", "DateTime");
381 assert!(strategy.is_none());
382 }
383}