Skip to main content

sql_splitter/redactor/
config_generator.rs

1//! Config generator for auto-detecting PII columns.
2//!
3//! Analyzes the input file schema and suggests redaction strategies
4//! based on column names and types.
5
6use crate::parser::{Parser, SqlDialect, StatementType};
7use crate::redactor::config::RedactConfig;
8use crate::redactor::StrategyKind;
9use crate::schema::SchemaBuilder;
10use std::fs::File;
11use std::io::Write;
12
13/// Column analysis result for config generation
14#[derive(Debug)]
15pub struct ColumnAnalysis {
16    pub table: String,
17    pub column: String,
18    #[allow(dead_code)]
19    pub column_type: String,
20    pub suggested_strategy: Option<StrategyKind>,
21    pub confidence: Confidence,
22}
23
24/// Confidence level for PII detection
25#[derive(Debug, Clone, Copy, PartialEq)]
26pub enum Confidence {
27    High,
28    Medium,
29    Low,
30    None,
31}
32
33impl Confidence {
34    fn as_comment(&self) -> &'static str {
35        match self {
36            Confidence::High => "",
37            Confidence::Medium => "  # Medium confidence",
38            Confidence::Low => "  # Low confidence - review",
39            Confidence::None => "",
40        }
41    }
42}
43
44/// Generate a YAML config file by analyzing the input
45pub fn generate_config(config: &RedactConfig) -> anyhow::Result<()> {
46    let analyses = analyze_for_config(&config.input, config.dialect)?;
47
48    // Determine output path
49    let output_path = config.output.clone().unwrap_or_else(|| {
50        let mut path = config.input.clone();
51        path.set_extension("redact.yaml");
52        path
53    });
54
55    generate_config_yaml(&analyses, &output_path)?;
56
57    eprintln!("Generated config: {:?}", output_path);
58    eprintln!(
59        "Found {} columns with potential PII",
60        analyses
61            .iter()
62            .filter(|a| a.suggested_strategy.is_some())
63            .count()
64    );
65
66    Ok(())
67}
68
69/// Analyze input file for PII columns
70fn analyze_for_config(
71    input: &std::path::Path,
72    dialect: SqlDialect,
73) -> anyhow::Result<Vec<ColumnAnalysis>> {
74    let file = File::open(input)?;
75    let mut parser = Parser::with_dialect(file, 64 * 1024, dialect);
76    let mut builder = SchemaBuilder::new();
77
78    // Build schema
79    while let Some(stmt) = parser.read_statement()? {
80        let (stmt_type, _table_name) =
81            Parser::<&[u8]>::parse_statement_with_dialect(&stmt, dialect);
82
83        if stmt_type == StatementType::CreateTable {
84            let stmt_str = String::from_utf8_lossy(&stmt);
85            builder.parse_create_table(&stmt_str);
86        }
87    }
88
89    let schema = builder.build();
90
91    // Analyze each column
92    let mut analyses = Vec::new();
93    for table in schema.iter() {
94        for col in &table.columns {
95            let (strategy, confidence) = detect_pii(&col.name, &format!("{:?}", col.col_type));
96            analyses.push(ColumnAnalysis {
97                table: table.name.clone(),
98                column: col.name.clone(),
99                column_type: format!("{:?}", col.col_type),
100                suggested_strategy: strategy,
101                confidence,
102            });
103        }
104    }
105
106    Ok(analyses)
107}
108
109/// Detect PII based on column name patterns
110fn detect_pii(column_name: &str, _column_type: &str) -> (Option<StrategyKind>, Confidence) {
111    let name = column_name.to_lowercase();
112
113    // High confidence patterns
114    if name.contains("email") {
115        return (
116            Some(StrategyKind::Hash {
117                preserve_domain: true,
118            }),
119            Confidence::High,
120        );
121    }
122    if name.contains("password") || name.contains("passwd") {
123        return (
124            Some(StrategyKind::Constant {
125                value: "$2b$10$REDACTED".to_string(),
126            }),
127            Confidence::High,
128        );
129    }
130    if name.contains("ssn") || name.contains("social_security") {
131        return (Some(StrategyKind::Null), Confidence::High);
132    }
133    if name.contains("tax_id") || name == "tin" {
134        return (Some(StrategyKind::Null), Confidence::High);
135    }
136    if name.contains("credit_card") || name.starts_with("cc_") {
137        return (
138            Some(StrategyKind::Mask {
139                pattern: "****-****-****-XXXX".to_string(),
140            }),
141            Confidence::High,
142        );
143    }
144
145    // Medium confidence patterns
146    if name.contains("phone") || name.contains("mobile") || name.contains("cell") {
147        return (
148            Some(StrategyKind::Fake {
149                generator: "phone".to_string(),
150            }),
151            Confidence::Medium,
152        );
153    }
154    if name == "first_name" || name == "fname" {
155        return (
156            Some(StrategyKind::Fake {
157                generator: "first_name".to_string(),
158            }),
159            Confidence::High,
160        );
161    }
162    if name == "last_name" || name == "lname" || name.contains("surname") {
163        return (
164            Some(StrategyKind::Fake {
165                generator: "last_name".to_string(),
166            }),
167            Confidence::High,
168        );
169    }
170    if (name.contains("name") && !name.contains("username") && name != "name")
171        || name == "full_name"
172    {
173        return (
174            Some(StrategyKind::Fake {
175                generator: "name".to_string(),
176            }),
177            Confidence::Medium,
178        );
179    }
180    if name.contains("address") || name.contains("street") {
181        return (
182            Some(StrategyKind::Fake {
183                generator: "address".to_string(),
184            }),
185            Confidence::Medium,
186        );
187    }
188    if name == "city" {
189        return (
190            Some(StrategyKind::Fake {
191                generator: "city".to_string(),
192            }),
193            Confidence::Medium,
194        );
195    }
196    if name.contains("zip") || name.contains("postal") {
197        return (
198            Some(StrategyKind::Fake {
199                generator: "zip".to_string(),
200            }),
201            Confidence::Medium,
202        );
203    }
204    if name.contains("ip_address") || name == "ip_addr" || name == "ip" {
205        return (
206            Some(StrategyKind::Fake {
207                generator: "ip".to_string(),
208            }),
209            Confidence::Medium,
210        );
211    }
212    if name.contains("birth") || name == "dob" || name.contains("date_of_birth") {
213        return (
214            Some(StrategyKind::Fake {
215                generator: "date".to_string(),
216            }),
217            Confidence::Medium,
218        );
219    }
220
221    // Low confidence patterns
222    if name.contains("company") || name.contains("organization") {
223        return (
224            Some(StrategyKind::Fake {
225                generator: "company".to_string(),
226            }),
227            Confidence::Low,
228        );
229    }
230
231    (None, Confidence::None)
232}
233
234/// Generate the YAML config file
235fn generate_config_yaml(
236    analyses: &[ColumnAnalysis],
237    output: &std::path::Path,
238) -> anyhow::Result<()> {
239    let mut yaml = String::new();
240
241    // Header
242    yaml.push_str("# sql-splitter redact configuration\n");
243    yaml.push_str("# Generated by: sql-splitter redact <input> --generate-config\n");
244    yaml.push_str("#\n");
245    yaml.push_str("# Review and modify this file before running redaction.\n");
246    yaml.push_str("# See: https://github.com/helgesverre/sql-splitter#redact-config\n");
247    yaml.push('\n');
248
249    // Seed
250    yaml.push_str("# Random seed for reproducible redaction (optional)\n");
251    yaml.push_str("# seed: 12345\n\n");
252
253    // Locale
254    yaml.push_str("# Locale for fake data generation\n");
255    yaml.push_str("# Supported: en, de_de, fr_fr, zh_cn, zh_tw, ja_jp, pt_br, ar_sa\n");
256    yaml.push_str("locale: en\n\n");
257
258    // Defaults
259    yaml.push_str("# Default strategy for columns not matching any rule\n");
260    yaml.push_str("defaults:\n");
261    yaml.push_str("  strategy: skip\n\n");
262
263    // Rules - grouped by table
264    yaml.push_str("# Redaction rules (processed in order, first match wins)\n");
265    yaml.push_str("rules:\n");
266
267    // Group by table
268    let mut by_table: std::collections::BTreeMap<&str, Vec<&ColumnAnalysis>> =
269        std::collections::BTreeMap::new();
270    for analysis in analyses {
271        by_table.entry(&analysis.table).or_default().push(analysis);
272    }
273
274    for (table, columns) in by_table {
275        yaml.push_str(&format!("\n  # --- Table: {} ---\n", table));
276
277        for col in columns {
278            if let Some(ref strategy) = col.suggested_strategy {
279                let confidence_note = col.confidence.as_comment();
280
281                yaml.push_str(&format!("  - column: \"{}.{}\"\n", table, col.column));
282
283                match strategy {
284                    StrategyKind::Null => {
285                        yaml.push_str("    strategy: null\n");
286                    }
287                    StrategyKind::Constant { value } => {
288                        yaml.push_str("    strategy: constant\n");
289                        yaml.push_str(&format!("    value: \"{}\"\n", value));
290                    }
291                    StrategyKind::Hash { preserve_domain } => {
292                        yaml.push_str("    strategy: hash\n");
293                        if *preserve_domain {
294                            yaml.push_str("    preserve_domain: true\n");
295                        }
296                    }
297                    StrategyKind::Mask { pattern } => {
298                        yaml.push_str("    strategy: mask\n");
299                        yaml.push_str(&format!("    pattern: \"{}\"\n", pattern));
300                    }
301                    StrategyKind::Fake { generator } => {
302                        yaml.push_str("    strategy: fake\n");
303                        yaml.push_str(&format!("    generator: {}\n", generator));
304                    }
305                    StrategyKind::Shuffle => {
306                        yaml.push_str("    strategy: shuffle\n");
307                    }
308                    StrategyKind::Skip => {
309                        yaml.push_str("    strategy: skip\n");
310                    }
311                }
312
313                if !confidence_note.is_empty() {
314                    yaml.push_str(&format!("   {}\n", confidence_note.trim()));
315                }
316            } else {
317                // Columns without suggestion - comment out
318                yaml.push_str(&format!(
319                    "  # - column: \"{}.{}\"  # No PII detected\n",
320                    table, col.column
321                ));
322                yaml.push_str("  #   strategy: skip\n");
323            }
324        }
325    }
326
327    // Skip tables
328    yaml.push_str("\n# Tables to skip entirely (no redaction applied)\n");
329    yaml.push_str("skip_tables:\n");
330    yaml.push_str("  # - schema_migrations\n");
331    yaml.push_str("  # - ar_internal_metadata\n");
332
333    // Write file
334    let mut file = File::create(output)?;
335    file.write_all(yaml.as_bytes())?;
336
337    Ok(())
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn test_detect_email() {
346        let (strategy, confidence) = detect_pii("email", "Text");
347        assert!(matches!(strategy, Some(StrategyKind::Hash { .. })));
348        assert_eq!(confidence, Confidence::High);
349
350        let (strategy, _) = detect_pii("user_email", "Text");
351        assert!(matches!(strategy, Some(StrategyKind::Hash { .. })));
352    }
353
354    #[test]
355    fn test_detect_password() {
356        let (strategy, confidence) = detect_pii("password", "Text");
357        assert!(matches!(strategy, Some(StrategyKind::Constant { .. })));
358        assert_eq!(confidence, Confidence::High);
359    }
360
361    #[test]
362    fn test_detect_ssn() {
363        let (strategy, confidence) = detect_pii("ssn", "Text");
364        assert!(matches!(strategy, Some(StrategyKind::Null)));
365        assert_eq!(confidence, Confidence::High);
366    }
367
368    #[test]
369    fn test_detect_phone() {
370        let (strategy, confidence) = detect_pii("phone_number", "Text");
371        assert!(matches!(strategy, Some(StrategyKind::Fake { .. })));
372        assert_eq!(confidence, Confidence::Medium);
373    }
374
375    #[test]
376    fn test_detect_no_pii() {
377        let (strategy, confidence) = detect_pii("id", "Int");
378        assert!(strategy.is_none());
379        assert_eq!(confidence, Confidence::None);
380
381        let (strategy, _) = detect_pii("created_at", "DateTime");
382        assert!(strategy.is_none());
383    }
384}