Skip to main content

sql_splitter/redactor/
config_generator.rs

1//! Config generator for auto-detecting PII columns.
2//!
3//! Analyzes the input file schema and suggests redaction strategies
4//! based on column names and types.
5
6use crate::parser::{Parser, SqlDialect, StatementType};
7use crate::redactor::config::RedactConfig;
8use crate::redactor::StrategyKind;
9use crate::schema::SchemaBuilder;
10use std::fs::File;
11use std::io::Write;
12
13/// Column analysis result for config generation
14#[derive(Debug)]
15pub struct ColumnAnalysis {
16    pub table: String,
17    pub column: String,
18    pub column_type: String,
19    pub suggested_strategy: Option<StrategyKind>,
20    pub confidence: Confidence,
21}
22
23/// Confidence level for PII detection
24#[derive(Debug, Clone, Copy, PartialEq)]
25pub enum Confidence {
26    High,
27    Medium,
28    Low,
29    None,
30}
31
32impl Confidence {
33    fn as_comment(&self) -> &'static str {
34        match self {
35            Confidence::High => "",
36            Confidence::Medium => "  # Medium confidence",
37            Confidence::Low => "  # Low confidence - review",
38            Confidence::None => "",
39        }
40    }
41}
42
43/// Generate a YAML config file by analyzing the input
44pub fn generate_config(config: &RedactConfig) -> anyhow::Result<()> {
45    let analyses = analyze_for_config(&config.input, config.dialect)?;
46
47    // Determine output path
48    let output_path = config.output.clone().unwrap_or_else(|| {
49        let mut path = config.input.clone();
50        path.set_extension("redact.yaml");
51        path
52    });
53
54    generate_config_yaml(&analyses, &output_path)?;
55
56    eprintln!("Generated config: {:?}", output_path);
57    eprintln!(
58        "Found {} columns with potential PII",
59        analyses
60            .iter()
61            .filter(|a| a.suggested_strategy.is_some())
62            .count()
63    );
64
65    Ok(())
66}
67
68/// Analyze input file for PII columns
69fn analyze_for_config(
70    input: &std::path::Path,
71    dialect: SqlDialect,
72) -> anyhow::Result<Vec<ColumnAnalysis>> {
73    let file = File::open(input)?;
74    let mut parser = Parser::with_dialect(file, 64 * 1024, dialect);
75    let mut builder = SchemaBuilder::new();
76
77    // Build schema
78    while let Some(stmt) = parser.read_statement()? {
79        let (stmt_type, _table_name) =
80            Parser::<&[u8]>::parse_statement_with_dialect(&stmt, dialect);
81
82        if stmt_type == StatementType::CreateTable {
83            let stmt_str = String::from_utf8_lossy(&stmt);
84            builder.parse_create_table(&stmt_str);
85        }
86    }
87
88    let schema = builder.build();
89
90    // Analyze each column
91    let mut analyses = Vec::new();
92    for table in schema.iter() {
93        for col in &table.columns {
94            let (strategy, confidence) = detect_pii(&col.name, &format!("{:?}", col.col_type));
95            analyses.push(ColumnAnalysis {
96                table: table.name.clone(),
97                column: col.name.clone(),
98                column_type: format!("{:?}", col.col_type),
99                suggested_strategy: strategy,
100                confidence,
101            });
102        }
103    }
104
105    Ok(analyses)
106}
107
108/// Detect PII based on column name patterns
109fn detect_pii(column_name: &str, _column_type: &str) -> (Option<StrategyKind>, Confidence) {
110    let name = column_name.to_lowercase();
111
112    // High confidence patterns
113    if name.contains("email") {
114        return (
115            Some(StrategyKind::Hash {
116                preserve_domain: true,
117            }),
118            Confidence::High,
119        );
120    }
121    if name.contains("password") || name.contains("passwd") {
122        return (
123            Some(StrategyKind::Constant {
124                value: "$2b$10$REDACTED".to_string(),
125            }),
126            Confidence::High,
127        );
128    }
129    if name.contains("ssn") || name.contains("social_security") {
130        return (Some(StrategyKind::Null), Confidence::High);
131    }
132    if name.contains("tax_id") || name == "tin" {
133        return (Some(StrategyKind::Null), Confidence::High);
134    }
135    if name.contains("credit_card") || name.starts_with("cc_") {
136        return (
137            Some(StrategyKind::Mask {
138                pattern: "****-****-****-XXXX".to_string(),
139            }),
140            Confidence::High,
141        );
142    }
143
144    // Medium confidence patterns
145    if name.contains("phone") || name.contains("mobile") || name.contains("cell") {
146        return (
147            Some(StrategyKind::Fake {
148                generator: "phone".to_string(),
149            }),
150            Confidence::Medium,
151        );
152    }
153    if name == "first_name" || name == "fname" {
154        return (
155            Some(StrategyKind::Fake {
156                generator: "first_name".to_string(),
157            }),
158            Confidence::High,
159        );
160    }
161    if name == "last_name" || name == "lname" || name.contains("surname") {
162        return (
163            Some(StrategyKind::Fake {
164                generator: "last_name".to_string(),
165            }),
166            Confidence::High,
167        );
168    }
169    if (name.contains("name") && !name.contains("username") && name != "name")
170        || name == "full_name"
171    {
172        return (
173            Some(StrategyKind::Fake {
174                generator: "name".to_string(),
175            }),
176            Confidence::Medium,
177        );
178    }
179    if name.contains("address") || name.contains("street") {
180        return (
181            Some(StrategyKind::Fake {
182                generator: "address".to_string(),
183            }),
184            Confidence::Medium,
185        );
186    }
187    if name == "city" {
188        return (
189            Some(StrategyKind::Fake {
190                generator: "city".to_string(),
191            }),
192            Confidence::Medium,
193        );
194    }
195    if name.contains("zip") || name.contains("postal") {
196        return (
197            Some(StrategyKind::Fake {
198                generator: "zip".to_string(),
199            }),
200            Confidence::Medium,
201        );
202    }
203    if name.contains("ip_address") || name == "ip_addr" || name == "ip" {
204        return (
205            Some(StrategyKind::Fake {
206                generator: "ip".to_string(),
207            }),
208            Confidence::Medium,
209        );
210    }
211    if name.contains("birth") || name == "dob" || name.contains("date_of_birth") {
212        return (
213            Some(StrategyKind::Fake {
214                generator: "date".to_string(),
215            }),
216            Confidence::Medium,
217        );
218    }
219
220    // Low confidence patterns
221    if name.contains("company") || name.contains("organization") {
222        return (
223            Some(StrategyKind::Fake {
224                generator: "company".to_string(),
225            }),
226            Confidence::Low,
227        );
228    }
229
230    (None, Confidence::None)
231}
232
233/// Generate the YAML config file
234fn generate_config_yaml(
235    analyses: &[ColumnAnalysis],
236    output: &std::path::Path,
237) -> anyhow::Result<()> {
238    let mut yaml = String::new();
239
240    // Header
241    yaml.push_str("# sql-splitter redact configuration\n");
242    yaml.push_str("# Generated by: sql-splitter redact <input> --generate-config\n");
243    yaml.push_str("#\n");
244    yaml.push_str("# Review and modify this file before running redaction.\n");
245    yaml.push_str("# See: https://github.com/helgesverre/sql-splitter#redact-config\n");
246    yaml.push('\n');
247
248    // Seed
249    yaml.push_str("# Random seed for reproducible redaction (optional)\n");
250    yaml.push_str("# seed: 12345\n\n");
251
252    // Locale
253    yaml.push_str("# Locale for fake data generation\n");
254    yaml.push_str("# Supported: en, de_de, fr_fr, zh_cn, zh_tw, ja_jp, pt_br, ar_sa\n");
255    yaml.push_str("locale: en\n\n");
256
257    // Defaults
258    yaml.push_str("# Default strategy for columns not matching any rule\n");
259    yaml.push_str("defaults:\n");
260    yaml.push_str("  strategy: skip\n\n");
261
262    // Rules - grouped by table
263    yaml.push_str("# Redaction rules (processed in order, first match wins)\n");
264    yaml.push_str("rules:\n");
265
266    // Group by table
267    let mut by_table: std::collections::BTreeMap<&str, Vec<&ColumnAnalysis>> =
268        std::collections::BTreeMap::new();
269    for analysis in analyses {
270        by_table.entry(&analysis.table).or_default().push(analysis);
271    }
272
273    for (table, columns) in by_table {
274        yaml.push_str(&format!("\n  # --- Table: {} ---\n", table));
275
276        for col in columns {
277            if let Some(ref strategy) = col.suggested_strategy {
278                let confidence_note = col.confidence.as_comment();
279
280                yaml.push_str(&format!("  - column: \"{}.{}\"\n", table, col.column));
281
282                match strategy {
283                    StrategyKind::Null => {
284                        yaml.push_str("    strategy: null\n");
285                    }
286                    StrategyKind::Constant { value } => {
287                        yaml.push_str("    strategy: constant\n");
288                        yaml.push_str(&format!("    value: \"{}\"\n", value));
289                    }
290                    StrategyKind::Hash { preserve_domain } => {
291                        yaml.push_str("    strategy: hash\n");
292                        if *preserve_domain {
293                            yaml.push_str("    preserve_domain: true\n");
294                        }
295                    }
296                    StrategyKind::Mask { pattern } => {
297                        yaml.push_str("    strategy: mask\n");
298                        yaml.push_str(&format!("    pattern: \"{}\"\n", pattern));
299                    }
300                    StrategyKind::Fake { generator } => {
301                        yaml.push_str("    strategy: fake\n");
302                        yaml.push_str(&format!("    generator: {}\n", generator));
303                    }
304                    StrategyKind::Shuffle => {
305                        yaml.push_str("    strategy: shuffle\n");
306                    }
307                    StrategyKind::Skip => {
308                        yaml.push_str("    strategy: skip\n");
309                    }
310                }
311
312                if !confidence_note.is_empty() {
313                    yaml.push_str(&format!("   {}\n", confidence_note.trim()));
314                }
315            } else {
316                // Columns without suggestion - comment out
317                yaml.push_str(&format!(
318                    "  # - column: \"{}.{}\"  # No PII detected\n",
319                    table, col.column
320                ));
321                yaml.push_str("  #   strategy: skip\n");
322            }
323        }
324    }
325
326    // Skip tables
327    yaml.push_str("\n# Tables to skip entirely (no redaction applied)\n");
328    yaml.push_str("skip_tables:\n");
329    yaml.push_str("  # - schema_migrations\n");
330    yaml.push_str("  # - ar_internal_metadata\n");
331
332    // Write file
333    let mut file = File::create(output)?;
334    file.write_all(yaml.as_bytes())?;
335
336    Ok(())
337}
338
339#[cfg(test)]
340mod tests {
341    use super::*;
342
343    #[test]
344    fn test_detect_email() {
345        let (strategy, confidence) = detect_pii("email", "Text");
346        assert!(matches!(strategy, Some(StrategyKind::Hash { .. })));
347        assert_eq!(confidence, Confidence::High);
348
349        let (strategy, _) = detect_pii("user_email", "Text");
350        assert!(matches!(strategy, Some(StrategyKind::Hash { .. })));
351    }
352
353    #[test]
354    fn test_detect_password() {
355        let (strategy, confidence) = detect_pii("password", "Text");
356        assert!(matches!(strategy, Some(StrategyKind::Constant { .. })));
357        assert_eq!(confidence, Confidence::High);
358    }
359
360    #[test]
361    fn test_detect_ssn() {
362        let (strategy, confidence) = detect_pii("ssn", "Text");
363        assert!(matches!(strategy, Some(StrategyKind::Null)));
364        assert_eq!(confidence, Confidence::High);
365    }
366
367    #[test]
368    fn test_detect_phone() {
369        let (strategy, confidence) = detect_pii("phone_number", "Text");
370        assert!(matches!(strategy, Some(StrategyKind::Fake { .. })));
371        assert_eq!(confidence, Confidence::Medium);
372    }
373
374    #[test]
375    fn test_detect_no_pii() {
376        let (strategy, confidence) = detect_pii("id", "Int");
377        assert!(strategy.is_none());
378        assert_eq!(confidence, Confidence::None);
379
380        let (strategy, _) = detect_pii("created_at", "DateTime");
381        assert!(strategy.is_none());
382    }
383}