cloudiful-redactor 0.2.9

Structured text redaction with reversible sessions for secrets, domains, URLs, and related sensitive values.
Documentation
use crate::types::{Finding, FindingKind, FindingSource, RedactionRules};
use std::collections::HashSet;

use super::regexes::{assignment_regex, url_regex};
use super::validators::{
    is_likely_code_expression, is_plain_config_value, is_valid_domain, is_valid_email,
    is_valid_phone, looks_like_secret, normalize, trim_wrapped,
};

pub(crate) fn propagate_repeated_secrets(text: &str, findings: &mut Vec<Finding>) {
    let mut known_ranges = findings
        .iter()
        .map(|finding| (finding.start, finding.end))
        .collect::<HashSet<_>>();
    let mut seen_values = HashSet::new();
    let repeated = findings
        .iter()
        .filter(|finding| finding.kind == FindingKind::Secret)
        .filter(|finding| seen_values.insert(finding.match_text.clone()))
        .map(|finding| (finding.match_text.clone(), finding.normalized_key.clone()))
        .collect::<Vec<_>>();

    for (match_text, normalized_key) in repeated {
        for (start, _) in text.match_indices(&match_text) {
            let end = start + match_text.len();
            if !known_ranges.insert((start, end)) {
                continue;
            }

            findings.push(Finding {
                kind: FindingKind::Secret,
                source: FindingSource::Rule,
                match_text: match_text.clone(),
                normalized_key: normalized_key.clone(),
                confidence: 95,
                start,
                end,
            });
        }
    }
}

pub(crate) fn detect_contextual_assignments(
    text: &str,
    findings: &mut Vec<Finding>,
    rules: RedactionRules,
) {
    let mut offset = 0;

    for line in text.split_inclusive('\n') {
        for captures in assignment_regex().captures_iter(line) {
            let Some(key_match) = captures.name("key") else {
                continue;
            };
            let Some(value_match) = captures.name("value") else {
                continue;
            };
            let Some(separator_match) = captures.name("separator") else {
                continue;
            };

            let key = key_match.as_str();
            let raw_value = value_match.as_str().trim();
            let value = trim_wrapped(raw_value);
            if value.is_empty() {
                continue;
            }

            let separator = separator_match.as_str().chars().next().unwrap_or('=');
            let Some(kind) = contextual_kind(key, value, raw_value, separator, rules) else {
                continue;
            };

            let value_start = value_match.start() + value_match.as_str().find(value).unwrap_or(0);
            let value_end = value_start + value.len();

            findings.push(Finding {
                kind,
                source: FindingSource::Rule,
                match_text: value.to_string(),
                normalized_key: normalize(kind, value),
                confidence: 99,
                start: offset + value_start,
                end: offset + value_end,
            });
        }

        offset += line.len();
    }
}

fn contextual_kind(
    key: &str,
    value: &str,
    raw_value: &str,
    separator: char,
    rules: RedactionRules,
) -> Option<FindingKind> {
    let lower = key.to_ascii_lowercase();

    if rules.secret
        && (lower.contains("secret")
            || lower.contains("token")
            || lower.contains("password")
            || lower.contains("passwd")
            || lower.contains("api_key")
            || lower.contains("apikey")
            || lower.contains("private_key"))
    {
        return contextual_secret_kind(value, raw_value, separator);
    }

    if rules.email && lower.contains("email") && is_valid_email(value) {
        return Some(FindingKind::Email);
    }

    if rules.domain
        && (lower.contains("domain") || lower.contains("host"))
        && is_plain_config_value(raw_value)
        && is_valid_domain(value)
    {
        return Some(FindingKind::Domain);
    }

    if rules.url && lower.contains("url") && url_regex().is_match(value) {
        return Some(FindingKind::Url);
    }

    if rules.phone && lower.contains("phone") && is_valid_phone(value) {
        return Some(FindingKind::Phone);
    }

    if rules.person && lower.contains("name") && value.split_whitespace().count() >= 2 {
        return Some(FindingKind::Person);
    }

    (rules.secret && looks_like_secret(value)).then_some(FindingKind::Secret)
}

fn contextual_secret_kind(value: &str, raw_value: &str, separator: char) -> Option<FindingKind> {
    if looks_like_secret(value) {
        return Some(FindingKind::Secret);
    }

    if separator == ':'
        && (is_likely_code_expression(raw_value) || !is_plain_config_value(raw_value))
    {
        return None;
    }

    (separator == '=' && is_plain_config_value(raw_value) && !is_likely_code_expression(raw_value))
        .then_some(FindingKind::Secret)
}

#[cfg(test)]
mod tests {
    use super::propagate_repeated_secrets;
    use crate::types::{Finding, FindingKind, FindingSource};

    #[test]
    fn repeated_secret_propagation_adds_missing_occurrences_once() {
        let text = "token=ABCDEF1234567890XYZ token=ABCDEF1234567890XYZ";
        let mut findings = vec![Finding {
            kind: FindingKind::Secret,
            source: FindingSource::Rule,
            match_text: "ABCDEF1234567890XYZ".to_string(),
            normalized_key: "ABCDEF1234567890XYZ".to_string(),
            confidence: 99,
            start: 6,
            end: 25,
        }];

        propagate_repeated_secrets(text, &mut findings);
        propagate_repeated_secrets(text, &mut findings);

        let secret_ranges = findings
            .iter()
            .filter(|finding| finding.kind == FindingKind::Secret)
            .map(|finding| (finding.start, finding.end))
            .collect::<Vec<_>>();
        assert_eq!(secret_ranges, vec![(6, 25), (32, 51)]);
    }
}