Skip to main content

keyhog_scanner/structured/
parsers.rs

1use super::ExtractedPair;
2
3/// Parse KEY=VALUE lines from an .env file.
4pub fn parse_env(text: &str) -> Vec<ExtractedPair> {
5    let mut pairs = Vec::new();
6    for (line_idx, line) in text.lines().enumerate() {
7        let trimmed = line.trim();
8        if trimmed.is_empty() || trimmed.starts_with('#') {
9            continue;
10        }
11        let after_export = trimmed.strip_prefix("export ").unwrap_or(trimmed);
12        if let Some((key, value)) = after_export.split_once('=') {
13            let key = key.trim();
14            let value = value.trim();
15            if key.is_empty() {
16                continue;
17            }
18            let unquoted = strip_quotes(value);
19            pairs.push(ExtractedPair {
20                context: key.to_string(),
21                value: unquoted.to_string(),
22                line: line_idx + 1,
23            });
24        }
25    }
26    pairs
27}
28
29fn strip_quotes(s: &str) -> &str {
30    if s.len() >= 2 {
31        let first = s.as_bytes()[0] as char;
32        let last = s.as_bytes()[s.len() - 1] as char;
33        if (first == '"' || first == '\'') && first == last {
34            return &s[1..s.len() - 1];
35        }
36    }
37    s
38}
39
40/// Parse a Kubernetes Secret YAML and decode base64 values under `data:`.
41pub fn parse_k8s_secret(text: &str) -> Vec<ExtractedPair> {
42    let mut pairs = Vec::new();
43    let value: serde_yaml::Value = match serde_yaml::from_str(text) {
44        Ok(v) => v,
45        Err(error) => {
46            tracing::debug!(target: "keyhog::structured", %error, "k8s secret YAML parse failed");
47            return pairs;
48        }
49    };
50
51    if let Some(serde_yaml::Value::Mapping(map)) = value.get("data") {
52        for (k, v) in map {
53            let key = k.as_str().unwrap_or_default();
54            let encoded = v.as_str().unwrap_or_default();
55            if key.is_empty() || encoded.is_empty() {
56                continue;
57            }
58            let decoded = match keyhog_core::encoding::decode_standard_base64(encoded) {
59                Ok(bytes) => String::from_utf8_lossy(&bytes).into_owned(),
60                Err(_) => continue,
61            };
62            let line = find_line_number(text, encoded).unwrap_or(1);
63            pairs.push(ExtractedPair {
64                context: key.to_string(),
65                value: decoded,
66                line,
67            });
68        }
69    }
70
71    if let Some(serde_yaml::Value::Mapping(map)) = value.get("stringData") {
72        for (k, v) in map {
73            let key = k.as_str().unwrap_or_default();
74            let secret_value = v.as_str().unwrap_or_default().to_string();
75            if key.is_empty() {
76                continue;
77            }
78            let line = find_line_number(text, key).unwrap_or(1);
79            pairs.push(ExtractedPair {
80                context: key.to_string(),
81                value: secret_value,
82                line,
83            });
84        }
85    }
86
87    pairs
88}
89
90/// Parse docker-compose.yml environment blocks.
91pub fn parse_docker_compose(text: &str) -> Vec<ExtractedPair> {
92    let mut pairs = Vec::new();
93    let value: serde_yaml::Value = match serde_yaml::from_str(text) {
94        Ok(v) => v,
95        Err(error) => {
96            tracing::debug!(target: "keyhog::structured", %error, "docker-compose YAML parse failed");
97            return pairs;
98        }
99    };
100    find_environment_pairs(&value, text, &mut pairs, 0);
101    pairs
102}
103
104/// Cap recursion depth on adversarial YAML - same threat as
105/// [`MAX_TFSTATE_DEPTH`] for JSON. Real docker-compose schemas nest
106/// ~6 levels deep (`services.<name>.environment.<list>`); 256 leaves
107/// the policy permissive but guards against a malicious YAML that
108/// embeds deep `services:` chains to stack-overflow the scanner.
109const MAX_COMPOSE_DEPTH: usize = 256;
110
111fn find_environment_pairs(
112    value: &serde_yaml::Value,
113    text: &str,
114    pairs: &mut Vec<ExtractedPair>,
115    depth: usize,
116) {
117    if depth >= MAX_COMPOSE_DEPTH {
118        return;
119    }
120    match value {
121        serde_yaml::Value::Mapping(map) => {
122            for (k, v) in map {
123                if k.as_str() == Some("environment") {
124                    extract_environment_block(v, text, pairs);
125                } else {
126                    find_environment_pairs(v, text, pairs, depth + 1);
127                }
128            }
129        }
130        serde_yaml::Value::Sequence(seq) => {
131            for v in seq {
132                find_environment_pairs(v, text, pairs, depth + 1);
133            }
134        }
135        _ => {}
136    }
137}
138
139fn extract_environment_block(
140    value: &serde_yaml::Value,
141    text: &str,
142    pairs: &mut Vec<ExtractedPair>,
143) {
144    match value {
145        serde_yaml::Value::Mapping(map) => {
146            for (k, v) in map {
147                let key = k.as_str().unwrap_or_default();
148                let val = v.as_str().unwrap_or_default().to_string();
149                if key.is_empty() {
150                    continue;
151                }
152                let line = find_line_number(text, key).unwrap_or(1);
153                pairs.push(ExtractedPair {
154                    context: key.to_string(),
155                    value: val,
156                    line,
157                });
158            }
159        }
160        serde_yaml::Value::Sequence(seq) => {
161            for item in seq {
162                if let Some(s) = item.as_str() {
163                    if let Some((key, val)) = s.split_once('=') {
164                        // A leading `=` (e.g. `=secretvalue`) produces an
165                        // empty key - that's malformed compose and the empty
166                        // context would be useless downstream. Skip in line
167                        // with the k8s parser's empty-key policy.
168                        if key.is_empty() {
169                            continue;
170                        }
171                        let line = find_line_number(text, s).unwrap_or(1);
172                        pairs.push(ExtractedPair {
173                            context: key.to_string(),
174                            value: val.to_string(),
175                            line,
176                        });
177                    }
178                }
179            }
180        }
181        _ => {}
182    }
183}
184
185/// Parse Terraform state JSON and recursively extract `value` fields.
186pub fn parse_tfstate(text: &str) -> Vec<ExtractedPair> {
187    let mut pairs = Vec::new();
188    let value: serde_json::Value = match serde_json::from_str(text) {
189        Ok(v) => v,
190        Err(error) => {
191            tracing::debug!(target: "keyhog::structured", %error, "tfstate JSON parse failed");
192            return pairs;
193        }
194    };
195    extract_tfstate_values(&value, text, &mut pairs, 0);
196    pairs
197}
198
199/// Cap recursion depth on adversarial JSON. A 2 MiB document of
200/// nothing but `[[[...]]]` can nest >500k levels deep - beyond the
201/// 8 MiB default stack of most Linux threads. 256 is enough for any
202/// real Terraform statefile (the deepest natural nesting in the
203/// schema is ~12 levels) but bails before stack overflow.
204const MAX_TFSTATE_DEPTH: usize = 256;
205
206fn extract_tfstate_values(
207    value: &serde_json::Value,
208    text: &str,
209    pairs: &mut Vec<ExtractedPair>,
210    depth: usize,
211) {
212    if depth >= MAX_TFSTATE_DEPTH {
213        return;
214    }
215    match value {
216        serde_json::Value::Object(map) => {
217            for (k, v) in map {
218                if k == "value" {
219                    let val_str = match v {
220                        serde_json::Value::String(s) => s.clone(),
221                        serde_json::Value::Number(n) => n.to_string(),
222                        serde_json::Value::Bool(b) => b.to_string(),
223                        _ => String::new(),
224                    };
225                    if !val_str.is_empty() {
226                        let line = find_line_number(text, &val_str).unwrap_or(1);
227                        pairs.push(ExtractedPair {
228                            context: "tfstate-value".to_string(),
229                            value: val_str,
230                            line,
231                        });
232                    }
233                }
234                extract_tfstate_values(v, text, pairs, depth + 1);
235            }
236        }
237        serde_json::Value::Array(arr) => {
238            for v in arr {
239                extract_tfstate_values(v, text, pairs, depth + 1);
240            }
241        }
242        _ => {}
243    }
244}
245
246/// Parse Jupyter notebook JSON and extract code cell sources.
247pub fn parse_jupyter(text: &str) -> Vec<ExtractedPair> {
248    let mut pairs = Vec::new();
249    let value: serde_json::Value = match serde_json::from_str(text) {
250        Ok(v) => v,
251        Err(error) => {
252            tracing::debug!(target: "keyhog::structured", %error, "Jupyter notebook JSON parse failed");
253            return pairs;
254        }
255    };
256    let cells = match value.get("cells") {
257        Some(serde_json::Value::Array(arr)) => arr,
258        _ => return pairs,
259    };
260    for (idx, cell) in cells.iter().enumerate() {
261        let cell_type = cell.get("cell_type").and_then(|c| c.as_str()).unwrap_or("");
262        if cell_type != "code" {
263            continue;
264        }
265        let source = match cell.get("source") {
266            Some(v) => v,
267            None => continue,
268        };
269        let (source_text, line) = match source {
270            serde_json::Value::String(s) => {
271                let line = find_line_number(text, s).unwrap_or(1);
272                (s.clone(), line)
273            }
274            serde_json::Value::Array(arr) => {
275                let parts: Vec<String> = arr
276                    .iter()
277                    .filter_map(|v| v.as_str().map(|s| s.to_string()))
278                    .collect();
279                let joined = parts.join("");
280                // The joined source contains literal `\n` characters, but
281                // the on-disk JSON encodes them as the escape sequence
282                // `\\n`. Searching for the joined whole - or even a
283                // single fragment that still ends in `\n` - therefore
284                // never matches, collapsing line attribution to 1 for
285                // every multi-string cell. Anchor on the first non-empty
286                // fragment with trailing newlines stripped: the leading
287                // bytes ARE present verbatim in the source JSON.
288                let anchor = parts
289                    .iter()
290                    .find_map(|p| {
291                        let trimmed_end = p.trim_end_matches(['\n', '\r']);
292                        if trimmed_end.is_empty() {
293                            None
294                        } else {
295                            Some(trimmed_end.to_string())
296                        }
297                    })
298                    .unwrap_or_else(|| joined.clone());
299                let line = find_line_number(text, &anchor).unwrap_or(1);
300                (joined, line)
301            }
302            _ => continue,
303        };
304        if !source_text.trim().is_empty() {
305            pairs.push(ExtractedPair {
306                context: format!("jupyter-cell-{}", idx),
307                value: source_text,
308                line,
309            });
310        }
311    }
312    pairs
313}
314
315fn find_line_number(text: &str, needle: &str) -> Option<usize> {
316    if needle.is_empty() {
317        return None;
318    }
319    let pos = text.find(needle)?;
320    let line = text[..pos].chars().filter(|&c| c == '\n').count() + 1;
321    Some(line)
322}