Skip to main content

sem_core/parser/plugins/
yaml.rs

1use crate::model::entity::{build_entity_id, SemanticEntity};
2use crate::parser::plugin::SemanticParserPlugin;
3use crate::utils::hash::content_hash;
4
5pub struct YamlParserPlugin;
6
7impl SemanticParserPlugin for YamlParserPlugin {
8    fn id(&self) -> &str {
9        "yaml"
10    }
11
12    fn extensions(&self) -> &[&str] {
13        &[".yml", ".yaml"]
14    }
15
16    fn extract_entities(&self, content: &str, file_path: &str) -> Vec<SemanticEntity> {
17        // Extract top-level keys with proper line ranges by scanning the source text.
18        // A top-level key starts a line with no indentation (e.g. "key:" or "key: value").
19        // Its range extends until the next top-level key or end of file.
20        let lines: Vec<&str> = content.lines().collect();
21        let top_level_keys = find_top_level_keys(&lines);
22
23        if top_level_keys.is_empty() {
24            return Vec::new();
25        }
26
27        // Parse with serde_yaml for content hashing
28        let parsed: serde_yaml::Value = match serde_yaml::from_str(content) {
29            Ok(v) => v,
30            Err(_) => return Vec::new(),
31        };
32        let mapping = match parsed.as_mapping() {
33            Some(m) => m,
34            None => return Vec::new(),
35        };
36
37        // Build a lookup from key name to serialized value
38        let mut value_map: std::collections::HashMap<String, (String, bool)> =
39            std::collections::HashMap::new();
40        for (key, value) in mapping {
41            let key_str = match key.as_str() {
42                Some(s) => s.to_string(),
43                None => format!("{:?}", key),
44            };
45            let is_section = value.is_mapping() || value.is_sequence();
46            let value_str = if is_section {
47                serde_yaml::to_string(value)
48                    .unwrap_or_default()
49                    .trim()
50                    .to_string()
51            } else {
52                yaml_value_to_string(value)
53            };
54            value_map.insert(key_str, (value_str, is_section));
55        }
56
57        let mut entities = Vec::new();
58        for (i, tk) in top_level_keys.iter().enumerate() {
59            let end_line = if i + 1 < top_level_keys.len() {
60                // End just before the next top-level key (skip trailing blanks)
61                let next_start = top_level_keys[i + 1].line;
62                trim_trailing_blanks_yaml(&lines, tk.line, next_start)
63            } else {
64                // Last key: extend to end of file (skip trailing blanks)
65                trim_trailing_blanks_yaml(&lines, tk.line, lines.len() + 1)
66            };
67
68            let entity_content = lines[tk.line - 1..end_line].join("\n");
69            let (value_str, is_section) = value_map
70                .get(&tk.key)
71                .cloned()
72                .unwrap_or_else(|| (entity_content.clone(), false));
73
74            let entity_type = if is_section { "section" } else { "property" };
75
76            entities.push(SemanticEntity {
77                id: build_entity_id(file_path, entity_type, &tk.key, None),
78                file_path: file_path.to_string(),
79                entity_type: entity_type.to_string(),
80                name: tk.key.clone(),
81                parent_id: None,
82                content_hash: content_hash(&value_str),
83                structural_hash: None,
84                content: entity_content,
85                start_line: tk.line,
86                end_line,
87                metadata: None,
88            });
89        }
90
91        entities
92    }
93}
94
95struct TopLevelKey {
96    key: String,
97    line: usize, // 1-based
98}
99
100/// Find all top-level keys in the YAML source. A top-level key is a line
101/// that starts with a non-space, non-comment character and contains a colon.
102fn find_top_level_keys(lines: &[&str]) -> Vec<TopLevelKey> {
103    let mut keys = Vec::new();
104    for (i, line) in lines.iter().enumerate() {
105        if line.is_empty() || line.starts_with(' ') || line.starts_with('\t') {
106            continue;
107        }
108        // Skip comments and document markers
109        if line.starts_with('#') || line.starts_with("---") || line.starts_with("...") {
110            continue;
111        }
112        // Extract the key (everything before the first ':')
113        if let Some(colon_pos) = line.find(':') {
114            let key = line[..colon_pos].trim().to_string();
115            if !key.is_empty() {
116                keys.push(TopLevelKey {
117                    key,
118                    line: i + 1,
119                });
120            }
121        }
122    }
123    keys
124}
125
126fn trim_trailing_blanks_yaml(lines: &[&str], start: usize, next_start: usize) -> usize {
127    let mut end = next_start - 1;
128    while end > start {
129        let trimmed = lines[end - 1].trim();
130        if trimmed.is_empty() {
131            end -= 1;
132        } else {
133            break;
134        }
135    }
136    end
137}
138
139fn yaml_value_to_string(value: &serde_yaml::Value) -> String {
140    match value {
141        serde_yaml::Value::String(s) => s.clone(),
142        serde_yaml::Value::Number(n) => n.to_string(),
143        serde_yaml::Value::Bool(b) => b.to_string(),
144        serde_yaml::Value::Null => "null".to_string(),
145        _ => format!("{:?}", value),
146    }
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    #[test]
154    fn test_yaml_line_positions() {
155        let content = "name: my-app\nversion: 1.0.0\nscripts:\n  build: tsc\n  test: jest\ndescription: a test app\n";
156        let plugin = YamlParserPlugin;
157        let entities = plugin.extract_entities(content, "config.yaml");
158
159        assert_eq!(entities.len(), 4);
160
161        assert_eq!(entities[0].name, "name");
162        assert_eq!(entities[0].start_line, 1);
163        assert_eq!(entities[0].end_line, 1);
164
165        assert_eq!(entities[1].name, "version");
166        assert_eq!(entities[1].start_line, 2);
167        assert_eq!(entities[1].end_line, 2);
168
169        assert_eq!(entities[2].name, "scripts");
170        assert_eq!(entities[2].entity_type, "section");
171        assert_eq!(entities[2].start_line, 3);
172        assert_eq!(entities[2].end_line, 5);
173
174        assert_eq!(entities[3].name, "description");
175        assert_eq!(entities[3].start_line, 6);
176        assert_eq!(entities[3].end_line, 6);
177    }
178}