Skip to main content

sem_core/parser/plugins/
yaml.rs

1use crate::model::entity::{build_entity_id, SemanticEntity};
2use crate::parser::plugin::SemanticParserPlugin;
3use crate::utils::hash::content_hash;
4
5pub struct YamlParserPlugin;
6
7impl SemanticParserPlugin for YamlParserPlugin {
8    fn id(&self) -> &str {
9        "yaml"
10    }
11
12    fn extensions(&self) -> &[&str] {
13        &[".yml", ".yaml"]
14    }
15
16    fn extract_entities(&self, content: &str, file_path: &str) -> Vec<SemanticEntity> {
17        // Extract top-level keys with proper line ranges by scanning the source text.
18        // A top-level key starts a line with no indentation (e.g. "key:" or "key: value").
19        // Its range extends until the next top-level key or end of file.
20        let lines: Vec<&str> = content.lines().collect();
21        let top_level_keys = find_top_level_keys(&lines);
22
23        if top_level_keys.is_empty() {
24            // No top-level keys: treat the whole file as a single chunk so
25            // changes to comment-only or marker-only YAML files are detected.
26            if !content.trim().is_empty() {
27                return vec![SemanticEntity {
28                    id: build_entity_id(file_path, "chunk", "(document)", None),
29                    file_path: file_path.to_string(),
30                    entity_type: "chunk".to_string(),
31                    name: "(document)".to_string(),
32                    parent_id: None,
33                    content_hash: content_hash(content),
34                    structural_hash: None,
35                    content: content.to_string(),
36                    start_line: 1,
37                    end_line: lines.len(),
38                    metadata: None,
39                }];
40            }
41            return Vec::new();
42        }
43
44        // Determine entity types using serde_yaml for section vs property.
45        let section_keys: std::collections::HashSet<String> =
46            if let Ok(serde_yaml::Value::Mapping(mapping)) = serde_yaml::from_str(content) {
47                mapping
48                    .iter()
49                    .filter(|(_, v)| v.is_mapping() || v.is_sequence())
50                    .filter_map(|(k, _)| k.as_str().map(String::from))
51                    .collect()
52            } else {
53                std::collections::HashSet::new()
54            };
55
56        let mut entities = Vec::new();
57
58        // Capture preamble (comments, document markers) before the first key
59        if top_level_keys[0].line > 1 {
60            let preamble_end =
61                trim_trailing_blanks_yaml(&lines, 1, top_level_keys[0].line);
62            if preamble_end >= 1 {
63                let preamble_content = lines[..preamble_end].join("\n");
64                if !preamble_content.trim().is_empty() {
65                    entities.push(SemanticEntity {
66                        id: build_entity_id(file_path, "chunk", "(preamble)", None),
67                        file_path: file_path.to_string(),
68                        entity_type: "chunk".to_string(),
69                        name: "(preamble)".to_string(),
70                        parent_id: None,
71                        content_hash: content_hash(&preamble_content),
72                        structural_hash: None,
73                        content: preamble_content,
74                        start_line: 1,
75                        end_line: preamble_end,
76                        metadata: None,
77                    });
78                }
79            }
80        }
81
82        for (i, tk) in top_level_keys.iter().enumerate() {
83            let end_line = if i + 1 < top_level_keys.len() {
84                let next_start = top_level_keys[i + 1].line;
85                trim_trailing_blanks_yaml(&lines, tk.line, next_start)
86            } else {
87                trim_trailing_blanks_yaml(&lines, tk.line, lines.len() + 1)
88            };
89
90            let entity_content = lines[tk.line - 1..end_line].join("\n");
91            let is_section = section_keys.contains(&tk.key);
92            let entity_type = if is_section { "section" } else { "property" };
93
94            // Hash raw text so comment changes within a section are detected.
95            entities.push(SemanticEntity {
96                id: build_entity_id(file_path, entity_type, &tk.key, None),
97                file_path: file_path.to_string(),
98                entity_type: entity_type.to_string(),
99                name: tk.key.clone(),
100                parent_id: None,
101                content_hash: content_hash(&entity_content),
102                structural_hash: None,
103                content: entity_content,
104                start_line: tk.line,
105                end_line,
106                metadata: None,
107            });
108        }
109
110        entities
111    }
112}
113
114struct TopLevelKey {
115    key: String,
116    line: usize, // 1-based
117}
118
119/// Find all top-level keys in the YAML source. A top-level key is a line
120/// that starts with a non-space, non-comment character and contains a colon.
121fn find_top_level_keys(lines: &[&str]) -> Vec<TopLevelKey> {
122    let mut keys = Vec::new();
123    for (i, line) in lines.iter().enumerate() {
124        if line.is_empty() || line.starts_with(' ') || line.starts_with('\t') {
125            continue;
126        }
127        // Skip comments and document markers
128        if line.starts_with('#') || line.starts_with("---") || line.starts_with("...") {
129            continue;
130        }
131        // Extract the key (everything before the first ':')
132        if let Some(colon_pos) = line.find(':') {
133            let key = line[..colon_pos].trim().to_string();
134            if !key.is_empty() {
135                keys.push(TopLevelKey {
136                    key,
137                    line: i + 1,
138                });
139            }
140        }
141    }
142    keys
143}
144
145fn trim_trailing_blanks_yaml(lines: &[&str], start: usize, next_start: usize) -> usize {
146    let mut end = next_start - 1;
147    while end > start {
148        let trimmed = lines[end - 1].trim();
149        if trimmed.is_empty() {
150            end -= 1;
151        } else {
152            break;
153        }
154    }
155    end
156}
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161
162    #[test]
163    fn test_yaml_line_positions() {
164        let content = "name: my-app\nversion: 1.0.0\nscripts:\n  build: tsc\n  test: jest\ndescription: a test app\n";
165        let plugin = YamlParserPlugin;
166        let entities = plugin.extract_entities(content, "config.yaml");
167
168        assert_eq!(entities.len(), 4);
169
170        assert_eq!(entities[0].name, "name");
171        assert_eq!(entities[0].start_line, 1);
172        assert_eq!(entities[0].end_line, 1);
173
174        assert_eq!(entities[1].name, "version");
175        assert_eq!(entities[1].start_line, 2);
176        assert_eq!(entities[1].end_line, 2);
177
178        assert_eq!(entities[2].name, "scripts");
179        assert_eq!(entities[2].entity_type, "section");
180        assert_eq!(entities[2].start_line, 3);
181        assert_eq!(entities[2].end_line, 5);
182
183        assert_eq!(entities[3].name, "description");
184        assert_eq!(entities[3].start_line, 6);
185        assert_eq!(entities[3].end_line, 6);
186    }
187
188    #[test]
189    fn test_yaml_preamble() {
190        let content = "# Config file\n---\nname: my-app\nversion: 1.0.0\n";
191        let plugin = YamlParserPlugin;
192        let entities = plugin.extract_entities(content, "config.yaml");
193
194        assert_eq!(entities[0].name, "(preamble)");
195        assert_eq!(entities[0].entity_type, "chunk");
196        assert_eq!(entities[0].start_line, 1);
197
198        assert_eq!(entities[1].name, "name");
199        assert_eq!(entities[2].name, "version");
200    }
201
202    #[test]
203    fn test_yaml_comment_only_file() {
204        let content = "# Just a comment\n# Another line\n";
205        let plugin = YamlParserPlugin;
206        let entities = plugin.extract_entities(content, "notes.yaml");
207
208        assert_eq!(entities.len(), 1);
209        assert_eq!(entities[0].name, "(document)");
210        assert_eq!(entities[0].entity_type, "chunk");
211    }
212
213    #[test]
214    fn test_yaml_comment_changes_detected() {
215        let content_a = "name: my-app\n# old comment\nversion: 1.0.0\n";
216        let content_b = "name: my-app\n# new comment\nversion: 1.0.0\n";
217        let plugin = YamlParserPlugin;
218        let entities_a = plugin.extract_entities(content_a, "config.yaml");
219        let entities_b = plugin.extract_entities(content_b, "config.yaml");
220
221        // The "name" entity includes the comment line in its range, so
222        // its content_hash should differ between versions.
223        assert_ne!(entities_a[0].content_hash, entities_b[0].content_hash);
224    }
225}