Skip to main content

kimun_notes/cli/
metadata_extractor.rs

1use crate::cli::json_output::JsonHeader;
2use regex::Regex;
3use std::collections::HashSet;
4use std::sync::OnceLock;
5
6fn header_regex() -> &'static Regex {
7    static REGEX: OnceLock<Regex> = OnceLock::new();
8    REGEX.get_or_init(|| Regex::new(r"^(#{1,6})\s+(.+)$").unwrap())
9}
10
11pub fn extract_tags(content: &str) -> Vec<String> {
12    let mut tags: HashSet<String> = HashSet::new();
13
14    // Extract from YAML frontmatter
15    if let Some(frontmatter) = extract_frontmatter(content)
16        && let Some(yaml_tags) = extract_frontmatter_tags(&frontmatter)
17    {
18        for tag in yaml_tags {
19            tags.insert(tag.to_lowercase());
20        }
21    }
22
23    // Extract hashtags from content (consistent with core's label_matches:
24    // applies word-boundary, ASCII-only label name, and skips frontmatter/
25    // code/HTML/markdown-link/wikilink zones — matching what core indexes.)
26    for tag in kimun_core::note::extract_labels(content) {
27        tags.insert(tag);
28    }
29
30    let mut result: Vec<String> = tags.into_iter().collect();
31    result.sort();
32    result
33}
34
35pub fn extract_links(content: &str) -> Vec<String> {
36    kimun_core::note::link_char_spans(content)
37        .into_iter()
38        .map(|span| span.target)
39        .collect()
40}
41
42pub fn extract_headers(content: &str) -> Vec<JsonHeader> {
43    let mut headers: Vec<JsonHeader> = Vec::new();
44
45    for line in content.lines() {
46        if let Some(capture) = header_regex().captures(line)
47            && let (Some(level_match), Some(text_match)) = (capture.get(1), capture.get(2))
48        {
49            let level = level_match.as_str().len() as u32;
50            let text = text_match.as_str().trim().to_string();
51            headers.push(JsonHeader { text, level });
52        }
53    }
54
55    headers
56}
57
58fn extract_frontmatter(content: &str) -> Option<String> {
59    if !content.starts_with("---") {
60        return None;
61    }
62
63    let lines: Vec<&str> = content.lines().collect();
64    if lines.len() < 3 {
65        return None;
66    }
67
68    let mut end_index = None;
69    for (i, line) in lines.iter().enumerate().skip(1) {
70        if line.trim() == "---" {
71            end_index = Some(i);
72            break;
73        }
74    }
75
76    if let Some(end) = end_index {
77        let frontmatter_lines = &lines[1..end];
78        Some(frontmatter_lines.join("\n"))
79    } else {
80        None
81    }
82}
83
84fn extract_frontmatter_tags(frontmatter: &str) -> Option<Vec<String>> {
85    let mut tags: Vec<String> = Vec::new();
86    let mut in_tags_block = false;
87
88    for line in frontmatter.lines() {
89        let line = line.trim();
90
91        // Handle "tags: [tag1, tag2]" format (inline array)
92        if let Some(tags_str) = line.strip_prefix("tags:") {
93            let trimmed = tags_str.trim();
94
95            // Check if this is an inline array format
96            if trimmed.starts_with('[') && trimmed.ends_with(']') {
97                let cleaned = trimmed
98                    .strip_prefix('[')
99                    .and_then(|s| s.strip_suffix(']'))
100                    .unwrap_or(trimmed);
101
102                for tag in cleaned.split(',') {
103                    let clean_tag = tag
104                        .trim()
105                        .strip_prefix('"')
106                        .and_then(|s| s.strip_suffix('"'))
107                        .or_else(|| {
108                            tag.trim()
109                                .strip_prefix('\'')
110                                .and_then(|s| s.strip_suffix('\''))
111                        })
112                        .unwrap_or(tag.trim());
113
114                    if !clean_tag.is_empty() {
115                        tags.push(clean_tag.to_string());
116                    }
117                }
118            }
119            // Check if this is the start of a block sequence (no content after colon, or just empty)
120            else if trimmed.is_empty() {
121                in_tags_block = true;
122            }
123            // Single tag on same line as "tags:"
124            else {
125                let clean_tag = trimmed
126                    .strip_prefix('"')
127                    .and_then(|s| s.strip_suffix('"'))
128                    .unwrap_or(trimmed);
129                if !clean_tag.is_empty() {
130                    tags.push(clean_tag.to_string());
131                }
132            }
133        }
134        // Handle YAML block sequence format (tags: \n  - tag1 \n  - tag2)
135        else if in_tags_block && line.starts_with('-') {
136            if let Some(tag_str) = line.strip_prefix('-') {
137                let clean_tag = tag_str
138                    .trim()
139                    .strip_prefix('"')
140                    .and_then(|s| s.strip_suffix('"'))
141                    .or_else(|| {
142                        tag_str
143                            .trim()
144                            .strip_prefix('\'')
145                            .and_then(|s| s.strip_suffix('\''))
146                    })
147                    .unwrap_or(tag_str.trim());
148
149                if !clean_tag.is_empty() {
150                    tags.push(clean_tag.to_string());
151                }
152            }
153        }
154        // Handle "tag: value" format (single tag)
155        else if let Some(tag_str) = line.strip_prefix("tag:") {
156            let clean_tag = tag_str
157                .trim()
158                .strip_prefix('"')
159                .and_then(|s| s.strip_suffix('"'))
160                .unwrap_or(tag_str.trim());
161
162            if !clean_tag.is_empty() {
163                tags.push(clean_tag.to_string());
164            }
165        }
166        // Exit tags block if we encounter a new YAML key or empty line
167        else if in_tags_block && (line.contains(':') || line.is_empty()) {
168            in_tags_block = false;
169        }
170    }
171
172    if tags.is_empty() { None } else { Some(tags) }
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178
179    #[test]
180    fn frontmatter_tags_array_format() {
181        let frontmatter = r#"tags: ["project", "urgent"]
182title: "Test Note""#;
183
184        let tags = extract_frontmatter_tags(frontmatter).unwrap();
185        assert_eq!(tags, vec!["project", "urgent"]);
186    }
187
188    #[test]
189    fn frontmatter_single_tag_format() {
190        let frontmatter = r#"tag: meeting
191title: "Test Note""#;
192
193        let tags = extract_frontmatter_tags(frontmatter).unwrap();
194        assert_eq!(tags, vec!["meeting"]);
195    }
196
197    #[test]
198    fn extract_tags_matches_core_label_rules() {
199        let body =
200            "---\ntags: [yaml_tag]\n---\nplain #body and #tag-with-dash\n```\n#code_tag\n```";
201        let tags = extract_tags(body);
202        // Expected: yaml_tag (frontmatter), body (extracted), tag (dash-terminated).
203        // NOT expected: code_tag (in fence), tag-with-dash (dash not in label).
204        assert!(tags.contains(&"yaml_tag".to_string()));
205        assert!(tags.contains(&"body".to_string()));
206        assert!(tags.contains(&"tag".to_string()));
207        assert!(!tags.contains(&"code_tag".to_string()), "got: {:?}", tags);
208        assert!(
209            !tags.contains(&"tag-with-dash".to_string()),
210            "got: {:?}",
211            tags
212        );
213    }
214}