Skip to main content

kimun_notes/cli/
metadata_extractor.rs

1use crate::cli::json_output::JsonHeader;
2use regex::Regex;
3use std::collections::HashSet;
4use std::sync::OnceLock;
5
6fn hashtag_regex() -> &'static Regex {
7    static REGEX: OnceLock<Regex> = OnceLock::new();
8    REGEX.get_or_init(|| Regex::new(r"#([a-zA-Z0-9_-]+)").unwrap())
9}
10
11fn header_regex() -> &'static Regex {
12    static REGEX: OnceLock<Regex> = OnceLock::new();
13    REGEX.get_or_init(|| Regex::new(r"^(#{1,6})\s+(.+)$").unwrap())
14}
15
16pub fn extract_tags(content: &str) -> Vec<String> {
17    let mut tags: HashSet<String> = HashSet::new();
18
19    // Extract from YAML frontmatter
20    if let Some(frontmatter) = extract_frontmatter(content) {
21        if let Some(yaml_tags) = extract_frontmatter_tags(&frontmatter) {
22            for tag in yaml_tags {
23                tags.insert(tag);
24            }
25        }
26    }
27
28    // Extract hashtags from content
29    for capture in hashtag_regex().captures_iter(content) {
30        if let Some(tag) = capture.get(1) {
31            tags.insert(tag.as_str().to_string());
32        }
33    }
34
35    let mut result: Vec<String> = tags.into_iter().collect();
36    result.sort();
37    result
38}
39
40pub fn extract_links(content: &str) -> Vec<String> {
41    kimun_core::note::link_char_spans(content)
42        .into_iter()
43        .map(|span| span.target)
44        .collect()
45}
46
47pub fn extract_headers(content: &str) -> Vec<JsonHeader> {
48    let mut headers: Vec<JsonHeader> = Vec::new();
49
50    for line in content.lines() {
51        if let Some(capture) = header_regex().captures(line) {
52            if let (Some(level_match), Some(text_match)) = (capture.get(1), capture.get(2)) {
53                let level = level_match.as_str().len() as u32;
54                let text = text_match.as_str().trim().to_string();
55                headers.push(JsonHeader { text, level });
56            }
57        }
58    }
59
60    headers
61}
62
63fn extract_frontmatter(content: &str) -> Option<String> {
64    if !content.starts_with("---") {
65        return None;
66    }
67
68    let lines: Vec<&str> = content.lines().collect();
69    if lines.len() < 3 {
70        return None;
71    }
72
73    let mut end_index = None;
74    for (i, line) in lines.iter().enumerate().skip(1) {
75        if line.trim() == "---" {
76            end_index = Some(i);
77            break;
78        }
79    }
80
81    if let Some(end) = end_index {
82        let frontmatter_lines = &lines[1..end];
83        Some(frontmatter_lines.join("\n"))
84    } else {
85        None
86    }
87}
88
89fn extract_frontmatter_tags(frontmatter: &str) -> Option<Vec<String>> {
90    let mut tags: Vec<String> = Vec::new();
91    let mut in_tags_block = false;
92
93    for line in frontmatter.lines() {
94        let line = line.trim();
95
96        // Handle "tags: [tag1, tag2]" format (inline array)
97        if let Some(tags_str) = line.strip_prefix("tags:") {
98            let trimmed = tags_str.trim();
99
100            // Check if this is an inline array format
101            if trimmed.starts_with('[') && trimmed.ends_with(']') {
102                let cleaned = trimmed.strip_prefix('[')
103                    .and_then(|s| s.strip_suffix(']'))
104                    .unwrap_or(trimmed);
105
106                for tag in cleaned.split(',') {
107                    let clean_tag = tag.trim()
108                        .strip_prefix('"')
109                        .and_then(|s| s.strip_suffix('"'))
110                        .or_else(|| tag.trim().strip_prefix('\'').and_then(|s| s.strip_suffix('\'')))
111                        .unwrap_or(tag.trim());
112
113                    if !clean_tag.is_empty() {
114                        tags.push(clean_tag.to_string());
115                    }
116                }
117            }
118            // Check if this is the start of a block sequence (no content after colon, or just empty)
119            else if trimmed.is_empty() {
120                in_tags_block = true;
121            }
122            // Single tag on same line as "tags:"
123            else {
124                let clean_tag = trimmed
125                    .strip_prefix('"')
126                    .and_then(|s| s.strip_suffix('"'))
127                    .unwrap_or(trimmed);
128                if !clean_tag.is_empty() {
129                    tags.push(clean_tag.to_string());
130                }
131            }
132        }
133        // Handle YAML block sequence format (tags: \n  - tag1 \n  - tag2)
134        else if in_tags_block && line.starts_with('-') {
135            if let Some(tag_str) = line.strip_prefix('-') {
136                let clean_tag = tag_str.trim()
137                    .strip_prefix('"')
138                    .and_then(|s| s.strip_suffix('"'))
139                    .or_else(|| tag_str.trim().strip_prefix('\'').and_then(|s| s.strip_suffix('\'')))
140                    .unwrap_or(tag_str.trim());
141
142                if !clean_tag.is_empty() {
143                    tags.push(clean_tag.to_string());
144                }
145            }
146        }
147        // Handle "tag: value" format (single tag)
148        else if let Some(tag_str) = line.strip_prefix("tag:") {
149            let clean_tag = tag_str.trim()
150                .strip_prefix('"')
151                .and_then(|s| s.strip_suffix('"'))
152                .unwrap_or(tag_str.trim());
153
154            if !clean_tag.is_empty() {
155                tags.push(clean_tag.to_string());
156            }
157        }
158        // Exit tags block if we encounter a new YAML key or empty line
159        else if in_tags_block && (line.contains(':') || line.is_empty()) {
160            in_tags_block = false;
161        }
162    }
163
164    if tags.is_empty() { None } else { Some(tags) }
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170
171    #[test]
172    fn frontmatter_tags_array_format() {
173        let frontmatter = r#"tags: ["project", "urgent"]
174title: "Test Note""#;
175
176        let tags = extract_frontmatter_tags(frontmatter).unwrap();
177        assert_eq!(tags, vec!["project", "urgent"]);
178    }
179
180    #[test]
181    fn frontmatter_single_tag_format() {
182        let frontmatter = r#"tag: meeting
183title: "Test Note""#;
184
185        let tags = extract_frontmatter_tags(frontmatter).unwrap();
186        assert_eq!(tags, vec!["meeting"]);
187    }
188}