Skip to main content

kimun_notes/cli/
metadata_extractor.rs

1use crate::cli::json_output::JsonHeader;
2use regex::Regex;
3use std::collections::HashSet;
4use std::sync::OnceLock;
5
6fn hashtag_regex() -> &'static Regex {
7    static REGEX: OnceLock<Regex> = OnceLock::new();
8    REGEX.get_or_init(|| Regex::new(r"#([a-zA-Z0-9_-]+)").unwrap())
9}
10
11fn header_regex() -> &'static Regex {
12    static REGEX: OnceLock<Regex> = OnceLock::new();
13    REGEX.get_or_init(|| Regex::new(r"^(#{1,6})\s+(.+)$").unwrap())
14}
15
16pub fn extract_tags(content: &str) -> Vec<String> {
17    let mut tags: HashSet<String> = HashSet::new();
18
19    // Extract from YAML frontmatter
20    if let Some(frontmatter) = extract_frontmatter(content)
21        && let Some(yaml_tags) = extract_frontmatter_tags(&frontmatter) {
22            for tag in yaml_tags {
23                tags.insert(tag);
24            }
25        }
26
27    // Extract hashtags from content
28    for capture in hashtag_regex().captures_iter(content) {
29        if let Some(tag) = capture.get(1) {
30            tags.insert(tag.as_str().to_string());
31        }
32    }
33
34    let mut result: Vec<String> = tags.into_iter().collect();
35    result.sort();
36    result
37}
38
39pub fn extract_links(content: &str) -> Vec<String> {
40    kimun_core::note::link_char_spans(content)
41        .into_iter()
42        .map(|span| span.target)
43        .collect()
44}
45
46pub fn extract_headers(content: &str) -> Vec<JsonHeader> {
47    let mut headers: Vec<JsonHeader> = Vec::new();
48
49    for line in content.lines() {
50        if let Some(capture) = header_regex().captures(line)
51            && let (Some(level_match), Some(text_match)) = (capture.get(1), capture.get(2)) {
52                let level = level_match.as_str().len() as u32;
53                let text = text_match.as_str().trim().to_string();
54                headers.push(JsonHeader { text, level });
55            }
56    }
57
58    headers
59}
60
61fn extract_frontmatter(content: &str) -> Option<String> {
62    if !content.starts_with("---") {
63        return None;
64    }
65
66    let lines: Vec<&str> = content.lines().collect();
67    if lines.len() < 3 {
68        return None;
69    }
70
71    let mut end_index = None;
72    for (i, line) in lines.iter().enumerate().skip(1) {
73        if line.trim() == "---" {
74            end_index = Some(i);
75            break;
76        }
77    }
78
79    if let Some(end) = end_index {
80        let frontmatter_lines = &lines[1..end];
81        Some(frontmatter_lines.join("\n"))
82    } else {
83        None
84    }
85}
86
87fn extract_frontmatter_tags(frontmatter: &str) -> Option<Vec<String>> {
88    let mut tags: Vec<String> = Vec::new();
89    let mut in_tags_block = false;
90
91    for line in frontmatter.lines() {
92        let line = line.trim();
93
94        // Handle "tags: [tag1, tag2]" format (inline array)
95        if let Some(tags_str) = line.strip_prefix("tags:") {
96            let trimmed = tags_str.trim();
97
98            // Check if this is an inline array format
99            if trimmed.starts_with('[') && trimmed.ends_with(']') {
100                let cleaned = trimmed.strip_prefix('[')
101                    .and_then(|s| s.strip_suffix(']'))
102                    .unwrap_or(trimmed);
103
104                for tag in cleaned.split(',') {
105                    let clean_tag = tag.trim()
106                        .strip_prefix('"')
107                        .and_then(|s| s.strip_suffix('"'))
108                        .or_else(|| tag.trim().strip_prefix('\'').and_then(|s| s.strip_suffix('\'')))
109                        .unwrap_or(tag.trim());
110
111                    if !clean_tag.is_empty() {
112                        tags.push(clean_tag.to_string());
113                    }
114                }
115            }
116            // Check if this is the start of a block sequence (no content after colon, or just empty)
117            else if trimmed.is_empty() {
118                in_tags_block = true;
119            }
120            // Single tag on same line as "tags:"
121            else {
122                let clean_tag = trimmed
123                    .strip_prefix('"')
124                    .and_then(|s| s.strip_suffix('"'))
125                    .unwrap_or(trimmed);
126                if !clean_tag.is_empty() {
127                    tags.push(clean_tag.to_string());
128                }
129            }
130        }
131        // Handle YAML block sequence format (tags: \n  - tag1 \n  - tag2)
132        else if in_tags_block && line.starts_with('-') {
133            if let Some(tag_str) = line.strip_prefix('-') {
134                let clean_tag = tag_str.trim()
135                    .strip_prefix('"')
136                    .and_then(|s| s.strip_suffix('"'))
137                    .or_else(|| tag_str.trim().strip_prefix('\'').and_then(|s| s.strip_suffix('\'')))
138                    .unwrap_or(tag_str.trim());
139
140                if !clean_tag.is_empty() {
141                    tags.push(clean_tag.to_string());
142                }
143            }
144        }
145        // Handle "tag: value" format (single tag)
146        else if let Some(tag_str) = line.strip_prefix("tag:") {
147            let clean_tag = tag_str.trim()
148                .strip_prefix('"')
149                .and_then(|s| s.strip_suffix('"'))
150                .unwrap_or(tag_str.trim());
151
152            if !clean_tag.is_empty() {
153                tags.push(clean_tag.to_string());
154            }
155        }
156        // Exit tags block if we encounter a new YAML key or empty line
157        else if in_tags_block && (line.contains(':') || line.is_empty()) {
158            in_tags_block = false;
159        }
160    }
161
162    if tags.is_empty() { None } else { Some(tags) }
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    #[test]
170    fn frontmatter_tags_array_format() {
171        let frontmatter = r#"tags: ["project", "urgent"]
172title: "Test Note""#;
173
174        let tags = extract_frontmatter_tags(frontmatter).unwrap();
175        assert_eq!(tags, vec!["project", "urgent"]);
176    }
177
178    #[test]
179    fn frontmatter_single_tag_format() {
180        let frontmatter = r#"tag: meeting
181title: "Test Note""#;
182
183        let tags = extract_frontmatter_tags(frontmatter).unwrap();
184        assert_eq!(tags, vec!["meeting"]);
185    }
186}