use regex::Regex;
use std::collections::HashSet;
use std::sync::OnceLock;
use crate::frontmatter::Frontmatter;
pub fn extract(body: &str, fm: &Frontmatter) -> Vec<String> {
let mut out = HashSet::<String>::new();
for tag in &fm.tags {
let normalised = tag.trim_start_matches('#').trim().to_ascii_lowercase();
if !normalised.is_empty() {
out.insert(normalised);
}
}
let inline = inline_pattern();
let mut in_fence = false;
for line in body.lines() {
let trimmed = line.trim_start();
if trimmed.starts_with("```") {
in_fence = !in_fence;
continue;
}
if in_fence {
continue;
}
if is_heading(trimmed) {
continue;
}
for cap in inline.captures_iter(line) {
if let Some(m) = cap.get(1) {
let tag = m.as_str().to_ascii_lowercase();
if !tag.is_empty() {
out.insert(tag);
}
}
}
}
let mut tags: Vec<String> = out.into_iter().collect();
tags.sort();
tags
}
fn inline_pattern() -> &'static Regex {
static R: OnceLock<Regex> = OnceLock::new();
R.get_or_init(|| {
Regex::new(r"(?:^|\s)#([A-Za-z][A-Za-z0-9_/\-]*)").expect("inline tag regex must compile")
})
}
fn is_heading(trimmed_line: &str) -> bool {
let mut chars = trimmed_line.chars();
let mut hashes = 0;
for c in chars.by_ref() {
if c == '#' {
hashes += 1;
if hashes > 6 {
return false;
}
} else {
return hashes >= 1 && c == ' ';
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
fn fm_with_tags(tags: Vec<&str>) -> Frontmatter {
Frontmatter {
tags: tags.into_iter().map(String::from).collect(),
..Frontmatter::default()
}
}
#[test]
fn merges_frontmatter_and_inline() {
let fm = fm_with_tags(vec!["project", "research"]);
let body = "Working on #project today, also #fitness.\n";
let tags = extract(body, &fm);
assert_eq!(tags, vec!["fitness", "project", "research"]);
}
#[test]
fn lowercases_and_dedupes() {
let fm = fm_with_tags(vec!["Project", "PROJECT"]);
let body = "#Project #project\n";
let tags = extract(body, &fm);
assert_eq!(tags, vec!["project"]);
}
#[test]
fn skips_code_fences() {
let fm = Frontmatter::default();
let body = "before\n```\n#nottag\n```\nafter #realtag\n";
let tags = extract(body, &fm);
assert_eq!(tags, vec!["realtag"]);
}
#[test]
fn skips_heading_lines() {
let fm = Frontmatter::default();
let body = "## Heading text\nbody #realtag\n";
let tags = extract(body, &fm);
assert_eq!(tags, vec!["realtag"]);
}
#[test]
fn supports_nested_tags() {
let fm = Frontmatter::default();
let body = "#project/lexa is a sub-tag.\n";
let tags = extract(body, &fm);
assert_eq!(tags, vec!["project/lexa"]);
}
#[test]
fn ignores_url_fragments_and_html_ids() {
let fm = Frontmatter::default();
let body = "see http://example.com#section or <div id=\"foo\">\n";
let tags = extract(body, &fm);
assert!(tags.is_empty());
}
}