Skip to main content

rumdl_lib/rules/
front_matter_utils.rs

1use regex::Regex;
2use std::collections::HashMap;
3use std::sync::LazyLock;
4
5// Standard front matter delimiter (three dashes)
6static STANDARD_FRONT_MATTER_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^---\s*$").unwrap());
7static STANDARD_FRONT_MATTER_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^---\s*$").unwrap());
8
9// TOML front matter delimiter (three plus signs)
10static TOML_FRONT_MATTER_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\+\+\+\s*$").unwrap());
11static TOML_FRONT_MATTER_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\+\+\+\s*$").unwrap());
12
13// JSON front matter delimiter (curly braces)
14static JSON_FRONT_MATTER_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\{\s*$").unwrap());
15static JSON_FRONT_MATTER_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\}\s*$").unwrap());
16
17// Common malformed front matter (dash space dash dash)
18static MALFORMED_FRONT_MATTER_START1: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^- --\s*$").unwrap());
19static MALFORMED_FRONT_MATTER_END1: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^- --\s*$").unwrap());
20
21// Alternate malformed front matter (dash dash space dash)
22static MALFORMED_FRONT_MATTER_START2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-- -\s*$").unwrap());
23static MALFORMED_FRONT_MATTER_END2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-- -\s*$").unwrap());
24
25// Front matter field pattern
26static FRONT_MATTER_FIELD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^([^:]+):\s*(.*)$").unwrap());
27
28// TOML field pattern
29static TOML_FIELD_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"^([^=]+)\s*=\s*"?([^"]*)"?$"#).unwrap());
30
31/// Represents the type of front matter found in a document
32#[derive(Debug, PartialEq, Eq, Clone, Copy)]
33pub enum FrontMatterType {
34    /// YAML front matter (---)
35    Yaml,
36    /// TOML front matter (+++)
37    Toml,
38    /// JSON front matter ({})
39    Json,
40    /// Malformed front matter
41    Malformed,
42    /// No front matter
43    None,
44}
45
46/// Utility functions for detecting and handling front matter in Markdown documents
47pub struct FrontMatterUtils;
48
49impl FrontMatterUtils {
50    /// Check if a content contains front matter with a specific field
51    pub fn has_front_matter_field(content: &str, field_prefix: &str) -> bool {
52        let field_name = field_prefix.trim_end_matches(':');
53        Self::get_front_matter_field_value(content, field_name).is_some()
54    }
55
56    /// Get the value of a specific front matter field
57    pub fn get_front_matter_field_value<'a>(content: &'a str, field_name: &str) -> Option<&'a str> {
58        let lines: Vec<&'a str> = content.lines().collect();
59        if lines.len() < 3 {
60            return None;
61        }
62
63        let front_matter_type = Self::detect_front_matter_type(content);
64        if front_matter_type == FrontMatterType::None {
65            return None;
66        }
67
68        let front_matter = Self::extract_front_matter(content);
69        for line in front_matter {
70            let line = line.trim();
71            match front_matter_type {
72                FrontMatterType::Toml => {
73                    // Handle TOML-style fields (key = value)
74                    if let Some(captures) = TOML_FIELD_PATTERN.captures(line) {
75                        let key = captures.get(1).unwrap().as_str().trim();
76                        if key == field_name {
77                            let value = captures.get(2).unwrap().as_str();
78                            return Some(value);
79                        }
80                    }
81                }
82                _ => {
83                    // Handle YAML/JSON-style fields (key: value)
84                    if let Some(captures) = FRONT_MATTER_FIELD.captures(line) {
85                        let mut key = captures.get(1).unwrap().as_str().trim();
86
87                        // Strip quotes from the key if present (for JSON-style fields in any format)
88                        if key.starts_with('"') && key.ends_with('"') && key.len() >= 2 {
89                            key = &key[1..key.len() - 1];
90                        }
91
92                        if key == field_name {
93                            let value = captures.get(2).unwrap().as_str().trim();
94                            // Strip quotes if present
95                            if value.starts_with('"') && value.ends_with('"') && value.len() >= 2 {
96                                return Some(&value[1..value.len() - 1]);
97                            }
98                            return Some(value);
99                        }
100                    }
101                }
102            }
103        }
104
105        None
106    }
107
108    /// Extract all front matter fields as a HashMap
109    pub fn extract_front_matter_fields(content: &str) -> HashMap<String, String> {
110        let mut fields = HashMap::new();
111
112        let front_matter_type = Self::detect_front_matter_type(content);
113        if front_matter_type == FrontMatterType::None {
114            return fields;
115        }
116
117        let front_matter = Self::extract_front_matter(content);
118        let mut current_prefix = String::new();
119        let mut indent_level = 0;
120
121        for line in front_matter {
122            let line_indent = line.chars().take_while(|c| c.is_whitespace()).count();
123            let line = line.trim();
124
125            // Handle indentation changes for nested fields
126            match line_indent.cmp(&indent_level) {
127                std::cmp::Ordering::Greater => {
128                    // Going deeper
129                    indent_level = line_indent;
130                }
131                std::cmp::Ordering::Less => {
132                    // Going back up
133                    indent_level = line_indent;
134                    // Remove last nested level from prefix
135                    if let Some(last_dot) = current_prefix.rfind('.') {
136                        current_prefix.truncate(last_dot);
137                    } else {
138                        current_prefix.clear();
139                    }
140                }
141                std::cmp::Ordering::Equal => {}
142            }
143
144            match front_matter_type {
145                FrontMatterType::Toml => {
146                    // Handle TOML-style fields
147                    if let Some(captures) = TOML_FIELD_PATTERN.captures(line) {
148                        let key = captures.get(1).unwrap().as_str().trim();
149                        let value = captures.get(2).unwrap().as_str();
150                        let full_key = if current_prefix.is_empty() {
151                            key.to_string()
152                        } else {
153                            format!("{current_prefix}.{key}")
154                        };
155                        fields.insert(full_key, value.to_string());
156                    }
157                }
158                _ => {
159                    // Handle YAML/JSON-style fields
160                    if let Some(captures) = FRONT_MATTER_FIELD.captures(line) {
161                        let mut key = captures.get(1).unwrap().as_str().trim();
162                        let value = captures.get(2).unwrap().as_str().trim();
163
164                        // Strip quotes from the key if present (for JSON-style fields in any format)
165                        if key.starts_with('"') && key.ends_with('"') && key.len() >= 2 {
166                            key = &key[1..key.len() - 1];
167                        }
168
169                        if let Some(stripped) = key.strip_suffix(':') {
170                            // This is a nested field marker
171                            if current_prefix.is_empty() {
172                                current_prefix = stripped.to_string();
173                            } else {
174                                current_prefix = format!("{current_prefix}.{stripped}");
175                            }
176                        } else {
177                            // This is a field with a value
178                            let full_key = if current_prefix.is_empty() {
179                                key.to_string()
180                            } else {
181                                format!("{current_prefix}.{key}")
182                            };
183                            // Strip quotes if present
184                            let value = value
185                                .strip_prefix('"')
186                                .and_then(|v| v.strip_suffix('"'))
187                                .unwrap_or(value);
188                            fields.insert(full_key, value.to_string());
189                        }
190                    }
191                }
192            }
193        }
194
195        fields
196    }
197
198    /// Extract the front matter content as a vector of lines
199    pub fn extract_front_matter<'a>(content: &'a str) -> Vec<&'a str> {
200        let lines: Vec<&'a str> = content.lines().collect();
201        if lines.len() < 3 {
202            return Vec::new();
203        }
204
205        let front_matter_type = Self::detect_front_matter_type(content);
206        if front_matter_type == FrontMatterType::None {
207            return Vec::new();
208        }
209
210        let mut front_matter = Vec::new();
211        let mut in_front_matter = false;
212
213        for (i, line) in lines.iter().enumerate() {
214            match front_matter_type {
215                FrontMatterType::Yaml => {
216                    if i == 0 && STANDARD_FRONT_MATTER_START.is_match(line) {
217                        in_front_matter = true;
218                        continue;
219                    } else if STANDARD_FRONT_MATTER_END.is_match(line) && in_front_matter && i > 0 {
220                        break;
221                    }
222                }
223                FrontMatterType::Toml => {
224                    if i == 0 && TOML_FRONT_MATTER_START.is_match(line) {
225                        in_front_matter = true;
226                        continue;
227                    } else if TOML_FRONT_MATTER_END.is_match(line) && in_front_matter && i > 0 {
228                        break;
229                    }
230                }
231                FrontMatterType::Json => {
232                    if i == 0 && JSON_FRONT_MATTER_START.is_match(line) {
233                        in_front_matter = true;
234                        continue;
235                    } else if JSON_FRONT_MATTER_END.is_match(line) && in_front_matter && i > 0 {
236                        break;
237                    }
238                }
239                FrontMatterType::Malformed => {
240                    if i == 0
241                        && (MALFORMED_FRONT_MATTER_START1.is_match(line)
242                            || MALFORMED_FRONT_MATTER_START2.is_match(line))
243                    {
244                        in_front_matter = true;
245                        continue;
246                    } else if (MALFORMED_FRONT_MATTER_END1.is_match(line) || MALFORMED_FRONT_MATTER_END2.is_match(line))
247                        && in_front_matter
248                        && i > 0
249                    {
250                        break;
251                    }
252                }
253                FrontMatterType::None => break,
254            }
255
256            if in_front_matter {
257                front_matter.push(*line);
258            }
259        }
260
261        front_matter
262    }
263
264    /// Detect the type of front matter in the content
265    pub fn detect_front_matter_type(content: &str) -> FrontMatterType {
266        let lines: Vec<&str> = content.lines().collect();
267        if lines.is_empty() {
268            return FrontMatterType::None;
269        }
270
271        let first_line = lines[0];
272
273        if STANDARD_FRONT_MATTER_START.is_match(first_line) {
274            // Check if there's a closing marker
275            for line in lines.iter().skip(1) {
276                if STANDARD_FRONT_MATTER_END.is_match(line) {
277                    return FrontMatterType::Yaml;
278                }
279            }
280        } else if TOML_FRONT_MATTER_START.is_match(first_line) {
281            // Check if there's a closing marker
282            for line in lines.iter().skip(1) {
283                if TOML_FRONT_MATTER_END.is_match(line) {
284                    return FrontMatterType::Toml;
285                }
286            }
287        } else if JSON_FRONT_MATTER_START.is_match(first_line) {
288            // Check if there's a closing marker
289            for line in lines.iter().skip(1) {
290                if JSON_FRONT_MATTER_END.is_match(line) {
291                    return FrontMatterType::Json;
292                }
293            }
294        } else if MALFORMED_FRONT_MATTER_START1.is_match(first_line)
295            || MALFORMED_FRONT_MATTER_START2.is_match(first_line)
296        {
297            // Check if there's a closing marker
298            for line in lines.iter().skip(1) {
299                if MALFORMED_FRONT_MATTER_END1.is_match(line) || MALFORMED_FRONT_MATTER_END2.is_match(line) {
300                    return FrontMatterType::Malformed;
301                }
302            }
303        }
304
305        FrontMatterType::None
306    }
307
308    /// Get the line number where front matter ends (or 0 if no front matter)
309    pub fn get_front_matter_end_line(content: &str) -> usize {
310        let lines: Vec<&str> = content.lines().collect();
311        if lines.len() < 3 {
312            return 0;
313        }
314
315        let front_matter_type = Self::detect_front_matter_type(content);
316        if front_matter_type == FrontMatterType::None {
317            return 0;
318        }
319
320        let mut in_front_matter = false;
321
322        for (i, line) in lines.iter().enumerate() {
323            match front_matter_type {
324                FrontMatterType::Yaml => {
325                    if i == 0 && STANDARD_FRONT_MATTER_START.is_match(line) {
326                        in_front_matter = true;
327                    } else if STANDARD_FRONT_MATTER_END.is_match(line) && in_front_matter && i > 0 {
328                        return i + 1;
329                    }
330                }
331                FrontMatterType::Toml => {
332                    if i == 0 && TOML_FRONT_MATTER_START.is_match(line) {
333                        in_front_matter = true;
334                    } else if TOML_FRONT_MATTER_END.is_match(line) && in_front_matter && i > 0 {
335                        return i + 1;
336                    }
337                }
338                FrontMatterType::Json => {
339                    if i == 0 && JSON_FRONT_MATTER_START.is_match(line) {
340                        in_front_matter = true;
341                    } else if JSON_FRONT_MATTER_END.is_match(line) && in_front_matter && i > 0 {
342                        return i + 1;
343                    }
344                }
345                FrontMatterType::Malformed => {
346                    if i == 0
347                        && (MALFORMED_FRONT_MATTER_START1.is_match(line)
348                            || MALFORMED_FRONT_MATTER_START2.is_match(line))
349                    {
350                        in_front_matter = true;
351                    } else if (MALFORMED_FRONT_MATTER_END1.is_match(line) || MALFORMED_FRONT_MATTER_END2.is_match(line))
352                        && in_front_matter
353                        && i > 0
354                    {
355                        return i + 1;
356                    }
357                }
358                FrontMatterType::None => return 0,
359            }
360        }
361
362        0
363    }
364}
365
366#[cfg(test)]
367mod tests {
368    use super::*;
369
370    #[test]
371    fn test_front_matter_type_enum() {
372        assert_eq!(FrontMatterType::Yaml, FrontMatterType::Yaml);
373        assert_eq!(FrontMatterType::Toml, FrontMatterType::Toml);
374        assert_eq!(FrontMatterType::Json, FrontMatterType::Json);
375        assert_eq!(FrontMatterType::Malformed, FrontMatterType::Malformed);
376        assert_eq!(FrontMatterType::None, FrontMatterType::None);
377        assert_ne!(FrontMatterType::Yaml, FrontMatterType::Toml);
378    }
379
380    #[test]
381    fn test_detect_front_matter_type() {
382        // YAML front matter
383        let yaml_content = "---\ntitle: Test\n---\nContent";
384        assert_eq!(
385            FrontMatterUtils::detect_front_matter_type(yaml_content),
386            FrontMatterType::Yaml
387        );
388
389        // TOML front matter
390        let toml_content = "+++\ntitle = \"Test\"\n+++\nContent";
391        assert_eq!(
392            FrontMatterUtils::detect_front_matter_type(toml_content),
393            FrontMatterType::Toml
394        );
395
396        // JSON front matter
397        let json_content = "{\n\"title\": \"Test\"\n}\nContent";
398        assert_eq!(
399            FrontMatterUtils::detect_front_matter_type(json_content),
400            FrontMatterType::Json
401        );
402
403        // Malformed front matter
404        let malformed1 = "- --\ntitle: Test\n- --\nContent";
405        assert_eq!(
406            FrontMatterUtils::detect_front_matter_type(malformed1),
407            FrontMatterType::Malformed
408        );
409
410        let malformed2 = "-- -\ntitle: Test\n-- -\nContent";
411        assert_eq!(
412            FrontMatterUtils::detect_front_matter_type(malformed2),
413            FrontMatterType::Malformed
414        );
415
416        // No front matter
417        assert_eq!(
418            FrontMatterUtils::detect_front_matter_type("# Regular content"),
419            FrontMatterType::None
420        );
421        assert_eq!(FrontMatterUtils::detect_front_matter_type(""), FrontMatterType::None);
422
423        // Incomplete front matter (no closing marker)
424        assert_eq!(
425            FrontMatterUtils::detect_front_matter_type("---\ntitle: Test"),
426            FrontMatterType::None
427        );
428    }
429
430    #[test]
431    fn test_extract_front_matter() {
432        let content = "---\ntitle: Test\nauthor: Me\n---\nContent";
433        let front_matter = FrontMatterUtils::extract_front_matter(content);
434
435        assert_eq!(front_matter.len(), 2);
436        assert_eq!(front_matter[0], "title: Test");
437        assert_eq!(front_matter[1], "author: Me");
438
439        // No front matter
440        let no_fm = FrontMatterUtils::extract_front_matter("Regular content");
441        assert!(no_fm.is_empty());
442
443        // Too short content
444        let short = FrontMatterUtils::extract_front_matter("---\n---");
445        assert!(short.is_empty());
446    }
447
448    #[test]
449    fn test_has_front_matter_field() {
450        let content = "---\ntitle: Test\nauthor: Me\n---\nContent";
451
452        assert!(FrontMatterUtils::has_front_matter_field(content, "title"));
453        assert!(FrontMatterUtils::has_front_matter_field(content, "author"));
454        assert!(!FrontMatterUtils::has_front_matter_field(content, "date"));
455
456        // No front matter
457        assert!(!FrontMatterUtils::has_front_matter_field("Regular content", "title"));
458
459        // Too short content
460        assert!(!FrontMatterUtils::has_front_matter_field("--", "title"));
461    }
462
463    #[test]
464    fn test_get_front_matter_field_value() {
465        // YAML front matter
466        let yaml_content = "---\ntitle: Test Title\nauthor: \"John Doe\"\n---\nContent";
467        assert_eq!(
468            FrontMatterUtils::get_front_matter_field_value(yaml_content, "title"),
469            Some("Test Title")
470        );
471        assert_eq!(
472            FrontMatterUtils::get_front_matter_field_value(yaml_content, "author"),
473            Some("John Doe")
474        );
475        assert_eq!(
476            FrontMatterUtils::get_front_matter_field_value(yaml_content, "nonexistent"),
477            None
478        );
479
480        // TOML front matter
481        let toml_content = "+++\ntitle = \"Test Title\"\nauthor = \"John Doe\"\n+++\nContent";
482        assert_eq!(
483            FrontMatterUtils::get_front_matter_field_value(toml_content, "title"),
484            Some("Test Title")
485        );
486        assert_eq!(
487            FrontMatterUtils::get_front_matter_field_value(toml_content, "author"),
488            Some("John Doe")
489        );
490
491        // JSON-style fields in YAML front matter - keys should not include quotes
492        let json_style_yaml = "---\n\"title\": \"Test Title\"\n---\nContent";
493        assert_eq!(
494            FrontMatterUtils::get_front_matter_field_value(json_style_yaml, "title"),
495            Some("Test Title")
496        );
497
498        // Actual JSON front matter
499        let json_fm = "{\n\"title\": \"Test Title\"\n}\nContent";
500        assert_eq!(
501            FrontMatterUtils::get_front_matter_field_value(json_fm, "title"),
502            Some("Test Title")
503        );
504
505        // No front matter
506        assert_eq!(
507            FrontMatterUtils::get_front_matter_field_value("Regular content", "title"),
508            None
509        );
510
511        // Too short content
512        assert_eq!(FrontMatterUtils::get_front_matter_field_value("--", "title"), None);
513    }
514
515    #[test]
516    fn test_extract_front_matter_fields() {
517        // Simple YAML front matter
518        let yaml_content = "---\ntitle: Test\nauthor: Me\n---\nContent";
519        let fields = FrontMatterUtils::extract_front_matter_fields(yaml_content);
520
521        assert_eq!(fields.get("title"), Some(&"Test".to_string()));
522        assert_eq!(fields.get("author"), Some(&"Me".to_string()));
523
524        // TOML front matter
525        let toml_content = "+++\ntitle = \"Test\"\nauthor = \"Me\"\n+++\nContent";
526        let toml_fields = FrontMatterUtils::extract_front_matter_fields(toml_content);
527
528        assert_eq!(toml_fields.get("title"), Some(&"Test".to_string()));
529        assert_eq!(toml_fields.get("author"), Some(&"Me".to_string()));
530
531        // No front matter
532        let no_fields = FrontMatterUtils::extract_front_matter_fields("Regular content");
533        assert!(no_fields.is_empty());
534    }
535
536    #[test]
537    fn test_get_front_matter_end_line() {
538        let content = "---\ntitle: Test\n---\nContent";
539        assert_eq!(FrontMatterUtils::get_front_matter_end_line(content), 3);
540
541        // TOML
542        let toml_content = "+++\ntitle = \"Test\"\n+++\nContent";
543        assert_eq!(FrontMatterUtils::get_front_matter_end_line(toml_content), 3);
544
545        // No front matter
546        assert_eq!(FrontMatterUtils::get_front_matter_end_line("Regular content"), 0);
547
548        // Too short
549        assert_eq!(FrontMatterUtils::get_front_matter_end_line("--"), 0);
550    }
551
552    #[test]
553    fn test_nested_yaml_fields() {
554        let content = "---
555title: Test
556author:
557  name: John Doe
558  email: john@example.com
559---
560Content";
561
562        let fields = FrontMatterUtils::extract_front_matter_fields(content);
563
564        // Note: The current implementation doesn't fully handle nested YAML
565        // This test documents the current behavior
566        assert!(fields.contains_key("title"));
567        // Nested fields handling would need enhancement
568    }
569
570    #[test]
571    fn test_edge_cases() {
572        // Empty content
573        assert_eq!(FrontMatterUtils::detect_front_matter_type(""), FrontMatterType::None);
574        assert!(FrontMatterUtils::extract_front_matter("").is_empty());
575        assert_eq!(FrontMatterUtils::get_front_matter_end_line(""), 0);
576
577        // Only delimiters
578        let only_delim = "---\n---";
579        assert!(FrontMatterUtils::extract_front_matter(only_delim).is_empty());
580
581        // Multiple front matter sections (only first should be detected)
582        let multiple = "---\ntitle: First\n---\n---\ntitle: Second\n---";
583        let fm_type = FrontMatterUtils::detect_front_matter_type(multiple);
584        assert_eq!(fm_type, FrontMatterType::Yaml);
585        let fields = FrontMatterUtils::extract_front_matter_fields(multiple);
586        assert_eq!(fields.get("title"), Some(&"First".to_string()));
587    }
588
589    #[test]
590    fn test_unicode_content() {
591        let content = "---\ntitle: 你好世界\nauthor: José\n---\nContent";
592
593        assert_eq!(
594            FrontMatterUtils::detect_front_matter_type(content),
595            FrontMatterType::Yaml
596        );
597        assert_eq!(
598            FrontMatterUtils::get_front_matter_field_value(content, "title"),
599            Some("你好世界")
600        );
601        assert_eq!(
602            FrontMatterUtils::get_front_matter_field_value(content, "author"),
603            Some("José")
604        );
605    }
606}