Skip to main content

metadata_gen/
metadata.rs

1//! Metadata extraction and processing module.
2//!
3//! This module provides functionality for extracting metadata from various formats
4//! (YAML, TOML, JSON) and processing it into a standardized structure.
5
6use crate::error::MetadataError;
7use dtt::datetime::DateTime;
8use regex::Regex;
9use serde_json::Value as JsonValue;
10use std::collections::HashMap;
11use toml::Value as TomlValue;
12
13/// Represents metadata for a page or content item.
14///
15/// # Example
16///
17/// ```
18/// use metadata_gen::Metadata;
19/// use std::collections::HashMap;
20///
21/// let mut data = HashMap::new();
22/// data.insert("title".to_string(), "My Page".to_string());
23/// let metadata = Metadata::new(data);
24/// assert_eq!(metadata.get("title"), Some(&"My Page".to_string()));
25/// ```
26#[derive(Debug, Default, Clone)]
27pub struct Metadata {
28    /// The underlying key-value store for metadata fields.
29    inner: HashMap<String, String>,
30}
31
32impl Metadata {
33    /// Creates a new `Metadata` instance with the given data.
34    ///
35    /// # Arguments
36    ///
37    /// * `data` - A `HashMap` containing the metadata key-value pairs.
38    ///
39    /// # Returns
40    ///
41    /// A new `Metadata` instance.
42    pub fn new(data: HashMap<String, String>) -> Self {
43        Metadata { inner: data }
44    }
45
46    /// Retrieves the value associated with the given key.
47    ///
48    /// # Arguments
49    ///
50    /// * `key` - A string slice representing the key to look up.
51    ///
52    /// # Returns
53    ///
54    /// An `Option<&String>` containing the value if the key exists, or `None` otherwise.
55    pub fn get(&self, key: &str) -> Option<&String> {
56        self.inner.get(key)
57    }
58
59    /// Inserts a key-value pair into the metadata.
60    ///
61    /// # Arguments
62    ///
63    /// * `key` - The key to insert.
64    /// * `value` - The value to associate with the key.
65    ///
66    /// # Returns
67    ///
68    /// The old value associated with the key, if it existed.
69    pub fn insert(
70        &mut self,
71        key: String,
72        value: String,
73    ) -> Option<String> {
74        self.inner.insert(key, value)
75    }
76
77    /// Checks if the metadata contains the given key.
78    ///
79    /// # Arguments
80    ///
81    /// * `key` - A string slice representing the key to check for.
82    ///
83    /// # Returns
84    ///
85    /// `true` if the key exists, `false` otherwise.
86    pub fn contains_key(&self, key: &str) -> bool {
87        self.inner.contains_key(key)
88    }
89
90    /// Consumes the `Metadata` instance and returns the inner `HashMap`.
91    ///
92    /// # Returns
93    ///
94    /// The inner `HashMap<String, String>` containing all metadata key-value pairs.
95    pub fn into_inner(self) -> HashMap<String, String> {
96        self.inner
97    }
98}
99
100/// Extracts metadata from the content string.
101///
102/// This function attempts to extract metadata from YAML, TOML, or JSON formats.
103///
104/// # Arguments
105///
106/// * `content` - A string slice containing the content to extract metadata from.
107///
108/// # Returns
109///
110/// A `Result` containing the extracted `Metadata` if successful, or a `MetadataError` if extraction fails.
111///
112/// # Errors
113///
114/// Returns a `MetadataError::ExtractionError` if no valid front matter is found.
115pub fn extract_metadata(
116    content: &str,
117) -> Result<Metadata, MetadataError> {
118    extract_yaml_metadata(content)
119        .or_else(|| extract_toml_metadata(content))
120        .or_else(|| extract_json_metadata(content))
121        .ok_or_else(|| MetadataError::ExtractionError {
122            message: "No valid front matter found.".to_string(),
123        })
124}
125
126/// Extracts YAML metadata from the content.
127///
128/// # Arguments
129///
130/// * `content` - A string slice containing the content to extract YAML metadata from.
131///
132/// # Returns
133///
134/// An `Option<Metadata>` containing the extracted metadata if successful, or `None` if extraction fails.
135fn extract_yaml_metadata(content: &str) -> Option<Metadata> {
136    let re = Regex::new(r"(?s)^\s*---\s*\n(.*?)\n\s*---\s*").ok()?;
137    let captures = re.captures(content)?;
138
139    let yaml_str = captures.get(1)?.as_str().trim();
140
141    let yaml_value: serde_yml::Value =
142        serde_yml::from_str(yaml_str).ok()?;
143
144    let metadata: HashMap<String, String> = flatten_yaml(&yaml_value);
145
146    Some(Metadata::new(metadata))
147}
148
149/// Flattens a nested YAML value into a flat key-value map.
150///
151/// Nested keys are joined with `.` (e.g., `author.name`).
152/// Sequences are rendered as comma-separated lists wrapped in brackets.
153fn flatten_yaml(value: &serde_yml::Value) -> HashMap<String, String> {
154    let mut map = HashMap::new();
155    flatten_yaml_recursive(value, String::new(), &mut map);
156    map
157}
158
159/// Recursively walks a YAML value tree, inserting leaf values into the map
160/// with dot-separated keys for nested mappings.
161fn flatten_yaml_recursive(
162    value: &serde_yml::Value,
163    prefix: String,
164    map: &mut HashMap<String, String>,
165) {
166    match value {
167        serde_yml::Value::Mapping(m) => {
168            for (k, v) in m {
169                let new_prefix = if prefix.is_empty() {
170                    k.as_str().unwrap_or_default().to_string()
171                } else {
172                    format!(
173                        "{}.{}",
174                        prefix,
175                        k.as_str().unwrap_or_default()
176                    )
177                };
178                flatten_yaml_recursive(v, new_prefix, map);
179            }
180        }
181        serde_yml::Value::Sequence(seq) => {
182            let inline_list = seq
183                .iter()
184                .filter_map(|item| item.as_str().map(|s| s.to_string()))
185                .collect::<Vec<String>>()
186                .join(", ");
187            map.insert(prefix, format!("[{}]", inline_list));
188        }
189        _ => {
190            map.insert(
191                prefix,
192                value.as_str().unwrap_or_default().to_string(),
193            );
194        }
195    }
196}
197
198/// Extracts TOML metadata from the content.
199///
200/// # Arguments
201///
202/// * `content` - A string slice containing the content to extract TOML metadata from.
203///
204/// # Returns
205///
206/// An `Option<Metadata>` containing the extracted metadata if successful, or `None` if extraction fails.
207fn extract_toml_metadata(content: &str) -> Option<Metadata> {
208    let re = Regex::new(r"(?s)^\s*\+\+\+\s*(.*?)\s*\+\+\+").ok()?;
209    let captures = re.captures(content)?;
210    let toml_str = captures.get(1)?.as_str().trim();
211
212    let toml_value: TomlValue = toml::from_str(toml_str).ok()?;
213
214    let mut metadata = HashMap::new();
215    flatten_toml(&toml_value, &mut metadata, String::new());
216
217    Some(Metadata::new(metadata))
218}
219
220/// Recursively flattens a TOML value tree into a flat key-value map.
221///
222/// Nested keys are joined with `.` (e.g., `author.name`).
223/// Arrays are rendered as comma-separated lists wrapped in brackets.
224fn flatten_toml(
225    value: &TomlValue,
226    map: &mut HashMap<String, String>,
227    prefix: String,
228) {
229    match value {
230        TomlValue::Table(table) => {
231            for (k, v) in table {
232                let new_prefix = if prefix.is_empty() {
233                    k.to_string()
234                } else {
235                    format!("{}.{}", prefix, k)
236                };
237                flatten_toml(v, map, new_prefix);
238            }
239        }
240        TomlValue::Array(arr) => {
241            let inline_list = arr
242                .iter()
243                .map(|v| {
244                    // Remove double quotes for string elements
245                    match v {
246                        TomlValue::String(s) => s.clone(),
247                        _ => v.to_string(),
248                    }
249                })
250                .collect::<Vec<String>>()
251                .join(", ");
252            map.insert(prefix, format!("[{}]", inline_list));
253        }
254        TomlValue::String(s) => {
255            map.insert(prefix, s.clone());
256        }
257        TomlValue::Datetime(dt) => {
258            map.insert(prefix, dt.to_string());
259        }
260        _ => {
261            map.insert(prefix, value.to_string());
262        }
263    }
264}
265
266/// Extracts JSON metadata from the content.
267///
268/// # Arguments
269///
270/// * `content` - A string slice containing the content to extract JSON metadata from.
271///
272/// # Returns
273///
274/// An `Option<Metadata>` containing the extracted metadata if successful, or `None` if extraction fails.
275fn extract_json_metadata(content: &str) -> Option<Metadata> {
276    let re = Regex::new(r"(?s)^\s*\{\s*(.*?)\s*\}").ok()?;
277    let captures = re.captures(content)?;
278    let json_str = format!("{{{}}}", captures.get(1)?.as_str().trim());
279
280    let json_value: JsonValue = serde_json::from_str(&json_str).ok()?;
281    let json_object = json_value.as_object()?;
282
283    let metadata: HashMap<String, String> = json_object
284        .iter()
285        .filter_map(|(k, v)| {
286            v.as_str().map(|s| (k.clone(), s.to_string()))
287        })
288        .collect();
289
290    Some(Metadata::new(metadata))
291}
292
293/// Processes the extracted metadata.
294///
295/// This function standardizes dates, ensures required fields are present, and generates derived fields.
296///
297/// # Arguments
298///
299/// * `metadata` - A reference to the `Metadata` instance to process.
300///
301/// # Returns
302///
303/// A `Result` containing the processed `Metadata` if successful, or a `MetadataError` if processing fails.
304///
305/// # Errors
306///
307/// Returns a `MetadataError` if date standardization fails or if required fields are missing.
308pub fn process_metadata(
309    metadata: &Metadata,
310) -> Result<Metadata, MetadataError> {
311    let mut processed = metadata.clone();
312
313    // Convert dates to a standard format
314    if let Some(date) = processed.get("date").cloned() {
315        let standardized_date = standardize_date(&date)?;
316        processed.insert("date".to_string(), standardized_date);
317    }
318
319    // Ensure required fields are present
320    ensure_required_fields(&processed)?;
321
322    // Generate derived fields
323    generate_derived_fields(&mut processed);
324
325    Ok(processed)
326}
327
328/// Standardizes the date format.
329///
330/// This function attempts to parse various date formats and convert them to the YYYY-MM-DD format.
331///
332/// # Arguments
333///
334/// * `date` - A string slice containing the date to standardize.
335///
336/// # Returns
337///
338/// A `Result` containing the standardized date string if successful, or a `MetadataError` if parsing fails.
339///
340/// # Errors
341///
342/// Returns a `MetadataError::DateParseError` if the date cannot be parsed or is invalid.
343fn standardize_date(date: &str) -> Result<String, MetadataError> {
344    // Handle edge cases with empty or too-short dates
345    if date.trim().is_empty() {
346        return Err(MetadataError::DateParseError(
347            "Date string is empty.".to_string(),
348        ));
349    }
350
351    if date.len() < 8 {
352        return Err(MetadataError::DateParseError(
353            "Date string is too short.".to_string(),
354        ));
355    }
356
357    // Check if the date is in the DD/MM/YYYY format and reformat to YYYY-MM-DD
358    let date = if date.contains('/') && date.len() == 10 {
359        let parts: Vec<&str> = date.split('/').collect();
360        if parts.len() == 3
361            && parts[0].len() == 2
362            && parts[1].len() == 2
363            && parts[2].len() == 4
364        {
365            format!("{}-{}-{}", parts[2], parts[1], parts[0]) // Reformat to YYYY-MM-DD
366        } else {
367            return Err(MetadataError::DateParseError(
368                "Invalid DD/MM/YYYY date format.".to_string(),
369            ));
370        }
371    } else {
372        date.to_string()
373    };
374
375    // Attempt to parse the date in different formats using DateTime methods
376    let parsed_date = DateTime::parse(&date)
377        .or_else(|_| {
378            DateTime::parse_custom_format(&date, "[year]-[month]-[day]")
379        })
380        .or_else(|_| {
381            DateTime::parse_custom_format(&date, "[month]/[day]/[year]")
382        })
383        .map_err(|e| {
384            MetadataError::DateParseError(format!(
385                "Failed to parse date: {}",
386                e
387            ))
388        })?;
389
390    // Format the date to the standardized YYYY-MM-DD format
391    Ok(format!(
392        "{:04}-{:02}-{:02}",
393        parsed_date.year(),
394        parsed_date.month() as u8,
395        parsed_date.day()
396    ))
397}
398
399/// Ensures that all required fields are present in the metadata.
400///
401/// # Arguments
402///
403/// * `metadata` - A reference to the `Metadata` instance to check.
404///
405/// # Returns
406///
407/// A `Result<()>` if all required fields are present, or a `MetadataError` if any are missing.
408///
409/// # Errors
410///
411/// Returns a `MetadataError::MissingFieldError` if any required field is missing.
412fn ensure_required_fields(
413    metadata: &Metadata,
414) -> Result<(), MetadataError> {
415    let required_fields = ["title", "date"];
416
417    for &field in &required_fields {
418        if !metadata.contains_key(field) {
419            return Err(MetadataError::MissingFieldError(
420                field.to_string(),
421            ));
422        }
423    }
424
425    Ok(())
426}
427
428/// Generates derived fields for the metadata.
429///
430/// Currently, this function generates a URL slug from the title if not already present.
431///
432/// # Arguments
433///
434/// * `metadata` - A mutable reference to the `Metadata` instance to update.
435fn generate_derived_fields(metadata: &mut Metadata) {
436    if !metadata.contains_key("slug") {
437        if let Some(title) = metadata.get("title") {
438            let slug = generate_slug(title);
439            metadata.insert("slug".to_string(), slug);
440        }
441    }
442}
443
444/// Generates a URL slug from the given title.
445///
446/// # Arguments
447///
448/// * `title` - A string slice containing the title to convert to a slug.
449///
450/// # Returns
451///
452/// A `String` containing the generated slug.
453fn generate_slug(title: &str) -> String {
454    title.to_lowercase().replace(' ', "-")
455}
456
457#[cfg(test)]
458mod tests {
459    use super::*;
460    use dtt::dtt_parse;
461
462    #[test]
463    fn test_standardize_date() {
464        let test_cases = vec![
465            ("2023-05-20T15:30:00Z", "2023-05-20"),
466            ("2023-05-20", "2023-05-20"),
467            ("20/05/2023", "2023-05-20"), // European format DD/MM/YYYY
468        ];
469
470        for (input, expected) in test_cases {
471            let result = standardize_date(input);
472            assert!(result.is_ok(), "Failed for input: {}", input);
473            assert_eq!(result.unwrap(), expected);
474        }
475    }
476
477    #[test]
478    fn test_standardize_date_errors() {
479        assert!(standardize_date("").is_err());
480        assert!(standardize_date("invalid").is_err());
481        assert!(standardize_date("20/05/23").is_err()); // Invalid DD/MM/YY format
482    }
483
484    #[test]
485    fn test_date_format() {
486        let dt = dtt_parse!("2023-01-01T12:00:00+00:00").unwrap();
487        let formatted = format!(
488            "{:04}-{:02}-{:02}",
489            dt.year(),
490            dt.month() as u8,
491            dt.day()
492        );
493        assert_eq!(formatted, "2023-01-01");
494    }
495
496    #[test]
497    fn test_generate_slug() {
498        assert_eq!(generate_slug("Hello World"), "hello-world");
499        assert_eq!(generate_slug("Test 123"), "test-123");
500        assert_eq!(generate_slug("  Spaces  "), "--spaces--");
501    }
502
503    #[test]
504    fn test_process_metadata() {
505        let mut metadata = Metadata::new(HashMap::new());
506        metadata.insert("title".to_string(), "Test Title".to_string());
507        metadata.insert(
508            "date".to_string(),
509            "2023-05-20T15:30:00Z".to_string(),
510        );
511
512        let processed = process_metadata(&metadata).unwrap();
513        assert_eq!(processed.get("title").unwrap(), "Test Title");
514        assert_eq!(processed.get("date").unwrap(), "2023-05-20");
515        assert_eq!(processed.get("slug").unwrap(), "test-title");
516    }
517
518    #[test]
519    fn test_extract_metadata() {
520        let yaml_content = r#"---
521title: YAML Test
522date: 2023-05-20
523---
524Content here"#;
525
526        let toml_content = r#"+++
527title = "TOML Test"
528date = "2023-05-20"
529+++
530Content here"#;
531
532        let json_content = r#"{
533"title": "JSON Test",
534"date": "2023-05-20"
535}
536Content here"#;
537
538        let yaml_metadata = extract_metadata(yaml_content).unwrap();
539        assert_eq!(yaml_metadata.get("title").unwrap(), "YAML Test");
540
541        let toml_metadata = extract_metadata(toml_content).unwrap();
542        assert_eq!(toml_metadata.get("title").unwrap(), "TOML Test");
543
544        let json_metadata = extract_metadata(json_content).unwrap();
545        assert_eq!(json_metadata.get("title").unwrap(), "JSON Test");
546    }
547
548    #[test]
549    fn test_extract_metadata_failure() {
550        let invalid_content = "This content has no metadata";
551        assert!(extract_metadata(invalid_content).is_err());
552    }
553
554    #[test]
555    fn test_ensure_required_fields() {
556        let mut metadata = Metadata::new(HashMap::new());
557        metadata.insert("title".to_string(), "Test".to_string());
558        metadata.insert("date".to_string(), "2023-05-20".to_string());
559
560        assert!(ensure_required_fields(&metadata).is_ok());
561
562        let mut incomplete_metadata = Metadata::new(HashMap::new());
563        incomplete_metadata
564            .insert("title".to_string(), "Test".to_string());
565
566        assert!(ensure_required_fields(&incomplete_metadata).is_err());
567    }
568
569    #[test]
570    fn test_generate_derived_fields() {
571        let mut metadata = Metadata::new(HashMap::new());
572        metadata.insert("title".to_string(), "Test Title".to_string());
573
574        generate_derived_fields(&mut metadata);
575
576        assert_eq!(metadata.get("slug").unwrap(), "test-title");
577    }
578
579    #[test]
580    fn test_metadata_methods() {
581        let mut metadata = Metadata::new(HashMap::new());
582        metadata.insert("key".to_string(), "value".to_string());
583
584        assert_eq!(metadata.get("key"), Some(&"value".to_string()));
585        assert!(metadata.contains_key("key"));
586        assert!(!metadata.contains_key("nonexistent"));
587
588        let old_value =
589            metadata.insert("key".to_string(), "new_value".to_string());
590        assert_eq!(old_value, Some("value".to_string()));
591        assert_eq!(metadata.get("key"), Some(&"new_value".to_string()));
592
593        let inner = metadata.into_inner();
594        assert_eq!(inner.get("key"), Some(&"new_value".to_string()));
595    }
596
597    #[test]
598    fn test_process_metadata_with_invalid_date() {
599        let mut metadata = Metadata::new(HashMap::new());
600        metadata.insert("title".to_string(), "Test Title".to_string());
601        metadata.insert("date".to_string(), "invalid_date".to_string());
602
603        assert!(process_metadata(&metadata).is_err());
604    }
605
606    #[test]
607    fn test_extract_yaml_metadata_with_complex_structure() {
608        let yaml_content = r#"---
609title: Complex YAML Test
610date: 2023-05-20
611author:
612  name: John Doe
613  email: john@example.com
614tags:
615  - rust
616  - metadata
617  - testing
618---
619Content here"#;
620
621        let metadata = extract_metadata(yaml_content).unwrap();
622        assert_eq!(metadata.get("title").unwrap(), "Complex YAML Test");
623        assert_eq!(metadata.get("date").unwrap(), "2023-05-20");
624        assert_eq!(metadata.get("author.name").unwrap(), "John Doe");
625        assert_eq!(
626            metadata.get("author.email").unwrap(),
627            "john@example.com"
628        );
629        assert_eq!(
630            metadata.get("tags").unwrap(),
631            "[rust, metadata, testing]"
632        );
633    }
634
635    #[test]
636    fn test_extract_toml_metadata_with_complex_structure() {
637        let toml_content = r#"+++
638title = "Complex TOML Test"
639date = 2023-05-20
640
641[author]
642name = "John Doe"
643email = "john@example.com"
644
645tags = ["rust", "metadata", "testing"]
646+++
647Content here"#;
648
649        let metadata = extract_metadata(toml_content).unwrap();
650        assert_eq!(
651            metadata.get("title").expect("Missing 'title' key"),
652            "Complex TOML Test"
653        );
654        assert_eq!(
655            metadata.get("date").expect("Missing 'date' key"),
656            "2023-05-20"
657        );
658        assert_eq!(
659            metadata
660                .get("author.name")
661                .expect("Missing 'author.name' key"),
662            "John Doe"
663        );
664        assert_eq!(
665            metadata
666                .get("author.email")
667                .expect("Missing 'author.email' key"),
668            "john@example.com"
669        );
670        assert_eq!(
671            metadata
672                .get("author.tags")
673                .expect("Missing 'author.tags' key"),
674            "[rust, metadata, testing]"
675        );
676    }
677
678    #[test]
679    fn test_generate_slug_with_special_characters() {
680        assert_eq!(
681            generate_slug("Hello, World! 123"),
682            "hello,-world!-123"
683        );
684        assert_eq!(generate_slug("Test: Ästhetik"), "test:-ästhetik");
685        assert_eq!(
686            generate_slug("  Multiple   Spaces  "),
687            "--multiple---spaces--"
688        );
689    }
690}