metadata_gen/
metadata.rs

1//! Metadata extraction and processing module.
2//!
3//! This module provides functionality for extracting metadata from various formats
4//! (YAML, TOML, JSON) and processing it into a standardized structure.
5
6use crate::error::MetadataError;
7use dtt::datetime::DateTime;
8use regex::Regex;
9use serde_json::Value as JsonValue;
10use std::collections::HashMap;
11use toml::Value as TomlValue;
12
13/// Represents metadata for a page or content item.
14#[derive(Debug, Default, Clone)]
15pub struct Metadata {
16    inner: HashMap<String, String>,
17}
18
19impl Metadata {
20    /// Creates a new `Metadata` instance with the given data.
21    ///
22    /// # Arguments
23    ///
24    /// * `data` - A `HashMap` containing the metadata key-value pairs.
25    ///
26    /// # Returns
27    ///
28    /// A new `Metadata` instance.
29    pub fn new(data: HashMap<String, String>) -> Self {
30        Metadata { inner: data }
31    }
32
33    /// Retrieves the value associated with the given key.
34    ///
35    /// # Arguments
36    ///
37    /// * `key` - A string slice representing the key to look up.
38    ///
39    /// # Returns
40    ///
41    /// An `Option<&String>` containing the value if the key exists, or `None` otherwise.
42    pub fn get(&self, key: &str) -> Option<&String> {
43        self.inner.get(key)
44    }
45
46    /// Inserts a key-value pair into the metadata.
47    ///
48    /// # Arguments
49    ///
50    /// * `key` - The key to insert.
51    /// * `value` - The value to associate with the key.
52    ///
53    /// # Returns
54    ///
55    /// The old value associated with the key, if it existed.
56    pub fn insert(
57        &mut self,
58        key: String,
59        value: String,
60    ) -> Option<String> {
61        self.inner.insert(key, value)
62    }
63
64    /// Checks if the metadata contains the given key.
65    ///
66    /// # Arguments
67    ///
68    /// * `key` - A string slice representing the key to check for.
69    ///
70    /// # Returns
71    ///
72    /// `true` if the key exists, `false` otherwise.
73    pub fn contains_key(&self, key: &str) -> bool {
74        self.inner.contains_key(key)
75    }
76
77    /// Consumes the `Metadata` instance and returns the inner `HashMap`.
78    ///
79    /// # Returns
80    ///
81    /// The inner `HashMap<String, String>` containing all metadata key-value pairs.
82    pub fn into_inner(self) -> HashMap<String, String> {
83        self.inner
84    }
85}
86
87/// Extracts metadata from the content string.
88///
89/// This function attempts to extract metadata from YAML, TOML, or JSON formats.
90///
91/// # Arguments
92///
93/// * `content` - A string slice containing the content to extract metadata from.
94///
95/// # Returns
96///
97/// A `Result` containing the extracted `Metadata` if successful, or a `MetadataError` if extraction fails.
98///
99/// # Errors
100///
101/// Returns a `MetadataError::ExtractionError` if no valid front matter is found.
102pub fn extract_metadata(
103    content: &str,
104) -> Result<Metadata, MetadataError> {
105    extract_yaml_metadata(content)
106        .or_else(|| extract_toml_metadata(content))
107        .or_else(|| extract_json_metadata(content))
108        .ok_or_else(|| MetadataError::ExtractionError {
109            message: "No valid front matter found.".to_string(),
110        })
111}
112
113/// Extracts YAML metadata from the content.
114///
115/// # Arguments
116///
117/// * `content` - A string slice containing the content to extract YAML metadata from.
118///
119/// # Returns
120///
121/// An `Option<Metadata>` containing the extracted metadata if successful, or `None` if extraction fails.
122fn extract_yaml_metadata(content: &str) -> Option<Metadata> {
123    let re = Regex::new(r"(?s)^\s*---\s*\n(.*?)\n\s*---\s*").ok()?;
124    let captures = re.captures(content)?;
125
126    let yaml_str = captures.get(1)?.as_str().trim();
127
128    let yaml_value: serde_yml::Value =
129        serde_yml::from_str(yaml_str).ok()?;
130
131    let metadata: HashMap<String, String> = flatten_yaml(&yaml_value);
132
133    Some(Metadata::new(metadata))
134}
135
136fn flatten_yaml(value: &serde_yml::Value) -> HashMap<String, String> {
137    let mut map = HashMap::new();
138    flatten_yaml_recursive(value, String::new(), &mut map);
139    map
140}
141
142fn flatten_yaml_recursive(
143    value: &serde_yml::Value,
144    prefix: String,
145    map: &mut HashMap<String, String>,
146) {
147    match value {
148        serde_yml::Value::Mapping(m) => {
149            for (k, v) in m {
150                let new_prefix = if prefix.is_empty() {
151                    k.as_str().unwrap_or_default().to_string()
152                } else {
153                    format!(
154                        "{}.{}",
155                        prefix,
156                        k.as_str().unwrap_or_default()
157                    )
158                };
159                flatten_yaml_recursive(v, new_prefix, map);
160            }
161        }
162        serde_yml::Value::Sequence(seq) => {
163            let inline_list = seq
164                .iter()
165                .filter_map(|item| item.as_str().map(|s| s.to_string()))
166                .collect::<Vec<String>>()
167                .join(", ");
168            map.insert(prefix, format!("[{}]", inline_list));
169        }
170        _ => {
171            map.insert(
172                prefix,
173                value.as_str().unwrap_or_default().to_string(),
174            );
175        }
176    }
177}
178
179/// Extracts TOML metadata from the content.
180///
181/// # Arguments
182///
183/// * `content` - A string slice containing the content to extract TOML metadata from.
184///
185/// # Returns
186///
187/// An `Option<Metadata>` containing the extracted metadata if successful, or `None` if extraction fails.
188fn extract_toml_metadata(content: &str) -> Option<Metadata> {
189    let re = Regex::new(r"(?s)^\s*\+\+\+\s*(.*?)\s*\+\+\+").ok()?;
190    let captures = re.captures(content)?;
191    let toml_str = captures.get(1)?.as_str().trim();
192
193    let toml_value: TomlValue = toml::from_str(toml_str).ok()?;
194
195    let mut metadata = HashMap::new();
196    flatten_toml(&toml_value, &mut metadata, String::new());
197
198    Some(Metadata::new(metadata))
199}
200
201fn flatten_toml(
202    value: &TomlValue,
203    map: &mut HashMap<String, String>,
204    prefix: String,
205) {
206    match value {
207        TomlValue::Table(table) => {
208            for (k, v) in table {
209                let new_prefix = if prefix.is_empty() {
210                    k.to_string()
211                } else {
212                    format!("{}.{}", prefix, k)
213                };
214                flatten_toml(v, map, new_prefix);
215            }
216        }
217        TomlValue::Array(arr) => {
218            let inline_list = arr
219                .iter()
220                .map(|v| {
221                    // Remove double quotes for string elements
222                    match v {
223                        TomlValue::String(s) => s.clone(),
224                        _ => v.to_string(),
225                    }
226                })
227                .collect::<Vec<String>>()
228                .join(", ");
229            map.insert(prefix, format!("[{}]", inline_list));
230        }
231        TomlValue::String(s) => {
232            map.insert(prefix, s.clone());
233        }
234        TomlValue::Datetime(dt) => {
235            map.insert(prefix, dt.to_string());
236        }
237        _ => {
238            map.insert(prefix, value.to_string());
239        }
240    }
241}
242
243/// Extracts JSON metadata from the content.
244///
245/// # Arguments
246///
247/// * `content` - A string slice containing the content to extract JSON metadata from.
248///
249/// # Returns
250///
251/// An `Option<Metadata>` containing the extracted metadata if successful, or `None` if extraction fails.
252fn extract_json_metadata(content: &str) -> Option<Metadata> {
253    let re = Regex::new(r"(?s)^\s*\{\s*(.*?)\s*\}").ok()?;
254    let captures = re.captures(content)?;
255    let json_str = format!("{{{}}}", captures.get(1)?.as_str().trim());
256
257    let json_value: JsonValue = serde_json::from_str(&json_str).ok()?;
258    let json_object = json_value.as_object()?;
259
260    let metadata: HashMap<String, String> = json_object
261        .iter()
262        .filter_map(|(k, v)| {
263            v.as_str().map(|s| (k.clone(), s.to_string()))
264        })
265        .collect();
266
267    Some(Metadata::new(metadata))
268}
269
270/// Processes the extracted metadata.
271///
272/// This function standardizes dates, ensures required fields are present, and generates derived fields.
273///
274/// # Arguments
275///
276/// * `metadata` - A reference to the `Metadata` instance to process.
277///
278/// # Returns
279///
280/// A `Result` containing the processed `Metadata` if successful, or a `MetadataError` if processing fails.
281///
282/// # Errors
283///
284/// Returns a `MetadataError` if date standardization fails or if required fields are missing.
285pub fn process_metadata(
286    metadata: &Metadata,
287) -> Result<Metadata, MetadataError> {
288    let mut processed = metadata.clone();
289
290    // Convert dates to a standard format
291    if let Some(date) = processed.get("date").cloned() {
292        let standardized_date = standardize_date(&date)?;
293        processed.insert("date".to_string(), standardized_date);
294    }
295
296    // Ensure required fields are present
297    ensure_required_fields(&processed)?;
298
299    // Generate derived fields
300    generate_derived_fields(&mut processed);
301
302    Ok(processed)
303}
304
305/// Standardizes the date format.
306///
307/// This function attempts to parse various date formats and convert them to the YYYY-MM-DD format.
308///
309/// # Arguments
310///
311/// * `date` - A string slice containing the date to standardize.
312///
313/// # Returns
314///
315/// A `Result` containing the standardized date string if successful, or a `MetadataError` if parsing fails.
316///
317/// # Errors
318///
319/// Returns a `MetadataError::DateParseError` if the date cannot be parsed or is invalid.
320fn standardize_date(date: &str) -> Result<String, MetadataError> {
321    // Handle edge cases with empty or too-short dates
322    if date.trim().is_empty() {
323        return Err(MetadataError::DateParseError(
324            "Date string is empty.".to_string(),
325        ));
326    }
327
328    if date.len() < 8 {
329        return Err(MetadataError::DateParseError(
330            "Date string is too short.".to_string(),
331        ));
332    }
333
334    // Check if the date is in the DD/MM/YYYY format and reformat to YYYY-MM-DD
335    let date = if date.contains('/') && date.len() == 10 {
336        let parts: Vec<&str> = date.split('/').collect();
337        if parts.len() == 3
338            && parts[0].len() == 2
339            && parts[1].len() == 2
340            && parts[2].len() == 4
341        {
342            format!("{}-{}-{}", parts[2], parts[1], parts[0]) // Reformat to YYYY-MM-DD
343        } else {
344            return Err(MetadataError::DateParseError(
345                "Invalid DD/MM/YYYY date format.".to_string(),
346            ));
347        }
348    } else {
349        date.to_string()
350    };
351
352    // Attempt to parse the date in different formats using DateTime methods
353    let parsed_date = DateTime::parse(&date)
354        .or_else(|_| {
355            DateTime::parse_custom_format(&date, "[year]-[month]-[day]")
356        })
357        .or_else(|_| {
358            DateTime::parse_custom_format(&date, "[month]/[day]/[year]")
359        })
360        .map_err(|e| {
361            MetadataError::DateParseError(format!(
362                "Failed to parse date: {}",
363                e
364            ))
365        })?;
366
367    // Format the date to the standardized YYYY-MM-DD format
368    Ok(format!(
369        "{:04}-{:02}-{:02}",
370        parsed_date.year(),
371        parsed_date.month() as u8,
372        parsed_date.day()
373    ))
374}
375
376/// Ensures that all required fields are present in the metadata.
377///
378/// # Arguments
379///
380/// * `metadata` - A reference to the `Metadata` instance to check.
381///
382/// # Returns
383///
384/// A `Result<()>` if all required fields are present, or a `MetadataError` if any are missing.
385///
386/// # Errors
387///
388/// Returns a `MetadataError::MissingFieldError` if any required field is missing.
389fn ensure_required_fields(
390    metadata: &Metadata,
391) -> Result<(), MetadataError> {
392    let required_fields = ["title", "date"];
393
394    for &field in &required_fields {
395        if !metadata.contains_key(field) {
396            return Err(MetadataError::MissingFieldError(
397                field.to_string(),
398            ));
399        }
400    }
401
402    Ok(())
403}
404
405/// Generates derived fields for the metadata.
406///
407/// Currently, this function generates a URL slug from the title if not already present.
408///
409/// # Arguments
410///
411/// * `metadata` - A mutable reference to the `Metadata` instance to update.
412fn generate_derived_fields(metadata: &mut Metadata) {
413    if !metadata.contains_key("slug") {
414        if let Some(title) = metadata.get("title") {
415            let slug = generate_slug(title);
416            metadata.insert("slug".to_string(), slug);
417        }
418    }
419}
420
421/// Generates a URL slug from the given title.
422///
423/// # Arguments
424///
425/// * `title` - A string slice containing the title to convert to a slug.
426///
427/// # Returns
428///
429/// A `String` containing the generated slug.
430fn generate_slug(title: &str) -> String {
431    title.to_lowercase().replace(' ', "-")
432}
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437    use dtt::dtt_parse;
438
439    #[test]
440    fn test_standardize_date() {
441        let test_cases = vec![
442            ("2023-05-20T15:30:00Z", "2023-05-20"),
443            ("2023-05-20", "2023-05-20"),
444            ("20/05/2023", "2023-05-20"), // European format DD/MM/YYYY
445        ];
446
447        for (input, expected) in test_cases {
448            let result = standardize_date(input);
449            assert!(result.is_ok(), "Failed for input: {}", input);
450            assert_eq!(result.unwrap(), expected);
451        }
452    }
453
454    #[test]
455    fn test_standardize_date_errors() {
456        assert!(standardize_date("").is_err());
457        assert!(standardize_date("invalid").is_err());
458        assert!(standardize_date("20/05/23").is_err()); // Invalid DD/MM/YY format
459    }
460
461    #[test]
462    fn test_date_format() {
463        let dt = dtt_parse!("2023-01-01T12:00:00+00:00").unwrap();
464        let formatted = format!(
465            "{:04}-{:02}-{:02}",
466            dt.year(),
467            dt.month() as u8,
468            dt.day()
469        );
470        assert_eq!(formatted, "2023-01-01");
471    }
472
473    #[test]
474    fn test_generate_slug() {
475        assert_eq!(generate_slug("Hello World"), "hello-world");
476        assert_eq!(generate_slug("Test 123"), "test-123");
477        assert_eq!(generate_slug("  Spaces  "), "--spaces--");
478    }
479
480    #[test]
481    fn test_process_metadata() {
482        let mut metadata = Metadata::new(HashMap::new());
483        metadata.insert("title".to_string(), "Test Title".to_string());
484        metadata.insert(
485            "date".to_string(),
486            "2023-05-20T15:30:00Z".to_string(),
487        );
488
489        let processed = process_metadata(&metadata).unwrap();
490        assert_eq!(processed.get("title").unwrap(), "Test Title");
491        assert_eq!(processed.get("date").unwrap(), "2023-05-20");
492        assert_eq!(processed.get("slug").unwrap(), "test-title");
493    }
494
495    #[test]
496    fn test_extract_metadata() {
497        let yaml_content = r#"---
498title: YAML Test
499date: 2023-05-20
500---
501Content here"#;
502
503        let toml_content = r#"+++
504title = "TOML Test"
505date = "2023-05-20"
506+++
507Content here"#;
508
509        let json_content = r#"{
510"title": "JSON Test",
511"date": "2023-05-20"
512}
513Content here"#;
514
515        let yaml_metadata = extract_metadata(yaml_content).unwrap();
516        assert_eq!(yaml_metadata.get("title").unwrap(), "YAML Test");
517
518        let toml_metadata = extract_metadata(toml_content).unwrap();
519        assert_eq!(toml_metadata.get("title").unwrap(), "TOML Test");
520
521        let json_metadata = extract_metadata(json_content).unwrap();
522        assert_eq!(json_metadata.get("title").unwrap(), "JSON Test");
523    }
524
525    #[test]
526    fn test_extract_metadata_failure() {
527        let invalid_content = "This content has no metadata";
528        assert!(extract_metadata(invalid_content).is_err());
529    }
530
531    #[test]
532    fn test_ensure_required_fields() {
533        let mut metadata = Metadata::new(HashMap::new());
534        metadata.insert("title".to_string(), "Test".to_string());
535        metadata.insert("date".to_string(), "2023-05-20".to_string());
536
537        assert!(ensure_required_fields(&metadata).is_ok());
538
539        let mut incomplete_metadata = Metadata::new(HashMap::new());
540        incomplete_metadata
541            .insert("title".to_string(), "Test".to_string());
542
543        assert!(ensure_required_fields(&incomplete_metadata).is_err());
544    }
545
546    #[test]
547    fn test_generate_derived_fields() {
548        let mut metadata = Metadata::new(HashMap::new());
549        metadata.insert("title".to_string(), "Test Title".to_string());
550
551        generate_derived_fields(&mut metadata);
552
553        assert_eq!(metadata.get("slug").unwrap(), "test-title");
554    }
555
556    #[test]
557    fn test_metadata_methods() {
558        let mut metadata = Metadata::new(HashMap::new());
559        metadata.insert("key".to_string(), "value".to_string());
560
561        assert_eq!(metadata.get("key"), Some(&"value".to_string()));
562        assert!(metadata.contains_key("key"));
563        assert!(!metadata.contains_key("nonexistent"));
564
565        let old_value =
566            metadata.insert("key".to_string(), "new_value".to_string());
567        assert_eq!(old_value, Some("value".to_string()));
568        assert_eq!(metadata.get("key"), Some(&"new_value".to_string()));
569
570        let inner = metadata.into_inner();
571        assert_eq!(inner.get("key"), Some(&"new_value".to_string()));
572    }
573
574    #[test]
575    fn test_process_metadata_with_invalid_date() {
576        let mut metadata = Metadata::new(HashMap::new());
577        metadata.insert("title".to_string(), "Test Title".to_string());
578        metadata.insert("date".to_string(), "invalid_date".to_string());
579
580        assert!(process_metadata(&metadata).is_err());
581    }
582
583    #[test]
584    fn test_extract_yaml_metadata_with_complex_structure() {
585        let yaml_content = r#"---
586title: Complex YAML Test
587date: 2023-05-20
588author:
589  name: John Doe
590  email: john@example.com
591tags:
592  - rust
593  - metadata
594  - testing
595---
596Content here"#;
597
598        let metadata = extract_metadata(yaml_content).unwrap();
599        assert_eq!(metadata.get("title").unwrap(), "Complex YAML Test");
600        assert_eq!(metadata.get("date").unwrap(), "2023-05-20");
601        assert_eq!(metadata.get("author.name").unwrap(), "John Doe");
602        assert_eq!(
603            metadata.get("author.email").unwrap(),
604            "john@example.com"
605        );
606        assert_eq!(
607            metadata.get("tags").unwrap(),
608            "[rust, metadata, testing]"
609        );
610    }
611
612    #[test]
613    fn test_extract_toml_metadata_with_complex_structure() {
614        let toml_content = r#"+++
615title = "Complex TOML Test"
616date = 2023-05-20
617
618[author]
619name = "John Doe"
620email = "john@example.com"
621
622tags = ["rust", "metadata", "testing"]
623+++
624Content here"#;
625
626        let metadata = extract_metadata(toml_content).unwrap();
627        assert_eq!(
628            metadata.get("title").expect("Missing 'title' key"),
629            "Complex TOML Test"
630        );
631        assert_eq!(
632            metadata.get("date").expect("Missing 'date' key"),
633            "2023-05-20"
634        );
635        assert_eq!(
636            metadata
637                .get("author.name")
638                .expect("Missing 'author.name' key"),
639            "John Doe"
640        );
641        assert_eq!(
642            metadata
643                .get("author.email")
644                .expect("Missing 'author.email' key"),
645            "john@example.com"
646        );
647        assert_eq!(
648            metadata
649                .get("author.tags")
650                .expect("Missing 'author.tags' key"),
651            "[rust, metadata, testing]"
652        );
653    }
654
655    #[test]
656    fn test_generate_slug_with_special_characters() {
657        assert_eq!(
658            generate_slug("Hello, World! 123"),
659            "hello,-world!-123"
660        );
661        assert_eq!(generate_slug("Test: Ästhetik"), "test:-ästhetik");
662        assert_eq!(
663            generate_slug("  Multiple   Spaces  "),
664            "--multiple---spaces--"
665        );
666    }
667}