Skip to main content

systemprompt_content/config/
ready.rs

1//! Loaded content index built from a validated configuration.
2//!
3//! [`ContentReady`] scans every enabled source's directory, parses each
4//! markdown file's frontmatter into [`ParsedContent`], and indexes the results
5//! by slug and by [`SourceId`]. [`LoadStats`] records per-source scan and parse
6//! outcomes for diagnostics.
7
8use chrono::{DateTime, Utc};
9use sha2::{Digest, Sha256};
10use std::collections::HashMap;
11use std::path::{Path, PathBuf};
12use systemprompt_identifiers::{CategoryId, SourceId};
13use systemprompt_models::ContentRouting;
14use walkdir::WalkDir;
15
16use crate::ContentError;
17use crate::models::ContentMetadata;
18use crate::services::validate_content_metadata;
19
20use super::validated::{ContentConfigValidated, ContentSourceConfigValidated};
21
22#[derive(Debug, Clone)]
23pub struct ContentReady {
24    config: ContentConfigValidated,
25    content_by_slug: HashMap<String, ParsedContent>,
26    content_by_source: HashMap<SourceId, Vec<ParsedContent>>,
27    stats: LoadStats,
28}
29
30#[derive(Debug, Clone)]
31pub struct ParsedContent {
32    pub slug: String,
33    pub title: String,
34    pub description: String,
35    pub body: String,
36    pub author: String,
37    pub published_at: DateTime<Utc>,
38    pub keywords: String,
39    pub kind: String,
40    pub image: Option<String>,
41    pub category_id: CategoryId,
42    pub source_id: SourceId,
43    pub version_hash: String,
44    pub file_path: PathBuf,
45}
46
47#[derive(Debug, Clone, Default)]
48pub struct LoadStats {
49    pub files_found: usize,
50    pub files_loaded: usize,
51    pub files_with_errors: usize,
52    pub load_time_ms: u64,
53    pub source_stats: HashMap<String, SourceLoadStats>,
54}
55
56#[derive(Debug, Clone, Copy, Default)]
57pub struct SourceLoadStats {
58    pub files_found: usize,
59    pub files_loaded: usize,
60    pub errors: usize,
61}
62
63impl ContentReady {
64    pub fn from_validated(config: ContentConfigValidated) -> Self {
65        let start_time = std::time::Instant::now();
66        let mut content_by_slug = HashMap::new();
67        let mut content_by_source: HashMap<SourceId, Vec<ParsedContent>> = HashMap::new();
68        let mut stats = LoadStats::default();
69
70        for (source_name, source_config) in config.content_sources() {
71            if !source_config.enabled {
72                continue;
73            }
74
75            let mut source_stats = SourceLoadStats::default();
76
77            let files = scan_markdown_files(&source_config.path, source_config.indexing.recursive);
78            source_stats.files_found = files.len();
79            stats.files_found += files.len();
80
81            for file_path in files {
82                match parse_content_file(&file_path, source_config) {
83                    Ok(content) => {
84                        let slug = content.slug.clone();
85                        let source_id = content.source_id.clone();
86
87                        content_by_source
88                            .entry(source_id)
89                            .or_default()
90                            .push(content.clone());
91
92                        content_by_slug.insert(slug, content);
93
94                        source_stats.files_loaded += 1;
95                        stats.files_loaded += 1;
96                    },
97                    Err(e) => {
98                        tracing::warn!(
99                            file = %file_path.display(),
100                            error = %e,
101                            "Failed to parse content file"
102                        );
103                        source_stats.errors += 1;
104                        stats.files_with_errors += 1;
105                    },
106                }
107            }
108
109            stats.source_stats.insert(source_name.clone(), source_stats);
110        }
111
112        stats.load_time_ms = start_time.elapsed().as_millis() as u64;
113
114        Self {
115            config,
116            content_by_slug,
117            content_by_source,
118            stats,
119        }
120    }
121
122    pub const fn config(&self) -> &ContentConfigValidated {
123        &self.config
124    }
125
126    pub const fn stats(&self) -> &LoadStats {
127        &self.stats
128    }
129
130    pub fn get_by_slug(&self, slug: &str) -> Option<&ParsedContent> {
131        self.content_by_slug.get(slug)
132    }
133
134    pub fn get_by_source(&self, source_id: &SourceId) -> Option<&Vec<ParsedContent>> {
135        self.content_by_source.get(source_id)
136    }
137
138    pub fn all_content(&self) -> impl Iterator<Item = &ParsedContent> {
139        self.content_by_slug.values()
140    }
141
142    pub fn content_count(&self) -> usize {
143        self.content_by_slug.len()
144    }
145}
146
147impl ContentRouting for ContentReady {
148    fn is_html_page(&self, path: &str) -> bool {
149        self.config.is_html_page(path)
150    }
151
152    fn determine_source(&self, path: &str) -> String {
153        self.config.determine_source(path)
154    }
155}
156
157fn scan_markdown_files(dir: &Path, recursive: bool) -> Vec<PathBuf> {
158    let walker = if recursive {
159        WalkDir::new(dir).min_depth(1)
160    } else {
161        WalkDir::new(dir).min_depth(1).max_depth(1)
162    };
163
164    walker
165        .into_iter()
166        .filter_map(Result::ok)
167        .filter(|e| e.file_type().is_file())
168        .filter(|e| e.path().extension().is_some_and(|ext| ext == "md"))
169        .map(|e| e.path().to_path_buf())
170        .collect()
171}
172
173fn parse_content_file(
174    file_path: &Path,
175    source_config: &ContentSourceConfigValidated,
176) -> Result<ParsedContent, ContentError> {
177    let markdown_text = std::fs::read_to_string(file_path).map_err(ContentError::Io)?;
178
179    let (metadata, body) = parse_frontmatter(&markdown_text)?;
180
181    validate_content_metadata(&metadata)?;
182
183    let published_at = parse_date(&metadata.published_at)?;
184
185    let category_id = metadata.category.as_ref().map_or_else(
186        || source_config.category_id.clone(),
187        |c| CategoryId::new(c.clone()),
188    );
189
190    let version_hash = compute_version_hash(&metadata.title, &body, &metadata.description);
191
192    Ok(ParsedContent {
193        slug: metadata.slug,
194        title: metadata.title,
195        description: metadata.description,
196        body,
197        author: metadata.author,
198        published_at,
199        keywords: metadata.keywords,
200        kind: metadata.kind,
201        image: metadata.image,
202        category_id,
203        source_id: source_config.source_id.clone(),
204        version_hash,
205        file_path: file_path.to_path_buf(),
206    })
207}
208
209fn parse_frontmatter(markdown: &str) -> Result<(ContentMetadata, String), ContentError> {
210    let parts: Vec<&str> = markdown.splitn(3, "---").collect();
211
212    if parts.len() < 3 {
213        return Err(ContentError::Parse(
214            "Invalid frontmatter format - missing '---' delimiters".to_owned(),
215        ));
216    }
217
218    let metadata: ContentMetadata = serde_yaml::from_str(parts[1]).map_err(ContentError::Yaml)?;
219
220    Ok((metadata, parts[2].trim().to_owned()))
221}
222
223fn parse_date(date_str: &str) -> Result<DateTime<Utc>, ContentError> {
224    chrono::NaiveDate::parse_from_str(date_str, "%Y-%m-%d")
225        .map_err(|e| ContentError::Parse(format!("Invalid date '{}': {}", date_str, e)))?
226        .and_hms_opt(0, 0, 0)
227        .ok_or_else(|| ContentError::Parse("Failed to create datetime".to_owned()))?
228        .and_local_timezone(Utc)
229        .single()
230        .ok_or_else(|| ContentError::Parse("Ambiguous timezone conversion".to_owned()))
231}
232
233fn compute_version_hash(title: &str, body: &str, description: &str) -> String {
234    let mut hasher = Sha256::new();
235    hasher.update(title.as_bytes());
236    hasher.update(body.as_bytes());
237    hasher.update(description.as_bytes());
238    hex::encode(hasher.finalize())
239}