systemprompt_content/config/
ready.rs1use chrono::{DateTime, Utc};
9use sha2::{Digest, Sha256};
10use std::collections::HashMap;
11use std::path::{Path, PathBuf};
12use systemprompt_identifiers::{CategoryId, SourceId};
13use systemprompt_models::ContentRouting;
14use walkdir::WalkDir;
15
16use crate::ContentError;
17use crate::models::ContentMetadata;
18use crate::services::validate_content_metadata;
19
20use super::validated::{ContentConfigValidated, ContentSourceConfigValidated};
21
22#[derive(Debug, Clone)]
23pub struct ContentReady {
24 config: ContentConfigValidated,
25 content_by_slug: HashMap<String, ParsedContent>,
26 content_by_source: HashMap<SourceId, Vec<ParsedContent>>,
27 stats: LoadStats,
28}
29
30#[derive(Debug, Clone)]
31pub struct ParsedContent {
32 pub slug: String,
33 pub title: String,
34 pub description: String,
35 pub body: String,
36 pub author: String,
37 pub published_at: DateTime<Utc>,
38 pub keywords: String,
39 pub kind: String,
40 pub image: Option<String>,
41 pub category_id: CategoryId,
42 pub source_id: SourceId,
43 pub version_hash: String,
44 pub file_path: PathBuf,
45}
46
47#[derive(Debug, Clone, Default)]
48pub struct LoadStats {
49 pub files_found: usize,
50 pub files_loaded: usize,
51 pub files_with_errors: usize,
52 pub load_time_ms: u64,
53 pub source_stats: HashMap<String, SourceLoadStats>,
54}
55
56#[derive(Debug, Clone, Copy, Default)]
57pub struct SourceLoadStats {
58 pub files_found: usize,
59 pub files_loaded: usize,
60 pub errors: usize,
61}
62
63impl ContentReady {
64 pub fn from_validated(config: ContentConfigValidated) -> Self {
65 let start_time = std::time::Instant::now();
66 let mut content_by_slug = HashMap::new();
67 let mut content_by_source: HashMap<SourceId, Vec<ParsedContent>> = HashMap::new();
68 let mut stats = LoadStats::default();
69
70 for (source_name, source_config) in config.content_sources() {
71 if !source_config.enabled {
72 continue;
73 }
74
75 let mut source_stats = SourceLoadStats::default();
76
77 let files = scan_markdown_files(&source_config.path, source_config.indexing.recursive);
78 source_stats.files_found = files.len();
79 stats.files_found += files.len();
80
81 for file_path in files {
82 match parse_content_file(&file_path, source_config) {
83 Ok(content) => {
84 let slug = content.slug.clone();
85 let source_id = content.source_id.clone();
86
87 content_by_source
88 .entry(source_id)
89 .or_default()
90 .push(content.clone());
91
92 content_by_slug.insert(slug, content);
93
94 source_stats.files_loaded += 1;
95 stats.files_loaded += 1;
96 },
97 Err(e) => {
98 tracing::warn!(
99 file = %file_path.display(),
100 error = %e,
101 "Failed to parse content file"
102 );
103 source_stats.errors += 1;
104 stats.files_with_errors += 1;
105 },
106 }
107 }
108
109 stats.source_stats.insert(source_name.clone(), source_stats);
110 }
111
112 stats.load_time_ms = start_time.elapsed().as_millis() as u64;
113
114 Self {
115 config,
116 content_by_slug,
117 content_by_source,
118 stats,
119 }
120 }
121
122 pub const fn config(&self) -> &ContentConfigValidated {
123 &self.config
124 }
125
126 pub const fn stats(&self) -> &LoadStats {
127 &self.stats
128 }
129
130 pub fn get_by_slug(&self, slug: &str) -> Option<&ParsedContent> {
131 self.content_by_slug.get(slug)
132 }
133
134 pub fn get_by_source(&self, source_id: &SourceId) -> Option<&Vec<ParsedContent>> {
135 self.content_by_source.get(source_id)
136 }
137
138 pub fn all_content(&self) -> impl Iterator<Item = &ParsedContent> {
139 self.content_by_slug.values()
140 }
141
142 pub fn content_count(&self) -> usize {
143 self.content_by_slug.len()
144 }
145}
146
147impl ContentRouting for ContentReady {
148 fn is_html_page(&self, path: &str) -> bool {
149 self.config.is_html_page(path)
150 }
151
152 fn determine_source(&self, path: &str) -> String {
153 self.config.determine_source(path)
154 }
155}
156
157fn scan_markdown_files(dir: &Path, recursive: bool) -> Vec<PathBuf> {
158 let walker = if recursive {
159 WalkDir::new(dir).min_depth(1)
160 } else {
161 WalkDir::new(dir).min_depth(1).max_depth(1)
162 };
163
164 walker
165 .into_iter()
166 .filter_map(Result::ok)
167 .filter(|e| e.file_type().is_file())
168 .filter(|e| e.path().extension().is_some_and(|ext| ext == "md"))
169 .map(|e| e.path().to_path_buf())
170 .collect()
171}
172
173fn parse_content_file(
174 file_path: &Path,
175 source_config: &ContentSourceConfigValidated,
176) -> Result<ParsedContent, ContentError> {
177 let markdown_text = std::fs::read_to_string(file_path).map_err(ContentError::Io)?;
178
179 let (metadata, body) = parse_frontmatter(&markdown_text)?;
180
181 validate_content_metadata(&metadata)?;
182
183 let published_at = parse_date(&metadata.published_at)?;
184
185 let category_id = metadata.category.as_ref().map_or_else(
186 || source_config.category_id.clone(),
187 |c| CategoryId::new(c.clone()),
188 );
189
190 let version_hash = compute_version_hash(&metadata.title, &body, &metadata.description);
191
192 Ok(ParsedContent {
193 slug: metadata.slug,
194 title: metadata.title,
195 description: metadata.description,
196 body,
197 author: metadata.author,
198 published_at,
199 keywords: metadata.keywords,
200 kind: metadata.kind,
201 image: metadata.image,
202 category_id,
203 source_id: source_config.source_id.clone(),
204 version_hash,
205 file_path: file_path.to_path_buf(),
206 })
207}
208
209fn parse_frontmatter(markdown: &str) -> Result<(ContentMetadata, String), ContentError> {
210 let parts: Vec<&str> = markdown.splitn(3, "---").collect();
211
212 if parts.len() < 3 {
213 return Err(ContentError::Parse(
214 "Invalid frontmatter format - missing '---' delimiters".to_owned(),
215 ));
216 }
217
218 let metadata: ContentMetadata = serde_yaml::from_str(parts[1]).map_err(ContentError::Yaml)?;
219
220 Ok((metadata, parts[2].trim().to_owned()))
221}
222
223fn parse_date(date_str: &str) -> Result<DateTime<Utc>, ContentError> {
224 chrono::NaiveDate::parse_from_str(date_str, "%Y-%m-%d")
225 .map_err(|e| ContentError::Parse(format!("Invalid date '{}': {}", date_str, e)))?
226 .and_hms_opt(0, 0, 0)
227 .ok_or_else(|| ContentError::Parse("Failed to create datetime".to_owned()))?
228 .and_local_timezone(Utc)
229 .single()
230 .ok_or_else(|| ContentError::Parse("Ambiguous timezone conversion".to_owned()))
231}
232
233fn compute_version_hash(title: &str, body: &str, description: &str) -> String {
234 let mut hasher = Sha256::new();
235 hasher.update(title.as_bytes());
236 hasher.update(body.as_bytes());
237 hasher.update(description.as_bytes());
238 hex::encode(hasher.finalize())
239}