Skip to main content

chub_core/build/
builder.rs

1use std::collections::{HashMap, HashSet};
2use std::fs;
3use std::path::Path;
4
5use rayon::prelude::*;
6use sha2::{Digest, Sha256};
7
8use crate::error::{Error, Result};
9use crate::search::bm25;
10use crate::types::{DocEntry, Entry, Registry, SearchIndex, SkillEntry};
11
12use super::discovery::{discover_author, load_author_registry};
13
14/// Options for the build process.
15#[derive(Debug)]
16pub struct BuildOptions {
17    pub base_url: Option<String>,
18    pub validate_only: bool,
19    /// Enable incremental builds using a content hash manifest.
20    /// When true, files are only copied if their SHA-256 hash has changed.
21    pub incremental: bool,
22}
23
24impl Default for BuildOptions {
25    fn default() -> Self {
26        Self {
27            base_url: None,
28            validate_only: false,
29            incremental: true,
30        }
31    }
32}
33
34/// Result of a successful build.
35#[derive(Debug)]
36pub struct BuildResult {
37    pub registry: Registry,
38    pub search_index: SearchIndex,
39    pub docs_count: usize,
40    pub skills_count: usize,
41    pub warnings: Vec<String>,
42}
43
44/// Build a registry from a content directory.
45///
46/// Scans top-level directories as author directories.
47/// Each author either has a registry.json or we auto-discover DOC.md/SKILL.md files.
48pub fn build_registry(content_dir: &Path, opts: &BuildOptions) -> Result<BuildResult> {
49    if !content_dir.exists() {
50        return Err(Error::ContentDirNotFound(content_dir.to_path_buf()));
51    }
52
53    let mut all_docs: Vec<DocEntry> = Vec::new();
54    let mut all_skills: Vec<SkillEntry> = Vec::new();
55    let mut all_warnings: Vec<String> = Vec::new();
56    let mut all_errors: Vec<String> = Vec::new();
57
58    // List top-level directories (author directories)
59    let mut author_dirs: Vec<(String, std::path::PathBuf)> = Vec::new();
60    for entry in fs::read_dir(content_dir)? {
61        let entry = entry?;
62        if !entry.file_type()?.is_dir() {
63            continue;
64        }
65        let name = entry.file_name().to_string_lossy().to_string();
66        if name == "dist" || name.starts_with('.') {
67            continue;
68        }
69        author_dirs.push((name, entry.path()));
70    }
71
72    for (author_name, author_dir) in &author_dirs {
73        let author_registry = author_dir.join("registry.json");
74
75        if author_registry.exists() {
76            match load_author_registry(author_dir, author_name) {
77                Ok((docs, skills)) => {
78                    all_docs.extend(docs);
79                    all_skills.extend(skills);
80                }
81                Err(e) => {
82                    all_errors.push(format!("{}/registry.json: {}", author_name, e));
83                }
84            }
85        } else {
86            let result = discover_author(author_dir, author_name, content_dir);
87            all_docs.extend(result.docs);
88            all_skills.extend(result.skills);
89            all_warnings.extend(result.warnings);
90            all_errors.extend(result.errors);
91        }
92    }
93
94    // Check for id collisions using HashSet (faster than HashMap)
95    let mut seen = HashSet::with_capacity(all_docs.len() + all_skills.len());
96    for doc in &all_docs {
97        if !seen.insert(&doc.id) {
98            all_errors.push(format!("Duplicate doc id '{}'", doc.id));
99        }
100    }
101    for skill in &all_skills {
102        if !seen.insert(&skill.id) {
103            all_errors.push(format!("Duplicate skill id '{}'", skill.id));
104        }
105    }
106
107    if !all_errors.is_empty() {
108        return Err(Error::BuildErrors(all_errors.join("\n")));
109    }
110
111    // Build search index
112    let entries: Vec<Entry> = all_docs
113        .iter()
114        .map(Entry::Doc)
115        .chain(all_skills.iter().map(Entry::Skill))
116        .collect();
117    let search_index = bm25::build_index(&entries);
118
119    let docs_count = all_docs.len();
120    let skills_count = all_skills.len();
121    let generated = now_iso8601();
122
123    let registry = Registry {
124        version: "1.0.0".to_string(),
125        generated,
126        docs: all_docs,
127        skills: all_skills,
128        base_url: opts.base_url.clone(),
129    };
130
131    Ok(BuildResult {
132        docs_count,
133        skills_count,
134        warnings: all_warnings,
135        registry,
136        search_index,
137    })
138}
139
140/// Name of the build manifest file used for incremental builds.
141const BUILD_MANIFEST_NAME: &str = ".build-manifest.json";
142
143/// Compute the SHA-256 hex digest of a file's contents.
144fn sha256_file(path: &Path) -> std::io::Result<String> {
145    let data = fs::read(path)?;
146    let hash = Sha256::digest(&data);
147    Ok(format!("{:x}", hash))
148}
149
150/// Load an existing build manifest from the output directory.
151/// Returns an empty map if the file does not exist or cannot be parsed.
152fn load_build_manifest(output_dir: &Path) -> HashMap<String, String> {
153    let manifest_path = output_dir.join(BUILD_MANIFEST_NAME);
154    if let Ok(data) = fs::read_to_string(&manifest_path) {
155        serde_json::from_str(&data).unwrap_or_default()
156    } else {
157        HashMap::new()
158    }
159}
160
161/// Save the build manifest to the output directory.
162fn save_build_manifest(output_dir: &Path, manifest: &HashMap<String, String>) -> Result<()> {
163    use std::io::BufWriter;
164    let file = fs::File::create(output_dir.join(BUILD_MANIFEST_NAME))?;
165    let writer = BufWriter::new(file);
166    serde_json::to_writer_pretty(writer, manifest)?;
167    Ok(())
168}
169
170/// Write build results to output directory.
171pub fn write_build_output(
172    content_dir: &Path,
173    output_dir: &Path,
174    result: &BuildResult,
175) -> Result<()> {
176    write_build_output_with_opts(content_dir, output_dir, result, &BuildOptions::default())
177}
178
179/// Write build results to output directory with options controlling incremental behavior.
180pub fn write_build_output_with_opts(
181    content_dir: &Path,
182    output_dir: &Path,
183    result: &BuildResult,
184    opts: &BuildOptions,
185) -> Result<()> {
186    use std::io::BufWriter;
187
188    fs::create_dir_all(output_dir)?;
189
190    // Write registry.json using buffered writer
191    let file = fs::File::create(output_dir.join("registry.json"))?;
192    let writer = BufWriter::new(file);
193    serde_json::to_writer_pretty(writer, &result.registry)?;
194
195    // Write search-index.json using buffered writer (compact, no pretty-print)
196    let file = fs::File::create(output_dir.join("search-index.json"))?;
197    let writer = BufWriter::new(file);
198    serde_json::to_writer(writer, &result.search_index)?;
199
200    // Write search-index.bin using bincode serialization
201    let bin_data = bincode::serialize(&result.search_index)
202        .map_err(|e| Error::BuildErrors(format!("bincode serialization failed: {}", e)))?;
203    fs::write(output_dir.join("search-index.bin"), &bin_data)?;
204
205    // Write index.html landing page for the CDN root
206    let index_html = generate_index_html(result);
207    fs::write(output_dir.join("index.html"), index_html)?;
208
209    // Load existing manifest for incremental builds
210    let old_manifest = if opts.incremental {
211        load_build_manifest(output_dir)
212    } else {
213        HashMap::new()
214    };
215    let mut new_manifest: HashMap<String, String> = HashMap::new();
216
217    // Copy content tree using single walkdir pass with filter_entry for early pruning
218    // Phase 1: collect dirs and files
219    // Phase 2: batch create dirs, then copy files in parallel
220    let mut dirs_to_create = Vec::new();
221    let mut files_to_copy: Vec<(std::path::PathBuf, std::path::PathBuf, String)> = Vec::new();
222
223    for entry in walkdir::WalkDir::new(content_dir)
224        .min_depth(1)
225        .into_iter()
226        .filter_entry(|e| {
227            // Early-prune dist/ and dotfile directories at top level
228            if e.depth() == 1 && e.file_type().is_dir() {
229                let name = e.file_name().to_string_lossy();
230                return name != "dist" && !name.starts_with('.');
231            }
232            true
233        })
234        .filter_map(|e| e.ok())
235    {
236        // Skip registry.json in author root (depth 2: author/registry.json)
237        if entry.file_type().is_file() && entry.file_name() == "registry.json" && entry.depth() == 2
238        {
239            continue;
240        }
241
242        let rel = entry.path().strip_prefix(content_dir).unwrap();
243        let rel_str = rel.to_string_lossy().replace('\\', "/");
244        let dest = output_dir.join(rel);
245
246        if entry.file_type().is_dir() {
247            dirs_to_create.push(dest);
248        } else {
249            files_to_copy.push((entry.into_path(), dest, rel_str));
250        }
251    }
252
253    // Batch create all directories first sequentially (parents must exist before children)
254    for dir in &dirs_to_create {
255        fs::create_dir_all(dir)?;
256    }
257
258    if opts.incremental {
259        // Compute hashes and filter unchanged files, then copy in parallel
260        let copy_results: Vec<std::result::Result<(String, String), Error>> = files_to_copy
261            .par_iter()
262            .map(|(src, dest, rel_str)| {
263                let hash = sha256_file(src).map_err(|e| {
264                    Error::BuildErrors(format!("hash failed for {}: {}", rel_str, e))
265                })?;
266
267                // Skip copy if the hash matches the old manifest
268                if old_manifest.get(rel_str).map(|h| h.as_str()) == Some(hash.as_str()) {
269                    return Ok((rel_str.clone(), hash));
270                }
271
272                fs::copy(src, dest).map_err(|e| {
273                    Error::BuildErrors(format!("copy failed for {}: {}", rel_str, e))
274                })?;
275                Ok((rel_str.clone(), hash))
276            })
277            .collect();
278
279        for res in copy_results {
280            let (rel_str, hash) = res?;
281            new_manifest.insert(rel_str, hash);
282        }
283
284        // Save updated manifest
285        save_build_manifest(output_dir, &new_manifest)?;
286    } else {
287        // Non-incremental: copy all files in parallel using rayon, no manifest
288        let copy_results: Vec<std::result::Result<(), Error>> = files_to_copy
289            .par_iter()
290            .map(|(src, dest, rel_str)| {
291                fs::copy(src, dest).map_err(|e| {
292                    Error::BuildErrors(format!("copy failed for {}: {}", rel_str, e))
293                })?;
294                Ok(())
295            })
296            .collect();
297
298        for res in copy_results {
299            res?;
300        }
301    }
302
303    Ok(())
304}
305
306/// Static assets for the CDN index page, embedded at compile time.
307const INDEX_TEMPLATE: &str = include_str!("static/index.html");
308const INDEX_STYLE: &str = include_str!("static/style.css");
309const INDEX_SCRIPT: &str = include_str!("static/script.js");
310
311/// Generate an index.html landing page with search for the CDN root.
312/// Matches the VitePress website theme (Inter font, sky blue brand colors, dark/light toggle).
313fn generate_index_html(result: &BuildResult) -> String {
314    // Build a compact JSON catalog for client-side search, including paths for doc viewer
315    let mut entries = Vec::new();
316    for doc in &result.registry.docs {
317        let langs: Vec<serde_json::Value> = doc
318            .languages
319            .iter()
320            .map(|l| {
321                let versions: Vec<serde_json::Value> = l
322                    .versions
323                    .iter()
324                    .map(|v| {
325                        serde_json::json!({
326                            "version": v.version,
327                            "path": v.path,
328                        })
329                    })
330                    .collect();
331                serde_json::json!({
332                    "language": l.language,
333                    "recommended": l.recommended_version,
334                    "versions": versions,
335                })
336            })
337            .collect();
338        let lang_names: Vec<&str> = doc.languages.iter().map(|l| l.language.as_str()).collect();
339        entries.push(serde_json::json!({
340            "id": doc.id,
341            "name": doc.name,
342            "description": doc.description,
343            "source": doc.source,
344            "tags": doc.tags,
345            "type": "doc",
346            "langNames": lang_names,
347            "langs": langs,
348        }));
349    }
350    for skill in &result.registry.skills {
351        entries.push(serde_json::json!({
352            "id": skill.id,
353            "name": skill.name,
354            "description": skill.description,
355            "source": skill.source,
356            "tags": skill.tags,
357            "type": "skill",
358            "langNames": [],
359            "langs": [],
360            "path": skill.path,
361        }));
362    }
363    let catalog_json = serde_json::to_string(&entries).unwrap_or_else(|_| "[]".to_string());
364
365    let docs_count = result.docs_count;
366    let skills_count = result.skills_count;
367    let generated = &result.registry.generated;
368
369    let mut languages: Vec<&str> = result
370        .registry
371        .docs
372        .iter()
373        .flat_map(|d| d.languages.iter().map(|l| l.language.as_str()))
374        .collect();
375    languages.sort();
376    languages.dedup();
377
378    INDEX_TEMPLATE
379        .replace("{style}", INDEX_STYLE)
380        .replace("{script}", INDEX_SCRIPT)
381        .replace("{docs}", &docs_count.to_string())
382        .replace("{skills}", &skills_count.to_string())
383        .replace("{lang_count}", &languages.len().to_string())
384        .replace("{generated}", generated)
385        .replace("{catalog}", &catalog_json)
386}
387
388/// Get current time as ISO 8601 string.
389fn now_iso8601() -> String {
390    crate::util::now_iso8601()
391}
392
393/// Convert days since Unix epoch to (year, month, day).
394/// Delegates to [`crate::util::days_to_date`] — kept here for backward compatibility.
395pub fn days_to_date(days: u64) -> (u64, u64, u64) {
396    crate::util::days_to_date(days)
397}