Skip to main content

chub_core/build/
builder.rs

1use std::collections::{HashMap, HashSet};
2use std::fs;
3use std::path::Path;
4
5use rayon::prelude::*;
6use sha2::{Digest, Sha256};
7
8use crate::error::{Error, Result};
9use crate::search::bm25;
10use crate::types::{DocEntry, Entry, Registry, SearchIndex, SkillEntry};
11
12use super::discovery::{discover_author, load_author_registry};
13
14/// Options for the build process.
15#[derive(Debug)]
16pub struct BuildOptions {
17    pub base_url: Option<String>,
18    pub validate_only: bool,
19    /// Enable incremental builds using a content hash manifest.
20    /// When true, files are only copied if their SHA-256 hash has changed.
21    pub incremental: bool,
22}
23
24impl Default for BuildOptions {
25    fn default() -> Self {
26        Self {
27            base_url: None,
28            validate_only: false,
29            incremental: true,
30        }
31    }
32}
33
34/// Result of a successful build.
35#[derive(Debug)]
36pub struct BuildResult {
37    pub registry: Registry,
38    pub search_index: SearchIndex,
39    pub docs_count: usize,
40    pub skills_count: usize,
41    pub warnings: Vec<String>,
42}
43
44/// Build a registry from a content directory.
45///
46/// Scans top-level directories as author directories.
47/// Each author either has a registry.json or we auto-discover DOC.md/SKILL.md files.
48pub fn build_registry(content_dir: &Path, opts: &BuildOptions) -> Result<BuildResult> {
49    if !content_dir.exists() {
50        return Err(Error::ContentDirNotFound(content_dir.to_path_buf()));
51    }
52
53    let mut all_docs: Vec<DocEntry> = Vec::new();
54    let mut all_skills: Vec<SkillEntry> = Vec::new();
55    let mut all_warnings: Vec<String> = Vec::new();
56    let mut all_errors: Vec<String> = Vec::new();
57
58    // List top-level directories (author directories)
59    let mut author_dirs: Vec<(String, std::path::PathBuf)> = Vec::new();
60    for entry in fs::read_dir(content_dir)? {
61        let entry = entry?;
62        if !entry.file_type()?.is_dir() {
63            continue;
64        }
65        let name = entry.file_name().to_string_lossy().to_string();
66        if name == "dist" || name.starts_with('.') {
67            continue;
68        }
69        author_dirs.push((name, entry.path()));
70    }
71
72    for (author_name, author_dir) in &author_dirs {
73        let author_registry = author_dir.join("registry.json");
74
75        if author_registry.exists() {
76            match load_author_registry(author_dir, author_name) {
77                Ok((docs, skills)) => {
78                    all_docs.extend(docs);
79                    all_skills.extend(skills);
80                }
81                Err(e) => {
82                    all_errors.push(format!("{}/registry.json: {}", author_name, e));
83                }
84            }
85        } else {
86            let result = discover_author(author_dir, author_name, content_dir);
87            all_docs.extend(result.docs);
88            all_skills.extend(result.skills);
89            all_warnings.extend(result.warnings);
90            all_errors.extend(result.errors);
91        }
92    }
93
94    // Check for id collisions using HashSet (faster than HashMap)
95    let mut seen = HashSet::with_capacity(all_docs.len() + all_skills.len());
96    for doc in &all_docs {
97        if !seen.insert(&doc.id) {
98            all_errors.push(format!("Duplicate doc id '{}'", doc.id));
99        }
100    }
101    for skill in &all_skills {
102        if !seen.insert(&skill.id) {
103            all_errors.push(format!("Duplicate skill id '{}'", skill.id));
104        }
105    }
106
107    if !all_errors.is_empty() {
108        return Err(Error::BuildErrors(all_errors.join("\n")));
109    }
110
111    // Build search index
112    let entries: Vec<Entry> = all_docs
113        .iter()
114        .map(Entry::Doc)
115        .chain(all_skills.iter().map(Entry::Skill))
116        .collect();
117    let search_index = bm25::build_index(&entries);
118
119    let docs_count = all_docs.len();
120    let skills_count = all_skills.len();
121    let generated = now_iso8601();
122
123    let registry = Registry {
124        version: "1.0.0".to_string(),
125        generated,
126        docs: all_docs,
127        skills: all_skills,
128        base_url: opts.base_url.clone(),
129    };
130
131    Ok(BuildResult {
132        docs_count,
133        skills_count,
134        warnings: all_warnings,
135        registry,
136        search_index,
137    })
138}
139
140/// Name of the build manifest file used for incremental builds.
141const BUILD_MANIFEST_NAME: &str = ".build-manifest.json";
142
143/// Compute the SHA-256 hex digest of a file's contents.
144fn sha256_file(path: &Path) -> std::io::Result<String> {
145    let data = fs::read(path)?;
146    let hash = Sha256::digest(&data);
147    Ok(format!("{:x}", hash))
148}
149
150/// Load an existing build manifest from the output directory.
151/// Returns an empty map if the file does not exist or cannot be parsed.
152fn load_build_manifest(output_dir: &Path) -> HashMap<String, String> {
153    let manifest_path = output_dir.join(BUILD_MANIFEST_NAME);
154    if let Ok(data) = fs::read_to_string(&manifest_path) {
155        serde_json::from_str(&data).unwrap_or_default()
156    } else {
157        HashMap::new()
158    }
159}
160
161/// Save the build manifest to the output directory.
162fn save_build_manifest(output_dir: &Path, manifest: &HashMap<String, String>) -> Result<()> {
163    use std::io::BufWriter;
164    let file = fs::File::create(output_dir.join(BUILD_MANIFEST_NAME))?;
165    let writer = BufWriter::new(file);
166    serde_json::to_writer_pretty(writer, manifest)?;
167    Ok(())
168}
169
170/// Write build results to output directory.
171pub fn write_build_output(
172    content_dir: &Path,
173    output_dir: &Path,
174    result: &BuildResult,
175) -> Result<()> {
176    write_build_output_with_opts(content_dir, output_dir, result, &BuildOptions::default())
177}
178
179/// Write build results to output directory with options controlling incremental behavior.
180pub fn write_build_output_with_opts(
181    content_dir: &Path,
182    output_dir: &Path,
183    result: &BuildResult,
184    opts: &BuildOptions,
185) -> Result<()> {
186    use std::io::BufWriter;
187
188    fs::create_dir_all(output_dir)?;
189
190    // Write registry.json using buffered writer
191    let file = fs::File::create(output_dir.join("registry.json"))?;
192    let writer = BufWriter::new(file);
193    serde_json::to_writer_pretty(writer, &result.registry)?;
194
195    // Write search-index.json using buffered writer (compact, no pretty-print)
196    let file = fs::File::create(output_dir.join("search-index.json"))?;
197    let writer = BufWriter::new(file);
198    serde_json::to_writer(writer, &result.search_index)?;
199
200    // Write search-index.bin using bincode serialization
201    let bin_data = bincode::serialize(&result.search_index)
202        .map_err(|e| Error::BuildErrors(format!("bincode serialization failed: {}", e)))?;
203    fs::write(output_dir.join("search-index.bin"), &bin_data)?;
204
205    // Load existing manifest for incremental builds
206    let old_manifest = if opts.incremental {
207        load_build_manifest(output_dir)
208    } else {
209        HashMap::new()
210    };
211    let mut new_manifest: HashMap<String, String> = HashMap::new();
212
213    // Copy content tree using single walkdir pass with filter_entry for early pruning
214    // Phase 1: collect dirs and files
215    // Phase 2: batch create dirs, then copy files in parallel
216    let mut dirs_to_create = Vec::new();
217    let mut files_to_copy: Vec<(std::path::PathBuf, std::path::PathBuf, String)> = Vec::new();
218
219    for entry in walkdir::WalkDir::new(content_dir)
220        .min_depth(1)
221        .into_iter()
222        .filter_entry(|e| {
223            // Early-prune dist/ and dotfile directories at top level
224            if e.depth() == 1 && e.file_type().is_dir() {
225                let name = e.file_name().to_string_lossy();
226                return name != "dist" && !name.starts_with('.');
227            }
228            true
229        })
230        .filter_map(|e| e.ok())
231    {
232        // Skip registry.json in author root (depth 2: author/registry.json)
233        if entry.file_type().is_file() && entry.file_name() == "registry.json" && entry.depth() == 2
234        {
235            continue;
236        }
237
238        let rel = entry.path().strip_prefix(content_dir).unwrap();
239        let rel_str = rel.to_string_lossy().to_string();
240        let dest = output_dir.join(rel);
241
242        if entry.file_type().is_dir() {
243            dirs_to_create.push(dest);
244        } else {
245            files_to_copy.push((entry.into_path(), dest, rel_str));
246        }
247    }
248
249    // Batch create all directories first sequentially (parents must exist before children)
250    for dir in &dirs_to_create {
251        fs::create_dir_all(dir)?;
252    }
253
254    if opts.incremental {
255        // Compute hashes and filter unchanged files, then copy in parallel
256        let copy_results: Vec<std::result::Result<(String, String), Error>> = files_to_copy
257            .par_iter()
258            .map(|(src, dest, rel_str)| {
259                let hash = sha256_file(src).map_err(|e| {
260                    Error::BuildErrors(format!("hash failed for {}: {}", rel_str, e))
261                })?;
262
263                // Skip copy if the hash matches the old manifest
264                if old_manifest.get(rel_str).map(|h| h.as_str()) == Some(hash.as_str()) {
265                    return Ok((rel_str.clone(), hash));
266                }
267
268                fs::copy(src, dest).map_err(|e| {
269                    Error::BuildErrors(format!("copy failed for {}: {}", rel_str, e))
270                })?;
271                Ok((rel_str.clone(), hash))
272            })
273            .collect();
274
275        for res in copy_results {
276            let (rel_str, hash) = res?;
277            new_manifest.insert(rel_str, hash);
278        }
279
280        // Save updated manifest
281        save_build_manifest(output_dir, &new_manifest)?;
282    } else {
283        // Non-incremental: copy all files in parallel using rayon, no manifest
284        let copy_results: Vec<std::result::Result<(), Error>> = files_to_copy
285            .par_iter()
286            .map(|(src, dest, rel_str)| {
287                fs::copy(src, dest).map_err(|e| {
288                    Error::BuildErrors(format!("copy failed for {}: {}", rel_str, e))
289                })?;
290                Ok(())
291            })
292            .collect();
293
294        for res in copy_results {
295            res?;
296        }
297    }
298
299    Ok(())
300}
301
302/// Get current time as ISO 8601 string.
303fn now_iso8601() -> String {
304    let now = std::time::SystemTime::now()
305        .duration_since(std::time::UNIX_EPOCH)
306        .unwrap_or_default()
307        .as_secs();
308
309    // Basic ISO 8601 without external crate
310    // For a production build we'd use chrono, but this avoids the dependency
311    let secs_per_day = 86400u64;
312    let days = now / secs_per_day;
313    let time_of_day = now % secs_per_day;
314
315    let hours = time_of_day / 3600;
316    let minutes = (time_of_day % 3600) / 60;
317    let seconds = time_of_day % 60;
318
319    // Compute year/month/day from days since epoch
320    let (year, month, day) = days_to_date(days);
321
322    format!(
323        "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}.000Z",
324        year, month, day, hours, minutes, seconds
325    )
326}
327
328/// Convert days since Unix epoch to (year, month, day).
329pub fn days_to_date(days: u64) -> (u64, u64, u64) {
330    // Algorithm from http://howardhinnant.github.io/date_algorithms.html
331    let z = days + 719468;
332    let era = z / 146097;
333    let doe = z - era * 146097;
334    let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
335    let y = yoe + era * 400;
336    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
337    let mp = (5 * doy + 2) / 153;
338    let d = doy - (153 * mp + 2) / 5 + 1;
339    let m = if mp < 10 { mp + 3 } else { mp - 9 };
340    let y = if m <= 2 { y + 1 } else { y };
341    (y, m, d)
342}