mdbook_content_collections/
lib.rs

1use anyhow::Result;
2use chrono::{DateTime, NaiveDate, TimeZone, Utc};
3use pulldown_cmark::{Options, Parser, html};
4use serde::{Deserialize, Deserializer, Serialize};
5use serde_json::json;
6use std::{fs, path::Path, time::SystemTime};
7use walkdir::WalkDir;
8
9// Minimum body length (in chars) before we prefer it over description
10const MIN_BODY_PREVIEW_CHARS: usize = 80;
11
12// Convert file modification time → UTC
13fn systemtime_to_utc(st: SystemTime) -> DateTime<Utc> {
14    DateTime::<Utc>::from(st)
15}
16
17// Parse front-matter date formats
18fn deserialize_date<'de, D>(deserializer: D) -> Result<Option<DateTime<Utc>>, D::Error>
19where
20    D: Deserializer<'de>,
21{
22    let s: Option<String> = Option::deserialize(deserializer)?;
23
24    if let Some(date_str) = s {
25        if let Ok(dt) = DateTime::parse_from_rfc3339(&date_str) {
26            return Ok(Some(dt.with_timezone(&Utc)));
27        }
28
29        if let Ok(nd) = NaiveDate::parse_from_str(&date_str, "%Y-%m-%d") {
30            return Ok(Some(
31                Utc.from_utc_datetime(&nd.and_hms_opt(0, 0, 0).unwrap()),
32            ));
33        }
34    }
35    Ok(None)
36}
37
38#[derive(Debug, Deserialize, Clone)]
39pub struct FrontMatter {
40    pub title: String,
41
42    #[serde(deserialize_with = "deserialize_date")]
43    pub date: Option<DateTime<Utc>>,
44
45    pub author: Option<String>,
46    pub description: Option<String>, // User-supplied summary (optional)
47
48    // New: optional collection name
49    pub collection: Option<String>,
50
51    // New: simple tags array for indexing
52    pub tags: Option<Vec<String>>,
53
54    // New: draft flag
55    pub draft: Option<bool>,
56}
57
58#[derive(Debug)]
59pub struct Article {
60    pub fm: FrontMatter,
61    pub content: String,
62    pub path: String,
63}
64
65/// Parses a markdown file to extract YAML front matter and content.
66///
67/// # Errors
68///
69/// This function returns an error if the file cannot be read.
70///
71/// # Panics
72///
73/// Panics if the provided `path` does not have a valid file name (e.g., it terminates in `..`),
74/// as a title cannot be generated from the file stem.
75pub fn parse_markdown_file(root: &Path, path: &Path) -> Result<Article> {
76    let text = fs::read_to_string(path)?;
77
78    let mut lines = text.lines();
79    let mut yaml = String::new();
80    let mut in_yaml = false;
81
82    // Extract YAML front matter
83    for line in lines.by_ref() {
84        let trimmed = line.trim();
85        if trimmed == "---" {
86            if !in_yaml {
87                in_yaml = true;
88                continue;
89            }
90            break;
91        }
92        if in_yaml {
93            yaml.push_str(line);
94            yaml.push('\n');
95        }
96    }
97
98    // Markdown content after front matter
99    let content = lines.collect::<Vec<_>>().join("\n") + "\n";
100
101    let fallback_date = path
102        .metadata()
103        .ok()
104        .and_then(|m| m.modified().ok())
105        .map(systemtime_to_utc);
106
107    // Parse front matter
108    let fm = if yaml.trim().is_empty() {
109        FrontMatter {
110            title: path.file_stem().unwrap().to_string_lossy().into_owned(),
111            date: fallback_date,
112            author: None,
113            description: Some(content.clone()),
114            collection: None,
115            tags: None,
116            draft: None,
117        }
118    } else {
119        serde_yml::from_str(&yaml).unwrap_or_else(|_| FrontMatter {
120            title: path.file_stem().unwrap().to_string_lossy().into_owned(),
121            date: fallback_date,
122            author: None,
123            description: Some(content.clone()),
124            collection: None,
125            tags: None,
126            draft: None,
127        })
128    };
129
130    let rel_path = path.strip_prefix(root).unwrap_or(path);
131
132    Ok(Article {
133        fm,
134        content,
135        path: rel_path.to_string_lossy().into_owned(),
136    })
137}
138
139/// Traverses the source directory to collect and parse all Markdown files.
140///
141/// It ignores `SUMMARY.md` and non-markdown files. Articles are returned
142/// sorted by date in descending order (newest first).
143///
144/// # Errors
145///
146/// Returns an error if:
147/// * The `src_dir` cannot be accessed or does not exist.
148/// * An I/O error occurs while walking the directory tree (e.g., permission denied).
149/// * A markdown file exists but cannot be read during the parsing phase.
150pub fn collect_articles(src_dir: &Path) -> Result<Vec<Article>> {
151    let mut articles = Vec::new();
152
153    for entry in WalkDir::new(src_dir)
154        .into_iter()
155        .filter_map(std::result::Result::ok)
156    {
157        let path = entry.path();
158
159        // 1. Filter out directories
160        if !path.is_file() {
161            continue;
162        }
163
164        // 2. Check extension safely
165        let is_markdown = path
166            .extension()
167            .and_then(|e| e.to_str())
168            .map(str::to_ascii_lowercase)
169            .is_some_and(|s| s == "md" || s == "markdown");
170
171        if !is_markdown {
172            continue;
173        }
174
175        // 3. Check for SUMMARY.md safely
176        // map_or(false, ...) returns false if file_name() is None
177        let is_summary = path
178            .file_name()
179            .map(|name| name.to_string_lossy())
180            .is_some_and(|name| name.eq_ignore_ascii_case("SUMMARY.md"));
181
182        if is_summary {
183            continue;
184        }
185
186        // 4. Parse the file
187        if let Ok(article) = parse_markdown_file(src_dir, path) {
188            articles.push(article);
189        }
190    }
191
192    // Sort newest → oldest
193    articles.sort_by_key(|a| a.fm.date);
194    articles.reverse();
195
196    Ok(articles)
197}
198
199fn markdown_to_html(md: &str) -> String {
200    let mut html = String::new();
201    let parser = Parser::new_ext(md, Options::all());
202    html::push_html(&mut html, parser);
203    html
204}
205
206/// Strip obvious leading boilerplate (TOCs, details, long definition blocks)
207/// so previews tend to start at the main intro text instead of metadata.
208fn strip_leading_boilerplate(md: &str) -> &str {
209    let mut seen_heading = false;
210    let mut byte_idx = 0;
211    let mut acc_bytes = 0;
212
213    for (i, line) in md.lines().enumerate() {
214        let line_len_with_nl = line.len() + 1; // assume '\n' separated
215
216        // Skip initial blank lines entirely
217        if i == 0 && line.trim().is_empty() {
218            acc_bytes += line_len_with_nl;
219            continue;
220        }
221
222        if line.trim_start().starts_with('#') {
223            seen_heading = true;
224        }
225
226        if seen_heading && line.trim().is_empty() {
227            // First blank line after heading: start preview after this
228            acc_bytes += line_len_with_nl;
229            byte_idx = acc_bytes;
230            break;
231        }
232
233        acc_bytes += line_len_with_nl;
234    }
235
236    if byte_idx == 0 {
237        md
238    } else {
239        &md[byte_idx.min(md.len())..]
240    }
241}
242
243/// Take at most `max_chars` worth of UTF‑8 text from `s`.
244fn utf8_prefix(s: &str, max_chars: usize) -> &str {
245    if max_chars == 0 {
246        return "";
247    }
248
249    let mut last_byte = 0;
250
251    for (ch_idx, (byte_idx, _)) in s.char_indices().enumerate() {
252        if ch_idx == max_chars {
253            last_byte = byte_idx;
254            break;
255        }
256        last_byte = byte_idx + 1;
257    }
258
259    if last_byte == 0 || last_byte >= s.len() {
260        s
261    } else {
262        &s[..last_byte]
263    }
264}
265
266/// Take up to `max_paragraphs` <p> blocks from HTML, and cap at `max_chars` (UTF-8 safe).
267fn html_first_paragraphs(html: &str, max_paragraphs: usize, max_chars: usize) -> String {
268    let mut out = String::new();
269    let mut start = 0;
270    let mut count = 0;
271
272    while count < max_paragraphs {
273        // Find next <p ...>
274        let Some(rel) = html[start..].find("<p") else {
275            break;
276        };
277        let p_start = start + rel;
278
279        // Find the end of this paragraph
280        let Some(rel_close) = html[p_start..].find("</p>") else {
281            break;
282        };
283        let close = p_start + rel_close + "</p>".len();
284
285        let para = &html[p_start..close];
286        out.push_str(para);
287        count += 1;
288        start = close;
289    }
290
291    // If no <p> found, fall back to original HTML
292    if out.is_empty() {
293        out = html.to_string();
294    }
295
296    // UTF‑8 safe trim by character count
297    if out.chars().count() > max_chars {
298        out.chars().take(max_chars).collect()
299    } else {
300        out
301    }
302}
303
304#[derive(Debug, Serialize)]
305pub struct ContentEntry {
306    pub path: String, // relative path in src
307    pub title: String,
308    pub date: Option<String>,
309    pub author: Option<String>,
310    pub description: Option<String>,
311    pub collection: Option<String>,
312    pub tags: Vec<String>,
313    pub draft: bool,
314    pub preview_html: String,
315}
316
317/// Builds a JSON index of all articles and writes it to the specified output path.
318///
319/// This function processes markdown content into HTML previews and bundles
320/// metadata (tags, dates, authors) into a structured format used by the
321/// Kanagawa landing page.
322///
323/// # Errors
324///
325/// Returns an error if:
326/// * [`collect_articles`] fails due to I/O issues in the source directory.
327/// * The parent directory of `output_path` cannot be created.
328/// * The final JSON structure fails to serialize.
329/// * The index file cannot be written to `output_path` (e.g., disk is full or permission denied).
330pub fn build_content_index(src_dir: &Path, output_path: &Path) -> Result<()> {
331    const PREVIEW_MD_SLICE_CHARS: usize = 4000;
332    let articles = collect_articles(src_dir)?;
333
334    let entries: Vec<ContentEntry> = articles
335        .into_iter()
336        .map(|article| {
337            let content_trimmed = article.content.trim();
338            let body_len = content_trimmed.chars().count();
339
340            let mut source_md =
341                if body_len >= MIN_BODY_PREVIEW_CHARS || article.fm.description.is_none() {
342                    content_trimmed
343                } else {
344                    article.fm.description.as_deref().unwrap_or(content_trimmed)
345                };
346
347            source_md = strip_leading_boilerplate(source_md);
348
349            let source_md_slice = utf8_prefix(source_md, PREVIEW_MD_SLICE_CHARS);
350
351            let raw_html = markdown_to_html(source_md_slice);
352            let preview_html = html_first_paragraphs(&raw_html, 3, 800);
353
354            ContentEntry {
355                path: article.path,
356                title: article.fm.title,
357                date: article.fm.date.map(|d| d.to_rfc3339()),
358                author: article.fm.author,
359                description: article.fm.description,
360                collection: article.fm.collection,
361                tags: article.fm.tags.unwrap_or_default(),
362                draft: article.fm.draft.unwrap_or(false),
363                preview_html,
364            }
365        })
366        .collect();
367
368    // Very simple index shape for now
369    let index = json!({
370        "entries": entries,
371    });
372
373    if let Some(parent) = output_path.parent() {
374        fs::create_dir_all(parent)?;
375    }
376
377    fs::write(output_path, serde_json::to_vec_pretty(&index)?)?;
378
379    Ok(())
380}
mdbook_content_collections/lib.rs

mdbook_content_collections/
lib.rs