Skip to main content

mdbook_rss_feed/
lib.rs

1//! mdbook-rss-feed core library.
2//!
3//! This module scans an mdBook src directory for chapters, extracts frontmatter
4//! and content, and turns them into one or more RSS 2.0 channels suitable for
5//! static hosting.
6
7use anyhow::Result;
8use chrono::{DateTime, NaiveDate, TimeZone, Utc};
9use pulldown_cmark::{html, Options, Parser};
10use rss::{Channel, ChannelBuilder, Guid, Item, ItemBuilder};
11use serde::{Deserialize, Deserializer};
12use serde_json::Value as JsonValue;
13use std::{fs, path::Path, time::SystemTime};
14use walkdir::WalkDir;
15
16// Minimal JSON Feed 1.1 model for this crate
17#[derive(serde::Serialize)]
18pub struct JsonFeed {
19    pub version: String,
20    pub title: String,
21    #[serde(skip_serializing_if = "Option::is_none")]
22    pub home_page_url: Option<String>,
23    #[serde(skip_serializing_if = "Option::is_none")]
24    pub feed_url: Option<String>,
25    #[serde(skip_serializing_if = "Option::is_none")]
26    pub description: Option<String>,
27    #[serde(skip_serializing_if = "Option::is_none")]
28    pub next_url: Option<String>, // <-- add this
29    pub items: Vec<JsonFeedItem>,
30}
31
32#[derive(serde::Serialize)]
33pub struct JsonFeedItem {
34    pub id: String,
35    #[serde(skip_serializing_if = "Option::is_none")]
36    pub url: Option<String>,
37    #[serde(skip_serializing_if = "Option::is_none")]
38    pub title: Option<String>,
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub content_html: Option<String>,
41    #[serde(skip_serializing_if = "Option::is_none")]
42    pub date_published: Option<String>,
43    #[serde(skip_serializing_if = "Option::is_none")]
44    pub author: Option<JsonValue>, // allow simple or richer authors later
45}
46
47// Optional Atom support
48use atom_syndication::{
49    Content as AtomContent, Entry as AtomEntry, Feed as AtomFeed, Link as AtomLink,
50    Text as AtomText,
51};
52
53// Minimum body length (in chars) before we prefer it over description
54const MIN_BODY_PREVIEW_CHARS: usize = 80;
55
56// Convert file modification time → UTC
57fn systemtime_to_utc(st: SystemTime) -> DateTime<Utc> {
58    DateTime::<Utc>::from(st)
59}
60
61// Parse front-matter date formats
62fn deserialize_date<'de, D>(deserializer: D) -> Result<Option<DateTime<Utc>>, D::Error>
63where
64    D: Deserializer<'de>,
65{
66    let s: Option<String> = Option::deserialize(deserializer)?;
67
68    if let Some(date_str) = s {
69        if let Ok(dt) = DateTime::parse_from_rfc3339(&date_str) {
70            return Ok(Some(dt.with_timezone(&Utc)));
71        }
72
73        if let Ok(nd) = NaiveDate::parse_from_str(&date_str, "%Y-%m-%d") {
74            return Ok(Some(
75                Utc.from_utc_datetime(&nd.and_hms_opt(0, 0, 0).unwrap()),
76            ));
77        }
78    }
79    Ok(None)
80}
81
82/// Parsed YAML frontmatter for a single chapter.
83///
84/// Fields are used for feed metadata:
85/// - `title`: item title shown in the feed.
86/// - `date`: publish date for sorting and `pubDate` (RFC3339 or `YYYY-MM-DD`).
87/// - `author`: optional item author.
88/// - `description`: optional summary/preview override.
89#[derive(Debug, Deserialize, Clone)]
90pub struct FrontMatter {
91    pub title: String,
92
93    #[serde(deserialize_with = "deserialize_date")]
94    pub date: Option<DateTime<Utc>>,
95
96    pub author: Option<String>,
97    pub description: Option<String>, // User-supplied summary (optional)
98}
99
100/// A chapter plus its parsed metadata.
101///
102/// `Article` holds the frontmatter, full Markdown body, and the path relative
103/// to the mdBook `src` root. It is the internal representation used before
104/// converting to RSS items.
105#[derive(Debug)]
106pub struct Article {
107    pub fm: FrontMatter,
108    pub content: String,
109    pub path: String,
110}
111
112/// Parses a markdown file and returns an Article.
113///
114/// # Panics
115/// # Errors
116///
117/// This function will panic if the path has no file stem (e.g., if it's a directory or has no filename).
118/// Will return `Err` if path doesn't exist
119pub fn parse_markdown_file(root: &Path, path: &Path) -> Result<Article> {
120    let text = fs::read_to_string(path)?;
121
122    let mut lines = text.lines();
123    let mut yaml = String::new();
124    let mut in_yaml = false;
125
126    // Extract YAML front matter
127    for line in lines.by_ref() {
128        let trimmed = line.trim();
129        if trimmed == "---" {
130            if !in_yaml {
131                in_yaml = true;
132                continue;
133            }
134            break;
135        }
136        if in_yaml {
137            yaml.push_str(line);
138            yaml.push('\n');
139        }
140    }
141
142    // Markdown content after front matter
143    let content = lines.collect::<Vec<_>>().join("\n") + "\n";
144
145    let fallback_date = path
146        .metadata()
147        .ok()
148        .and_then(|m| m.modified().ok())
149        .map(systemtime_to_utc);
150
151    // Parse front matter
152    let fm = if yaml.trim().is_empty() {
153        FrontMatter {
154            title: path.file_stem().unwrap().to_string_lossy().into_owned(),
155            date: fallback_date,
156            author: None,
157            description: Some(content.clone()),
158        }
159    } else {
160        serde_yaml::from_str(&yaml).unwrap_or_else(|_| FrontMatter {
161            title: path.file_stem().unwrap().to_string_lossy().into_owned(),
162            date: fallback_date,
163            author: None,
164            description: Some(content.clone()),
165        })
166    };
167
168    let rel_path = path.strip_prefix(root).unwrap_or(path);
169
170    Ok(Article {
171        fm,
172        content,
173        path: rel_path.to_string_lossy().into_owned(),
174    })
175}
176
177/// Collect all Markdown chapters under `src_dir`.
178///
179/// Walks the directory tree, skipping `SUMMARY.md` and non-Markdown files,
180/// parses each chapter into an `Article`, then sorts the list newest → oldest
181/// based on frontmatter `date` (falling back to file modification time).
182/// Parses a markdown file and returns an Article.
183///
184/// # Panics
185/// # Errors
186/// This function will panic if the path has no file stem
187/// Will return `Err` if `src_dir` doesn't exist
188pub fn collect_articles(src_dir: &Path) -> Result<Vec<Article>> {
189    let mut articles = Vec::new();
190
191    for entry in WalkDir::new(src_dir).into_iter().filter_map(Result::ok) {
192        let path = entry.path();
193        if !path.is_file() {
194            continue;
195        }
196
197        let ext = path
198            .extension()
199            .and_then(|e| e.to_str())
200            .map(str::to_ascii_lowercase);
201
202        if !matches!(ext.as_deref(), Some("md" | "markdown")) {
203            continue;
204        }
205
206        if path
207            .file_name()
208            .unwrap()
209            .to_string_lossy()
210            .eq_ignore_ascii_case("SUMMARY.md")
211        {
212            continue;
213        }
214
215        if let Ok(article) = parse_markdown_file(src_dir, path) {
216            articles.push(article);
217        }
218    }
219
220    // Sort newest → oldest
221    articles.sort_by_key(|a| a.fm.date);
222    articles.reverse();
223
224    Ok(articles)
225}
226
227/// Render Markdown to HTML using `pulldown_cmark`.
228///
229/// This is used both for full-content feeds and for generating HTML previews
230/// from chapter bodies or frontmatter descriptions.
231fn markdown_to_html(md: &str) -> String {
232    let mut html = String::new();
233    let parser = Parser::new_ext(md, Options::all());
234    html::push_html(&mut html, parser);
235    html
236}
237
238/// Strip obvious leading boilerplate (TOCs, details, long definition blocks)
239/// so previews tend to start at the main intro text instead of metadata or
240/// navigation.
241fn strip_leading_boilerplate(md: &str) -> &str {
242    let mut seen_heading = false;
243    let mut byte_idx = 0;
244    let mut acc_bytes = 0;
245
246    for (i, line) in md.lines().enumerate() {
247        let line_len_with_nl = line.len() + 1; // assume '\n' separated
248
249        // Skip initial blank lines entirely
250        if i == 0 && line.trim().is_empty() {
251            acc_bytes += line_len_with_nl;
252            continue;
253        }
254
255        if line.trim_start().starts_with('#') {
256            seen_heading = true;
257        }
258
259        if seen_heading && line.trim().is_empty() {
260            // First blank line after heading: start preview after this
261            acc_bytes += line_len_with_nl;
262            byte_idx = acc_bytes;
263            break;
264        }
265
266        acc_bytes += line_len_with_nl;
267    }
268
269    if byte_idx == 0 {
270        md
271    } else {
272        &md[byte_idx.min(md.len())..]
273    }
274}
275
276/// Take at most `max_chars` worth of UTF‑8 text from `s`.
277fn utf8_prefix(s: &str, max_chars: usize) -> &str {
278    if max_chars == 0 {
279        return "";
280    }
281
282    let mut last_byte = 0;
283
284    for (ch_idx, (byte_idx, _)) in s.char_indices().enumerate() {
285        if ch_idx == max_chars {
286            last_byte = byte_idx;
287            break;
288        }
289        last_byte = byte_idx + 1;
290    }
291
292    if last_byte == 0 || last_byte >= s.len() {
293        s
294    } else {
295        &s[..last_byte]
296    }
297}
298
299/// Return the first few `<p>` blocks from an HTML fragment.
300///
301/// This is used to build the `<description>` preview for each item. At most
302/// `max_paragraphs` paragraphs are included, and the result is truncated to
303/// `max_chars` characters (UTF‑8 safe). If no `<p>` is found, the original
304/// HTML is returned unchanged.
305fn html_first_paragraphs(html: &str, max_paragraphs: usize, max_chars: usize) -> String {
306    let mut out = String::new();
307    let mut start = 0;
308    let mut count = 0;
309
310    while count < max_paragraphs {
311        // Find next <p ...>
312        let Some(rel) = html[start..].find("<p") else {
313            break;
314        };
315        let p_start = start + rel;
316
317        // Find the end of this paragraph
318        let Some(rel_close) = html[p_start..].find("</p>") else {
319            break;
320        };
321        let close = p_start + rel_close + "</p>".len();
322
323        let para = &html[p_start..close];
324        out.push_str(para);
325        count += 1;
326        start = close;
327    }
328
329    // If no <p> found, fall back to original HTML
330    if out.is_empty() {
331        out = html.to_string();
332    }
333
334    // UTF‑8 safe trim by character count
335    if out.chars().count() > max_chars {
336        out.chars().take(max_chars).collect()
337    } else {
338        out
339    }
340}
341
342/// One generated RSS feed file.
343///
344/// `filename` is the relative file name written into `src/` (for example
345/// `rss.xml` or `rss2.xml`). `channel` is the corresponding RSS 2.0 channel.
346pub struct FeedPage {
347    pub filename: String, // e.g. "rss.xml", "rss2.xml"
348    pub channel: Channel,
349}
350
351/// Result of building feeds for a book.
352///
353/// In simple setups this will contain a single `rss.xml` page. When pagination
354/// is enabled it contains multiple `FeedPage`s (e.g. `rss.xml`, `rss2.xml`,
355/// `rss3.xml`, …) each with a slice of the overall item list.
356pub struct BuildResult {
357    pub pages: Vec<FeedPage>,
358}
359
360/// Convert an RSS 2.0 channel into a JSON Feed 1.1 structure.
361///
362/// Used when `json-feed = true` in the configuration.
363#[must_use]
364pub fn rss_to_json_feed(
365    channel: &Channel,
366    feed_url: Option<&str>,
367    next_url: Option<&str>,
368) -> JsonFeed {
369    let items: Vec<JsonFeedItem> = channel
370        .items()
371        .iter()
372        .map(|item| {
373            let id = item
374                .guid()
375                .map(|g| g.value().to_string())
376                .or_else(|| item.link().map(std::string::ToString::to_string))
377                .unwrap_or_else(|| item.title().unwrap_or("").to_string());
378
379            let url = item.link().map(std::string::ToString::to_string);
380            let title = item.title().map(std::string::ToString::to_string);
381            let content_html = item.description().map(std::string::ToString::to_string);
382            let date_published = item.pub_date().and_then(|d| {
383                DateTime::parse_from_rfc2822(d)
384                    .ok()
385                    .map(|dt| dt.to_rfc3339())
386            });
387
388            let author = item.author().map(|a| serde_json::json!({ "name": a }));
389
390            JsonFeedItem {
391                id,
392                url,
393                title,
394                content_html,
395                date_published,
396                author,
397            }
398        })
399        .collect();
400
401    JsonFeed {
402        version: "https://jsonfeed.org/version/1.1".to_string(),
403        title: channel.title().to_string(),
404        home_page_url: Some(channel.link().to_string()),
405        feed_url: feed_url.map(std::string::ToString::to_string),
406        description: Some(channel.description().to_string()),
407        next_url: next_url.map(std::string::ToString::to_string),
408        items,
409    }
410}
411/// Convert an RSS 2.0 channel into a minimal Atom 1.0 feed.
412///
413/// This is a best-effort mapping used when `atom = true` in the configuration.
414/// It copies titles, links, descriptions (as HTML content), and dates where
415/// available.
416#[must_use]
417pub fn rss_to_atom(channel: &Channel) -> AtomFeed {
418    let entries: Vec<AtomEntry> = channel
419        .items()
420        .iter()
421        .map(|item| {
422            let mut entry = AtomEntry::default();
423
424            // Stable per-entry id: prefer guid, then link, then title
425            let entry_id = item
426                .guid()
427                .map(|g| g.value().to_string())
428                .or_else(|| item.link().map(std::string::ToString::to_string))
429                .unwrap_or_else(|| item.title().unwrap_or("").to_string());
430            entry.set_id(entry_id);
431
432            if let Some(title) = item.title() {
433                entry.set_title(title.to_string());
434            }
435
436            if let Some(link) = item.link() {
437                entry.set_links(vec![AtomLink {
438                    href: link.to_string(),
439                    ..Default::default()
440                }]);
441            }
442
443            if let Some(desc) = item.description() {
444                let mut content = AtomContent::default();
445                content.set_content_type("html".to_string());
446                content.set_value(Some(desc.to_string()));
447                entry.set_content(Some(content));
448            }
449
450            if let Some(Ok(dt)) = item.pub_date().map(DateTime::parse_from_rfc2822) {
451                entry.set_updated(dt);
452            }
453
454            entry
455        })
456        .collect();
457
458    let mut feed = AtomFeed::default();
459    feed.set_title(channel.title().to_string());
460    feed.set_entries(entries);
461
462    let link = channel.link();
463    if link.is_empty() {
464        // Fallback id if link is somehow empty
465        feed.set_id(channel.title().to_string());
466    } else {
467        feed.set_links(vec![AtomLink {
468            href: link.to_string(),
469            ..Default::default()
470        }]);
471        // Use the public feed URL as a stable Atom feed id
472        feed.set_id(link.to_string());
473    }
474
475    let desc = channel.description();
476    if !desc.is_empty() {
477        feed.set_subtitle(Some(AtomText {
478            value: desc.to_string(),
479            ..Default::default()
480        }));
481    }
482
483    feed
484}
485
486/// Build one or more RSS 2.0 feeds for an mdBook.
487///
488/// This scans `src_dir` for chapters, extracts frontmatter, generates HTML
489/// previews, and returns a `BuildResult` containing one or more `FeedPage`s.
490/// The first page is always `rss.xml`; when `paginated` is `true` and
491/// `max_items > 0`, additional pages `rss2.xml`, `rss3.xml`, … are created.
492///
493/// Arguments:
494/// - `src_dir`: mdBook `src` directory to scan for `.md` files.
495/// - `title`: feed title, usually `config.book.title`.
496/// - `site_url`: public base URL of the rendered site (no trailing slash).
497/// - `description`: top-level feed description.
498/// - `full_preview`: when `true`, include full chapter content instead of a
499///   shortened preview in `<description>`.
500/// - `max_items`: maximum items per feed page when pagination is enabled.
501/// - `paginated`: enable or disable multi-page feeds.
502/// # Errors
503/// On success, the caller is responsible for writing each `FeedPage`'s channel
504/// to disk at `pages[i].filename`.
505/// Will return `Err` if:
506/// - The `src_dir` can't be accessed or doesn't exist
507/// - `collect_articles` fails to read or parse the md files
508/// - There are underlying I/O issues when walking the directory tree
509pub fn build_feed(
510    src_dir: &Path,
511    title: &str,
512    site_url: &str,
513    description: &str,
514    full_preview: bool,
515    max_items: usize,
516    paginated: bool,
517) -> Result<BuildResult> {
518    let articles = collect_articles(src_dir)?;
519
520    let base_url = site_url.trim_end_matches('/');
521
522    let items: Vec<Item> = articles
523        .into_iter()
524        .map(|article| {
525            // Build correct .html path
526            let html_path = article
527                .path
528                .replace('\\', "/")
529                .replace(".md", ".html")
530                .replace("/README.html", "/index.html");
531
532            let link = format!("{base_url}/{html_path}");
533
534            // Hybrid preview source selection
535            let content_trimmed = article.content.trim();
536
537            // Count chars to decide if body is "very short"
538            let _body_len = content_trimmed.chars().count();
539
540            // 1) Choose base markdown (body vs description)
541            let mut source_md: &str;
542
543            if full_preview {
544                // Full-content mode: always use the full body markdown
545                source_md = article.content.as_str();
546            } else {
547                // Only consider the first slice of markdown for preview
548                const PREVIEW_MD_SLICE_CHARS: usize = 4000;
549                // Preview mode: existing hybrid logic (body vs description, boilerplate strip, slice)
550                let content_trimmed = article.content.trim();
551                let body_len = content_trimmed.chars().count();
552
553                source_md =
554                    if body_len >= MIN_BODY_PREVIEW_CHARS || article.fm.description.is_none() {
555                        content_trimmed
556                    } else {
557                        article.fm.description.as_deref().unwrap_or(content_trimmed)
558                    };
559
560                // Strip obvious leading boilerplate so we start near the intro text
561                source_md = strip_leading_boilerplate(source_md);
562
563                source_md = utf8_prefix(source_md, PREVIEW_MD_SLICE_CHARS);
564            }
565
566            // Convert chosen markdown source → HTML
567            let raw_html = markdown_to_html(source_md);
568
569            // Use either full HTML or first few paragraphs as preview
570            let preview = if full_preview {
571                raw_html
572            } else {
573                html_first_paragraphs(&raw_html, 3, 800)
574            };
575
576            let mut item = ItemBuilder::default();
577
578            item.title(Some(article.fm.title.clone()));
579            item.link(Some(link.clone()));
580            item.description(Some(preview)); // Stored directly inside CDATA
581            item.guid(Some(Guid {
582                value: link,
583                permalink: true,
584            }));
585
586            if let Some(date) = article.fm.date {
587                item.pub_date(Some(date.to_rfc2822()));
588            }
589
590            if let Some(author) = article.fm.author {
591                item.author(Some(author));
592            }
593
594            item.build()
595        })
596        .collect();
597
598    // Helper to construct a single Channel with a slice of items
599    let build_channel_for_slice =
600        |slice: &[Item], _page_idx: usize, _total_pages: usize| -> Channel {
601            ChannelBuilder::default()
602                .title(title)
603                .link(format!("{base_url}/"))
604                .description(description)
605                .items(slice.to_vec())
606                .generator(Some("mdbook-rss-feed 1.0.0".to_string()))
607                .build()
608        };
609
610    let mut pages = Vec::new();
611
612    if !paginated || max_items == 0 || items.len() <= max_items {
613        // Single feed (no pagination)
614        let channel = build_channel_for_slice(&items, 1, 1);
615        pages.push(FeedPage {
616            filename: "rss.xml".to_string(),
617            channel,
618        });
619    } else {
620        // Split into pages of size max_items
621        let total_pages = items.len().div_ceil(max_items);
622
623        for page_idx in 0..total_pages {
624            let start = page_idx * max_items;
625            let end = (start + max_items).min(items.len());
626            let slice = &items[start..end];
627
628            let filename = if page_idx == 0 {
629                "rss.xml".to_string()
630            } else {
631                format!("rss{}.xml", page_idx + 1)
632            };
633
634            let channel = build_channel_for_slice(slice, page_idx + 1, total_pages);
635
636            pages.push(FeedPage { filename, channel });
637        }
638    }
639
640    Ok(BuildResult { pages })
641}