typstify_generator/
collector.rs

1//! Content collection and organization.
2//!
3//! Walks the content directory and collects all pages into a structured hierarchy.
4
5use std::{
6    collections::HashMap,
7    fs,
8    path::{Path, PathBuf},
9};
10
11use rayon::prelude::*;
12use thiserror::Error;
13use tracing::{debug, info, warn};
14use typstify_core::{Config, ContentPath, ContentType, Page};
15use typstify_parser::ParserRegistry;
16
17/// Content collection errors.
18#[derive(Debug, Error)]
19pub enum CollectorError {
20    /// IO error.
21    #[error("IO error: {0}")]
22    Io(#[from] std::io::Error),
23
24    /// Parser error.
25    #[error("parse error in {path}: {message}")]
26    Parse { path: PathBuf, message: String },
27
28    /// Invalid content path.
29    #[error("invalid content path: {0}")]
30    InvalidPath(PathBuf),
31}
32
33/// Result type for collector operations.
34pub type Result<T> = std::result::Result<T, CollectorError>;
35
36/// Collected site content.
37#[derive(Debug, Default)]
38pub struct SiteContent {
39    /// All pages indexed by slug.
40    pub pages: HashMap<String, Page>,
41
42    /// Pages organized by section (first path component).
43    pub sections: HashMap<String, Vec<String>>,
44
45    /// Taxonomy term to page slugs mapping.
46    pub taxonomies: TaxonomyIndex,
47
48    /// Translation groups (canonical_id -> [slugs]).
49    pub translations: HashMap<String, Vec<String>>,
50}
51
52/// Index of taxonomy terms.
53#[derive(Debug, Default)]
54pub struct TaxonomyIndex {
55    /// Tag -> page slugs.
56    pub tags: HashMap<String, Vec<String>>,
57
58    /// Category -> page slugs.
59    pub categories: HashMap<String, Vec<String>>,
60}
61
62/// Content collector that walks directories and parses files.
63#[derive(Debug)]
64pub struct ContentCollector {
65    config: Config,
66    parser: ParserRegistry,
67    content_dir: PathBuf,
68}
69
70impl ContentCollector {
71    /// Create a new content collector.
72    #[must_use]
73    pub fn new(config: Config, content_dir: impl Into<PathBuf>) -> Self {
74        Self {
75            config,
76            parser: ParserRegistry::new(),
77            content_dir: content_dir.into(),
78        }
79    }
80
81    /// Collect all content from the content directory.
82    pub fn collect(&self) -> Result<SiteContent> {
83        info!(dir = %self.content_dir.display(), "collecting content");
84
85        // Find all content files
86        let files = self.find_content_files()?;
87        info!(count = files.len(), "found content files");
88
89        // Parse files in parallel
90        let pages: Vec<_> = files
91            .par_iter()
92            .filter_map(|path| {
93                match self.parse_file(path) {
94                    Ok(page) => {
95                        // Filter drafts unless configured to include them
96                        if page.draft && !self.config.build.drafts {
97                            debug!(url = %page.url, "skipping draft");
98                            None
99                        } else {
100                            Some(page)
101                        }
102                    }
103                    Err(e) => {
104                        warn!(path = %path.display(), error = %e, "failed to parse file");
105                        None
106                    }
107                }
108            })
109            .collect();
110
111        // Build site content structure
112        let mut content = SiteContent::default();
113
114        for page in pages {
115            let url = page.url.clone();
116            let slug = url.trim_start_matches('/').to_string();
117
118            // Add to sections
119            let section = slug.split('/').next().unwrap_or("").to_string();
120            if !section.is_empty() {
121                content
122                    .sections
123                    .entry(section)
124                    .or_default()
125                    .push(url.clone());
126            }
127
128            // Index taxonomies
129            for tag in &page.tags {
130                content
131                    .taxonomies
132                    .tags
133                    .entry(tag.clone())
134                    .or_default()
135                    .push(url.clone());
136            }
137            for category in &page.categories {
138                content
139                    .taxonomies
140                    .categories
141                    .entry(category.clone())
142                    .or_default()
143                    .push(url.clone());
144            }
145
146            // Index translations
147            if !page.canonical_id.is_empty() {
148                content
149                    .translations
150                    .entry(page.canonical_id.clone())
151                    .or_default()
152                    .push(url.clone());
153            }
154
155            content.pages.insert(url, page);
156        }
157
158        info!(
159            pages = content.pages.len(),
160            sections = content.sections.len(),
161            tags = content.taxonomies.tags.len(),
162            categories = content.taxonomies.categories.len(),
163            "content collection complete"
164        );
165
166        Ok(content)
167    }
168
169    /// Find all content files recursively.
170    fn find_content_files(&self) -> Result<Vec<PathBuf>> {
171        let mut files = Vec::new();
172        self.walk_dir(&self.content_dir, &mut files)?;
173        Ok(files)
174    }
175
176    /// Recursively walk a directory for content files.
177    fn walk_dir(&self, dir: &Path, files: &mut Vec<PathBuf>) -> Result<()> {
178        if !dir.exists() {
179            return Ok(());
180        }
181
182        for entry in fs::read_dir(dir)? {
183            let entry = entry?;
184            let path = entry.path();
185
186            if path.is_dir() {
187                // Skip hidden directories
188                if path
189                    .file_name()
190                    .is_some_and(|n| n.to_string_lossy().starts_with('.'))
191                {
192                    continue;
193                }
194                self.walk_dir(&path, files)?;
195            } else if path.is_file() {
196                // Check if it's a content file
197                if let Some(ext) = path.extension()
198                    && ContentType::from_extension(&ext.to_string_lossy()).is_some()
199                {
200                    files.push(path);
201                }
202            }
203        }
204
205        Ok(())
206    }
207
208    /// Parse a single content file into a Page.
209    fn parse_file(&self, path: &Path) -> Result<Page> {
210        debug!(path = %path.display(), "parsing file");
211
212        // Read file content
213        let content = fs::read_to_string(path)?;
214
215        // Parse content path to extract slug and language
216        let relative_path = path.strip_prefix(&self.content_dir).unwrap_or(path);
217        let content_path =
218            ContentPath::from_path(relative_path, &self.config.site.default_language)
219                .ok_or_else(|| CollectorError::InvalidPath(path.to_path_buf()))?;
220
221        // Parse content using appropriate parser
222        let parsed = self
223            .parser
224            .parse(&content, path)
225            .map_err(|e| CollectorError::Parse {
226                path: path.to_path_buf(),
227                message: e.to_string(),
228            })?;
229
230        Ok(Page::from_parsed(parsed, &content_path))
231    }
232
233    /// Get pages sorted by date (newest first).
234    pub fn pages_by_date(content: &SiteContent) -> Vec<&Page> {
235        let mut pages: Vec<_> = content.pages.values().collect();
236        pages.sort_by(|a, b| match (&b.date, &a.date) {
237            (Some(b_date), Some(a_date)) => b_date.cmp(a_date),
238            (Some(_), None) => std::cmp::Ordering::Less,
239            (None, Some(_)) => std::cmp::Ordering::Greater,
240            (None, None) => a.title.cmp(&b.title),
241        });
242        pages
243    }
244
245    /// Get pages for a specific section, sorted by date.
246    pub fn section_pages<'a>(content: &'a SiteContent, section: &str) -> Vec<&'a Page> {
247        let mut pages: Vec<_> = content
248            .sections
249            .get(section)
250            .map(|urls| urls.iter().filter_map(|u| content.pages.get(u)).collect())
251            .unwrap_or_default();
252
253        pages.sort_by(|a, b| match (&b.date, &a.date) {
254            (Some(b_date), Some(a_date)) => b_date.cmp(a_date),
255            (Some(_), None) => std::cmp::Ordering::Less,
256            (None, Some(_)) => std::cmp::Ordering::Greater,
257            (None, None) => a.title.cmp(&b.title),
258        });
259        pages
260    }
261
262    /// Get pages for a taxonomy term, sorted by date.
263    pub fn taxonomy_pages<'a>(
264        content: &'a SiteContent,
265        taxonomy: &str,
266        term: &str,
267    ) -> Vec<&'a Page> {
268        let urls = match taxonomy {
269            "tags" => content.taxonomies.tags.get(term),
270            "categories" => content.taxonomies.categories.get(term),
271            _ => None,
272        };
273
274        let mut pages: Vec<_> = urls
275            .map(|u| u.iter().filter_map(|url| content.pages.get(url)).collect())
276            .unwrap_or_default();
277
278        pages.sort_by(|a, b| match (&b.date, &a.date) {
279            (Some(b_date), Some(a_date)) => b_date.cmp(a_date),
280            (Some(_), None) => std::cmp::Ordering::Less,
281            (None, Some(_)) => std::cmp::Ordering::Greater,
282            (None, None) => a.title.cmp(&b.title),
283        });
284        pages
285    }
286}
287
288/// Paginate a slice of items.
289pub fn paginate<T>(items: &[T], page: usize, per_page: usize) -> (&[T], usize) {
290    let total_pages = items.len().div_ceil(per_page);
291    let start = (page - 1) * per_page;
292    let end = (start + per_page).min(items.len());
293
294    if start >= items.len() {
295        (&[], total_pages)
296    } else {
297        (&items[start..end], total_pages)
298    }
299}
300
301#[cfg(test)]
302mod tests {
303    use std::collections::HashMap;
304
305    use super::*;
306
307    #[allow(dead_code)]
308    fn test_config() -> Config {
309        Config {
310            site: typstify_core::config::SiteConfig {
311                title: "Test Site".to_string(),
312                base_url: "https://example.com".to_string(),
313                default_language: "en".to_string(),
314                description: None,
315                author: None,
316            },
317            languages: HashMap::new(),
318            build: typstify_core::config::BuildConfig {
319                drafts: false,
320                ..Default::default()
321            },
322            search: typstify_core::config::SearchConfig::default(),
323            rss: typstify_core::config::RssConfig::default(),
324            robots: typstify_core::config::RobotsConfig::default(),
325            taxonomies: typstify_core::config::TaxonomyConfig::default(),
326        }
327    }
328
329    #[test]
330    fn test_paginate() {
331        let items = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
332
333        let (page1, total) = paginate(&items, 1, 3);
334        assert_eq!(page1, &[1, 2, 3]);
335        assert_eq!(total, 4);
336
337        let (page2, _) = paginate(&items, 2, 3);
338        assert_eq!(page2, &[4, 5, 6]);
339
340        let (page4, _) = paginate(&items, 4, 3);
341        assert_eq!(page4, &[10]);
342
343        let (page5, _) = paginate(&items, 5, 3);
344        assert!(page5.is_empty());
345    }
346
347    #[test]
348    fn test_taxonomy_index() {
349        let mut index = TaxonomyIndex::default();
350        index.tags.insert(
351            "rust".to_string(),
352            vec!["post1".to_string(), "post2".to_string()],
353        );
354        index
355            .tags
356            .insert("web".to_string(), vec!["post2".to_string()]);
357
358        assert_eq!(index.tags.get("rust").unwrap().len(), 2);
359        assert_eq!(index.tags.get("web").unwrap().len(), 1);
360        assert!(!index.tags.contains_key("python"));
361    }
362
363    #[test]
364    fn test_site_content_default() {
365        let content = SiteContent::default();
366        assert!(content.pages.is_empty());
367        assert!(content.sections.is_empty());
368        assert!(content.taxonomies.tags.is_empty());
369    }
370}