typstify_generator/
collector.rs

1//! Content collection and organization.
2//!
3//! Walks the content directory and collects all pages into a structured hierarchy.
4
5use std::{
6    collections::HashMap,
7    fs,
8    path::{Path, PathBuf},
9};
10
11use rayon::prelude::*;
12use thiserror::Error;
13use tracing::{debug, info, warn};
14use typstify_core::{Config, ContentPath, ContentType, Page};
15use typstify_parser::ParserRegistry;
16
17/// Content collection errors.
18#[derive(Debug, Error)]
19pub enum CollectorError {
20    /// IO error.
21    #[error("IO error: {0}")]
22    Io(#[from] std::io::Error),
23
24    /// Parser error.
25    #[error("parse error in {path}: {message}")]
26    Parse { path: PathBuf, message: String },
27
28    /// Invalid content path.
29    #[error("invalid content path: {0}")]
30    InvalidPath(PathBuf),
31}
32
33/// Result type for collector operations.
34pub type Result<T> = std::result::Result<T, CollectorError>;
35
36/// Collected site content.
37#[derive(Debug, Default)]
38pub struct SiteContent {
39    /// All pages indexed by slug.
40    pub pages: HashMap<String, Page>,
41
42    /// Pages organized by section (first path component).
43    pub sections: HashMap<String, Vec<String>>,
44
45    /// Taxonomy term to page slugs mapping.
46    pub taxonomies: TaxonomyIndex,
47}
48
49/// Index of taxonomy terms.
50#[derive(Debug, Default)]
51pub struct TaxonomyIndex {
52    /// Tag -> page slugs.
53    pub tags: HashMap<String, Vec<String>>,
54
55    /// Category -> page slugs.
56    pub categories: HashMap<String, Vec<String>>,
57}
58
59/// Content collector that walks directories and parses files.
60#[derive(Debug)]
61pub struct ContentCollector {
62    config: Config,
63    parser: ParserRegistry,
64    content_dir: PathBuf,
65}
66
67impl ContentCollector {
68    /// Create a new content collector.
69    #[must_use]
70    pub fn new(config: Config, content_dir: impl Into<PathBuf>) -> Self {
71        Self {
72            config,
73            parser: ParserRegistry::new(),
74            content_dir: content_dir.into(),
75        }
76    }
77
78    /// Collect all content from the content directory.
79    pub fn collect(&self) -> Result<SiteContent> {
80        info!(dir = %self.content_dir.display(), "collecting content");
81
82        // Find all content files
83        let files = self.find_content_files()?;
84        info!(count = files.len(), "found content files");
85
86        // Parse files in parallel
87        let pages: Vec<_> = files
88            .par_iter()
89            .filter_map(|path| {
90                match self.parse_file(path) {
91                    Ok(page) => {
92                        // Filter drafts unless configured to include them
93                        if page.draft && !self.config.build.drafts {
94                            debug!(url = %page.url, "skipping draft");
95                            None
96                        } else {
97                            Some(page)
98                        }
99                    }
100                    Err(e) => {
101                        warn!(path = %path.display(), error = %e, "failed to parse file");
102                        None
103                    }
104                }
105            })
106            .collect();
107
108        // Build site content structure
109        let mut content = SiteContent::default();
110
111        for page in pages {
112            let url = page.url.clone();
113            let slug = url.trim_start_matches('/').to_string();
114
115            // Add to sections
116            let section = slug.split('/').next().unwrap_or("").to_string();
117            if !section.is_empty() {
118                content
119                    .sections
120                    .entry(section)
121                    .or_default()
122                    .push(url.clone());
123            }
124
125            // Index taxonomies
126            for tag in &page.tags {
127                content
128                    .taxonomies
129                    .tags
130                    .entry(tag.clone())
131                    .or_default()
132                    .push(url.clone());
133            }
134            for category in &page.categories {
135                content
136                    .taxonomies
137                    .categories
138                    .entry(category.clone())
139                    .or_default()
140                    .push(url.clone());
141            }
142
143            content.pages.insert(url, page);
144        }
145
146        info!(
147            pages = content.pages.len(),
148            sections = content.sections.len(),
149            tags = content.taxonomies.tags.len(),
150            categories = content.taxonomies.categories.len(),
151            "content collection complete"
152        );
153
154        Ok(content)
155    }
156
157    /// Find all content files recursively.
158    fn find_content_files(&self) -> Result<Vec<PathBuf>> {
159        let mut files = Vec::new();
160        self.walk_dir(&self.content_dir, &mut files)?;
161        Ok(files)
162    }
163
164    /// Recursively walk a directory for content files.
165    fn walk_dir(&self, dir: &Path, files: &mut Vec<PathBuf>) -> Result<()> {
166        if !dir.exists() {
167            return Ok(());
168        }
169
170        for entry in fs::read_dir(dir)? {
171            let entry = entry?;
172            let path = entry.path();
173
174            if path.is_dir() {
175                // Skip hidden directories
176                if path
177                    .file_name()
178                    .is_some_and(|n| n.to_string_lossy().starts_with('.'))
179                {
180                    continue;
181                }
182                self.walk_dir(&path, files)?;
183            } else if path.is_file() {
184                // Check if it's a content file
185                if let Some(ext) = path.extension()
186                    && ContentType::from_extension(&ext.to_string_lossy()).is_some()
187                {
188                    files.push(path);
189                }
190            }
191        }
192
193        Ok(())
194    }
195
196    /// Parse a single content file into a Page.
197    fn parse_file(&self, path: &Path) -> Result<Page> {
198        debug!(path = %path.display(), "parsing file");
199
200        // Read file content
201        let content = fs::read_to_string(path)?;
202
203        // Parse content path to extract slug and language
204        let relative_path = path.strip_prefix(&self.content_dir).unwrap_or(path);
205        let content_path =
206            ContentPath::from_path(relative_path, &self.config.site.default_language)
207                .ok_or_else(|| CollectorError::InvalidPath(path.to_path_buf()))?;
208
209        // Parse content using appropriate parser
210        let parsed = self
211            .parser
212            .parse(&content, path)
213            .map_err(|e| CollectorError::Parse {
214                path: path.to_path_buf(),
215                message: e.to_string(),
216            })?;
217
218        Ok(Page::from_parsed(parsed, &content_path))
219    }
220
221    /// Get pages sorted by date (newest first).
222    pub fn pages_by_date(content: &SiteContent) -> Vec<&Page> {
223        let mut pages: Vec<_> = content.pages.values().collect();
224        pages.sort_by(|a, b| match (&b.date, &a.date) {
225            (Some(b_date), Some(a_date)) => b_date.cmp(a_date),
226            (Some(_), None) => std::cmp::Ordering::Less,
227            (None, Some(_)) => std::cmp::Ordering::Greater,
228            (None, None) => a.title.cmp(&b.title),
229        });
230        pages
231    }
232
233    /// Get pages for a specific section, sorted by date.
234    pub fn section_pages<'a>(content: &'a SiteContent, section: &str) -> Vec<&'a Page> {
235        let mut pages: Vec<_> = content
236            .sections
237            .get(section)
238            .map(|urls| urls.iter().filter_map(|u| content.pages.get(u)).collect())
239            .unwrap_or_default();
240
241        pages.sort_by(|a, b| match (&b.date, &a.date) {
242            (Some(b_date), Some(a_date)) => b_date.cmp(a_date),
243            (Some(_), None) => std::cmp::Ordering::Less,
244            (None, Some(_)) => std::cmp::Ordering::Greater,
245            (None, None) => a.title.cmp(&b.title),
246        });
247        pages
248    }
249
250    /// Get pages for a taxonomy term, sorted by date.
251    pub fn taxonomy_pages<'a>(
252        content: &'a SiteContent,
253        taxonomy: &str,
254        term: &str,
255    ) -> Vec<&'a Page> {
256        let urls = match taxonomy {
257            "tags" => content.taxonomies.tags.get(term),
258            "categories" => content.taxonomies.categories.get(term),
259            _ => None,
260        };
261
262        let mut pages: Vec<_> = urls
263            .map(|u| u.iter().filter_map(|url| content.pages.get(url)).collect())
264            .unwrap_or_default();
265
266        pages.sort_by(|a, b| match (&b.date, &a.date) {
267            (Some(b_date), Some(a_date)) => b_date.cmp(a_date),
268            (Some(_), None) => std::cmp::Ordering::Less,
269            (None, Some(_)) => std::cmp::Ordering::Greater,
270            (None, None) => a.title.cmp(&b.title),
271        });
272        pages
273    }
274}
275
276/// Paginate a slice of items.
277pub fn paginate<T>(items: &[T], page: usize, per_page: usize) -> (&[T], usize) {
278    let total_pages = items.len().div_ceil(per_page);
279    let start = (page - 1) * per_page;
280    let end = (start + per_page).min(items.len());
281
282    if start >= items.len() {
283        (&[], total_pages)
284    } else {
285        (&items[start..end], total_pages)
286    }
287}
288
289#[cfg(test)]
290mod tests {
291    use super::*;
292
293    #[allow(dead_code)]
294    fn test_config() -> Config {
295        Config {
296            site: typstify_core::config::SiteConfig {
297                title: "Test Site".to_string(),
298                base_url: "https://example.com".to_string(),
299                default_language: "en".to_string(),
300                languages: vec!["en".to_string()],
301                description: None,
302                author: None,
303            },
304            build: typstify_core::config::BuildConfig {
305                drafts: false,
306                ..Default::default()
307            },
308            search: typstify_core::config::SearchConfig::default(),
309            rss: typstify_core::config::RssConfig::default(),
310            taxonomies: typstify_core::config::TaxonomyConfig::default(),
311        }
312    }
313
314    #[test]
315    fn test_paginate() {
316        let items = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
317
318        let (page1, total) = paginate(&items, 1, 3);
319        assert_eq!(page1, &[1, 2, 3]);
320        assert_eq!(total, 4);
321
322        let (page2, _) = paginate(&items, 2, 3);
323        assert_eq!(page2, &[4, 5, 6]);
324
325        let (page4, _) = paginate(&items, 4, 3);
326        assert_eq!(page4, &[10]);
327
328        let (page5, _) = paginate(&items, 5, 3);
329        assert!(page5.is_empty());
330    }
331
332    #[test]
333    fn test_taxonomy_index() {
334        let mut index = TaxonomyIndex::default();
335        index.tags.insert(
336            "rust".to_string(),
337            vec!["post1".to_string(), "post2".to_string()],
338        );
339        index
340            .tags
341            .insert("web".to_string(), vec!["post2".to_string()]);
342
343        assert_eq!(index.tags.get("rust").unwrap().len(), 2);
344        assert_eq!(index.tags.get("web").unwrap().len(), 1);
345        assert!(index.tags.get("python").is_none());
346    }
347
348    #[test]
349    fn test_site_content_default() {
350        let content = SiteContent::default();
351        assert!(content.pages.is_empty());
352        assert!(content.sections.is_empty());
353        assert!(content.taxonomies.tags.is_empty());
354    }
355}