Skip to main content

llm_wiki/ops/
export.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3
4use anyhow::{Context, Result};
5use serde::{Deserialize, Serialize};
6use tantivy::collector::TopDocs;
7use tantivy::query::AllQuery;
8use tantivy::schema::Value;
9
10use crate::engine::EngineState;
11use crate::index_schema::IndexSchema;
12
13// ── Types ─────────────────────────────────────────────────────────────────────
14
15/// Options controlling the wiki export operation.
16#[derive(Debug, Clone)]
17pub struct ExportOptions {
18    /// Name of the wiki to export.
19    pub wiki: String,
20    /// Output path — resolved against wiki root if relative.
21    pub path: Option<String>,
22    /// Output format: llms-txt, llms-full, or JSON.
23    pub format: ExportFormat,
24    /// Whether to include archived pages (default: false).
25    pub include_archived: bool,
26}
27
28/// Supported wiki export formats.
29#[derive(Debug, Clone, Default, PartialEq)]
30pub enum ExportFormat {
31    /// Compact llms.txt listing with titles and summaries.
32    #[default]
33    LlmsTxt,
34    /// Full llms.txt with complete page bodies.
35    LlmsFull,
36    /// JSON array of page entries with metadata and bodies.
37    Json,
38}
39
40impl ExportFormat {
41    /// Return the canonical string representation of the format.
42    pub fn as_str(&self) -> &'static str {
43        match self {
44            ExportFormat::LlmsTxt => "llms-txt",
45            ExportFormat::LlmsFull => "llms-full",
46            ExportFormat::Json => "json",
47        }
48    }
49
50    /// Parse a format string; falls back to `LlmsTxt` for unrecognised input.
51    pub fn parse(s: &str) -> Self {
52        match s {
53            "llms-full" => ExportFormat::LlmsFull,
54            "json" => ExportFormat::Json,
55            _ => ExportFormat::LlmsTxt,
56        }
57    }
58}
59
60/// Summary of a completed wiki export.
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ExportReport {
63    /// Absolute path of the written output file.
64    pub path: String,
65    /// Number of pages written to the output.
66    pub pages_written: usize,
67    /// Total bytes written.
68    pub bytes: usize,
69    /// Name of the format used (e.g. `"llms-txt"`).
70    pub format: String,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
74struct PageEntry {
75    slug: String,
76    uri: String,
77    title: String,
78    r#type: String,
79    status: String,
80    confidence: f64,
81    summary: String,
82    #[serde(skip_serializing_if = "Option::is_none")]
83    body: Option<String>,
84}
85
86// ── export ────────────────────────────────────────────────────────────────────
87
88/// Export a wiki to a file in the requested format.
89pub fn export(engine: &EngineState, options: &ExportOptions) -> Result<ExportReport> {
90    let space = engine.space(&options.wiki)?;
91    let wiki_root = &space.wiki_root;
92
93    let resolved_path = resolve_path(options.path.as_deref(), wiki_root);
94
95    let searcher = space.index_manager.searcher()?;
96    let is = &space.index_schema;
97
98    let pages = collect_pages(&searcher, is, &options.wiki, options.include_archived)?;
99
100    let need_bodies = matches!(options.format, ExportFormat::LlmsFull | ExportFormat::Json);
101    let pages = if need_bodies {
102        load_bodies(pages, wiki_root)?
103    } else {
104        pages
105    };
106
107    let content = match options.format {
108        ExportFormat::LlmsTxt => render_llms_txt(&pages, &options.wiki),
109        ExportFormat::LlmsFull => render_llms_full(&pages, &options.wiki),
110        ExportFormat::Json => {
111            serde_json::to_string_pretty(&pages).context("failed to serialize pages to JSON")?
112        }
113    };
114
115    if let Some(parent) = resolved_path.parent()
116        && !parent.as_os_str().is_empty()
117    {
118        std::fs::create_dir_all(parent)
119            .with_context(|| format!("failed to create directory {}", parent.display()))?;
120    }
121    std::fs::write(&resolved_path, &content)
122        .with_context(|| format!("failed to write export to {}", resolved_path.display()))?;
123
124    Ok(ExportReport {
125        path: resolved_path.to_string_lossy().to_string(),
126        pages_written: pages.len(),
127        bytes: content.len(),
128        format: options.format.as_str().to_string(),
129    })
130}
131
132// ── Helpers ───────────────────────────────────────────────────────────────────
133
134fn resolve_path(path: Option<&str>, wiki_root: &Path) -> PathBuf {
135    let p = path.unwrap_or("llms.txt");
136    let pb = PathBuf::from(p);
137    if pb.is_absolute() {
138        pb
139    } else {
140        wiki_root.join(pb)
141    }
142}
143
144fn collect_pages(
145    searcher: &tantivy::Searcher,
146    is: &IndexSchema,
147    wiki_name: &str,
148    include_archived: bool,
149) -> Result<Vec<PageEntry>> {
150    let f_slug = is.field("slug");
151    let f_title = is.field("title");
152    let f_type = is.field("type");
153    let f_status = is.field("status");
154    let f_confidence = is.try_field("confidence");
155    let f_summary = is.try_field("summary");
156
157    let top_docs = searcher.search(&AllQuery, &TopDocs::with_limit(100_000).order_by_score())?;
158
159    let mut pages = Vec::new();
160    for (_score, doc_addr) in &top_docs {
161        let doc: tantivy::TantivyDocument = searcher.doc(*doc_addr)?;
162
163        let slug = doc
164            .get_first(f_slug)
165            .and_then(|v| v.as_str())
166            .unwrap_or("")
167            .to_string();
168        if slug.is_empty() {
169            continue;
170        }
171
172        let status = doc
173            .get_first(f_status)
174            .and_then(|v| v.as_str())
175            .unwrap_or("")
176            .to_string();
177
178        if !include_archived && status == "archived" {
179            continue;
180        }
181
182        let title = doc
183            .get_first(f_title)
184            .and_then(|v| v.as_str())
185            .unwrap_or("")
186            .to_string();
187        let page_type = doc
188            .get_first(f_type)
189            .and_then(|v| v.as_str())
190            .unwrap_or("")
191            .to_string();
192        let confidence = f_confidence
193            .and_then(|f| doc.get_first(f))
194            .and_then(|v| v.as_f64())
195            .unwrap_or(0.5);
196        let summary = f_summary
197            .and_then(|f| doc.get_first(f))
198            .and_then(|v| v.as_str())
199            .filter(|s| !s.is_empty())
200            .unwrap_or("")
201            .to_string();
202
203        let uri = format!("wiki://{wiki_name}/{slug}");
204
205        pages.push(PageEntry {
206            slug,
207            uri,
208            title,
209            r#type: page_type,
210            status,
211            confidence,
212            summary,
213            body: None,
214        });
215    }
216
217    // Sort: group by type (count desc), within group by confidence desc then title asc
218    let mut type_counts: HashMap<String, usize> = HashMap::new();
219    for p in &pages {
220        *type_counts.entry(p.r#type.clone()).or_insert(0) += 1;
221    }
222    pages.sort_by(|a, b| {
223        let ca = type_counts.get(&a.r#type).copied().unwrap_or(0);
224        let cb = type_counts.get(&b.r#type).copied().unwrap_or(0);
225        cb.cmp(&ca)
226            .then(a.r#type.cmp(&b.r#type))
227            .then(
228                b.confidence
229                    .partial_cmp(&a.confidence)
230                    .unwrap_or(std::cmp::Ordering::Equal),
231            )
232            .then(a.title.cmp(&b.title))
233    });
234
235    Ok(pages)
236}
237
238fn load_bodies(mut pages: Vec<PageEntry>, wiki_root: &Path) -> Result<Vec<PageEntry>> {
239    for page in &mut pages {
240        let path = wiki_root.join(format!("{}.md", page.slug));
241        if path.exists() {
242            let raw = std::fs::read_to_string(&path)
243                .with_context(|| format!("failed to read {}", path.display()))?;
244            // Strip frontmatter (between --- delimiters)
245            let body = strip_frontmatter(&raw);
246            page.body = Some(body.to_string());
247        }
248    }
249    Ok(pages)
250}
251
252fn strip_frontmatter(content: &str) -> &str {
253    if !content.starts_with("---") {
254        return content;
255    }
256    // Find second --- after the opening
257    if let Some(rest) = content[3..].find("\n---") {
258        let end = 3 + rest + 4; // skip past the closing ---
259        // Skip past optional newline after ---
260        let end = if content[end..].starts_with('\n') {
261            end + 1
262        } else {
263            end
264        };
265        &content[end..]
266    } else {
267        content
268    }
269}
270
271// ── Renderers ─────────────────────────────────────────────────────────────────
272
273fn render_llms_txt(pages: &[PageEntry], wiki_name: &str) -> String {
274    let mut out = format!("# {wiki_name}\n\n");
275    out.push_str(&format!("{} pages\n\n", pages.len()));
276
277    let mut current_type = "";
278    for page in pages {
279        if page.r#type != current_type {
280            current_type = &page.r#type;
281            let count = pages.iter().filter(|p| p.r#type == current_type).count();
282            out.push_str(&format!("## {} ({})\n\n", current_type, count));
283        }
284        if page.summary.is_empty() {
285            out.push_str(&format!("- [{}]({})\n", page.title, page.uri));
286        } else {
287            out.push_str(&format!(
288                "- [{}]({}): {}\n",
289                page.title, page.uri, page.summary
290            ));
291        }
292    }
293    out
294}
295
296fn render_llms_full(pages: &[PageEntry], wiki_name: &str) -> String {
297    let mut out = format!("# {wiki_name}\n\n");
298    out.push_str(&format!("{} pages\n\n", pages.len()));
299
300    for page in pages {
301        out.push_str("---\n\n");
302        out.push_str(&format!("# [{}]({})\n\n", page.title, page.uri));
303        if !page.summary.is_empty() {
304            out.push_str(&format!("_{}_\n\n", page.summary));
305        }
306        if let Some(ref body) = page.body {
307            out.push_str(body.trim());
308            out.push_str("\n\n");
309        }
310    }
311    out
312}