1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3
4use anyhow::{Context, Result};
5use serde::{Deserialize, Serialize};
6use tantivy::collector::TopDocs;
7use tantivy::query::AllQuery;
8use tantivy::schema::Value;
9
10use crate::engine::EngineState;
11use crate::index_schema::IndexSchema;
12
13#[derive(Debug, Clone)]
17pub struct ExportOptions {
18 pub wiki: String,
20 pub path: Option<String>,
22 pub format: ExportFormat,
24 pub include_archived: bool,
26}
27
28#[derive(Debug, Clone, Default, PartialEq)]
30pub enum ExportFormat {
31 #[default]
33 LlmsTxt,
34 LlmsFull,
36 Json,
38}
39
40impl ExportFormat {
41 pub fn as_str(&self) -> &'static str {
43 match self {
44 ExportFormat::LlmsTxt => "llms-txt",
45 ExportFormat::LlmsFull => "llms-full",
46 ExportFormat::Json => "json",
47 }
48 }
49
50 pub fn parse(s: &str) -> Self {
52 match s {
53 "llms-full" => ExportFormat::LlmsFull,
54 "json" => ExportFormat::Json,
55 _ => ExportFormat::LlmsTxt,
56 }
57 }
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct ExportReport {
63 pub path: String,
65 pub pages_written: usize,
67 pub bytes: usize,
69 pub format: String,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
74struct PageEntry {
75 slug: String,
76 uri: String,
77 title: String,
78 r#type: String,
79 status: String,
80 confidence: f64,
81 summary: String,
82 #[serde(skip_serializing_if = "Option::is_none")]
83 body: Option<String>,
84}
85
86pub fn export(engine: &EngineState, options: &ExportOptions) -> Result<ExportReport> {
90 let space = engine.space(&options.wiki)?;
91 let wiki_root = &space.wiki_root;
92
93 let resolved_path = resolve_path(options.path.as_deref(), wiki_root);
94
95 let searcher = space.index_manager.searcher()?;
96 let is = &space.index_schema;
97
98 let pages = collect_pages(&searcher, is, &options.wiki, options.include_archived)?;
99
100 let need_bodies = matches!(options.format, ExportFormat::LlmsFull | ExportFormat::Json);
101 let pages = if need_bodies {
102 load_bodies(pages, wiki_root)?
103 } else {
104 pages
105 };
106
107 let content = match options.format {
108 ExportFormat::LlmsTxt => render_llms_txt(&pages, &options.wiki),
109 ExportFormat::LlmsFull => render_llms_full(&pages, &options.wiki),
110 ExportFormat::Json => {
111 serde_json::to_string_pretty(&pages).context("failed to serialize pages to JSON")?
112 }
113 };
114
115 if let Some(parent) = resolved_path.parent()
116 && !parent.as_os_str().is_empty()
117 {
118 std::fs::create_dir_all(parent)
119 .with_context(|| format!("failed to create directory {}", parent.display()))?;
120 }
121 std::fs::write(&resolved_path, &content)
122 .with_context(|| format!("failed to write export to {}", resolved_path.display()))?;
123
124 Ok(ExportReport {
125 path: resolved_path.to_string_lossy().to_string(),
126 pages_written: pages.len(),
127 bytes: content.len(),
128 format: options.format.as_str().to_string(),
129 })
130}
131
132fn resolve_path(path: Option<&str>, wiki_root: &Path) -> PathBuf {
135 let p = path.unwrap_or("llms.txt");
136 let pb = PathBuf::from(p);
137 if pb.is_absolute() {
138 pb
139 } else {
140 wiki_root.join(pb)
141 }
142}
143
144fn collect_pages(
145 searcher: &tantivy::Searcher,
146 is: &IndexSchema,
147 wiki_name: &str,
148 include_archived: bool,
149) -> Result<Vec<PageEntry>> {
150 let f_slug = is.field("slug");
151 let f_title = is.field("title");
152 let f_type = is.field("type");
153 let f_status = is.field("status");
154 let f_confidence = is.try_field("confidence");
155 let f_summary = is.try_field("summary");
156
157 let top_docs = searcher.search(&AllQuery, &TopDocs::with_limit(100_000).order_by_score())?;
158
159 let mut pages = Vec::new();
160 for (_score, doc_addr) in &top_docs {
161 let doc: tantivy::TantivyDocument = searcher.doc(*doc_addr)?;
162
163 let slug = doc
164 .get_first(f_slug)
165 .and_then(|v| v.as_str())
166 .unwrap_or("")
167 .to_string();
168 if slug.is_empty() {
169 continue;
170 }
171
172 let status = doc
173 .get_first(f_status)
174 .and_then(|v| v.as_str())
175 .unwrap_or("")
176 .to_string();
177
178 if !include_archived && status == "archived" {
179 continue;
180 }
181
182 let title = doc
183 .get_first(f_title)
184 .and_then(|v| v.as_str())
185 .unwrap_or("")
186 .to_string();
187 let page_type = doc
188 .get_first(f_type)
189 .and_then(|v| v.as_str())
190 .unwrap_or("")
191 .to_string();
192 let confidence = f_confidence
193 .and_then(|f| doc.get_first(f))
194 .and_then(|v| v.as_f64())
195 .unwrap_or(0.5);
196 let summary = f_summary
197 .and_then(|f| doc.get_first(f))
198 .and_then(|v| v.as_str())
199 .filter(|s| !s.is_empty())
200 .unwrap_or("")
201 .to_string();
202
203 let uri = format!("wiki://{wiki_name}/{slug}");
204
205 pages.push(PageEntry {
206 slug,
207 uri,
208 title,
209 r#type: page_type,
210 status,
211 confidence,
212 summary,
213 body: None,
214 });
215 }
216
217 let mut type_counts: HashMap<String, usize> = HashMap::new();
219 for p in &pages {
220 *type_counts.entry(p.r#type.clone()).or_insert(0) += 1;
221 }
222 pages.sort_by(|a, b| {
223 let ca = type_counts.get(&a.r#type).copied().unwrap_or(0);
224 let cb = type_counts.get(&b.r#type).copied().unwrap_or(0);
225 cb.cmp(&ca)
226 .then(a.r#type.cmp(&b.r#type))
227 .then(
228 b.confidence
229 .partial_cmp(&a.confidence)
230 .unwrap_or(std::cmp::Ordering::Equal),
231 )
232 .then(a.title.cmp(&b.title))
233 });
234
235 Ok(pages)
236}
237
238fn load_bodies(mut pages: Vec<PageEntry>, wiki_root: &Path) -> Result<Vec<PageEntry>> {
239 for page in &mut pages {
240 let path = wiki_root.join(format!("{}.md", page.slug));
241 if path.exists() {
242 let raw = std::fs::read_to_string(&path)
243 .with_context(|| format!("failed to read {}", path.display()))?;
244 let body = strip_frontmatter(&raw);
246 page.body = Some(body.to_string());
247 }
248 }
249 Ok(pages)
250}
251
252fn strip_frontmatter(content: &str) -> &str {
253 if !content.starts_with("---") {
254 return content;
255 }
256 if let Some(rest) = content[3..].find("\n---") {
258 let end = 3 + rest + 4; let end = if content[end..].starts_with('\n') {
261 end + 1
262 } else {
263 end
264 };
265 &content[end..]
266 } else {
267 content
268 }
269}
270
271fn render_llms_txt(pages: &[PageEntry], wiki_name: &str) -> String {
274 let mut out = format!("# {wiki_name}\n\n");
275 out.push_str(&format!("{} pages\n\n", pages.len()));
276
277 let mut current_type = "";
278 for page in pages {
279 if page.r#type != current_type {
280 current_type = &page.r#type;
281 let count = pages.iter().filter(|p| p.r#type == current_type).count();
282 out.push_str(&format!("## {} ({})\n\n", current_type, count));
283 }
284 if page.summary.is_empty() {
285 out.push_str(&format!("- [{}]({})\n", page.title, page.uri));
286 } else {
287 out.push_str(&format!(
288 "- [{}]({}): {}\n",
289 page.title, page.uri, page.summary
290 ));
291 }
292 }
293 out
294}
295
296fn render_llms_full(pages: &[PageEntry], wiki_name: &str) -> String {
297 let mut out = format!("# {wiki_name}\n\n");
298 out.push_str(&format!("{} pages\n\n", pages.len()));
299
300 for page in pages {
301 out.push_str("---\n\n");
302 out.push_str(&format!("# [{}]({})\n\n", page.title, page.uri));
303 if !page.summary.is_empty() {
304 out.push_str(&format!("_{}_\n\n", page.summary));
305 }
306 if let Some(ref body) = page.body {
307 out.push_str(body.trim());
308 out.push_str("\n\n");
309 }
310 }
311 out
312}