Skip to main content

pebble_cms/cli/
import_wordpress.rs

1use crate::models::{ContentStatus, ContentType, CreateContent};
2use crate::services::{content, html_to_markdown};
3use crate::Config;
4use anyhow::Result;
5use quick_xml::events::Event;
6use quick_xml::Reader;
7use std::path::Path;
8
9#[allow(dead_code)]
10struct WxrItem {
11    title: String,
12    slug: String,
13    content_html: String,
14    status: String,
15    post_type: String,
16    published_at: Option<String>,
17    tags: Vec<String>,
18}
19
20pub async fn run(config_path: &Path, file: &Path, overwrite: bool) -> Result<()> {
21    let config = Config::load(config_path)?;
22    let db = crate::Database::open(&config.database.path)?;
23    db.migrate()?;
24
25    if !file.exists() {
26        anyhow::bail!("WordPress export file not found: {}", file.display());
27    }
28
29    let xml_content = std::fs::read_to_string(file)?;
30    let items = parse_wxr(&xml_content)?;
31
32    tracing::info!(
33        "Found {} items in WordPress export",
34        items.len()
35    );
36
37    let mut posts_imported = 0;
38    let mut pages_imported = 0;
39    let mut skipped = 0;
40
41    for item in items {
42        let content_type = match item.post_type.as_str() {
43            "post" => ContentType::Post,
44            "page" => ContentType::Page,
45            _ => {
46                skipped += 1;
47                continue;
48            }
49        };
50
51        let status = match item.status.as_str() {
52            "publish" => ContentStatus::Published,
53            "draft" => ContentStatus::Draft,
54            "private" => ContentStatus::Draft,
55            _ => ContentStatus::Draft,
56        };
57
58        let markdown = html_to_markdown::convert(&item.content_html);
59
60        let slug = if item.slug.is_empty() {
61            crate::services::slug::generate_slug(&item.title)
62        } else {
63            item.slug.clone()
64        };
65
66        // Check for existing content
67        if let Ok(Some(_)) = content::get_content_by_slug(&db, &slug) {
68            if !overwrite {
69                tracing::info!("Skipping existing: {}", slug);
70                skipped += 1;
71                continue;
72            }
73            // Delete existing for overwrite
74            let conn = db.get()?;
75            let _ = conn.execute("DELETE FROM content WHERE slug = ?", [&slug]);
76        }
77
78        let input = CreateContent {
79            title: item.title,
80            slug: Some(slug.clone()),
81            content_type: content_type.clone(),
82            body_markdown: markdown,
83            status,
84            scheduled_at: None,
85            excerpt: None,
86            featured_image: None,
87            tags: item.tags,
88            metadata: None,
89        };
90
91        match content::create_content(&db, input, None, config.content.excerpt_length) {
92            Ok(_) => {
93                match content_type {
94                    ContentType::Post => posts_imported += 1,
95                    ContentType::Page => pages_imported += 1,
96                    _ => {}
97                }
98                tracing::info!("Imported: {} ({})", slug, content_type);
99            }
100            Err(e) => {
101                tracing::warn!("Failed to import {}: {}", slug, e);
102                skipped += 1;
103            }
104        }
105    }
106
107    tracing::info!(
108        "WordPress import complete: {} posts, {} pages imported, {} skipped",
109        posts_imported,
110        pages_imported,
111        skipped
112    );
113    Ok(())
114}
115
116fn parse_wxr(xml: &str) -> Result<Vec<WxrItem>> {
117    let mut reader = Reader::from_str(xml);
118    reader.config_mut().trim_text(true);
119
120    let mut items = Vec::new();
121    let mut buf = Vec::new();
122
123    // State tracking
124    let mut in_item = false;
125    let mut current_tag = String::new();
126    let mut title = String::new();
127    let mut slug = String::new();
128    let mut content_html = String::new();
129    let mut status = String::new();
130    let mut post_type = String::new();
131    let mut published_at = Option::<String>::None;
132    let mut tags: Vec<String> = Vec::new();
133    let mut _in_content_encoded = false;
134
135    loop {
136        match reader.read_event_into(&mut buf) {
137            Ok(Event::Start(ref e)) => {
138                let local_name = e.local_name();
139                let tag_name = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
140
141                if tag_name == "item" {
142                    in_item = true;
143                    title.clear();
144                    slug.clear();
145                    content_html.clear();
146                    status.clear();
147                    post_type.clear();
148                    published_at = None;
149                    tags.clear();
150                } else if in_item {
151                    current_tag = tag_name.to_string();
152
153                    // Check for wp:post_name, wp:status, wp:post_type, wp:post_date
154                    // quick-xml handles namespaced elements; the local name strips prefix
155                    let qname = e.name();
156                    let full_name = std::str::from_utf8(qname.as_ref()).unwrap_or("");
157                    if full_name.contains("post_name") {
158                        current_tag = "wp:post_name".to_string();
159                    } else if full_name.contains("status") && full_name.contains("wp") {
160                        current_tag = "wp:status".to_string();
161                    } else if full_name.contains("post_type") {
162                        current_tag = "wp:post_type".to_string();
163                    } else if full_name.contains("post_date") && !full_name.contains("gmt") {
164                        current_tag = "wp:post_date".to_string();
165                    } else if full_name.contains("encoded") {
166                        _in_content_encoded = true;
167                        current_tag = "content:encoded".to_string();
168                    }
169
170                    // Check for tag categories
171                    if tag_name == "category" {
172                        let domain = e.attributes()
173                            .filter_map(|a| a.ok())
174                            .find(|a| a.key.as_ref() == b"domain")
175                            .and_then(|a| String::from_utf8(a.value.to_vec()).ok());
176                        if domain.as_deref() == Some("post_tag") {
177                            current_tag = "post_tag".to_string();
178                        }
179                    }
180                }
181            }
182            Ok(Event::CData(ref e)) => {
183                if in_item {
184                    let text = std::str::from_utf8(e.as_ref()).unwrap_or("");
185                    match current_tag.as_str() {
186                        "content:encoded" => content_html.push_str(text),
187                        "title" => title.push_str(text),
188                        _ => {}
189                    }
190                }
191            }
192            Ok(Event::Text(ref e)) => {
193                if in_item {
194                    let text = e.unescape().unwrap_or_default();
195                    match current_tag.as_str() {
196                        "title" => title.push_str(&text),
197                        "wp:post_name" => slug.push_str(&text),
198                        "content:encoded" => content_html.push_str(&text),
199                        "wp:status" => status.push_str(&text),
200                        "wp:post_type" => post_type.push_str(&text),
201                        "wp:post_date" => published_at = Some(text.to_string()),
202                        "post_tag" => {
203                            let tag = text.trim().to_string();
204                            if !tag.is_empty() {
205                                tags.push(tag);
206                            }
207                        }
208                        _ => {}
209                    }
210                }
211            }
212            Ok(Event::End(ref e)) => {
213                let local_name = e.local_name();
214                let tag_name = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
215
216                if tag_name == "item" && in_item {
217                    if !title.is_empty() {
218                        items.push(WxrItem {
219                            title: title.clone(),
220                            slug: slug.clone(),
221                            content_html: content_html.clone(),
222                            status: status.clone(),
223                            post_type: post_type.clone(),
224                            published_at: published_at.clone(),
225                            tags: tags.clone(),
226                        });
227                    }
228                    in_item = false;
229                }
230
231                let end_qname = e.name();
232                let full_name = std::str::from_utf8(end_qname.as_ref()).unwrap_or("");
233                if full_name.contains("encoded") {
234                    _in_content_encoded = false;
235                }
236                current_tag.clear();
237            }
238            Ok(Event::Eof) => break,
239            Err(e) => {
240                tracing::warn!("XML parsing error: {}", e);
241                break;
242            }
243            _ => {}
244        }
245        buf.clear();
246    }
247
248    Ok(items)
249}