infigraph_confluence/
sync.rs

1use std::collections::{HashSet, VecDeque};
2use std::path::Path;
3
4use anyhow::{Context, Result};
5use sha2::{Digest, Sha256};
6
7use infigraph_docs::chunk::{chunk_document, Chunk, ChunkStrategy};
8use infigraph_docs::extract::{DocFormat, ExtractedDoc};
9use infigraph_docs::store::DocStore;
10
11use crate::client::{ConfluenceClient, ConfluencePage};
12
13#[derive(Debug, serde::Serialize, serde::Deserialize)]
14pub struct SyncCursor {
15    pub last_synced: String,
16    pub source_id: String,
17    pub space_key: String,
18    pub base_url: String,
19    pub page_ids: Vec<String>,
20}
21
22#[derive(Debug, Clone, Default)]
23pub struct CrawlOptions {
24    pub follow_links: bool,
25    pub follow_depth: usize,
26    pub max_pages: usize,
27    pub same_space_only: bool,
28}
29
30impl CrawlOptions {
31    pub fn default_follow() -> Self {
32        Self {
33            follow_links: true,
34            follow_depth: 1,
35            max_pages: 100,
36            same_space_only: true,
37        }
38    }
39
40    pub fn no_follow() -> Self {
41        Self {
42            follow_links: false,
43            follow_depth: 0,
44            max_pages: 0,
45            same_space_only: true,
46        }
47    }
48}
49
50#[derive(Debug)]
51pub struct SyncResult {
52    pub pages_fetched: usize,
53    pub pages_indexed: usize,
54    pub pages_deleted: usize,
55    pub chunks_created: usize,
56    pub links_created: usize,
57}
58
59pub struct ConfluenceSync {
60    client: ConfluenceClient,
61    space_key: String,
62    source_id: String,
63}
64
65#[derive(Debug, Clone)]
66struct ParsedPage {
67    content: String,
68    links: Vec<PageLink>,
69}
70
71#[derive(Debug, Clone)]
72struct PageLink {
73    page_id: Option<String>,
74    url: String,
75    link_type: String,
76}
77
78impl ConfluenceSync {
79    pub fn new(client: ConfluenceClient, space_key: &str) -> Self {
80        let source_id = format!("confluence::{}", space_key);
81        Self {
82            client,
83            space_key: space_key.to_string(),
84            source_id,
85        }
86    }
87
88    pub fn sync(
89        &self,
90        store: &DocStore,
91        root: &Path,
92        page_ids: Option<&[String]>,
93    ) -> Result<SyncResult> {
94        self.sync_with_options(store, root, page_ids, &CrawlOptions::no_follow())
95    }
96
97    pub fn sync_with_options(
98        &self,
99        store: &DocStore,
100        root: &Path,
101        page_ids: Option<&[String]>,
102        crawl: &CrawlOptions,
103    ) -> Result<SyncResult> {
104        let cursor_path = root.join(".infigraph").join("confluence_sync.json");
105        let cursor = load_cursor(&cursor_path);
106
107        store.upsert_source(
108            &self.source_id,
109            "confluence",
110            self.client.base_url(),
111            &self.space_key,
112        )?;
113
114        let seed_pages = self.fetch_pages(page_ids, cursor.as_ref())?;
115
116        let (all_pages, link_map) = if crawl.follow_links {
117            self.crawl_links(&seed_pages, crawl)?
118        } else {
119            let link_map: Vec<(String, Vec<PageLink>)> = Vec::new();
120            (seed_pages, link_map)
121        };
122
123        let fetched = all_pages.len();
124        let (docs, all_chunks, page_links) = self.convert_pages(&all_pages);
125        let indexed = docs.len();
126        let chunks_created = all_chunks.len();
127
128        if !docs.is_empty() {
129            let doc_refs: Vec<&ExtractedDoc> = docs.iter().collect();
130            let chunk_refs: Vec<&Chunk> = all_chunks.iter().collect();
131            store.upsert_all_parquet(&doc_refs, &chunk_refs)?;
132
133            for doc in &docs {
134                store.link_doc_to_source(&doc.file, &self.source_id)?;
135            }
136        }
137
138        if !all_chunks.is_empty() {
139            let chunk_refs: Vec<&Chunk> = all_chunks.iter().collect();
140            let changed_files: Vec<&str> = docs.iter().map(|d| d.file.as_str()).collect();
141            infigraph_docs::embed::update_doc_embeddings(store, root, &chunk_refs, &changed_files)?;
142        }
143
144        // Create LINKS_TO edges
145        let mut links_created = 0;
146        let all_link_data: Vec<(String, Vec<PageLink>)> =
147            page_links.into_iter().chain(link_map).collect();
148
149        let indexed_ids: HashSet<&str> = docs.iter().map(|d| d.file.as_str()).collect();
150
151        for (from_file_id, links) in &all_link_data {
152            if !indexed_ids.contains(from_file_id.as_str()) {
153                continue;
154            }
155            store.delete_links_from(from_file_id)?;
156            for link in links {
157                if let Some(ref pid) = link.page_id {
158                    let to_file_id = format!("confluence://{}/{}", self.space_key, pid);
159                    if indexed_ids.contains(to_file_id.as_str()) {
160                        store.create_link(from_file_id, &to_file_id, &link.url, &link.link_type)?;
161                        links_created += 1;
162                    }
163                }
164            }
165        }
166
167        let deleted = self.remove_deleted_pages(store, page_ids)?;
168
169        let remote_ids: Vec<String> = all_pages.iter().map(|p| p.id.clone()).collect();
170        save_cursor(
171            &cursor_path,
172            &SyncCursor {
173                last_synced: chrono::Utc::now().to_rfc3339(),
174                source_id: self.source_id.clone(),
175                space_key: self.space_key.clone(),
176                base_url: self.client.base_url().to_string(),
177                page_ids: if let Some(ids) = page_ids {
178                    ids.to_vec()
179                } else {
180                    remote_ids
181                },
182            },
183        )?;
184
185        Ok(SyncResult {
186            pages_fetched: fetched,
187            pages_indexed: indexed,
188            pages_deleted: deleted,
189            chunks_created,
190            links_created,
191        })
192    }
193
194    #[allow(clippy::type_complexity)]
195    fn crawl_links(
196        &self,
197        seed_pages: &[ConfluencePage],
198        crawl: &CrawlOptions,
199    ) -> Result<(Vec<ConfluencePage>, Vec<(String, Vec<PageLink>)>)> {
200        let mut visited: HashSet<String> = HashSet::new();
201        let mut queue: VecDeque<(String, usize)> = VecDeque::new();
202        let mut all_pages: Vec<ConfluencePage> = Vec::new();
203        let mut all_links: Vec<(String, Vec<PageLink>)> = Vec::new();
204
205        for page in seed_pages {
206            visited.insert(page.id.clone());
207            queue.push_back((page.id.clone(), 0));
208            all_pages.push(page.clone());
209        }
210
211        while let Some((page_id, depth)) = queue.pop_front() {
212            if all_pages.len() >= crawl.max_pages {
213                eprintln!("Crawl: hit max_pages cap ({}), stopping", crawl.max_pages);
214                break;
215            }
216
217            let page = if depth == 0 {
218                all_pages.iter().find(|p| p.id == page_id).cloned()
219            } else {
220                match self.client.get_page(&page_id) {
221                    Ok(p) => {
222                        all_pages.push(p.clone());
223                        Some(p)
224                    }
225                    Err(e) => {
226                        eprintln!("Crawl: failed to fetch page {}: {}", page_id, e);
227                        continue;
228                    }
229                }
230            };
231
232            let Some(page) = page else { continue };
233            let parsed = parse_confluence_html(&page);
234            let file_id = format!("confluence://{}/{}", self.space_key, page.id);
235            all_links.push((file_id, parsed.links.clone()));
236
237            if depth >= crawl.follow_depth {
238                continue;
239            }
240
241            for link in &parsed.links {
242                if let Some(ref linked_id) = link.page_id {
243                    if visited.contains(linked_id) {
244                        continue;
245                    }
246                    if crawl.same_space_only && link.link_type == "external" {
247                        continue;
248                    }
249                    visited.insert(linked_id.clone());
250                    queue.push_back((linked_id.clone(), depth + 1));
251                    eprintln!("Crawl: queued page {} (depth {})", linked_id, depth + 1);
252                }
253            }
254        }
255
256        Ok((all_pages, all_links))
257    }
258
259    fn fetch_pages(
260        &self,
261        page_ids: Option<&[String]>,
262        cursor: Option<&SyncCursor>,
263    ) -> Result<Vec<ConfluencePage>> {
264        if let Some(ids) = page_ids {
265            let mut pages = Vec::new();
266            for id in ids {
267                match self.client.get_page(id) {
268                    Ok(page) => pages.push(page),
269                    Err(e) => eprintln!("Warning: failed to fetch page {}: {}", id, e),
270                }
271            }
272            return Ok(pages);
273        }
274
275        if let Some(c) = cursor {
276            let pages =
277                self.client
278                    .get_pages_modified_since(&self.space_key, &c.last_synced, 1000)?;
279            if !pages.is_empty() {
280                return Ok(pages);
281            }
282        }
283
284        self.client.get_pages_in_space(&self.space_key, 1000)
285    }
286
287    #[allow(clippy::type_complexity)]
288    fn convert_pages(
289        &self,
290        pages: &[ConfluencePage],
291    ) -> (Vec<ExtractedDoc>, Vec<Chunk>, Vec<(String, Vec<PageLink>)>) {
292        let mut docs = Vec::new();
293        let mut all_chunks = Vec::new();
294        let mut page_links = Vec::new();
295
296        for page in pages {
297            let parsed = parse_confluence_html(page);
298            if parsed.content.is_empty() {
299                continue;
300            }
301
302            let file_id = format!("confluence://{}/{}", self.space_key, page.id);
303            let hash = {
304                let mut h = Sha256::new();
305                h.update(parsed.content.as_bytes());
306                format!("{:x}", h.finalize())
307            };
308
309            let doc = ExtractedDoc {
310                file: file_id.clone(),
311                title: Some(page.title.clone()),
312                content_hash: hash.clone(),
313                format: DocFormat::Markdown,
314                text: parsed.content,
315                page_count: Some(1),
316            };
317
318            let chunks = chunk_document(&doc, &file_id, &hash, ChunkStrategy::HeadingBounded);
319            all_chunks.extend(chunks);
320            page_links.push((file_id, parsed.links));
321            docs.push(doc);
322        }
323
324        (docs, all_chunks, page_links)
325    }
326
327    fn remove_deleted_pages(&self, store: &DocStore, page_ids: Option<&[String]>) -> Result<usize> {
328        if page_ids.is_some() {
329            return Ok(0);
330        }
331
332        let remote_ids = self.client.get_all_page_ids_in_space(&self.space_key)?;
333        let remote_set: HashSet<String> = remote_ids.into_iter().collect();
334
335        let existing_docs = store.get_docs_by_source(&self.source_id)?;
336        let mut to_delete = Vec::new();
337
338        for doc_id in &existing_docs {
339            if let Some(page_id) = doc_id.strip_prefix(&format!("confluence://{}/", self.space_key))
340            {
341                if !remote_set.contains(page_id) {
342                    to_delete.push(doc_id.as_str());
343                }
344            }
345        }
346
347        let count = to_delete.len();
348        if !to_delete.is_empty() {
349            store.delete_docs_by_ids(&to_delete)?;
350        }
351        Ok(count)
352    }
353}
354
355// ---------------------------------------------------------------------------
356// Confluence HTML parser — preserves all content types
357// ---------------------------------------------------------------------------
358
359fn parse_confluence_html(page: &ConfluencePage) -> ParsedPage {
360    let html = if let Some(body) = &page.body {
361        if let Some(view) = &body.view {
362            if !view.value.is_empty() {
363                &view.value
364            } else if let Some(storage) = &body.storage {
365                &storage.value
366            } else {
367                return ParsedPage {
368                    content: String::new(),
369                    links: Vec::new(),
370                };
371            }
372        } else if let Some(storage) = &body.storage {
373            &storage.value
374        } else {
375            return ParsedPage {
376                content: String::new(),
377                links: Vec::new(),
378            };
379        }
380    } else {
381        return ParsedPage {
382            content: String::new(),
383            links: Vec::new(),
384        };
385    };
386
387    let mut parser = HtmlParser::new(html);
388    parser.parse();
389    let links = std::mem::take(&mut parser.links);
390    ParsedPage {
391        content: parser.finish(),
392        links,
393    }
394}
395
396struct HtmlParser<'a> {
397    input: &'a str,
398    pos: usize,
399    out: String,
400    links: Vec<PageLink>,
401    in_table: bool,
402    table_rows: Vec<Vec<String>>,
403    current_row: Vec<String>,
404    current_cell: String,
405    in_header_row: bool,
406    list_depth: usize,
407    ordered_list_counters: Vec<usize>,
408    in_code_block: bool,
409    code_language: String,
410    code_content: String,
411    in_pre: bool,
412    macro_name: String,
413    in_macro: bool,
414}
415
416impl<'a> HtmlParser<'a> {
417    fn new(input: &'a str) -> Self {
418        Self {
419            input,
420            pos: 0,
421            out: String::new(),
422            links: Vec::new(),
423            in_table: false,
424            table_rows: Vec::new(),
425            current_row: Vec::new(),
426            current_cell: String::new(),
427            in_header_row: false,
428            list_depth: 0,
429            ordered_list_counters: Vec::new(),
430            in_code_block: false,
431            code_language: String::new(),
432            code_content: String::new(),
433            in_pre: false,
434            macro_name: String::new(),
435            in_macro: false,
436        }
437    }
438
439    fn parse(&mut self) {
440        while self.pos < self.input.len() {
441            if self.input[self.pos..].starts_with('<') {
442                self.parse_tag();
443            } else if self.input[self.pos..].starts_with('&') {
444                self.parse_entity();
445            } else {
446                let ch = self.input[self.pos..].chars().next().unwrap();
447                if self.in_code_block || self.in_pre {
448                    self.code_content.push(ch);
449                } else if self.in_table {
450                    self.current_cell.push(ch);
451                } else {
452                    self.out.push(ch);
453                }
454                self.pos += ch.len_utf8();
455            }
456        }
457    }
458
459    fn parse_entity(&mut self) {
460        let rest = &self.input[self.pos..];
461        let end = rest.find(';').unwrap_or(0);
462        if end == 0 {
463            self.push_char('&');
464            self.pos += 1;
465            return;
466        }
467        let entity = &rest[..end + 1];
468        let decoded = match entity {
469            "&amp;" => "&",
470            "&lt;" => "<",
471            "&gt;" => ">",
472            "&quot;" => "\"",
473            "&#39;" | "&apos;" => "'",
474            "&nbsp;" => " ",
475            "&ndash;" => "–",
476            "&mdash;" => "—",
477            "&hellip;" => "…",
478            "&rarr;" => "→",
479            "&larr;" => "←",
480            "&times;" => "×",
481            "&bull;" => "•",
482            _ => {
483                if entity.starts_with("&#x") {
484                    let hex = &entity[3..entity.len() - 1];
485                    if let Ok(n) = u32::from_str_radix(hex, 16) {
486                        if let Some(ch) = char::from_u32(n) {
487                            self.push_char(ch);
488                            self.pos += entity.len();
489                            return;
490                        }
491                    }
492                } else if entity.starts_with("&#") {
493                    let num = &entity[2..entity.len() - 1];
494                    if let Ok(n) = num.parse::<u32>() {
495                        if let Some(ch) = char::from_u32(n) {
496                            self.push_char(ch);
497                            self.pos += entity.len();
498                            return;
499                        }
500                    }
501                }
502                self.push_str(entity);
503                self.pos += entity.len();
504                return;
505            }
506        };
507        self.push_str(decoded);
508        self.pos += entity.len();
509    }
510
511    fn push_char(&mut self, ch: char) {
512        if self.in_code_block || self.in_pre {
513            self.code_content.push(ch);
514        } else if self.in_table {
515            self.current_cell.push(ch);
516        } else {
517            self.out.push(ch);
518        }
519    }
520
521    fn push_str(&mut self, s: &str) {
522        if self.in_code_block || self.in_pre {
523            self.code_content.push_str(s);
524        } else if self.in_table {
525            self.current_cell.push_str(s);
526        } else {
527            self.out.push_str(s);
528        }
529    }
530
531    fn parse_tag(&mut self) {
532        let rest = &self.input[self.pos..];
533        let end = match rest.find('>') {
534            Some(e) => e,
535            None => {
536                self.pos = self.input.len();
537                return;
538            }
539        };
540        let tag_content = &rest[1..end];
541        self.pos += end + 1;
542
543        let is_closing = tag_content.starts_with('/');
544        let tag_str = if is_closing {
545            &tag_content[1..]
546        } else {
547            tag_content
548        };
549
550        let (tag_name, attrs) = split_tag(tag_str);
551        let tag_lower = tag_name.to_lowercase();
552
553        if is_closing {
554            self.handle_close_tag(&tag_lower);
555        } else {
556            self.handle_open_tag(&tag_lower, attrs);
557        }
558    }
559
560    fn handle_open_tag(&mut self, tag: &str, attrs: &str) {
561        match tag {
562            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
563                self.ensure_newline();
564                let level: usize = tag[1..].parse().unwrap_or(1);
565                for _ in 0..level {
566                    self.out.push('#');
567                }
568                self.out.push(' ');
569            }
570            "p" => self.ensure_blank_line(),
571            "br" => self.push_char('\n'),
572            "a" => {
573                if let Some(href) = extract_attr(attrs, "href") {
574                    let link = classify_link(&href);
575                    self.links.push(link.clone());
576                    match link.link_type.as_str() {
577                        "confluence_page" => {
578                            self.push_char('[');
579                        }
580                        "jira" => {
581                            self.push_str("[JIRA: ");
582                        }
583                        _ => {
584                            self.push_char('[');
585                        }
586                    }
587                }
588            }
589            "img" => {
590                let alt = extract_attr(attrs, "alt").unwrap_or_default();
591                let src = extract_attr(attrs, "src").unwrap_or_default();
592                if !alt.is_empty() || !src.is_empty() {
593                    self.push_str(&format!("![{}]({})", alt, src));
594                }
595            }
596            "ul" => {
597                self.list_depth += 1;
598                self.ensure_newline();
599            }
600            "ol" => {
601                self.list_depth += 1;
602                self.ordered_list_counters.push(0);
603                self.ensure_newline();
604            }
605            "li" => {
606                self.ensure_newline();
607                let indent = "  ".repeat(self.list_depth.saturating_sub(1));
608                if !self.ordered_list_counters.is_empty() {
609                    if let Some(counter) = self.ordered_list_counters.last_mut() {
610                        *counter += 1;
611                        let prefix = format!("{}{}. ", indent, counter);
612                        self.push_str(&prefix);
613                    }
614                } else {
615                    let prefix = format!("{}- ", indent);
616                    self.push_str(&prefix);
617                }
618            }
619            "table" => {
620                self.in_table = true;
621                self.table_rows.clear();
622                self.ensure_blank_line();
623            }
624            "thead" => {
625                self.in_header_row = true;
626            }
627            "tbody" => {
628                self.in_header_row = false;
629            }
630            "tr" => {
631                self.current_row.clear();
632                self.current_cell.clear();
633            }
634            "th" => {
635                self.current_cell.clear();
636                self.in_header_row = true;
637            }
638            "td" => {
639                self.current_cell.clear();
640            }
641            "pre" => {
642                self.in_pre = true;
643                self.code_content.clear();
644            }
645            "code" => {
646                if self.in_pre {
647                    self.in_code_block = true;
648                    self.code_language = extract_attr(attrs, "class")
649                        .map(|c| c.replace("language-", "").replace("confluence-", ""))
650                        .unwrap_or_default();
651                    self.code_content.clear();
652                } else {
653                    self.push_char('`');
654                }
655            }
656            "div" | "ac:structured-macro" => {
657                if let Some(name) = extract_attr(attrs, "ac:name")
658                    .or_else(|| extract_attr(attrs, "data-macro-name"))
659                {
660                    self.in_macro = true;
661                    self.macro_name = name.to_lowercase();
662                    match self.macro_name.as_str() {
663                        "info" | "note" | "warning" | "tip" => {
664                            self.ensure_blank_line();
665                            let label = self.macro_name.to_uppercase();
666                            self.out.push_str(&format!("> **{}:** ", label));
667                        }
668                        "code" | "noformat" => {
669                            self.in_code_block = true;
670                            self.code_content.clear();
671                            self.code_language =
672                                extract_attr(attrs, "language").unwrap_or_default();
673                        }
674                        "expand" => {
675                            self.ensure_blank_line();
676                        }
677                        "status" => {
678                            let color = extract_attr(attrs, "colour")
679                                .or_else(|| extract_attr(attrs, "color"))
680                                .unwrap_or_default();
681                            let title = extract_attr(attrs, "title").unwrap_or_default();
682                            if !title.is_empty() {
683                                self.push_str(&format!("[STATUS: {} ({})]", title, color));
684                            }
685                        }
686                        "jira" => {
687                            if let Some(key) = extract_attr(attrs, "key") {
688                                self.push_str(&format!("[JIRA: {}]", key));
689                                self.links.push(PageLink {
690                                    page_id: None,
691                                    url: key.to_string(),
692                                    link_type: "jira".to_string(),
693                                });
694                            }
695                        }
696                        _ => {}
697                    }
698                }
699            }
700            "ac:link" => {}
701            "ri:user" => {
702                if let Some(name) =
703                    extract_attr(attrs, "ri:username").or_else(|| extract_attr(attrs, "ri:userkey"))
704                {
705                    self.push_str(&format!("@{}", name));
706                }
707            }
708            "ri:page" => {
709                if let Some(title) = extract_attr(attrs, "ri:content-title") {
710                    self.push_str(&format!("[Page: {}]", title));
711                }
712            }
713            "ri:attachment" => {
714                if let Some(filename) = extract_attr(attrs, "ri:filename") {
715                    self.push_str(&format!("[Attachment: {}]", filename));
716                }
717            }
718            "hr" => {
719                self.ensure_newline();
720                self.out.push_str("---\n");
721            }
722            "strong" | "b" => self.push_str("**"),
723            "em" | "i" => self.push_char('*'),
724            "u" => self.push_str("__"),
725            "s" | "del" | "strike" => self.push_str("~~"),
726            "sup" => self.push_char('^'),
727            "sub" => self.push_char('~'),
728            "blockquote" => {
729                self.ensure_newline();
730                self.push_str("> ");
731            }
732            "time" => {
733                if let Some(dt) = extract_attr(attrs, "datetime") {
734                    self.push_str(&format!("[Date: {}]", dt));
735                }
736            }
737            "ac:emoticon" => {
738                if let Some(name) = extract_attr(attrs, "ac:name") {
739                    self.push_str(&format!(":{}: ", name));
740                }
741            }
742            _ => {}
743        }
744    }
745
746    fn handle_close_tag(&mut self, tag: &str) {
747        match tag {
748            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
749                self.push_char('\n');
750            }
751            "p" => self.ensure_blank_line(),
752            "a" => {
753                if let Some(last_link) = self.links.last() {
754                    let url = last_link.url.clone();
755                    self.push_str(&format!("]({})", url));
756                }
757            }
758            "ul" => {
759                self.list_depth = self.list_depth.saturating_sub(1);
760                if self.list_depth == 0 {
761                    self.ensure_newline();
762                }
763            }
764            "ol" => {
765                self.list_depth = self.list_depth.saturating_sub(1);
766                self.ordered_list_counters.pop();
767                if self.list_depth == 0 {
768                    self.ensure_newline();
769                }
770            }
771            "li" => {}
772            "th" | "td" => {
773                let cell = self.current_cell.trim().replace('\n', " ").to_string();
774                self.current_row.push(cell);
775                self.current_cell.clear();
776            }
777            "tr" => {
778                if !self.current_row.is_empty() {
779                    self.table_rows.push(self.current_row.clone());
780                }
781                self.current_row.clear();
782            }
783            "thead" => {
784                self.in_header_row = false;
785            }
786            "table" => {
787                self.in_table = false;
788                self.render_table();
789            }
790            "code" => {
791                if self.in_code_block && self.in_pre {
792                    // handled by /pre
793                } else {
794                    self.push_char('`');
795                }
796            }
797            "pre" => {
798                self.in_pre = false;
799                if self.in_code_block || !self.code_content.is_empty() {
800                    self.in_code_block = false;
801                    self.ensure_blank_line();
802                    self.out.push_str(&format!("```{}\n", self.code_language));
803                    self.out.push_str(self.code_content.trim());
804                    self.out.push_str("\n```\n");
805                    self.code_content.clear();
806                    self.code_language.clear();
807                }
808            }
809            "div" | "ac:structured-macro" if self.in_macro => {
810                if self.in_code_block {
811                    self.in_code_block = false;
812                    self.ensure_blank_line();
813                    self.out.push_str(&format!("```{}\n", self.code_language));
814                    self.out.push_str(self.code_content.trim());
815                    self.out.push_str("\n```\n");
816                    self.code_content.clear();
817                    self.code_language.clear();
818                }
819                self.in_macro = false;
820                self.macro_name.clear();
821            }
822            "strong" | "b" => self.push_str("**"),
823            "em" | "i" => self.push_char('*'),
824            "u" => self.push_str("__"),
825            "s" | "del" | "strike" => self.push_str("~~"),
826            "sup" => self.push_char('^'),
827            "sub" => self.push_char('~'),
828            "blockquote" => self.ensure_newline(),
829            _ => {}
830        }
831    }
832
833    fn render_table(&mut self) {
834        if self.table_rows.is_empty() {
835            return;
836        }
837
838        let max_cols = self.table_rows.iter().map(|r| r.len()).max().unwrap_or(0);
839        if max_cols == 0 {
840            return;
841        }
842
843        let mut widths = vec![3usize; max_cols];
844        for row in &self.table_rows {
845            for (i, cell) in row.iter().enumerate() {
846                widths[i] = widths[i].max(cell.len());
847            }
848        }
849
850        self.ensure_blank_line();
851
852        if let Some(header) = self.table_rows.first() {
853            self.out.push('|');
854            for (i, cell) in header.iter().enumerate() {
855                let w = widths.get(i).copied().unwrap_or(3);
856                self.out.push_str(&format!(" {:width$} |", cell, width = w));
857            }
858            for i in header.len()..max_cols {
859                let w = widths.get(i).copied().unwrap_or(3);
860                self.out.push_str(&format!(" {:width$} |", "", width = w));
861            }
862            self.out.push('\n');
863
864            self.out.push('|');
865            for w in &widths {
866                self.out.push_str(&format!(" {} |", "-".repeat(*w)));
867            }
868            self.out.push('\n');
869        }
870
871        for row in self.table_rows.iter().skip(1) {
872            self.out.push('|');
873            for (i, cell) in row.iter().enumerate() {
874                let w = widths.get(i).copied().unwrap_or(3);
875                self.out.push_str(&format!(" {:width$} |", cell, width = w));
876            }
877            for i in row.len()..max_cols {
878                let w = widths.get(i).copied().unwrap_or(3);
879                self.out.push_str(&format!(" {:width$} |", "", width = w));
880            }
881            self.out.push('\n');
882        }
883
884        self.out.push('\n');
885        self.table_rows.clear();
886    }
887
888    fn ensure_newline(&mut self) {
889        if !self.out.ends_with('\n') && !self.out.is_empty() {
890            self.out.push('\n');
891        }
892    }
893
894    fn ensure_blank_line(&mut self) {
895        self.ensure_newline();
896        if !self.out.ends_with("\n\n") && !self.out.is_empty() {
897            self.out.push('\n');
898        }
899    }
900
901    fn finish(self) -> String {
902        let mut result = String::new();
903        let mut blank_count = 0;
904        for line in self.out.lines() {
905            if line.trim().is_empty() {
906                blank_count += 1;
907                if blank_count <= 2 {
908                    result.push('\n');
909                }
910            } else {
911                blank_count = 0;
912                result.push_str(line);
913                result.push('\n');
914            }
915        }
916        result.trim().to_string()
917    }
918}
919
920// ---------------------------------------------------------------------------
921// Helpers
922// ---------------------------------------------------------------------------
923
924fn split_tag(s: &str) -> (&str, &str) {
925    let s = s.trim_end_matches('/').trim();
926    match s.find(|c: char| c.is_whitespace()) {
927        Some(i) => (&s[..i], s[i..].trim()),
928        None => (s, ""),
929    }
930}
931
932fn extract_attr(attrs: &str, name: &str) -> Option<String> {
933    let patterns = [format!("{}=\"", name), format!("{}='", name)];
934    for pat in &patterns {
935        if let Some(start) = attrs.find(pat.as_str()) {
936            let val_start = start + pat.len();
937            let quote = if pat.ends_with('"') { '"' } else { '\'' };
938            if let Some(end) = attrs[val_start..].find(quote) {
939                return Some(attrs[val_start..val_start + end].to_string());
940            }
941        }
942    }
943    None
944}
945
946fn classify_link(href: &str) -> PageLink {
947    if href.contains("/pages/") && href.contains("/wiki/spaces/") {
948        let parts: Vec<&str> = href.split('/').collect();
949        if let Some(idx) = parts.iter().position(|&p| p == "pages") {
950            if let Some(id) = parts.get(idx + 1) {
951                if id.chars().all(|c| c.is_ascii_digit()) {
952                    return PageLink {
953                        page_id: Some(id.to_string()),
954                        url: href.to_string(),
955                        link_type: "confluence_page".to_string(),
956                    };
957                }
958            }
959        }
960    }
961
962    if href.contains("pageId=") {
963        if let Some(id) = href.split("pageId=").nth(1) {
964            let id = id.split('&').next().unwrap_or(id);
965            if id.chars().all(|c| c.is_ascii_digit()) {
966                return PageLink {
967                    page_id: Some(id.to_string()),
968                    url: href.to_string(),
969                    link_type: "confluence_page".to_string(),
970                };
971            }
972        }
973    }
974
975    if href.contains("/browse/") || href.contains("jira") {
976        let key = href.rsplit('/').next().unwrap_or("");
977        if key.contains('-')
978            && key
979                .split('-')
980                .next()
981                .map(|p| p.chars().all(|c| c.is_ascii_uppercase()))
982                .unwrap_or(false)
983        {
984            return PageLink {
985                page_id: None,
986                url: href.to_string(),
987                link_type: "jira".to_string(),
988            };
989        }
990    }
991
992    PageLink {
993        page_id: None,
994        url: href.to_string(),
995        link_type: "external".to_string(),
996    }
997}
998
999fn load_cursor(path: &Path) -> Option<SyncCursor> {
1000    let data = std::fs::read_to_string(path).ok()?;
1001    serde_json::from_str(&data).ok()
1002}
1003
1004fn save_cursor(path: &Path, cursor: &SyncCursor) -> Result<()> {
1005    if let Some(parent) = path.parent() {
1006        std::fs::create_dir_all(parent)?;
1007    }
1008    let json = serde_json::to_string_pretty(cursor)?;
1009    std::fs::write(path, json).context("write sync cursor")?;
1010    Ok(())
1011}
infigraph_confluence/sync.rs

infigraph_confluence/
sync.rs