1use std::collections::{HashSet, VecDeque};
2use std::path::Path;
3
4use anyhow::{Context, Result};
5use sha2::{Digest, Sha256};
6
7use infigraph_docs::chunk::{chunk_document, Chunk, ChunkStrategy};
8use infigraph_docs::extract::{DocFormat, ExtractedDoc};
9use infigraph_docs::store::DocStore;
10
11use crate::client::{ConfluenceClient, ConfluencePage};
12
13#[derive(Debug, serde::Serialize, serde::Deserialize)]
14pub struct SyncCursor {
15 pub last_synced: String,
16 pub source_id: String,
17 pub space_key: String,
18 pub base_url: String,
19 pub page_ids: Vec<String>,
20}
21
22#[derive(Debug, Clone, Default)]
23pub struct CrawlOptions {
24 pub follow_links: bool,
25 pub follow_depth: usize,
26 pub max_pages: usize,
27 pub same_space_only: bool,
28}
29
30impl CrawlOptions {
31 pub fn default_follow() -> Self {
32 Self {
33 follow_links: true,
34 follow_depth: 1,
35 max_pages: 100,
36 same_space_only: true,
37 }
38 }
39
40 pub fn no_follow() -> Self {
41 Self {
42 follow_links: false,
43 follow_depth: 0,
44 max_pages: 0,
45 same_space_only: true,
46 }
47 }
48}
49
50#[derive(Debug)]
51pub struct SyncResult {
52 pub pages_fetched: usize,
53 pub pages_indexed: usize,
54 pub pages_deleted: usize,
55 pub chunks_created: usize,
56 pub links_created: usize,
57}
58
59pub struct ConfluenceSync {
60 client: ConfluenceClient,
61 space_key: String,
62 source_id: String,
63}
64
65#[derive(Debug, Clone)]
66struct ParsedPage {
67 content: String,
68 links: Vec<PageLink>,
69}
70
71#[derive(Debug, Clone)]
72struct PageLink {
73 page_id: Option<String>,
74 url: String,
75 link_type: String,
76}
77
78impl ConfluenceSync {
79 pub fn new(client: ConfluenceClient, space_key: &str) -> Self {
80 let source_id = format!("confluence::{}", space_key);
81 Self {
82 client,
83 space_key: space_key.to_string(),
84 source_id,
85 }
86 }
87
88 pub fn sync(
89 &self,
90 store: &DocStore,
91 root: &Path,
92 page_ids: Option<&[String]>,
93 ) -> Result<SyncResult> {
94 self.sync_with_options(store, root, page_ids, &CrawlOptions::no_follow())
95 }
96
97 pub fn sync_with_options(
98 &self,
99 store: &DocStore,
100 root: &Path,
101 page_ids: Option<&[String]>,
102 crawl: &CrawlOptions,
103 ) -> Result<SyncResult> {
104 let cursor_path = root.join(".infigraph").join("confluence_sync.json");
105 let cursor = load_cursor(&cursor_path);
106
107 store.upsert_source(
108 &self.source_id,
109 "confluence",
110 self.client.base_url(),
111 &self.space_key,
112 )?;
113
114 let seed_pages = self.fetch_pages(page_ids, cursor.as_ref())?;
115
116 let (all_pages, link_map) = if crawl.follow_links {
117 self.crawl_links(&seed_pages, crawl)?
118 } else {
119 let link_map: Vec<(String, Vec<PageLink>)> = Vec::new();
120 (seed_pages, link_map)
121 };
122
123 let fetched = all_pages.len();
124 let (docs, all_chunks, page_links) = self.convert_pages(&all_pages);
125 let indexed = docs.len();
126 let chunks_created = all_chunks.len();
127
128 if !docs.is_empty() {
129 let doc_refs: Vec<&ExtractedDoc> = docs.iter().collect();
130 let chunk_refs: Vec<&Chunk> = all_chunks.iter().collect();
131 store.upsert_all_parquet(&doc_refs, &chunk_refs)?;
132
133 for doc in &docs {
134 store.link_doc_to_source(&doc.file, &self.source_id)?;
135 }
136 }
137
138 if !all_chunks.is_empty() {
139 let chunk_refs: Vec<&Chunk> = all_chunks.iter().collect();
140 let changed_files: Vec<&str> = docs.iter().map(|d| d.file.as_str()).collect();
141 infigraph_docs::embed::update_doc_embeddings(store, root, &chunk_refs, &changed_files)?;
142 }
143
144 let mut links_created = 0;
146 let all_link_data: Vec<(String, Vec<PageLink>)> =
147 page_links.into_iter().chain(link_map).collect();
148
149 let indexed_ids: HashSet<&str> = docs.iter().map(|d| d.file.as_str()).collect();
150
151 for (from_file_id, links) in &all_link_data {
152 if !indexed_ids.contains(from_file_id.as_str()) {
153 continue;
154 }
155 store.delete_links_from(from_file_id)?;
156 for link in links {
157 if let Some(ref pid) = link.page_id {
158 let to_file_id = format!("confluence://{}/{}", self.space_key, pid);
159 if indexed_ids.contains(to_file_id.as_str()) {
160 store.create_link(from_file_id, &to_file_id, &link.url, &link.link_type)?;
161 links_created += 1;
162 }
163 }
164 }
165 }
166
167 let deleted = self.remove_deleted_pages(store, page_ids)?;
168
169 let remote_ids: Vec<String> = all_pages.iter().map(|p| p.id.clone()).collect();
170 save_cursor(
171 &cursor_path,
172 &SyncCursor {
173 last_synced: chrono::Utc::now().to_rfc3339(),
174 source_id: self.source_id.clone(),
175 space_key: self.space_key.clone(),
176 base_url: self.client.base_url().to_string(),
177 page_ids: if let Some(ids) = page_ids {
178 ids.to_vec()
179 } else {
180 remote_ids
181 },
182 },
183 )?;
184
185 Ok(SyncResult {
186 pages_fetched: fetched,
187 pages_indexed: indexed,
188 pages_deleted: deleted,
189 chunks_created,
190 links_created,
191 })
192 }
193
194 #[allow(clippy::type_complexity)]
195 fn crawl_links(
196 &self,
197 seed_pages: &[ConfluencePage],
198 crawl: &CrawlOptions,
199 ) -> Result<(Vec<ConfluencePage>, Vec<(String, Vec<PageLink>)>)> {
200 let mut visited: HashSet<String> = HashSet::new();
201 let mut queue: VecDeque<(String, usize)> = VecDeque::new();
202 let mut all_pages: Vec<ConfluencePage> = Vec::new();
203 let mut all_links: Vec<(String, Vec<PageLink>)> = Vec::new();
204
205 for page in seed_pages {
206 visited.insert(page.id.clone());
207 queue.push_back((page.id.clone(), 0));
208 all_pages.push(page.clone());
209 }
210
211 while let Some((page_id, depth)) = queue.pop_front() {
212 if all_pages.len() >= crawl.max_pages {
213 eprintln!("Crawl: hit max_pages cap ({}), stopping", crawl.max_pages);
214 break;
215 }
216
217 let page = if depth == 0 {
218 all_pages.iter().find(|p| p.id == page_id).cloned()
219 } else {
220 match self.client.get_page(&page_id) {
221 Ok(p) => {
222 all_pages.push(p.clone());
223 Some(p)
224 }
225 Err(e) => {
226 eprintln!("Crawl: failed to fetch page {}: {}", page_id, e);
227 continue;
228 }
229 }
230 };
231
232 let Some(page) = page else { continue };
233 let parsed = parse_confluence_html(&page);
234 let file_id = format!("confluence://{}/{}", self.space_key, page.id);
235 all_links.push((file_id, parsed.links.clone()));
236
237 if depth >= crawl.follow_depth {
238 continue;
239 }
240
241 for link in &parsed.links {
242 if let Some(ref linked_id) = link.page_id {
243 if visited.contains(linked_id) {
244 continue;
245 }
246 if crawl.same_space_only && link.link_type == "external" {
247 continue;
248 }
249 visited.insert(linked_id.clone());
250 queue.push_back((linked_id.clone(), depth + 1));
251 eprintln!("Crawl: queued page {} (depth {})", linked_id, depth + 1);
252 }
253 }
254 }
255
256 Ok((all_pages, all_links))
257 }
258
259 fn fetch_pages(
260 &self,
261 page_ids: Option<&[String]>,
262 cursor: Option<&SyncCursor>,
263 ) -> Result<Vec<ConfluencePage>> {
264 if let Some(ids) = page_ids {
265 let mut pages = Vec::new();
266 for id in ids {
267 match self.client.get_page(id) {
268 Ok(page) => pages.push(page),
269 Err(e) => eprintln!("Warning: failed to fetch page {}: {}", id, e),
270 }
271 }
272 return Ok(pages);
273 }
274
275 if let Some(c) = cursor {
276 let pages =
277 self.client
278 .get_pages_modified_since(&self.space_key, &c.last_synced, 1000)?;
279 if !pages.is_empty() {
280 return Ok(pages);
281 }
282 }
283
284 self.client.get_pages_in_space(&self.space_key, 1000)
285 }
286
287 #[allow(clippy::type_complexity)]
288 fn convert_pages(
289 &self,
290 pages: &[ConfluencePage],
291 ) -> (Vec<ExtractedDoc>, Vec<Chunk>, Vec<(String, Vec<PageLink>)>) {
292 let mut docs = Vec::new();
293 let mut all_chunks = Vec::new();
294 let mut page_links = Vec::new();
295
296 for page in pages {
297 let parsed = parse_confluence_html(page);
298 if parsed.content.is_empty() {
299 continue;
300 }
301
302 let file_id = format!("confluence://{}/{}", self.space_key, page.id);
303 let hash = {
304 let mut h = Sha256::new();
305 h.update(parsed.content.as_bytes());
306 format!("{:x}", h.finalize())
307 };
308
309 let doc = ExtractedDoc {
310 file: file_id.clone(),
311 title: Some(page.title.clone()),
312 content_hash: hash.clone(),
313 format: DocFormat::Markdown,
314 text: parsed.content,
315 page_count: Some(1),
316 };
317
318 let chunks = chunk_document(&doc, &file_id, &hash, ChunkStrategy::HeadingBounded);
319 all_chunks.extend(chunks);
320 page_links.push((file_id, parsed.links));
321 docs.push(doc);
322 }
323
324 (docs, all_chunks, page_links)
325 }
326
327 fn remove_deleted_pages(&self, store: &DocStore, page_ids: Option<&[String]>) -> Result<usize> {
328 if page_ids.is_some() {
329 return Ok(0);
330 }
331
332 let remote_ids = self.client.get_all_page_ids_in_space(&self.space_key)?;
333 let remote_set: HashSet<String> = remote_ids.into_iter().collect();
334
335 let existing_docs = store.get_docs_by_source(&self.source_id)?;
336 let mut to_delete = Vec::new();
337
338 for doc_id in &existing_docs {
339 if let Some(page_id) = doc_id.strip_prefix(&format!("confluence://{}/", self.space_key))
340 {
341 if !remote_set.contains(page_id) {
342 to_delete.push(doc_id.as_str());
343 }
344 }
345 }
346
347 let count = to_delete.len();
348 if !to_delete.is_empty() {
349 store.delete_docs_by_ids(&to_delete)?;
350 }
351 Ok(count)
352 }
353}
354
355fn parse_confluence_html(page: &ConfluencePage) -> ParsedPage {
360 let html = if let Some(body) = &page.body {
361 if let Some(view) = &body.view {
362 if !view.value.is_empty() {
363 &view.value
364 } else if let Some(storage) = &body.storage {
365 &storage.value
366 } else {
367 return ParsedPage {
368 content: String::new(),
369 links: Vec::new(),
370 };
371 }
372 } else if let Some(storage) = &body.storage {
373 &storage.value
374 } else {
375 return ParsedPage {
376 content: String::new(),
377 links: Vec::new(),
378 };
379 }
380 } else {
381 return ParsedPage {
382 content: String::new(),
383 links: Vec::new(),
384 };
385 };
386
387 let mut parser = HtmlParser::new(html);
388 parser.parse();
389 let links = std::mem::take(&mut parser.links);
390 ParsedPage {
391 content: parser.finish(),
392 links,
393 }
394}
395
396struct HtmlParser<'a> {
397 input: &'a str,
398 pos: usize,
399 out: String,
400 links: Vec<PageLink>,
401 in_table: bool,
402 table_rows: Vec<Vec<String>>,
403 current_row: Vec<String>,
404 current_cell: String,
405 in_header_row: bool,
406 list_depth: usize,
407 ordered_list_counters: Vec<usize>,
408 in_code_block: bool,
409 code_language: String,
410 code_content: String,
411 in_pre: bool,
412 macro_name: String,
413 in_macro: bool,
414}
415
416impl<'a> HtmlParser<'a> {
417 fn new(input: &'a str) -> Self {
418 Self {
419 input,
420 pos: 0,
421 out: String::new(),
422 links: Vec::new(),
423 in_table: false,
424 table_rows: Vec::new(),
425 current_row: Vec::new(),
426 current_cell: String::new(),
427 in_header_row: false,
428 list_depth: 0,
429 ordered_list_counters: Vec::new(),
430 in_code_block: false,
431 code_language: String::new(),
432 code_content: String::new(),
433 in_pre: false,
434 macro_name: String::new(),
435 in_macro: false,
436 }
437 }
438
439 fn parse(&mut self) {
440 while self.pos < self.input.len() {
441 if self.input[self.pos..].starts_with('<') {
442 self.parse_tag();
443 } else if self.input[self.pos..].starts_with('&') {
444 self.parse_entity();
445 } else {
446 let ch = self.input[self.pos..].chars().next().unwrap();
447 if self.in_code_block || self.in_pre {
448 self.code_content.push(ch);
449 } else if self.in_table {
450 self.current_cell.push(ch);
451 } else {
452 self.out.push(ch);
453 }
454 self.pos += ch.len_utf8();
455 }
456 }
457 }
458
459 fn parse_entity(&mut self) {
460 let rest = &self.input[self.pos..];
461 let end = rest.find(';').unwrap_or(0);
462 if end == 0 {
463 self.push_char('&');
464 self.pos += 1;
465 return;
466 }
467 let entity = &rest[..end + 1];
468 let decoded = match entity {
469 "&" => "&",
470 "<" => "<",
471 ">" => ">",
472 """ => "\"",
473 "'" | "'" => "'",
474 " " => " ",
475 "–" => "–",
476 "—" => "—",
477 "…" => "…",
478 "→" => "→",
479 "←" => "←",
480 "×" => "×",
481 "•" => "•",
482 _ => {
483 if entity.starts_with("&#x") {
484 let hex = &entity[3..entity.len() - 1];
485 if let Ok(n) = u32::from_str_radix(hex, 16) {
486 if let Some(ch) = char::from_u32(n) {
487 self.push_char(ch);
488 self.pos += entity.len();
489 return;
490 }
491 }
492 } else if entity.starts_with("&#") {
493 let num = &entity[2..entity.len() - 1];
494 if let Ok(n) = num.parse::<u32>() {
495 if let Some(ch) = char::from_u32(n) {
496 self.push_char(ch);
497 self.pos += entity.len();
498 return;
499 }
500 }
501 }
502 self.push_str(entity);
503 self.pos += entity.len();
504 return;
505 }
506 };
507 self.push_str(decoded);
508 self.pos += entity.len();
509 }
510
511 fn push_char(&mut self, ch: char) {
512 if self.in_code_block || self.in_pre {
513 self.code_content.push(ch);
514 } else if self.in_table {
515 self.current_cell.push(ch);
516 } else {
517 self.out.push(ch);
518 }
519 }
520
521 fn push_str(&mut self, s: &str) {
522 if self.in_code_block || self.in_pre {
523 self.code_content.push_str(s);
524 } else if self.in_table {
525 self.current_cell.push_str(s);
526 } else {
527 self.out.push_str(s);
528 }
529 }
530
531 fn parse_tag(&mut self) {
532 let rest = &self.input[self.pos..];
533 let end = match rest.find('>') {
534 Some(e) => e,
535 None => {
536 self.pos = self.input.len();
537 return;
538 }
539 };
540 let tag_content = &rest[1..end];
541 self.pos += end + 1;
542
543 let is_closing = tag_content.starts_with('/');
544 let tag_str = if is_closing {
545 &tag_content[1..]
546 } else {
547 tag_content
548 };
549
550 let (tag_name, attrs) = split_tag(tag_str);
551 let tag_lower = tag_name.to_lowercase();
552
553 if is_closing {
554 self.handle_close_tag(&tag_lower);
555 } else {
556 self.handle_open_tag(&tag_lower, attrs);
557 }
558 }
559
560 fn handle_open_tag(&mut self, tag: &str, attrs: &str) {
561 match tag {
562 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
563 self.ensure_newline();
564 let level: usize = tag[1..].parse().unwrap_or(1);
565 for _ in 0..level {
566 self.out.push('#');
567 }
568 self.out.push(' ');
569 }
570 "p" => self.ensure_blank_line(),
571 "br" => self.push_char('\n'),
572 "a" => {
573 if let Some(href) = extract_attr(attrs, "href") {
574 let link = classify_link(&href);
575 self.links.push(link.clone());
576 match link.link_type.as_str() {
577 "confluence_page" => {
578 self.push_char('[');
579 }
580 "jira" => {
581 self.push_str("[JIRA: ");
582 }
583 _ => {
584 self.push_char('[');
585 }
586 }
587 }
588 }
589 "img" => {
590 let alt = extract_attr(attrs, "alt").unwrap_or_default();
591 let src = extract_attr(attrs, "src").unwrap_or_default();
592 if !alt.is_empty() || !src.is_empty() {
593 self.push_str(&format!("", alt, src));
594 }
595 }
596 "ul" => {
597 self.list_depth += 1;
598 self.ensure_newline();
599 }
600 "ol" => {
601 self.list_depth += 1;
602 self.ordered_list_counters.push(0);
603 self.ensure_newline();
604 }
605 "li" => {
606 self.ensure_newline();
607 let indent = " ".repeat(self.list_depth.saturating_sub(1));
608 if !self.ordered_list_counters.is_empty() {
609 if let Some(counter) = self.ordered_list_counters.last_mut() {
610 *counter += 1;
611 let prefix = format!("{}{}. ", indent, counter);
612 self.push_str(&prefix);
613 }
614 } else {
615 let prefix = format!("{}- ", indent);
616 self.push_str(&prefix);
617 }
618 }
619 "table" => {
620 self.in_table = true;
621 self.table_rows.clear();
622 self.ensure_blank_line();
623 }
624 "thead" => {
625 self.in_header_row = true;
626 }
627 "tbody" => {
628 self.in_header_row = false;
629 }
630 "tr" => {
631 self.current_row.clear();
632 self.current_cell.clear();
633 }
634 "th" => {
635 self.current_cell.clear();
636 self.in_header_row = true;
637 }
638 "td" => {
639 self.current_cell.clear();
640 }
641 "pre" => {
642 self.in_pre = true;
643 self.code_content.clear();
644 }
645 "code" => {
646 if self.in_pre {
647 self.in_code_block = true;
648 self.code_language = extract_attr(attrs, "class")
649 .map(|c| c.replace("language-", "").replace("confluence-", ""))
650 .unwrap_or_default();
651 self.code_content.clear();
652 } else {
653 self.push_char('`');
654 }
655 }
656 "div" | "ac:structured-macro" => {
657 if let Some(name) = extract_attr(attrs, "ac:name")
658 .or_else(|| extract_attr(attrs, "data-macro-name"))
659 {
660 self.in_macro = true;
661 self.macro_name = name.to_lowercase();
662 match self.macro_name.as_str() {
663 "info" | "note" | "warning" | "tip" => {
664 self.ensure_blank_line();
665 let label = self.macro_name.to_uppercase();
666 self.out.push_str(&format!("> **{}:** ", label));
667 }
668 "code" | "noformat" => {
669 self.in_code_block = true;
670 self.code_content.clear();
671 self.code_language =
672 extract_attr(attrs, "language").unwrap_or_default();
673 }
674 "expand" => {
675 self.ensure_blank_line();
676 }
677 "status" => {
678 let color = extract_attr(attrs, "colour")
679 .or_else(|| extract_attr(attrs, "color"))
680 .unwrap_or_default();
681 let title = extract_attr(attrs, "title").unwrap_or_default();
682 if !title.is_empty() {
683 self.push_str(&format!("[STATUS: {} ({})]", title, color));
684 }
685 }
686 "jira" => {
687 if let Some(key) = extract_attr(attrs, "key") {
688 self.push_str(&format!("[JIRA: {}]", key));
689 self.links.push(PageLink {
690 page_id: None,
691 url: key.to_string(),
692 link_type: "jira".to_string(),
693 });
694 }
695 }
696 _ => {}
697 }
698 }
699 }
700 "ac:link" => {}
701 "ri:user" => {
702 if let Some(name) =
703 extract_attr(attrs, "ri:username").or_else(|| extract_attr(attrs, "ri:userkey"))
704 {
705 self.push_str(&format!("@{}", name));
706 }
707 }
708 "ri:page" => {
709 if let Some(title) = extract_attr(attrs, "ri:content-title") {
710 self.push_str(&format!("[Page: {}]", title));
711 }
712 }
713 "ri:attachment" => {
714 if let Some(filename) = extract_attr(attrs, "ri:filename") {
715 self.push_str(&format!("[Attachment: {}]", filename));
716 }
717 }
718 "hr" => {
719 self.ensure_newline();
720 self.out.push_str("---\n");
721 }
722 "strong" | "b" => self.push_str("**"),
723 "em" | "i" => self.push_char('*'),
724 "u" => self.push_str("__"),
725 "s" | "del" | "strike" => self.push_str("~~"),
726 "sup" => self.push_char('^'),
727 "sub" => self.push_char('~'),
728 "blockquote" => {
729 self.ensure_newline();
730 self.push_str("> ");
731 }
732 "time" => {
733 if let Some(dt) = extract_attr(attrs, "datetime") {
734 self.push_str(&format!("[Date: {}]", dt));
735 }
736 }
737 "ac:emoticon" => {
738 if let Some(name) = extract_attr(attrs, "ac:name") {
739 self.push_str(&format!(":{}: ", name));
740 }
741 }
742 _ => {}
743 }
744 }
745
746 fn handle_close_tag(&mut self, tag: &str) {
747 match tag {
748 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
749 self.push_char('\n');
750 }
751 "p" => self.ensure_blank_line(),
752 "a" => {
753 if let Some(last_link) = self.links.last() {
754 let url = last_link.url.clone();
755 self.push_str(&format!("]({})", url));
756 }
757 }
758 "ul" => {
759 self.list_depth = self.list_depth.saturating_sub(1);
760 if self.list_depth == 0 {
761 self.ensure_newline();
762 }
763 }
764 "ol" => {
765 self.list_depth = self.list_depth.saturating_sub(1);
766 self.ordered_list_counters.pop();
767 if self.list_depth == 0 {
768 self.ensure_newline();
769 }
770 }
771 "li" => {}
772 "th" | "td" => {
773 let cell = self.current_cell.trim().replace('\n', " ").to_string();
774 self.current_row.push(cell);
775 self.current_cell.clear();
776 }
777 "tr" => {
778 if !self.current_row.is_empty() {
779 self.table_rows.push(self.current_row.clone());
780 }
781 self.current_row.clear();
782 }
783 "thead" => {
784 self.in_header_row = false;
785 }
786 "table" => {
787 self.in_table = false;
788 self.render_table();
789 }
790 "code" => {
791 if self.in_code_block && self.in_pre {
792 } else {
794 self.push_char('`');
795 }
796 }
797 "pre" => {
798 self.in_pre = false;
799 if self.in_code_block || !self.code_content.is_empty() {
800 self.in_code_block = false;
801 self.ensure_blank_line();
802 self.out.push_str(&format!("```{}\n", self.code_language));
803 self.out.push_str(self.code_content.trim());
804 self.out.push_str("\n```\n");
805 self.code_content.clear();
806 self.code_language.clear();
807 }
808 }
809 "div" | "ac:structured-macro" if self.in_macro => {
810 if self.in_code_block {
811 self.in_code_block = false;
812 self.ensure_blank_line();
813 self.out.push_str(&format!("```{}\n", self.code_language));
814 self.out.push_str(self.code_content.trim());
815 self.out.push_str("\n```\n");
816 self.code_content.clear();
817 self.code_language.clear();
818 }
819 self.in_macro = false;
820 self.macro_name.clear();
821 }
822 "strong" | "b" => self.push_str("**"),
823 "em" | "i" => self.push_char('*'),
824 "u" => self.push_str("__"),
825 "s" | "del" | "strike" => self.push_str("~~"),
826 "sup" => self.push_char('^'),
827 "sub" => self.push_char('~'),
828 "blockquote" => self.ensure_newline(),
829 _ => {}
830 }
831 }
832
833 fn render_table(&mut self) {
834 if self.table_rows.is_empty() {
835 return;
836 }
837
838 let max_cols = self.table_rows.iter().map(|r| r.len()).max().unwrap_or(0);
839 if max_cols == 0 {
840 return;
841 }
842
843 let mut widths = vec![3usize; max_cols];
844 for row in &self.table_rows {
845 for (i, cell) in row.iter().enumerate() {
846 widths[i] = widths[i].max(cell.len());
847 }
848 }
849
850 self.ensure_blank_line();
851
852 if let Some(header) = self.table_rows.first() {
853 self.out.push('|');
854 for (i, cell) in header.iter().enumerate() {
855 let w = widths.get(i).copied().unwrap_or(3);
856 self.out.push_str(&format!(" {:width$} |", cell, width = w));
857 }
858 for i in header.len()..max_cols {
859 let w = widths.get(i).copied().unwrap_or(3);
860 self.out.push_str(&format!(" {:width$} |", "", width = w));
861 }
862 self.out.push('\n');
863
864 self.out.push('|');
865 for w in &widths {
866 self.out.push_str(&format!(" {} |", "-".repeat(*w)));
867 }
868 self.out.push('\n');
869 }
870
871 for row in self.table_rows.iter().skip(1) {
872 self.out.push('|');
873 for (i, cell) in row.iter().enumerate() {
874 let w = widths.get(i).copied().unwrap_or(3);
875 self.out.push_str(&format!(" {:width$} |", cell, width = w));
876 }
877 for i in row.len()..max_cols {
878 let w = widths.get(i).copied().unwrap_or(3);
879 self.out.push_str(&format!(" {:width$} |", "", width = w));
880 }
881 self.out.push('\n');
882 }
883
884 self.out.push('\n');
885 self.table_rows.clear();
886 }
887
888 fn ensure_newline(&mut self) {
889 if !self.out.ends_with('\n') && !self.out.is_empty() {
890 self.out.push('\n');
891 }
892 }
893
894 fn ensure_blank_line(&mut self) {
895 self.ensure_newline();
896 if !self.out.ends_with("\n\n") && !self.out.is_empty() {
897 self.out.push('\n');
898 }
899 }
900
901 fn finish(self) -> String {
902 let mut result = String::new();
903 let mut blank_count = 0;
904 for line in self.out.lines() {
905 if line.trim().is_empty() {
906 blank_count += 1;
907 if blank_count <= 2 {
908 result.push('\n');
909 }
910 } else {
911 blank_count = 0;
912 result.push_str(line);
913 result.push('\n');
914 }
915 }
916 result.trim().to_string()
917 }
918}
919
920fn split_tag(s: &str) -> (&str, &str) {
925 let s = s.trim_end_matches('/').trim();
926 match s.find(|c: char| c.is_whitespace()) {
927 Some(i) => (&s[..i], s[i..].trim()),
928 None => (s, ""),
929 }
930}
931
932fn extract_attr(attrs: &str, name: &str) -> Option<String> {
933 let patterns = [format!("{}=\"", name), format!("{}='", name)];
934 for pat in &patterns {
935 if let Some(start) = attrs.find(pat.as_str()) {
936 let val_start = start + pat.len();
937 let quote = if pat.ends_with('"') { '"' } else { '\'' };
938 if let Some(end) = attrs[val_start..].find(quote) {
939 return Some(attrs[val_start..val_start + end].to_string());
940 }
941 }
942 }
943 None
944}
945
946fn classify_link(href: &str) -> PageLink {
947 if href.contains("/pages/") && href.contains("/wiki/spaces/") {
948 let parts: Vec<&str> = href.split('/').collect();
949 if let Some(idx) = parts.iter().position(|&p| p == "pages") {
950 if let Some(id) = parts.get(idx + 1) {
951 if id.chars().all(|c| c.is_ascii_digit()) {
952 return PageLink {
953 page_id: Some(id.to_string()),
954 url: href.to_string(),
955 link_type: "confluence_page".to_string(),
956 };
957 }
958 }
959 }
960 }
961
962 if href.contains("pageId=") {
963 if let Some(id) = href.split("pageId=").nth(1) {
964 let id = id.split('&').next().unwrap_or(id);
965 if id.chars().all(|c| c.is_ascii_digit()) {
966 return PageLink {
967 page_id: Some(id.to_string()),
968 url: href.to_string(),
969 link_type: "confluence_page".to_string(),
970 };
971 }
972 }
973 }
974
975 if href.contains("/browse/") || href.contains("jira") {
976 let key = href.rsplit('/').next().unwrap_or("");
977 if key.contains('-')
978 && key
979 .split('-')
980 .next()
981 .map(|p| p.chars().all(|c| c.is_ascii_uppercase()))
982 .unwrap_or(false)
983 {
984 return PageLink {
985 page_id: None,
986 url: href.to_string(),
987 link_type: "jira".to_string(),
988 };
989 }
990 }
991
992 PageLink {
993 page_id: None,
994 url: href.to_string(),
995 link_type: "external".to_string(),
996 }
997}
998
999fn load_cursor(path: &Path) -> Option<SyncCursor> {
1000 let data = std::fs::read_to_string(path).ok()?;
1001 serde_json::from_str(&data).ok()
1002}
1003
1004fn save_cursor(path: &Path, cursor: &SyncCursor) -> Result<()> {
1005 if let Some(parent) = path.parent() {
1006 std::fs::create_dir_all(parent)?;
1007 }
1008 let json = serde_json::to_string_pretty(cursor)?;
1009 std::fs::write(path, json).context("write sync cursor")?;
1010 Ok(())
1011}