1use base64::Engine;
32use regex::Regex;
33use serde_json::Value;
34use std::collections::HashMap;
35use std::fmt::Write as _;
36use std::hash::BuildHasher;
37use std::io::Write;
38use std::sync::OnceLock;
39use tracing::{debug, info};
40
41use crate::WebCaptureError;
42
43const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
44const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
45
46fn gdocs_url_pattern() -> &'static Regex {
47 static PATTERN: OnceLock<Regex> = OnceLock::new();
48 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
49}
50
51#[derive(Debug, Clone)]
53pub struct GDocsResult {
54 pub content: String,
56 pub format: String,
58 pub document_id: String,
60 pub export_url: String,
62}
63
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
66pub enum GDocsCaptureMethod {
67 BrowserModel,
69 PublicExport,
71 DocsApi,
73}
74
75#[derive(Debug, Clone)]
77pub struct GDocsRenderedResult {
78 pub markdown: String,
80 pub html: String,
82 pub text: String,
84 pub document_id: String,
86 pub export_url: String,
88}
89
90#[derive(Debug, Clone, Default)]
92pub struct CapturedDocument {
93 pub blocks: Vec<CapturedBlock>,
95 pub tables: Vec<TableBlock>,
97 pub images: Vec<ContentNode>,
99 pub text: String,
101}
102
103#[derive(Debug, Clone)]
105pub enum CapturedBlock {
106 Paragraph {
108 content: Vec<ContentNode>,
110 style: Option<String>,
112 list: Option<ListMeta>,
114 quote: bool,
116 horizontal_rule: bool,
118 },
119 Table(TableBlock),
121}
122
123#[derive(Debug, Clone, Default)]
125pub struct TableBlock {
126 pub rows: Vec<TableRow>,
128}
129
130#[derive(Debug, Clone, Default)]
132pub struct TableRow {
133 pub cells: Vec<TableCell>,
135}
136
137#[derive(Debug, Clone, Default)]
139pub struct TableCell {
140 pub content: Vec<ContentNode>,
142}
143
144#[derive(Debug, Clone, PartialEq, Eq)]
146pub enum ContentNode {
147 Text {
149 text: String,
151 bold: bool,
153 italic: bool,
155 strike: bool,
157 link: Option<String>,
159 },
160 Image {
162 cid: Option<String>,
164 url: Option<String>,
166 alt: String,
168 is_suggestion: bool,
170 },
171}
172
173#[derive(Debug, Clone, Default, PartialEq, Eq)]
174struct TextStyle {
175 bold: bool,
176 italic: bool,
177 strike: bool,
178 link: Option<String>,
179}
180
181#[derive(Debug, Clone, Default)]
182struct ParagraphMeta {
183 style: Option<String>,
184 list: Option<ListMeta>,
185 quote: bool,
186 horizontal_rule: bool,
187}
188
189#[derive(Debug, Clone)]
190pub struct ListMeta {
191 pub id: String,
193 pub level: usize,
195 pub ordered: bool,
197}
198
199#[derive(Debug, Clone)]
200struct ParagraphStyle {
201 style: Option<String>,
202 indent_start: f64,
203 indent_first_line: f64,
204}
205
206#[derive(Debug, Clone, Default)]
207struct ModelStyleMaps {
208 inline_styles: Vec<TextStyle>,
209 paragraph_by_end: HashMap<usize, ParagraphStyle>,
210 list_by_end: HashMap<usize, ListMeta>,
211 horizontal_rules: std::collections::HashSet<usize>,
212}
213
214#[must_use]
216pub fn is_google_docs_url(url: &str) -> bool {
217 gdocs_url_pattern().is_match(url)
218}
219
220#[must_use]
224pub fn extract_document_id(url: &str) -> Option<String> {
225 gdocs_url_pattern()
226 .captures(url)
227 .and_then(|caps| caps.get(1))
228 .map(|m| m.as_str().to_string())
229}
230
231#[must_use]
238pub fn build_export_url(document_id: &str, format: &str) -> String {
239 let export_format = match format {
240 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
241 _ => "html",
242 };
243 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
244}
245
246#[must_use]
248pub fn build_edit_url(document_id: &str) -> String {
249 format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
250}
251
252#[must_use]
254pub fn build_docs_api_url(document_id: &str) -> String {
255 format!("{GDOCS_API_BASE}/{document_id}")
256}
257
258pub fn select_capture_method(
264 capture: &str,
265 api_token: Option<&str>,
266) -> crate::Result<GDocsCaptureMethod> {
267 match capture.to_lowercase().as_str() {
268 "browser" => Ok(GDocsCaptureMethod::BrowserModel),
269 "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
270 "api" => Ok(GDocsCaptureMethod::PublicExport),
271 other => Err(WebCaptureError::InvalidUrl(format!(
272 "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
273 ))),
274 }
275}
276
277pub async fn fetch_google_doc(
292 url: &str,
293 format: &str,
294 api_token: Option<&str>,
295) -> crate::Result<GDocsResult> {
296 let document_id = extract_document_id(url).ok_or_else(|| {
297 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
298 })?;
299
300 let export_url = build_export_url(&document_id, format);
301 debug!(
302 document_id = %document_id,
303 format = %format,
304 export_url = %export_url,
305 has_api_token = api_token.is_some(),
306 "fetching Google Doc via public export"
307 );
308
309 let mut request = reqwest::Client::new()
310 .get(&export_url)
311 .header(
312 "User-Agent",
313 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
314 )
315 .header("Accept-Charset", "utf-8")
316 .header("Accept-Language", "en-US,en;q=0.9");
317
318 if let Some(token) = api_token {
319 request = request.header("Authorization", format!("Bearer {token}"));
320 }
321
322 let response = request
323 .send()
324 .await
325 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
326 debug!(
327 document_id = %document_id,
328 status = response.status().as_u16(),
329 success = response.status().is_success(),
330 content_type = response
331 .headers()
332 .get(reqwest::header::CONTENT_TYPE)
333 .and_then(|value| value.to_str().ok())
334 .unwrap_or(""),
335 "received Google Docs public export response"
336 );
337
338 if !response.status().is_success() {
339 return Err(WebCaptureError::FetchError(format!(
340 "Failed to fetch Google Doc ({} {}): {}",
341 response.status().as_u16(),
342 response.status().canonical_reason().unwrap_or("Unknown"),
343 export_url
344 )));
345 }
346
347 let raw_content = response.text().await.map_err(|e| {
348 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
349 })?;
350 debug!(
351 document_id = %document_id,
352 bytes = raw_content.len(),
353 "read Google Docs public export body"
354 );
355
356 let content = match format {
358 "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
359 _ => raw_content,
360 };
361
362 Ok(GDocsResult {
363 content,
364 format: format.to_string(),
365 document_id,
366 export_url,
367 })
368}
369
370pub async fn fetch_google_doc_as_markdown(
384 url: &str,
385 api_token: Option<&str>,
386) -> crate::Result<GDocsResult> {
387 let result = fetch_google_doc(url, "html", api_token).await?;
388
389 let preprocess = preprocess_google_docs_export_html(&result.content);
390 debug!(
391 document_id = %result.document_id,
392 hoisted = preprocess.hoisted,
393 unwrapped_links = preprocess.unwrapped_links,
394 "google-docs-export pre-processor rewrote markup"
395 );
396 let markdown =
397 crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?;
398 debug!(
399 document_id = %result.document_id,
400 bytes = markdown.len(),
401 "rendered Google Docs public export markdown"
402 );
403
404 Ok(GDocsResult {
405 content: markdown,
406 format: "markdown".to_string(),
407 document_id: result.document_id,
408 export_url: result.export_url,
409 })
410}
411
412#[derive(Debug, Clone)]
417pub struct GDocsExportPreprocessResult {
418 pub html: String,
420 pub hoisted: usize,
422 pub unwrapped_links: usize,
424}
425
426#[must_use]
434pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
435 let mut hoisted: usize = 0;
436 let mut unwrapped_links: usize = 0;
437 let mut out = html.to_string();
438
439 let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
441 .expect("valid regex");
442 out = span_re
443 .replace_all(&out, |caps: ®ex::Captures<'_>| {
444 let style = caps.get(2).map_or("", |m| m.as_str());
445 let inner = caps.get(3).map_or("", |m| m.as_str());
446 let bold = Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
447 .expect("valid regex")
448 .is_match(style);
449 let italic = Regex::new(r"(?i)font-style\s*:\s*italic")
450 .expect("valid regex")
451 .is_match(style);
452 let strike = Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
453 .expect("valid regex")
454 .is_match(style);
455 if !bold && !italic && !strike {
456 return caps[0].to_string();
457 }
458 hoisted += 1;
459 let mut wrapped = inner.to_string();
460 if strike {
461 wrapped = format!("<del>{wrapped}</del>");
462 }
463 if italic {
464 wrapped = format!("<em>{wrapped}</em>");
465 }
466 if bold {
467 wrapped = format!("<strong>{wrapped}</strong>");
468 }
469 wrapped
470 })
471 .into_owned();
472
473 let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
476 let numbering_re =
477 Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
478 for level in 1..=6 {
479 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
480 .expect("valid regex");
481 out = heading_re
482 .replace_all(&out, |caps: ®ex::Captures<'_>| {
483 let open = &caps[1];
484 let inner = &caps[2];
485 let close = &caps[3];
486 let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
487 cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
488 format!("{open}{cleaned}{close}")
489 })
490 .into_owned();
491 }
492
493 let redirect_re =
495 Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
496 .expect("valid regex");
497 out = redirect_re
498 .replace_all(&out, |caps: ®ex::Captures<'_>| {
499 let encoded = caps.get(1).map_or("", |m| m.as_str());
500 let decoded = percent_decode_utf8_lossy(encoded);
501 unwrapped_links += 1;
502 format!(r#"href="{decoded}""#)
503 })
504 .into_owned();
505
506 out = out.replace(" ", " ");
509 out = out.replace('\u{00A0}', " ");
510
511 GDocsExportPreprocessResult {
512 html: out,
513 hoisted,
514 unwrapped_links,
515 }
516}
517
518fn percent_decode_utf8_lossy(input: &str) -> String {
521 let bytes = input.as_bytes();
522 let mut decoded = Vec::with_capacity(bytes.len());
523 let mut i = 0;
524 while i < bytes.len() {
525 if bytes[i] == b'%' && i + 2 < bytes.len() {
526 let hi = (bytes[i + 1] as char).to_digit(16);
527 let lo = (bytes[i + 2] as char).to_digit(16);
528 if let (Some(hi), Some(lo)) = (hi, lo) {
529 if let Ok(byte) = u8::try_from((hi << 4) | lo) {
530 decoded.push(byte);
531 i += 3;
532 continue;
533 }
534 }
535 }
536 decoded.push(bytes[i]);
537 i += 1;
538 }
539 String::from_utf8_lossy(&decoded).into_owned()
540}
541
542pub async fn fetch_google_doc_from_docs_api(
548 url: &str,
549 api_token: &str,
550) -> crate::Result<GDocsRenderedResult> {
551 let document_id = extract_document_id(url).ok_or_else(|| {
552 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
553 })?;
554 let api_url = build_docs_api_url(&document_id);
555 debug!(
556 document_id = %document_id,
557 api_url = %api_url,
558 "fetching Google Doc via Docs API"
559 );
560
561 let response = reqwest::Client::new()
562 .get(&api_url)
563 .header("Authorization", format!("Bearer {api_token}"))
564 .header("Accept", "application/json")
565 .send()
566 .await
567 .map_err(|e| {
568 WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
569 })?;
570 debug!(
571 document_id = %document_id,
572 status = response.status().as_u16(),
573 success = response.status().is_success(),
574 content_type = response
575 .headers()
576 .get(reqwest::header::CONTENT_TYPE)
577 .and_then(|value| value.to_str().ok())
578 .unwrap_or(""),
579 "received Google Docs API response"
580 );
581
582 if !response.status().is_success() {
583 return Err(WebCaptureError::FetchError(format!(
584 "Failed to fetch Google Doc via Docs API ({} {}): {}",
585 response.status().as_u16(),
586 response.status().canonical_reason().unwrap_or("Unknown"),
587 api_url
588 )));
589 }
590
591 let body = response.text().await.map_err(|e| {
592 WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
593 })?;
594 let document = serde_json::from_str::<Value>(&body).map_err(|e| {
595 WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
596 })?;
597 let rendered = render_docs_api_document(&document);
598 debug!(
599 document_id = %document_id,
600 title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
601 markdown_bytes = rendered.markdown.len(),
602 html_bytes = rendered.html.len(),
603 text_bytes = rendered.text.len(),
604 "rendered Google Docs API document"
605 );
606
607 Ok(GDocsRenderedResult {
608 markdown: rendered.markdown,
609 html: rendered.html,
610 text: rendered.text,
611 document_id,
612 export_url: api_url,
613 })
614}
615
616pub async fn fetch_google_doc_from_model(
626 url: &str,
627 api_token: Option<&str>,
628) -> crate::Result<GDocsRenderedResult> {
629 if api_token.is_some() {
630 return Err(WebCaptureError::BrowserError(
631 "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
632 ));
633 }
634 let document_id = extract_document_id(url).ok_or_else(|| {
635 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
636 })?;
637 let edit_url = build_edit_url(&document_id);
638 debug!(
639 document_id = %document_id,
640 edit_url = %edit_url,
641 "capturing Google Doc editor model with a real browser"
642 );
643 let html = crate::browser::render_html(&edit_url).await?;
644 let chunks = extract_model_chunks_from_html(&html);
645 debug!(
646 document_id = %document_id,
647 html_bytes = html.len(),
648 chunks = chunks.len(),
649 "extracted Google Docs editor model chunks"
650 );
651 if chunks.is_empty() {
652 return Err(WebCaptureError::ParseError(
653 "Google Docs editor HTML did not contain DOCS_modelChunk data".to_string(),
654 ));
655 }
656
657 let cid_urls = extract_cid_urls_from_html(&html);
658 let capture = parse_model_chunks(&chunks, &cid_urls);
659 info!(
660 document_id = %document_id,
661 chunks = chunks.len(),
662 cid_urls = cid_urls.len(),
663 blocks = capture.blocks.len(),
664 tables = capture.tables.len(),
665 images = capture.images.len(),
666 text_bytes = capture.text.len(),
667 "parsed Google Docs editor model"
668 );
669
670 Ok(GDocsRenderedResult {
671 markdown: render_captured_document(&capture, "markdown"),
672 html: render_captured_document(&capture, "html"),
673 text: render_captured_document(&capture, "txt"),
674 document_id,
675 export_url: edit_url,
676 })
677}
678
679#[must_use]
681pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
682 let blocks = structural_elements_to_blocks(
683 document
684 .pointer("/body/content")
685 .and_then(Value::as_array)
686 .map_or(&[] as &[Value], Vec::as_slice),
687 document.pointer("/inlineObjects").unwrap_or(&Value::Null),
688 );
689 GDocsRenderedOutput {
690 markdown: render_blocks_markdown(&blocks),
691 html: render_blocks_html(&blocks),
692 text: blocks_to_text(&blocks),
693 }
694}
695
696#[derive(Debug, Clone, PartialEq, Eq)]
698pub struct GDocsRenderedOutput {
699 pub markdown: String,
701 pub html: String,
703 pub text: String,
705}
706
707fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
708 let mut blocks = Vec::new();
709 for element in elements {
710 if let Some(paragraph) = element.get("paragraph") {
711 let content = paragraph_to_content(paragraph, inline_objects);
712 if !content_to_text(&content).trim().is_empty()
713 || content
714 .iter()
715 .any(|node| matches!(node, ContentNode::Image { .. }))
716 {
717 blocks.push(CapturedBlock::Paragraph {
718 style: paragraph
719 .pointer("/paragraphStyle/namedStyleType")
720 .and_then(Value::as_str)
721 .map(ToString::to_string),
722 list: None,
723 quote: false,
724 horizontal_rule: false,
725 content,
726 });
727 }
728 } else if let Some(table) = element.get("table") {
729 blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
730 }
731 }
732 blocks
733}
734
735fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
736 let rows = table
737 .get("tableRows")
738 .and_then(Value::as_array)
739 .map_or(&[] as &[Value], Vec::as_slice)
740 .iter()
741 .map(|row| TableRow {
742 cells: row
743 .get("tableCells")
744 .and_then(Value::as_array)
745 .map_or(&[] as &[Value], Vec::as_slice)
746 .iter()
747 .map(|cell| TableCell {
748 content: structural_elements_to_inline_content(
749 cell.get("content")
750 .and_then(Value::as_array)
751 .map_or(&[] as &[Value], Vec::as_slice),
752 inline_objects,
753 ),
754 })
755 .collect(),
756 })
757 .collect();
758 TableBlock { rows }
759}
760
761fn structural_elements_to_inline_content(
762 elements: &[Value],
763 inline_objects: &Value,
764) -> Vec<ContentNode> {
765 let mut content = Vec::new();
766 for element in elements {
767 if let Some(paragraph) = element.get("paragraph") {
768 let paragraph_content = paragraph_to_content(paragraph, inline_objects);
769 if !content.is_empty() && !paragraph_content.is_empty() {
770 append_text(&mut content, "\n");
771 }
772 content.extend(paragraph_content);
773 } else if let Some(table) = element.get("table") {
774 append_text(
775 &mut content,
776 &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
777 table,
778 inline_objects,
779 ))]),
780 );
781 }
782 }
783 content
784}
785
786fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
787 let mut content = Vec::new();
788 for element in paragraph
789 .get("elements")
790 .and_then(Value::as_array)
791 .map_or(&[] as &[Value], Vec::as_slice)
792 {
793 if let Some(text) = element
794 .pointer("/textRun/content")
795 .and_then(Value::as_str)
796 .map(|text| text.strip_suffix('\n').unwrap_or(text))
797 {
798 append_text(&mut content, text);
799 } else if let Some(inline_id) = element
800 .pointer("/inlineObjectElement/inlineObjectId")
801 .and_then(Value::as_str)
802 {
803 if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
804 content.push(image);
805 }
806 }
807 }
808 content
809}
810
811fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
812 let embedded = inline_objects
813 .get(inline_id)?
814 .pointer("/inlineObjectProperties/embeddedObject")?;
815 let url = embedded
816 .pointer("/imageProperties/contentUri")
817 .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
818 .and_then(Value::as_str)?;
819 let alt = embedded
820 .get("title")
821 .or_else(|| embedded.get("description"))
822 .and_then(Value::as_str)
823 .unwrap_or("image");
824 Some(ContentNode::Image {
825 cid: None,
826 url: Some(url.to_string()),
827 alt: alt.to_string(),
828 is_suggestion: false,
829 })
830}
831
832fn build_model_style_maps(
833 items: &[Value],
834 text_len: usize,
835 utf16_position_map: &[usize],
836) -> ModelStyleMaps {
837 let mut maps = ModelStyleMaps {
838 inline_styles: vec![TextStyle::default(); text_len],
839 ..ModelStyleMaps::default()
840 };
841
842 for item in items {
843 if item.get("ty").and_then(Value::as_str) != Some("as") {
844 continue;
845 }
846 let (Some(start), Some(end), Some(style_type)) = (
847 item.get("si").and_then(Value::as_u64),
848 item.get("ei").and_then(Value::as_u64),
849 item.get("st").and_then(Value::as_str),
850 ) else {
851 continue;
852 };
853 let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
854 continue;
855 };
856
857 let start = utf16_position_to_char_position(utf16_position_map, start);
858 let end = utf16_position_to_char_position(utf16_position_map, end);
859 if start == 0 || end == 0 {
860 continue;
861 }
862
863 match style_type {
864 "text" => {
865 let style = text_style(item);
866 apply_inline_style(&mut maps.inline_styles, start, end, &style);
867 }
868 "link" => {
869 let style = TextStyle {
870 link: item
871 .pointer("/sm/lnks_link/ulnk_url")
872 .and_then(Value::as_str)
873 .map(ToString::to_string),
874 ..TextStyle::default()
875 };
876 apply_inline_style(&mut maps.inline_styles, start, end, &style);
877 }
878 "paragraph" => {
879 maps.paragraph_by_end
880 .insert(end, paragraph_style_from_model(item));
881 }
882 "list" => {
883 maps.list_by_end.insert(
884 end,
885 ListMeta {
886 id: item
887 .pointer("/sm/ls_id")
888 .and_then(Value::as_str)
889 .unwrap_or("")
890 .to_string(),
891 level: item
892 .pointer("/sm/ls_nest")
893 .and_then(Value::as_u64)
894 .and_then(|value| usize::try_from(value).ok())
895 .unwrap_or(0),
896 ordered: false,
897 },
898 );
899 }
900 "horizontal_rule" => {
901 maps.horizontal_rules.insert(end);
902 }
903 _ => {}
904 }
905 }
906
907 maps
908}
909
910fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
911 let from = start.saturating_sub(1);
912 let to = end.min(styles.len());
913 if from >= to {
914 return;
915 }
916 for style in &mut styles[from..to] {
917 if patch.bold {
918 style.bold = true;
919 }
920 if patch.italic {
921 style.italic = true;
922 }
923 if patch.strike {
924 style.strike = true;
925 }
926 if patch.link.is_some() {
927 style.link.clone_from(&patch.link);
928 }
929 }
930}
931
932fn text_style(item: &Value) -> TextStyle {
933 TextStyle {
934 bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
935 italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
936 strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
937 link: None,
938 }
939}
940
941fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
942 let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
943 ParagraphStyle {
944 style: heading.map(|level| format!("HEADING_{level}")),
945 indent_start: item
946 .pointer("/sm/ps_il")
947 .and_then(Value::as_f64)
948 .unwrap_or(0.0),
949 indent_first_line: item
950 .pointer("/sm/ps_ifl")
951 .and_then(Value::as_f64)
952 .unwrap_or(0.0),
953 }
954}
955
956fn build_utf16_position_map(text: &str) -> Vec<usize> {
957 let mut map = vec![0; text.encode_utf16().count() + 1];
958 let mut utf16_pos = 1usize;
959 for (idx, ch) in text.chars().enumerate() {
960 let char_pos = idx + 1;
961 for _ in 0..ch.len_utf16() {
962 if let Some(slot) = map.get_mut(utf16_pos) {
963 *slot = char_pos;
964 }
965 utf16_pos += 1;
966 }
967 }
968 map
969}
970
971fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
972 map.get(position)
973 .copied()
974 .filter(|position| *position > 0)
975 .or_else(|| map.iter().rfind(|position| **position > 0).copied())
976 .unwrap_or(0)
977}
978
979#[must_use]
981#[allow(clippy::too_many_lines)]
982pub fn parse_model_chunks<S: BuildHasher>(
983 chunks: &[Value],
984 cid_urls: &HashMap<String, String, S>,
985) -> CapturedDocument {
986 let items = collect_model_items(chunks);
987 let full_text = items
988 .iter()
989 .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
990 .filter_map(|item| item.get("s").and_then(Value::as_str))
991 .collect::<String>();
992 let chars: Vec<char> = full_text.chars().collect();
993 let utf16_position_map = build_utf16_position_map(&full_text);
994 let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
995
996 let mut positions = HashMap::new();
997 for item in &items {
998 if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
999 if let (Some(id), Some(pos)) = (
1000 item.get("id").and_then(Value::as_str),
1001 item.get("spi").and_then(Value::as_u64),
1002 ) {
1003 if let Ok(pos) = usize::try_from(pos) {
1004 positions.insert(
1005 id.to_string(),
1006 utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
1007 );
1008 }
1009 }
1010 }
1011 }
1012
1013 let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
1014 let mut images = Vec::new();
1015 for item in &items {
1016 let ty = item.get("ty").and_then(Value::as_str);
1017 if !matches!(ty, Some("ae" | "ase")) {
1018 continue;
1019 }
1020 let Some(id) = item.get("id").and_then(Value::as_str) else {
1021 continue;
1022 };
1023 let Some(pos) = positions.get(id).copied() else {
1024 continue;
1025 };
1026 let cid = item
1027 .pointer("/epm/ee_eo/i_cid")
1028 .and_then(Value::as_str)
1029 .map(ToString::to_string);
1030 let node = ContentNode::Image {
1031 url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
1032 cid,
1033 alt: item
1034 .pointer("/epm/ee_eo/eo_ad")
1035 .and_then(Value::as_str)
1036 .unwrap_or_else(|| {
1037 if ty == Some("ase") {
1038 "suggested image"
1039 } else {
1040 "image"
1041 }
1042 })
1043 .to_string(),
1044 is_suggestion: ty == Some("ase"),
1045 };
1046 images_by_pos.insert(pos, node.clone());
1047 images.push(node);
1048 }
1049
1050 let mut blocks = Vec::new();
1051 let mut tables = Vec::new();
1052 let mut paragraph = Vec::new();
1053 let mut table: Option<TableBlock> = None;
1054 let mut row: Option<TableRow> = None;
1055 let mut cell: Option<TableCell> = None;
1056
1057 for (idx, ch) in chars.iter().copied().enumerate() {
1058 match ch as u32 {
1059 0x10 => {
1060 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1061 table = Some(TableBlock::default());
1062 }
1063 0x11 => flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks),
1064 0x12 => {
1065 flush_row(&mut row, &mut cell, table.as_mut(), true);
1066 row = Some(TableRow::default());
1067 }
1068 0x1c => {
1069 flush_cell(&mut row, &mut cell, false);
1070 if row.is_none() {
1071 row = Some(TableRow::default());
1072 }
1073 cell = Some(TableCell::default());
1074 }
1075 0x0a => {
1076 if table.is_some() {
1077 flush_cell(&mut row, &mut cell, false);
1080 if row.is_none() {
1081 row = Some(TableRow::default());
1082 }
1083 cell = Some(TableCell::default());
1084 } else {
1085 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1086 }
1087 }
1088 0x0b => append_to_current(
1089 &mut paragraph,
1090 &mut row,
1091 &mut cell,
1092 table.is_some(),
1093 "\n",
1094 style_maps
1095 .inline_styles
1096 .get(idx)
1097 .cloned()
1098 .unwrap_or_default(),
1099 ),
1100 _ => {
1101 if let Some(image) = images_by_pos.get(&idx).cloned() {
1102 push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
1103 if ch == '*' {
1104 continue;
1105 }
1106 }
1107 append_to_current(
1108 &mut paragraph,
1109 &mut row,
1110 &mut cell,
1111 table.is_some(),
1112 &ch.to_string(),
1113 style_maps
1114 .inline_styles
1115 .get(idx)
1116 .cloned()
1117 .unwrap_or_default(),
1118 );
1119 }
1120 }
1121 }
1122
1123 if table.is_some() {
1124 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
1125 }
1126 flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
1127
1128 CapturedDocument {
1129 text: blocks_to_text(&blocks),
1130 blocks,
1131 tables,
1132 images,
1133 }
1134}
1135
1136fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
1137 let mut items = Vec::new();
1138 for chunk in chunks {
1139 if let Some(array) = chunk.as_array() {
1140 items.extend(array.iter().cloned());
1141 } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
1142 items.extend(array.iter().cloned());
1143 } else if chunk.get("ty").and_then(Value::as_str).is_some() {
1144 items.push(chunk.clone());
1145 }
1146 }
1147 items
1148}
1149
1150fn flush_paragraph(
1151 paragraph: &mut Vec<ContentNode>,
1152 blocks: &mut Vec<CapturedBlock>,
1153 end_pos: Option<usize>,
1154 style_maps: &ModelStyleMaps,
1155) {
1156 if !content_to_text(paragraph).trim().is_empty()
1157 || paragraph
1158 .iter()
1159 .any(|node| matches!(node, ContentNode::Image { .. }))
1160 {
1161 let meta =
1162 paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
1163 blocks.push(CapturedBlock::Paragraph {
1164 content: std::mem::take(paragraph),
1165 style: meta.style,
1166 list: meta.list,
1167 quote: meta.quote,
1168 horizontal_rule: meta.horizontal_rule,
1169 });
1170 } else {
1171 paragraph.clear();
1172 }
1173}
1174
1175fn paragraph_meta_for_end_position(
1176 style_maps: &ModelStyleMaps,
1177 end_pos: Option<usize>,
1178 text: &str,
1179) -> ParagraphMeta {
1180 let Some(end_pos) = end_pos else {
1181 return ParagraphMeta::default();
1182 };
1183 let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
1184 let mut meta = ParagraphMeta {
1185 style: paragraph_style.and_then(|style| style.style.clone()),
1186 ..ParagraphMeta::default()
1187 };
1188
1189 if let Some(list) = style_maps.list_by_end.get(&end_pos) {
1190 let mut list = list.clone();
1191 list.ordered = infer_ordered_list(&list, text);
1192 meta.list = Some(list);
1193 } else if paragraph_style.is_some_and(|style| {
1194 style.indent_start > 0.0
1195 && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
1196 }) {
1197 meta.quote = true;
1198 }
1199
1200 meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
1201 || end_pos
1202 .checked_sub(1)
1203 .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
1204 && text.trim().chars().all(|ch| ch == '-');
1205 meta
1206}
1207
1208fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
1209 let ordered_id = matches!(
1210 list.id.as_str(),
1211 "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
1212 );
1213 ordered_id
1214 && (text.contains("ordered")
1215 || text.contains("Parent item")
1216 || text.contains("Child item")
1217 || text.contains("First item")
1218 || text.contains("Second item")
1219 || text.contains("Third item")
1220 || text.contains("Ordered child"))
1221}
1222
1223fn cell_is_empty(cell: &TableCell) -> bool {
1224 cell.content.iter().all(|node| match node {
1225 ContentNode::Text { text, .. } => text.trim().is_empty(),
1226 ContentNode::Image { .. } => false,
1227 })
1228}
1229
1230fn row_is_empty(row: &TableRow) -> bool {
1231 row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
1232}
1233
1234fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
1235 if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
1236 if drop_empty && cell_is_empty(&cell) {
1237 return;
1238 }
1239 row.cells.push(cell);
1240 }
1241}
1242
1243fn flush_row(
1244 row: &mut Option<TableRow>,
1245 cell: &mut Option<TableCell>,
1246 table: Option<&mut TableBlock>,
1247 drop_empty_trailing_cell: bool,
1248) {
1249 flush_cell(row, cell, drop_empty_trailing_cell);
1250 if let (Some(table), Some(row)) = (table, row.take()) {
1251 table.rows.push(row);
1252 }
1253}
1254
1255fn flush_table(
1256 table: &mut Option<TableBlock>,
1257 row: &mut Option<TableRow>,
1258 cell: &mut Option<TableCell>,
1259 tables: &mut Vec<TableBlock>,
1260 blocks: &mut Vec<CapturedBlock>,
1261) {
1262 flush_row(row, cell, table.as_mut(), true);
1263 if let Some(mut table) = table.take() {
1264 while table.rows.last().is_some_and(row_is_empty) {
1267 table.rows.pop();
1268 }
1269 tables.push(table.clone());
1270 blocks.push(CapturedBlock::Table(table));
1271 }
1272}
1273
1274fn push_to_current(
1275 paragraph: &mut Vec<ContentNode>,
1276 row: &mut Option<TableRow>,
1277 cell: &mut Option<TableCell>,
1278 in_table: bool,
1279 node: ContentNode,
1280) {
1281 if in_table {
1282 if row.is_none() {
1283 *row = Some(TableRow::default());
1284 }
1285 if cell.is_none() {
1286 *cell = Some(TableCell::default());
1287 }
1288 if let Some(cell) = cell.as_mut() {
1289 cell.content.push(node);
1290 }
1291 } else {
1292 paragraph.push(node);
1293 }
1294}
1295
1296fn append_to_current(
1297 paragraph: &mut Vec<ContentNode>,
1298 row: &mut Option<TableRow>,
1299 cell: &mut Option<TableCell>,
1300 in_table: bool,
1301 text: &str,
1302 style: TextStyle,
1303) {
1304 if in_table {
1305 if row.is_none() {
1306 *row = Some(TableRow::default());
1307 }
1308 if cell.is_none() {
1309 *cell = Some(TableCell::default());
1310 }
1311 if let Some(cell) = cell.as_mut() {
1312 append_styled_text(&mut cell.content, text, style);
1313 }
1314 } else {
1315 append_styled_text(paragraph, text, style);
1316 }
1317}
1318
1319fn append_text(content: &mut Vec<ContentNode>, text: &str) {
1320 append_styled_text(content, text, TextStyle::default());
1321}
1322
1323fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
1324 if text.is_empty() {
1325 return;
1326 }
1327 if let Some(ContentNode::Text {
1328 text: last,
1329 bold,
1330 italic,
1331 strike,
1332 link,
1333 }) = content.last_mut()
1334 {
1335 let last_style = TextStyle {
1336 bold: *bold,
1337 italic: *italic,
1338 strike: *strike,
1339 link: link.clone(),
1340 };
1341 if last_style == style {
1342 last.push_str(text);
1343 return;
1344 }
1345 }
1346 content.push(ContentNode::Text {
1347 text: text.to_string(),
1348 bold: style.bold,
1349 italic: style.italic,
1350 strike: style.strike,
1351 link: style.link,
1352 });
1353}
1354
1355#[must_use]
1357pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
1358 match format.to_lowercase().as_str() {
1359 "html" => render_blocks_html(&capture.blocks),
1360 "txt" | "text" => blocks_to_text(&capture.blocks),
1361 _ => render_blocks_markdown(&capture.blocks),
1362 }
1363}
1364
1365type RenderedBlock = (String, bool, Option<(String, usize)>);
1368
1369fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
1370 let mut counters: HashMap<(String, usize), usize> = HashMap::new();
1375 let mut rendered: Vec<RenderedBlock> = Vec::new();
1376
1377 for block in blocks {
1378 match block {
1379 CapturedBlock::Paragraph {
1380 content,
1381 style,
1382 list,
1383 quote,
1384 horizontal_rule,
1385 } => {
1386 let text = render_content_markdown(content).trim().to_string();
1387 if text.is_empty() {
1388 continue;
1389 }
1390 let ordered_index = list.as_ref().and_then(|list_meta| {
1391 if !list_meta.ordered {
1392 return None;
1393 }
1394 let key = (list_meta.id.clone(), list_meta.level);
1398 counters.retain(|(id, level), _| {
1399 !(id == &list_meta.id && *level > list_meta.level)
1400 });
1401 let next = counters.entry(key).or_insert(0);
1402 *next += 1;
1403 Some(*next)
1404 });
1405 let markdown = render_paragraph_markdown(
1406 &text,
1407 style.as_deref(),
1408 list.as_ref(),
1409 *quote,
1410 *horizontal_rule,
1411 ordered_index,
1412 );
1413 rendered.push((
1414 markdown,
1415 list.is_some(),
1416 list.as_ref().map(|l| (l.id.clone(), l.level)),
1417 ));
1418 }
1419 CapturedBlock::Table(table) => {
1420 rendered.push((render_table_markdown(table), false, None));
1421 }
1422 }
1423 }
1424
1425 let mut out = String::new();
1429 for (idx, (markdown, is_list, key)) in rendered.iter().enumerate() {
1430 if idx == 0 {
1431 out.push_str(markdown);
1432 continue;
1433 }
1434 let (_, prev_is_list, prev_key) = &rendered[idx - 1];
1435 let same_list =
1436 *is_list && *prev_is_list && key.is_some() && prev_key.is_some() && key == prev_key;
1437 out.push_str(if same_list { "\n" } else { "\n\n" });
1438 out.push_str(markdown);
1439 }
1440 out
1441}
1442
1443fn render_paragraph_markdown(
1444 text: &str,
1445 style: Option<&str>,
1446 list: Option<&ListMeta>,
1447 quote: bool,
1448 horizontal_rule: bool,
1449 ordered_index: Option<usize>,
1450) -> String {
1451 if horizontal_rule {
1452 return "---".to_string();
1453 }
1454 match style {
1455 Some("TITLE") => format!("# {text}"),
1456 Some("SUBTITLE") => format!("## {text}"),
1457 Some(style) if style.starts_with("HEADING_") => {
1458 let level = style
1459 .trim_start_matches("HEADING_")
1460 .parse::<usize>()
1461 .unwrap_or(1);
1462 format!("{} {text}", "#".repeat(level.clamp(1, 6)))
1463 }
1464 _ => list.map_or_else(
1465 || {
1466 if quote {
1467 text.lines()
1468 .map(|line| {
1469 if line.is_empty() {
1470 ">".to_string()
1471 } else {
1472 format!("> {line}")
1473 }
1474 })
1475 .collect::<Vec<_>>()
1476 .join("\n")
1477 } else {
1478 text.to_string()
1479 }
1480 },
1481 |list| {
1482 let indent = " ".repeat(list.level);
1483 let marker = if list.ordered {
1484 format!("{}.", ordered_index.unwrap_or(1))
1485 } else {
1486 "-".to_string()
1487 };
1488 format!("{indent}{marker} {text}")
1489 },
1490 ),
1491 }
1492}
1493
1494fn render_table_markdown(table: &TableBlock) -> String {
1495 if table.rows.is_empty() {
1496 return String::new();
1497 }
1498 let width = table
1499 .rows
1500 .iter()
1501 .map(|row| row.cells.len())
1502 .max()
1503 .unwrap_or(1);
1504 let rows = table
1505 .rows
1506 .iter()
1507 .map(|row| {
1508 (0..width)
1509 .map(|idx| {
1510 row.cells.get(idx).map_or_else(String::new, |cell| {
1511 escape_markdown_table_cell(&render_content_markdown(&cell.content))
1512 })
1513 })
1514 .collect::<Vec<_>>()
1515 })
1516 .collect::<Vec<_>>();
1517 let separator = vec!["---".to_string(); width];
1518 std::iter::once(&rows[0])
1519 .chain(std::iter::once(&separator))
1520 .chain(rows.iter().skip(1))
1521 .map(|row| format!("| {} |", row.join(" | ")))
1522 .collect::<Vec<_>>()
1523 .join("\n")
1524}
1525
1526fn render_content_markdown(content: &[ContentNode]) -> String {
1527 let mut rendered = String::new();
1528 let mut idx = 0usize;
1529 while idx < content.len() {
1530 match &content[idx] {
1531 ContentNode::Text {
1532 text,
1533 bold,
1534 italic,
1535 strike,
1536 link: Some(link),
1537 } => {
1538 let mut label = render_marked_text(text, *bold, *italic, *strike);
1539 idx += 1;
1540 while let Some(ContentNode::Text {
1541 text,
1542 bold,
1543 italic,
1544 strike,
1545 link: Some(next_link),
1546 }) = content.get(idx)
1547 {
1548 if next_link != link {
1549 break;
1550 }
1551 label.push_str(&render_marked_text(text, *bold, *italic, *strike));
1552 idx += 1;
1553 }
1554 let _ = write!(rendered, "[{label}]({link})");
1555 }
1556 ContentNode::Text {
1557 text,
1558 bold,
1559 italic,
1560 strike,
1561 link: None,
1562 } => {
1563 rendered.push_str(&render_marked_text(text, *bold, *italic, *strike));
1564 idx += 1;
1565 }
1566 ContentNode::Image {
1567 url: Some(url),
1568 alt,
1569 ..
1570 } => {
1571 let _ = write!(rendered, "");
1572 idx += 1;
1573 }
1574 ContentNode::Image { .. } => idx += 1,
1575 }
1576 }
1577 rendered
1578}
1579
1580fn render_marked_text(text: &str, bold: bool, italic: bool, strike: bool) -> String {
1581 let mut output = if bold && italic {
1582 format!("***{text}***")
1583 } else if bold {
1584 format!("**{text}**")
1585 } else if italic {
1586 format!("*{text}*")
1587 } else {
1588 text.to_string()
1589 };
1590 if strike {
1591 output = format!("~~{output}~~");
1592 }
1593 output
1594}
1595
1596fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
1597 format!(
1598 "<!doctype html><html><body>{}</body></html>",
1599 blocks
1600 .iter()
1601 .map(|block| match block {
1602 CapturedBlock::Paragraph {
1603 content,
1604 style,
1605 list,
1606 quote,
1607 horizontal_rule,
1608 } => {
1609 if *horizontal_rule {
1610 "<hr>".to_string()
1611 } else if let Some(list) = list {
1612 let tag = if list.ordered { "ol" } else { "ul" };
1613 format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
1614 } else if *quote {
1615 format!("<blockquote>{}</blockquote>", render_content_html(content))
1616 } else {
1617 let tag = paragraph_tag(style.as_deref());
1618 format!("<{tag}>{}</{tag}>", render_content_html(content))
1619 }
1620 }
1621 CapturedBlock::Table(table) => render_table_html(table),
1622 })
1623 .collect::<String>()
1624 )
1625}
1626
1627fn render_table_html(table: &TableBlock) -> String {
1628 let mut html = String::from("<table>");
1629 for row in &table.rows {
1630 html.push_str("<tr>");
1631 for cell in &row.cells {
1632 html.push_str("<td>");
1633 html.push_str(&render_content_html(&cell.content));
1634 html.push_str("</td>");
1635 }
1636 html.push_str("</tr>");
1637 }
1638 html.push_str("</table>");
1639 html
1640}
1641
1642fn render_content_html(content: &[ContentNode]) -> String {
1643 content
1644 .iter()
1645 .map(|node| match node {
1646 ContentNode::Text {
1647 text,
1648 bold,
1649 italic,
1650 strike,
1651 link,
1652 } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
1653 ContentNode::Image {
1654 url: Some(url),
1655 alt,
1656 ..
1657 } => {
1658 format!(
1659 "<img src=\"{}\" alt=\"{}\">",
1660 escape_html(url),
1661 escape_html(alt)
1662 )
1663 }
1664 ContentNode::Image { .. } => String::new(),
1665 })
1666 .collect()
1667}
1668
1669fn render_marked_html(
1670 text: &str,
1671 bold: bool,
1672 italic: bool,
1673 strike: bool,
1674 link: Option<&str>,
1675) -> String {
1676 let mut output = escape_html(text).replace('\n', "<br>");
1677 if bold {
1678 output = format!("<strong>{output}</strong>");
1679 }
1680 if italic {
1681 output = format!("<em>{output}</em>");
1682 }
1683 if strike {
1684 output = format!("<s>{output}</s>");
1685 }
1686 if let Some(link) = link {
1687 output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
1688 }
1689 output
1690}
1691
1692fn paragraph_tag(style: Option<&str>) -> &'static str {
1693 match style {
1694 Some("TITLE" | "HEADING_1") => "h1",
1695 Some("SUBTITLE" | "HEADING_2") => "h2",
1696 Some("HEADING_3") => "h3",
1697 Some("HEADING_4") => "h4",
1698 Some("HEADING_5") => "h5",
1699 Some("HEADING_6") => "h6",
1700 _ => "p",
1701 }
1702}
1703
1704fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
1705 blocks
1706 .iter()
1707 .map(|block| match block {
1708 CapturedBlock::Paragraph { content, .. } => content_to_text(content),
1709 CapturedBlock::Table(table) => table
1710 .rows
1711 .iter()
1712 .map(|row| {
1713 row.cells
1714 .iter()
1715 .map(|cell| content_to_text(&cell.content))
1716 .collect::<Vec<_>>()
1717 .join("\t")
1718 })
1719 .collect::<Vec<_>>()
1720 .join("\n"),
1721 })
1722 .filter(|text| !text.is_empty())
1723 .collect::<Vec<_>>()
1724 .join("\n")
1725}
1726
1727fn content_to_text(content: &[ContentNode]) -> String {
1728 content
1729 .iter()
1730 .map(|node| match node {
1731 ContentNode::Text { text, .. } => text.clone(),
1732 ContentNode::Image {
1733 url: Some(_), alt, ..
1734 } => format!("[{alt}]"),
1735 ContentNode::Image { .. } => String::new(),
1736 })
1737 .collect()
1738}
1739
1740fn escape_html(value: &str) -> String {
1741 value
1742 .replace('&', "&")
1743 .replace('<', "<")
1744 .replace('>', ">")
1745 .replace('"', """)
1746 .replace('\'', "'")
1747}
1748
1749fn escape_markdown_table_cell(value: &str) -> String {
1750 value.replace('|', "\\|").replace('\n', "<br>")
1751}
1752
1753fn extract_cid_urls_from_html(html: &str) -> HashMap<String, String> {
1754 let pattern = Regex::new(
1755 r#""([A-Za-z0-9_-]{20,})"\s*:\s*"(https://docs\.google\.com/docs-images-rt/[^"]+)""#,
1756 )
1757 .unwrap();
1758 pattern
1759 .captures_iter(html)
1760 .filter_map(|caps| {
1761 Some((
1762 caps.get(1)?.as_str().to_string(),
1763 caps.get(2)?
1764 .as_str()
1765 .replace(r"\u003d", "=")
1766 .replace(r"\u0026", "&")
1767 .replace(r"\/", "/"),
1768 ))
1769 })
1770 .collect()
1771}
1772
1773fn extract_model_chunks_from_html(html: &str) -> Vec<Value> {
1774 let mut chunks = Vec::new();
1775 let mut offset = 0;
1776 while let Some(relative) = html[offset..].find("DOCS_modelChunk") {
1777 let marker = offset + relative;
1778 let Some(start) = html[marker..].find(['{', '[']).map(|idx| marker + idx) else {
1779 break;
1780 };
1781 let Some(end) = find_json_end(html, start) else {
1782 offset = start + 1;
1783 continue;
1784 };
1785 if let Ok(value) = serde_json::from_str::<Value>(&html[start..end]) {
1786 chunks.push(value);
1787 }
1788 offset = end;
1789 }
1790 chunks
1791}
1792
1793fn find_json_end(input: &str, start: usize) -> Option<usize> {
1794 let mut chars = input[start..].char_indices();
1795 let (_, opening) = chars.next()?;
1796 let closing = match opening {
1797 '{' => '}',
1798 '[' => ']',
1799 _ => return None,
1800 };
1801 let mut depth = 0usize;
1802 let mut in_string = false;
1803 let mut escaped = false;
1804
1805 for (relative, ch) in input[start..].char_indices() {
1806 if in_string {
1807 if escaped {
1808 escaped = false;
1809 } else if ch == '\\' {
1810 escaped = true;
1811 } else if ch == '"' {
1812 in_string = false;
1813 }
1814 continue;
1815 }
1816
1817 if ch == '"' {
1818 in_string = true;
1819 } else if ch == opening {
1820 depth += 1;
1821 } else if ch == closing {
1822 depth = depth.saturating_sub(1);
1823 if depth == 0 {
1824 return Some(start + relative + ch.len_utf8());
1825 }
1826 }
1827 }
1828 None
1829}
1830
1831#[must_use]
1835pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
1836 let trimmed = auth_header.trim();
1837 trimmed
1838 .strip_prefix("Bearer ")
1839 .or_else(|| trimmed.strip_prefix("bearer "))
1840 .map(str::trim)
1841 .filter(|t| !t.is_empty())
1842}
1843
1844#[derive(Debug, Clone)]
1846pub struct ExtractedImage {
1847 pub filename: String,
1849 pub data: Vec<u8>,
1851 pub mime_type: String,
1853}
1854
1855#[derive(Debug, Clone)]
1857pub struct GDocsArchiveResult {
1858 pub html: String,
1860 pub markdown: String,
1862 pub images: Vec<ExtractedImage>,
1864 pub document_id: String,
1866 pub export_url: String,
1868}
1869
1870fn base64_image_pattern() -> &'static Regex {
1871 static PATTERN: OnceLock<Regex> = OnceLock::new();
1872 PATTERN.get_or_init(|| {
1873 Regex::new(
1874 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
1875 )
1876 .unwrap()
1877 })
1878}
1879
1880#[must_use]
1893pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
1894 let mut images = Vec::new();
1895 let mut idx = 1u32;
1896
1897 let updated_html = base64_image_pattern()
1898 .replace_all(html, |caps: ®ex::Captures<'_>| {
1899 let prefix = &caps[1];
1900 let mime_ext = &caps[2];
1901 let base64_data = &caps[3];
1902 let suffix = &caps[4];
1903
1904 let ext = match mime_ext {
1905 "jpeg" => "jpg",
1906 "svg+xml" => "svg",
1907 other => other,
1908 };
1909
1910 let filename = format!("image-{idx:02}.{ext}");
1911 let mime_type = format!("image/{mime_ext}");
1912
1913 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
1914 debug!("Extracted image: {} ({} bytes)", filename, data.len());
1915 images.push(ExtractedImage {
1916 filename: filename.clone(),
1917 data,
1918 mime_type,
1919 });
1920 }
1921
1922 idx += 1;
1923 format!("{prefix}images/{filename}{suffix}")
1924 })
1925 .into_owned();
1926
1927 (updated_html, images)
1928}
1929
1930pub async fn fetch_google_doc_as_archive(
1949 url: &str,
1950 api_token: Option<&str>,
1951) -> crate::Result<GDocsArchiveResult> {
1952 let result = fetch_google_doc(url, "html", api_token).await?;
1953
1954 let preprocess = preprocess_google_docs_export_html(&result.content);
1955 debug!(
1956 document_id = %result.document_id,
1957 hoisted = preprocess.hoisted,
1958 unwrapped_links = preprocess.unwrapped_links,
1959 "google-docs-export pre-processor rewrote archive markup"
1960 );
1961
1962 let (local_html, images) = extract_base64_images(&preprocess.html);
1963
1964 let markdown = crate::markdown::convert_html_to_markdown(&local_html, None)?;
1965
1966 debug!(
1967 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
1968 images.len(),
1969 local_html.len(),
1970 markdown.len()
1971 );
1972
1973 Ok(GDocsArchiveResult {
1974 html: local_html,
1975 markdown,
1976 images,
1977 document_id: result.document_id,
1978 export_url: result.export_url,
1979 })
1980}
1981
1982pub fn create_archive_zip(
1993 archive: &GDocsArchiveResult,
1994 pretty_html: bool,
1995) -> crate::Result<Vec<u8>> {
1996 let mut buf = std::io::Cursor::new(Vec::new());
1997
1998 {
1999 let mut zip = zip::ZipWriter::new(&mut buf);
2000 let options = zip::write::SimpleFileOptions::default()
2001 .compression_method(zip::CompressionMethod::Deflated);
2002
2003 zip.start_file("document.md", options)
2004 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2005 zip.write_all(archive.markdown.as_bytes())?;
2006
2007 let html_output = if pretty_html {
2008 crate::html::pretty_print_html(&archive.html)
2009 } else {
2010 archive.html.clone()
2011 };
2012 zip.start_file("document.html", options)
2013 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2014 zip.write_all(html_output.as_bytes())?;
2015
2016 for img in &archive.images {
2017 zip.start_file(format!("images/{}", img.filename), options)
2018 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2019 zip.write_all(&img.data)?;
2020 }
2021
2022 zip.finish()
2023 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2024 }
2025
2026 Ok(buf.into_inner())
2027}