1use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use serde_json::Value;
38use std::collections::HashMap;
39use std::fmt::Write as _;
40use std::hash::BuildHasher;
41use std::io::Write;
42use std::process::Stdio;
43use std::sync::OnceLock;
44use std::time::{Duration, Instant};
45use tokio::io::{AsyncBufReadExt, BufReader};
46use tokio::process::{Child, Command};
47use tracing::{debug, info, warn};
48
49use crate::WebCaptureError;
50
51const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
52const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
53const GDOCS_USER_AGENT: &str =
54 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
55const GDOCS_EDITOR_MODEL_WAIT: Duration = Duration::from_secs(30);
56const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
57
58type CdpWebSocket = WebSocketStream<ConnectStream>;
59
60const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
61window.__captured_chunks = [];
62const captureChunk = (value) => {
63 if (!value) {
64 return;
65 }
66 if (Array.isArray(value)) {
67 for (const item of value) {
68 captureChunk(item);
69 }
70 return;
71 }
72 try {
73 window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
74 } catch {
75 window.__captured_chunks.push(value);
76 }
77};
78const wrapChunkArray = (value) => {
79 if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
80 return value;
81 }
82 const originalPush = value.push;
83 Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
84 value: true,
85 enumerable: false,
86 });
87 Object.defineProperty(value, 'push', {
88 value(...items) {
89 for (const item of items) {
90 captureChunk(item);
91 }
92 return originalPush.apply(this, items);
93 },
94 writable: true,
95 configurable: true,
96 });
97 for (const item of value) {
98 captureChunk(item);
99 }
100 return value;
101};
102Object.defineProperty(window, 'DOCS_modelChunk', {
103 set(value) {
104 captureChunk(value);
105 window.__DOCS_modelChunk_latest = wrapChunkArray(value);
106 },
107 get() {
108 return window.__DOCS_modelChunk_latest;
109 },
110 configurable: false,
111});
112";
113
114const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
115 const chunks = [...(window.__captured_chunks || [])];
116 if (
117 window.DOCS_modelChunk &&
118 chunks.length === 0 &&
119 !chunks.includes(window.DOCS_modelChunk)
120 ) {
121 chunks.push(window.DOCS_modelChunk);
122 }
123 const cidUrlMap = {};
124 const scripts = document.querySelectorAll('script');
125 for (const script of scripts) {
126 const text = script.textContent || '';
127 if (!text.includes('docs-images-rt')) {
128 continue;
129 }
130 const regex =
131 /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
132 let match;
133 while ((match = regex.exec(text)) !== null) {
134 cidUrlMap[match[1]] = match[2]
135 .replace(/\\u003d/g, '=')
136 .replace(/\\u0026/g, '&')
137 .replace(/\\\//g, '/');
138 }
139 }
140 return { chunks, cidUrlMap };
141}"#;
142
143fn gdocs_url_pattern() -> &'static Regex {
144 static PATTERN: OnceLock<Regex> = OnceLock::new();
145 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
146}
147
148#[derive(Debug, Clone)]
150pub struct GDocsResult {
151 pub content: String,
153 pub format: String,
155 pub document_id: String,
157 pub export_url: String,
159}
160
161#[derive(Debug, Clone, Copy, PartialEq, Eq)]
163pub enum GDocsCaptureMethod {
164 BrowserModel,
166 PublicExport,
168 DocsApi,
170}
171
172#[derive(Debug, Clone)]
174pub struct GDocsRenderedResult {
175 pub markdown: String,
177 pub html: String,
179 pub text: String,
181 pub document_id: String,
183 pub export_url: String,
185 pub remote_images: Vec<RemoteImage>,
187}
188
189#[derive(Debug, Clone, PartialEq, Eq)]
191pub struct RemoteImage {
192 pub url: String,
194 pub alt: String,
196}
197
198#[derive(Debug, Clone)]
199struct BrowserModelData {
200 chunks: Vec<Value>,
201 cid_urls: HashMap<String, String>,
202}
203
204#[derive(Debug, Clone, Default)]
206pub struct CapturedDocument {
207 pub blocks: Vec<CapturedBlock>,
209 pub tables: Vec<TableBlock>,
211 pub images: Vec<ContentNode>,
213 pub text: String,
215}
216
217#[derive(Debug, Clone)]
219pub enum CapturedBlock {
220 Paragraph {
222 content: Vec<ContentNode>,
224 style: Option<String>,
226 list: Option<ListMeta>,
228 quote: bool,
230 horizontal_rule: bool,
232 },
233 Table(TableBlock),
235}
236
237#[derive(Debug, Clone, Default)]
239pub struct TableBlock {
240 pub rows: Vec<TableRow>,
242}
243
244#[derive(Debug, Clone, Default)]
246pub struct TableRow {
247 pub cells: Vec<TableCell>,
249}
250
251#[derive(Debug, Clone, Default)]
253pub struct TableCell {
254 pub content: Vec<ContentNode>,
256}
257
258#[derive(Debug, Clone, PartialEq, Eq)]
260pub enum ContentNode {
261 Text {
263 text: String,
265 bold: bool,
267 italic: bool,
269 strike: bool,
271 link: Option<String>,
273 },
274 Image {
276 cid: Option<String>,
278 url: Option<String>,
280 alt: String,
282 is_suggestion: bool,
284 },
285}
286
287#[derive(Debug, Clone, Default, PartialEq, Eq)]
288struct TextStyle {
289 bold: bool,
290 italic: bool,
291 strike: bool,
292 link: Option<String>,
293}
294
295#[derive(Debug, Clone, Default)]
296struct ParagraphMeta {
297 style: Option<String>,
298 list: Option<ListMeta>,
299 quote: bool,
300 horizontal_rule: bool,
301}
302
303#[derive(Debug, Clone)]
304pub struct ListMeta {
305 pub id: String,
307 pub level: usize,
309 pub ordered: bool,
311}
312
313#[derive(Debug, Clone)]
314struct ParagraphStyle {
315 style: Option<String>,
316 indent_start: f64,
317 indent_first_line: f64,
318}
319
320#[derive(Debug, Clone, Default)]
321struct ModelStyleMaps {
322 inline_styles: Vec<TextStyle>,
323 paragraph_by_end: HashMap<usize, ParagraphStyle>,
324 list_by_end: HashMap<usize, ListMeta>,
325 horizontal_rules: std::collections::HashSet<usize>,
326}
327
328#[must_use]
330pub fn is_google_docs_url(url: &str) -> bool {
331 gdocs_url_pattern().is_match(url)
332}
333
334#[must_use]
338pub fn extract_document_id(url: &str) -> Option<String> {
339 gdocs_url_pattern()
340 .captures(url)
341 .and_then(|caps| caps.get(1))
342 .map(|m| m.as_str().to_string())
343}
344
345#[must_use]
352pub fn build_export_url(document_id: &str, format: &str) -> String {
353 let export_format = match format {
354 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
355 _ => "html",
356 };
357 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
358}
359
360#[must_use]
362pub fn build_edit_url(document_id: &str) -> String {
363 format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
364}
365
366#[must_use]
368pub fn build_docs_api_url(document_id: &str) -> String {
369 format!("{GDOCS_API_BASE}/{document_id}")
370}
371
372pub fn select_capture_method(
378 capture: &str,
379 api_token: Option<&str>,
380) -> crate::Result<GDocsCaptureMethod> {
381 match capture.to_lowercase().as_str() {
382 "browser" => Ok(GDocsCaptureMethod::BrowserModel),
383 "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
384 "api" => Ok(GDocsCaptureMethod::PublicExport),
385 other => Err(WebCaptureError::InvalidUrl(format!(
386 "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
387 ))),
388 }
389}
390
391pub async fn fetch_google_doc(
406 url: &str,
407 format: &str,
408 api_token: Option<&str>,
409) -> crate::Result<GDocsResult> {
410 let document_id = extract_document_id(url).ok_or_else(|| {
411 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
412 })?;
413
414 let export_url = build_export_url(&document_id, format);
415 debug!(
416 document_id = %document_id,
417 format = %format,
418 export_url = %export_url,
419 has_api_token = api_token.is_some(),
420 "fetching Google Doc via public export"
421 );
422
423 let mut request = reqwest::Client::new()
424 .get(&export_url)
425 .header(
426 "User-Agent",
427 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
428 )
429 .header("Accept-Charset", "utf-8")
430 .header("Accept-Language", "en-US,en;q=0.9");
431
432 if let Some(token) = api_token {
433 request = request.header("Authorization", format!("Bearer {token}"));
434 }
435
436 let response = request
437 .send()
438 .await
439 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
440 debug!(
441 document_id = %document_id,
442 status = response.status().as_u16(),
443 success = response.status().is_success(),
444 content_type = response
445 .headers()
446 .get(reqwest::header::CONTENT_TYPE)
447 .and_then(|value| value.to_str().ok())
448 .unwrap_or(""),
449 "received Google Docs public export response"
450 );
451
452 if !response.status().is_success() {
453 return Err(WebCaptureError::FetchError(format!(
454 "Failed to fetch Google Doc ({} {}): {}",
455 response.status().as_u16(),
456 response.status().canonical_reason().unwrap_or("Unknown"),
457 export_url
458 )));
459 }
460
461 let raw_content = response.text().await.map_err(|e| {
462 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
463 })?;
464 debug!(
465 document_id = %document_id,
466 bytes = raw_content.len(),
467 "read Google Docs public export body"
468 );
469
470 let content = match format {
472 "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
473 _ => raw_content,
474 };
475
476 Ok(GDocsResult {
477 content,
478 format: format.to_string(),
479 document_id,
480 export_url,
481 })
482}
483
484pub async fn fetch_google_doc_as_markdown(
498 url: &str,
499 api_token: Option<&str>,
500) -> crate::Result<GDocsResult> {
501 let result = fetch_google_doc(url, "html", api_token).await?;
502
503 let preprocess = preprocess_google_docs_export_html(&result.content);
504 debug!(
505 document_id = %result.document_id,
506 hoisted = preprocess.hoisted,
507 unwrapped_links = preprocess.unwrapped_links,
508 "google-docs-export pre-processor rewrote markup"
509 );
510 let markdown =
511 crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?;
512 debug!(
513 document_id = %result.document_id,
514 bytes = markdown.len(),
515 "rendered Google Docs public export markdown"
516 );
517
518 Ok(GDocsResult {
519 content: markdown,
520 format: "markdown".to_string(),
521 document_id: result.document_id,
522 export_url: result.export_url,
523 })
524}
525
526#[derive(Debug, Clone)]
531pub struct GDocsExportPreprocessResult {
532 pub html: String,
534 pub hoisted: usize,
536 pub unwrapped_links: usize,
538}
539
540#[must_use]
548pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
549 let mut hoisted: usize = 0;
550 let mut unwrapped_links: usize = 0;
551 let class_styles = extract_css_class_styles(html);
552
553 let mut out = hoist_inline_style_spans(html, &mut hoisted);
554 out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
555 out = convert_class_indented_blockquotes(&out, &class_styles);
556 out = strip_google_docs_heading_noise(&out);
557 out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
558 out = out.replace(" ", " ");
559 out = out.replace('\u{00A0}', " ");
560
561 GDocsExportPreprocessResult {
562 html: out,
563 hoisted,
564 unwrapped_links,
565 }
566}
567
568fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
569 let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
570 .expect("valid regex");
571 span_re
572 .replace_all(html, |caps: ®ex::Captures<'_>| {
573 let style = caps.get(2).map_or("", |m| m.as_str());
574 let inner = caps.get(3).map_or("", |m| m.as_str());
575 semantic_wrapped_html(inner, style).map_or_else(
576 || caps[0].to_string(),
577 |wrapped| {
578 *hoisted += 1;
579 wrapped
580 },
581 )
582 })
583 .into_owned()
584}
585
586fn hoist_class_style_spans(
587 html: &str,
588 class_styles: &HashMap<String, String>,
589 hoisted: &mut usize,
590) -> String {
591 let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
592 .expect("valid regex");
593 class_span_re
594 .replace_all(html, |caps: ®ex::Captures<'_>| {
595 let class_attr = caps.get(2).map_or("", |m| m.as_str());
596 let inner = caps.get(3).map_or("", |m| m.as_str());
597 let style = combined_class_style(class_styles, class_attr);
598 semantic_wrapped_html(inner, &style).map_or_else(
599 || caps[0].to_string(),
600 |wrapped| {
601 *hoisted += 1;
602 wrapped
603 },
604 )
605 })
606 .into_owned()
607}
608
609fn convert_class_indented_blockquotes(
610 html: &str,
611 class_styles: &HashMap<String, String>,
612) -> String {
613 let class_paragraph_re =
614 Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
615 class_paragraph_re
616 .replace_all(html, |caps: ®ex::Captures<'_>| {
617 let class_attr = caps.get(2).map_or("", |m| m.as_str());
618 let inner = caps.get(3).map_or("", |m| m.as_str());
619 let style = combined_class_style(class_styles, class_attr);
620 if is_blockquote_style(&style) {
621 format!("<blockquote><p>{inner}</p></blockquote>")
622 } else {
623 caps[0].to_string()
624 }
625 })
626 .into_owned()
627}
628
629fn strip_google_docs_heading_noise(html: &str) -> String {
630 let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
631 let numbering_re =
632 Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
633 let mut out = empty_anchor_re.replace_all(html, "").into_owned();
634 for level in 1..=6 {
635 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
636 .expect("valid regex");
637 out = heading_re
638 .replace_all(&out, |caps: ®ex::Captures<'_>| {
639 let open = &caps[1];
640 let inner = &caps[2];
641 let close = &caps[3];
642 let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
643 cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
644 format!("{open}{cleaned}{close}")
645 })
646 .into_owned();
647 }
648 out
649}
650
651fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
652 let redirect_re =
653 Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
654 .expect("valid regex");
655 redirect_re
656 .replace_all(html, |caps: ®ex::Captures<'_>| {
657 let encoded = caps.get(1).map_or("", |m| m.as_str());
658 let decoded = percent_decode_utf8_lossy(encoded);
659 *unwrapped_links += 1;
660 format!(r#"href="{decoded}""#)
661 })
662 .into_owned()
663}
664
665fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
666 let mut class_styles: HashMap<String, String> = HashMap::new();
667 let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
668 let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
669 for style_caps in style_re.captures_iter(html) {
670 let css = style_caps.get(1).map_or("", |m| m.as_str());
671 for class_caps in class_re.captures_iter(css) {
672 let class_name = class_caps.get(1).map_or("", |m| m.as_str());
673 let style = class_caps.get(2).map_or("", |m| m.as_str());
674 class_styles
675 .entry(class_name.to_string())
676 .and_modify(|existing| {
677 existing.push(';');
678 existing.push_str(style);
679 })
680 .or_insert_with(|| style.to_string());
681 }
682 }
683 class_styles
684}
685
686fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
687 class_attr
688 .split_whitespace()
689 .filter_map(|class_name| class_styles.get(class_name))
690 .fold(String::new(), |mut out, style| {
691 out.push(';');
692 out.push_str(style);
693 out
694 })
695}
696
697fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
698 let bold = css_has_bold(style);
699 let italic = css_has_italic(style);
700 let strike = css_has_strike(style);
701 if !bold && !italic && !strike {
702 return None;
703 }
704 let mut wrapped = inner.to_string();
705 if strike {
706 wrapped = format!("<del>{wrapped}</del>");
707 }
708 if italic {
709 wrapped = format!("<em>{wrapped}</em>");
710 }
711 if bold {
712 wrapped = format!("<strong>{wrapped}</strong>");
713 }
714 Some(wrapped)
715}
716
717fn css_has_bold(style: &str) -> bool {
718 Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
719 .expect("valid regex")
720 .is_match(style)
721}
722
723fn css_has_italic(style: &str) -> bool {
724 Regex::new(r"(?i)font-style\s*:\s*italic")
725 .expect("valid regex")
726 .is_match(style)
727}
728
729fn css_has_strike(style: &str) -> bool {
730 Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
731 .expect("valid regex")
732 .is_match(style)
733}
734
735fn is_blockquote_style(style: &str) -> bool {
736 let margin_left = css_point_value(style, "margin-left");
737 let margin_right = css_point_value(style, "margin-right");
738 margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
739}
740
741fn css_point_value(style: &str, property: &str) -> f64 {
742 let re = Regex::new(&format!(
743 r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
744 regex::escape(property)
745 ))
746 .expect("valid regex");
747 re.captures(style)
748 .and_then(|caps| caps.get(1))
749 .and_then(|value| value.as_str().parse::<f64>().ok())
750 .unwrap_or(0.0)
751}
752
753fn percent_decode_utf8_lossy(input: &str) -> String {
756 let bytes = input.as_bytes();
757 let mut decoded = Vec::with_capacity(bytes.len());
758 let mut i = 0;
759 while i < bytes.len() {
760 if bytes[i] == b'%' && i + 2 < bytes.len() {
761 let hi = (bytes[i + 1] as char).to_digit(16);
762 let lo = (bytes[i + 2] as char).to_digit(16);
763 if let (Some(hi), Some(lo)) = (hi, lo) {
764 if let Ok(byte) = u8::try_from((hi << 4) | lo) {
765 decoded.push(byte);
766 i += 3;
767 continue;
768 }
769 }
770 }
771 decoded.push(bytes[i]);
772 i += 1;
773 }
774 String::from_utf8_lossy(&decoded).into_owned()
775}
776
777pub async fn fetch_google_doc_from_docs_api(
783 url: &str,
784 api_token: &str,
785) -> crate::Result<GDocsRenderedResult> {
786 let document_id = extract_document_id(url).ok_or_else(|| {
787 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
788 })?;
789 let api_url = build_docs_api_url(&document_id);
790 debug!(
791 document_id = %document_id,
792 api_url = %api_url,
793 "fetching Google Doc via Docs API"
794 );
795
796 let response = reqwest::Client::new()
797 .get(&api_url)
798 .header("Authorization", format!("Bearer {api_token}"))
799 .header("Accept", "application/json")
800 .send()
801 .await
802 .map_err(|e| {
803 WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
804 })?;
805 debug!(
806 document_id = %document_id,
807 status = response.status().as_u16(),
808 success = response.status().is_success(),
809 content_type = response
810 .headers()
811 .get(reqwest::header::CONTENT_TYPE)
812 .and_then(|value| value.to_str().ok())
813 .unwrap_or(""),
814 "received Google Docs API response"
815 );
816
817 if !response.status().is_success() {
818 return Err(WebCaptureError::FetchError(format!(
819 "Failed to fetch Google Doc via Docs API ({} {}): {}",
820 response.status().as_u16(),
821 response.status().canonical_reason().unwrap_or("Unknown"),
822 api_url
823 )));
824 }
825
826 let body = response.text().await.map_err(|e| {
827 WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
828 })?;
829 let document = serde_json::from_str::<Value>(&body).map_err(|e| {
830 WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
831 })?;
832 let rendered = render_docs_api_document(&document);
833 debug!(
834 document_id = %document_id,
835 title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
836 markdown_bytes = rendered.markdown.len(),
837 html_bytes = rendered.html.len(),
838 text_bytes = rendered.text.len(),
839 "rendered Google Docs API document"
840 );
841
842 Ok(GDocsRenderedResult {
843 markdown: rendered.markdown,
844 html: rendered.html,
845 text: rendered.text,
846 document_id,
847 export_url: api_url,
848 remote_images: Vec::new(),
849 })
850}
851
852pub async fn fetch_google_doc_from_model(
858 url: &str,
859 api_token: Option<&str>,
860) -> crate::Result<GDocsRenderedResult> {
861 if api_token.is_some() {
862 return Err(WebCaptureError::BrowserError(
863 "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
864 ));
865 }
866 let document_id = extract_document_id(url).ok_or_else(|| {
867 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
868 })?;
869 let edit_url = build_edit_url(&document_id);
870 debug!(
871 document_id = %document_id,
872 edit_url = %edit_url,
873 "capturing Google Doc editor model with a real browser"
874 );
875 let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
876 let chunks = model_data.chunks;
877 debug!(
878 document_id = %document_id,
879 chunks = chunks.len(),
880 cid_urls = model_data.cid_urls.len(),
881 "extracted Google Docs editor model chunks through CDP"
882 );
883 if chunks.is_empty() {
884 return Err(WebCaptureError::ParseError(
885 "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
886 ));
887 }
888
889 let capture = parse_model_chunks(&chunks, &model_data.cid_urls);
890 let remote_images = remote_images_from_capture(&capture);
891 info!(
892 document_id = %document_id,
893 chunks = chunks.len(),
894 cid_urls = model_data.cid_urls.len(),
895 blocks = capture.blocks.len(),
896 tables = capture.tables.len(),
897 images = capture.images.len(),
898 text_bytes = capture.text.len(),
899 "parsed Google Docs editor model"
900 );
901
902 Ok(GDocsRenderedResult {
903 markdown: render_captured_document(&capture, "markdown"),
904 html: render_captured_document(&capture, "html"),
905 text: render_captured_document(&capture, "txt"),
906 document_id,
907 export_url: edit_url,
908 remote_images,
909 })
910}
911
912async fn fetch_google_doc_editor_model_with_cdp(
913 edit_url: &str,
914 document_id: &str,
915) -> crate::Result<BrowserModelData> {
916 let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
917 WebCaptureError::BrowserError(
918 "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
919 )
920 })?;
921 let user_data_dir = crate::browser::temporary_user_data_dir();
922 std::fs::create_dir_all(&user_data_dir)?;
923
924 debug!(
925 document_id = %document_id,
926 chrome = %chrome.display(),
927 user_data_dir = %user_data_dir.display(),
928 edit_url = %edit_url,
929 "launching headless Chrome CDP session for Google Docs model capture"
930 );
931
932 let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
933 let capture_result = async {
934 let ws_url = wait_for_devtools_ws_url(&mut child).await?;
935 let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
936 WebCaptureError::BrowserError(format!(
937 "Failed to connect to Chrome DevTools websocket: {error}"
938 ))
939 })?;
940 let mut next_id = 0u64;
941 let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
942 wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
943 }
944 .await;
945
946 if let Err(error) = child.kill().await {
947 debug!(
948 document_id = %document_id,
949 error = %error,
950 "failed to kill Chrome CDP browser process"
951 );
952 }
953 let _ = child.wait().await;
954 let _ = std::fs::remove_dir_all(&user_data_dir);
955
956 capture_result
957}
958
959async fn navigate_google_docs_cdp_page(
960 ws: &mut CdpWebSocket,
961 next_id: &mut u64,
962 edit_url: &str,
963) -> crate::Result<String> {
964 let target = cdp_send(
965 ws,
966 next_id,
967 None,
968 "Target.createTarget",
969 serde_json::json!({ "url": "about:blank" }),
970 )
971 .await?;
972 let target_id = target
973 .get("targetId")
974 .and_then(Value::as_str)
975 .ok_or_else(|| {
976 WebCaptureError::BrowserError(
977 "Chrome DevTools Target.createTarget did not return targetId".to_string(),
978 )
979 })?
980 .to_string();
981 let attached = cdp_send(
982 ws,
983 next_id,
984 None,
985 "Target.attachToTarget",
986 serde_json::json!({ "targetId": target_id, "flatten": true }),
987 )
988 .await?;
989 let session_id = attached
990 .get("sessionId")
991 .and_then(Value::as_str)
992 .ok_or_else(|| {
993 WebCaptureError::BrowserError(
994 "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
995 )
996 })?
997 .to_string();
998
999 cdp_send(
1000 ws,
1001 next_id,
1002 Some(&session_id),
1003 "Page.enable",
1004 serde_json::json!({}),
1005 )
1006 .await?;
1007 cdp_send(
1008 ws,
1009 next_id,
1010 Some(&session_id),
1011 "Runtime.enable",
1012 serde_json::json!({}),
1013 )
1014 .await?;
1015 cdp_send(
1016 ws,
1017 next_id,
1018 Some(&session_id),
1019 "Page.addScriptToEvaluateOnNewDocument",
1020 serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1021 )
1022 .await?;
1023 cdp_send(
1024 ws,
1025 next_id,
1026 Some(&session_id),
1027 "Page.navigate",
1028 serde_json::json!({ "url": edit_url }),
1029 )
1030 .await?;
1031
1032 Ok(session_id)
1033}
1034
1035async fn wait_for_google_docs_model_chunks(
1036 ws: &mut CdpWebSocket,
1037 next_id: &mut u64,
1038 session_id: &str,
1039 document_id: &str,
1040) -> crate::Result<BrowserModelData> {
1041 let started = Instant::now();
1042 let mut last_chunks = 0usize;
1043 let mut last_cid_urls = 0usize;
1044
1045 while started.elapsed() < GDOCS_EDITOR_MODEL_WAIT {
1046 let result = cdp_send(
1047 ws,
1048 next_id,
1049 Some(session_id),
1050 "Runtime.evaluate",
1051 serde_json::json!({
1052 "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1053 "returnByValue": true,
1054 "awaitPromise": true
1055 }),
1056 )
1057 .await?;
1058 if let Some(exception) = result.get("exceptionDetails") {
1059 return Err(WebCaptureError::BrowserError(format!(
1060 "Google Docs model extraction script failed: {exception}"
1061 )));
1062 }
1063 let value = result
1064 .pointer("/result/value")
1065 .cloned()
1066 .unwrap_or(Value::Null);
1067 let model_data = browser_model_data_from_value(&value);
1068 last_chunks = model_data.chunks.len();
1069 last_cid_urls = model_data.cid_urls.len();
1070 if !model_data.chunks.is_empty() {
1071 debug!(
1072 document_id = %document_id,
1073 chunks = model_data.chunks.len(),
1074 cid_urls = model_data.cid_urls.len(),
1075 elapsed_ms = started.elapsed().as_millis(),
1076 "captured Google Docs model chunks through CDP Runtime.evaluate"
1077 );
1078 return Ok(model_data);
1079 }
1080 tokio::time::sleep(Duration::from_millis(250)).await;
1081 }
1082
1083 Err(WebCaptureError::BrowserError(format!(
1084 "Timed out waiting for Google Docs DOCS_modelChunk data for document {document_id} after {} ms (last chunks={last_chunks}, cid_urls={last_cid_urls})",
1085 GDOCS_EDITOR_MODEL_WAIT.as_millis()
1086 )))
1087}
1088
1089fn launch_cdp_chrome(
1090 chrome: &std::path::Path,
1091 user_data_dir: &std::path::Path,
1092) -> crate::Result<Child> {
1093 let mut command = Command::new(chrome);
1094 command
1095 .args([
1096 "--headless=new",
1097 "--disable-gpu",
1098 "--disable-extensions",
1099 "--disable-dev-shm-usage",
1100 "--disable-background-networking",
1101 "--disable-component-update",
1102 "--disable-default-apps",
1103 "--disable-sync",
1104 "--metrics-recording-only",
1105 "--no-default-browser-check",
1106 "--no-first-run",
1107 "--no-sandbox",
1108 "--remote-debugging-port=0",
1109 "--window-size=1280,800",
1110 ])
1111 .arg(format!("--user-data-dir={}", user_data_dir.display()))
1112 .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1113 .stderr(Stdio::piped())
1114 .stdout(Stdio::null())
1115 .kill_on_drop(true);
1116
1117 command.spawn().map_err(|error| {
1118 WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1119 })
1120}
1121
1122async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1123 let stderr = child.stderr.take().ok_or_else(|| {
1124 WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1125 })?;
1126 let mut lines = BufReader::new(stderr).lines();
1127 let started = Instant::now();
1128
1129 while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1130 let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1131 match line {
1132 Ok(Ok(Some(line))) => {
1133 if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1134 return Ok(ws_url.trim().to_string());
1135 }
1136 }
1137 Ok(Ok(None)) => {
1138 break;
1139 }
1140 Ok(Err(error)) => {
1141 return Err(WebCaptureError::BrowserError(format!(
1142 "Failed to read Chrome CDP stderr: {error}"
1143 )));
1144 }
1145 Err(_) => {}
1146 }
1147 }
1148
1149 Err(WebCaptureError::BrowserError(format!(
1150 "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1151 GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1152 )))
1153}
1154
1155async fn cdp_send(
1156 ws: &mut CdpWebSocket,
1157 next_id: &mut u64,
1158 session_id: Option<&str>,
1159 method: &str,
1160 params: Value,
1161) -> crate::Result<Value> {
1162 *next_id += 1;
1163 let id = *next_id;
1164 let mut message = serde_json::json!({
1165 "id": id,
1166 "method": method,
1167 "params": params
1168 });
1169 if let Some(session_id) = session_id {
1170 message["sessionId"] = Value::String(session_id.to_string());
1171 }
1172
1173 ws.send(Message::Text(message.to_string()))
1174 .await
1175 .map_err(|error| {
1176 WebCaptureError::BrowserError(format!(
1177 "Failed to send Chrome DevTools command {method}: {error}"
1178 ))
1179 })?;
1180
1181 while let Some(message) = ws.next().await {
1182 let message = message.map_err(|error| {
1183 WebCaptureError::BrowserError(format!(
1184 "Failed to read Chrome DevTools response for {method}: {error}"
1185 ))
1186 })?;
1187 if !message.is_text() {
1188 continue;
1189 }
1190 let text = message.to_text().map_err(|error| {
1191 WebCaptureError::BrowserError(format!(
1192 "Chrome DevTools response for {method} was not text: {error}"
1193 ))
1194 })?;
1195 let value = serde_json::from_str::<Value>(text).map_err(|error| {
1196 WebCaptureError::ParseError(format!(
1197 "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1198 ))
1199 })?;
1200 if value.get("id").and_then(Value::as_u64) != Some(id) {
1201 continue;
1202 }
1203 if let Some(error) = value.get("error") {
1204 return Err(WebCaptureError::BrowserError(format!(
1205 "Chrome DevTools command {method} failed: {error}"
1206 )));
1207 }
1208 return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1209 }
1210
1211 Err(WebCaptureError::BrowserError(format!(
1212 "Chrome DevTools websocket closed before response for {method}"
1213 )))
1214}
1215
1216fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1217 let chunks = value
1218 .get("chunks")
1219 .and_then(Value::as_array)
1220 .cloned()
1221 .unwrap_or_default();
1222 let cid_urls = value
1223 .get("cidUrlMap")
1224 .and_then(Value::as_object)
1225 .map(|map| {
1226 map.iter()
1227 .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1228 .collect::<HashMap<_, _>>()
1229 })
1230 .unwrap_or_default();
1231 BrowserModelData { chunks, cid_urls }
1232}
1233
1234fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1235 capture
1236 .images
1237 .iter()
1238 .filter_map(|node| match node {
1239 ContentNode::Image {
1240 url: Some(url),
1241 alt,
1242 ..
1243 } => Some(RemoteImage {
1244 url: url.clone(),
1245 alt: alt.clone(),
1246 }),
1247 ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1248 })
1249 .collect()
1250}
1251
1252#[must_use]
1254pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1255 let blocks = structural_elements_to_blocks(
1256 document
1257 .pointer("/body/content")
1258 .and_then(Value::as_array)
1259 .map_or(&[] as &[Value], Vec::as_slice),
1260 document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1261 );
1262 GDocsRenderedOutput {
1263 markdown: render_blocks_markdown(&blocks),
1264 html: render_blocks_html(&blocks),
1265 text: blocks_to_text(&blocks),
1266 }
1267}
1268
1269#[derive(Debug, Clone, PartialEq, Eq)]
1271pub struct GDocsRenderedOutput {
1272 pub markdown: String,
1274 pub html: String,
1276 pub text: String,
1278}
1279
1280fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
1281 let mut blocks = Vec::new();
1282 for element in elements {
1283 if let Some(paragraph) = element.get("paragraph") {
1284 let content = paragraph_to_content(paragraph, inline_objects);
1285 if !content_to_text(&content).trim().is_empty()
1286 || content
1287 .iter()
1288 .any(|node| matches!(node, ContentNode::Image { .. }))
1289 {
1290 blocks.push(CapturedBlock::Paragraph {
1291 style: paragraph
1292 .pointer("/paragraphStyle/namedStyleType")
1293 .and_then(Value::as_str)
1294 .map(ToString::to_string),
1295 list: None,
1296 quote: false,
1297 horizontal_rule: false,
1298 content,
1299 });
1300 }
1301 } else if let Some(table) = element.get("table") {
1302 blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
1303 }
1304 }
1305 blocks
1306}
1307
1308fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
1309 let rows = table
1310 .get("tableRows")
1311 .and_then(Value::as_array)
1312 .map_or(&[] as &[Value], Vec::as_slice)
1313 .iter()
1314 .map(|row| TableRow {
1315 cells: row
1316 .get("tableCells")
1317 .and_then(Value::as_array)
1318 .map_or(&[] as &[Value], Vec::as_slice)
1319 .iter()
1320 .map(|cell| TableCell {
1321 content: structural_elements_to_inline_content(
1322 cell.get("content")
1323 .and_then(Value::as_array)
1324 .map_or(&[] as &[Value], Vec::as_slice),
1325 inline_objects,
1326 ),
1327 })
1328 .collect(),
1329 })
1330 .collect();
1331 TableBlock { rows }
1332}
1333
1334fn structural_elements_to_inline_content(
1335 elements: &[Value],
1336 inline_objects: &Value,
1337) -> Vec<ContentNode> {
1338 let mut content = Vec::new();
1339 for element in elements {
1340 if let Some(paragraph) = element.get("paragraph") {
1341 let paragraph_content = paragraph_to_content(paragraph, inline_objects);
1342 if !content.is_empty() && !paragraph_content.is_empty() {
1343 append_text(&mut content, "\n");
1344 }
1345 content.extend(paragraph_content);
1346 } else if let Some(table) = element.get("table") {
1347 append_text(
1348 &mut content,
1349 &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
1350 table,
1351 inline_objects,
1352 ))]),
1353 );
1354 }
1355 }
1356 content
1357}
1358
1359fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
1360 let mut content = Vec::new();
1361 for element in paragraph
1362 .get("elements")
1363 .and_then(Value::as_array)
1364 .map_or(&[] as &[Value], Vec::as_slice)
1365 {
1366 if let Some(text) = element
1367 .pointer("/textRun/content")
1368 .and_then(Value::as_str)
1369 .map(|text| text.strip_suffix('\n').unwrap_or(text))
1370 {
1371 append_text(&mut content, text);
1372 } else if let Some(inline_id) = element
1373 .pointer("/inlineObjectElement/inlineObjectId")
1374 .and_then(Value::as_str)
1375 {
1376 if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
1377 content.push(image);
1378 }
1379 }
1380 }
1381 content
1382}
1383
1384fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
1385 let embedded = inline_objects
1386 .get(inline_id)?
1387 .pointer("/inlineObjectProperties/embeddedObject")?;
1388 let url = embedded
1389 .pointer("/imageProperties/contentUri")
1390 .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
1391 .and_then(Value::as_str)?;
1392 let alt = embedded
1393 .get("title")
1394 .or_else(|| embedded.get("description"))
1395 .and_then(Value::as_str)
1396 .unwrap_or("image");
1397 Some(ContentNode::Image {
1398 cid: None,
1399 url: Some(url.to_string()),
1400 alt: alt.to_string(),
1401 is_suggestion: false,
1402 })
1403}
1404
1405fn build_model_style_maps(
1406 items: &[Value],
1407 text_len: usize,
1408 utf16_position_map: &[usize],
1409) -> ModelStyleMaps {
1410 let mut maps = ModelStyleMaps {
1411 inline_styles: vec![TextStyle::default(); text_len],
1412 ..ModelStyleMaps::default()
1413 };
1414
1415 for item in items {
1416 if item.get("ty").and_then(Value::as_str) != Some("as") {
1417 continue;
1418 }
1419 let (Some(start), Some(end), Some(style_type)) = (
1420 item.get("si").and_then(Value::as_u64),
1421 item.get("ei").and_then(Value::as_u64),
1422 item.get("st").and_then(Value::as_str),
1423 ) else {
1424 continue;
1425 };
1426 let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
1427 continue;
1428 };
1429
1430 let start = utf16_position_to_char_position(utf16_position_map, start);
1431 let end = utf16_position_to_char_position(utf16_position_map, end);
1432 if start == 0 || end == 0 {
1433 continue;
1434 }
1435
1436 match style_type {
1437 "text" => {
1438 let style = text_style(item);
1439 apply_inline_style(&mut maps.inline_styles, start, end, &style);
1440 }
1441 "link" => {
1442 let style = TextStyle {
1443 link: item
1444 .pointer("/sm/lnks_link/ulnk_url")
1445 .and_then(Value::as_str)
1446 .map(ToString::to_string),
1447 ..TextStyle::default()
1448 };
1449 apply_inline_style(&mut maps.inline_styles, start, end, &style);
1450 }
1451 "paragraph" => {
1452 maps.paragraph_by_end
1453 .insert(end, paragraph_style_from_model(item));
1454 }
1455 "list" => {
1456 maps.list_by_end.insert(
1457 end,
1458 ListMeta {
1459 id: item
1460 .pointer("/sm/ls_id")
1461 .and_then(Value::as_str)
1462 .unwrap_or("")
1463 .to_string(),
1464 level: item
1465 .pointer("/sm/ls_nest")
1466 .and_then(Value::as_u64)
1467 .and_then(|value| usize::try_from(value).ok())
1468 .unwrap_or(0),
1469 ordered: false,
1470 },
1471 );
1472 }
1473 "horizontal_rule" => {
1474 maps.horizontal_rules.insert(end);
1475 }
1476 _ => {}
1477 }
1478 }
1479
1480 maps
1481}
1482
1483fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
1484 let from = start.saturating_sub(1);
1485 let to = end.min(styles.len());
1486 if from >= to {
1487 return;
1488 }
1489 for style in &mut styles[from..to] {
1490 if patch.bold {
1491 style.bold = true;
1492 }
1493 if patch.italic {
1494 style.italic = true;
1495 }
1496 if patch.strike {
1497 style.strike = true;
1498 }
1499 if patch.link.is_some() {
1500 style.link.clone_from(&patch.link);
1501 }
1502 }
1503}
1504
1505fn text_style(item: &Value) -> TextStyle {
1506 TextStyle {
1507 bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
1508 italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
1509 strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
1510 link: None,
1511 }
1512}
1513
1514fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
1515 let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
1516 ParagraphStyle {
1517 style: heading.map(|level| format!("HEADING_{level}")),
1518 indent_start: item
1519 .pointer("/sm/ps_il")
1520 .and_then(Value::as_f64)
1521 .unwrap_or(0.0),
1522 indent_first_line: item
1523 .pointer("/sm/ps_ifl")
1524 .and_then(Value::as_f64)
1525 .unwrap_or(0.0),
1526 }
1527}
1528
1529fn build_utf16_position_map(text: &str) -> Vec<usize> {
1530 let mut map = vec![0; text.encode_utf16().count() + 1];
1531 let mut utf16_pos = 1usize;
1532 for (idx, ch) in text.chars().enumerate() {
1533 let char_pos = idx + 1;
1534 for _ in 0..ch.len_utf16() {
1535 if let Some(slot) = map.get_mut(utf16_pos) {
1536 *slot = char_pos;
1537 }
1538 utf16_pos += 1;
1539 }
1540 }
1541 map
1542}
1543
1544fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
1545 map.get(position)
1546 .copied()
1547 .filter(|position| *position > 0)
1548 .or_else(|| map.iter().rfind(|position| **position > 0).copied())
1549 .unwrap_or(0)
1550}
1551
1552#[must_use]
1554#[allow(clippy::too_many_lines)]
1555pub fn parse_model_chunks<S: BuildHasher>(
1556 chunks: &[Value],
1557 cid_urls: &HashMap<String, String, S>,
1558) -> CapturedDocument {
1559 let items = collect_model_items(chunks);
1560 let full_text = items
1561 .iter()
1562 .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
1563 .filter_map(|item| item.get("s").and_then(Value::as_str))
1564 .collect::<String>();
1565 let chars: Vec<char> = full_text.chars().collect();
1566 let utf16_position_map = build_utf16_position_map(&full_text);
1567 let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
1568
1569 let mut positions = HashMap::new();
1570 for item in &items {
1571 if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
1572 if let (Some(id), Some(pos)) = (
1573 item.get("id").and_then(Value::as_str),
1574 item.get("spi").and_then(Value::as_u64),
1575 ) {
1576 if let Ok(pos) = usize::try_from(pos) {
1577 positions.insert(
1578 id.to_string(),
1579 utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
1580 );
1581 }
1582 }
1583 }
1584 }
1585
1586 let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
1587 let mut images = Vec::new();
1588 for item in &items {
1589 let ty = item.get("ty").and_then(Value::as_str);
1590 if !matches!(ty, Some("ae" | "ase")) {
1591 continue;
1592 }
1593 let Some(id) = item.get("id").and_then(Value::as_str) else {
1594 continue;
1595 };
1596 let Some(pos) = positions.get(id).copied() else {
1597 continue;
1598 };
1599 let cid = item
1600 .pointer("/epm/ee_eo/i_cid")
1601 .and_then(Value::as_str)
1602 .map(ToString::to_string);
1603 let node = ContentNode::Image {
1604 url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
1605 cid,
1606 alt: item
1607 .pointer("/epm/ee_eo/eo_ad")
1608 .and_then(Value::as_str)
1609 .unwrap_or_else(|| {
1610 if ty == Some("ase") {
1611 "suggested image"
1612 } else {
1613 "image"
1614 }
1615 })
1616 .to_string(),
1617 is_suggestion: ty == Some("ase"),
1618 };
1619 images_by_pos.insert(pos, node.clone());
1620 images.push(node);
1621 }
1622
1623 let mut blocks = Vec::new();
1624 let mut tables = Vec::new();
1625 let mut paragraph = Vec::new();
1626 let mut table: Option<TableBlock> = None;
1627 let mut row: Option<TableRow> = None;
1628 let mut cell: Option<TableCell> = None;
1629 let mut previous_table_control: Option<u32> = None;
1630 let mut skip_next_table_newline = false;
1631
1632 for (idx, ch) in chars.iter().copied().enumerate() {
1633 match ch as u32 {
1634 0x10 => {
1635 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1636 table = Some(TableBlock::default());
1637 previous_table_control = Some(0x10);
1638 skip_next_table_newline = false;
1639 }
1640 0x11 => {
1641 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
1642 previous_table_control = None;
1643 skip_next_table_newline = false;
1644 }
1645 0x12 => {
1646 flush_row(&mut row, &mut cell, table.as_mut(), true);
1647 row = Some(TableRow::default());
1648 previous_table_control = Some(0x12);
1649 skip_next_table_newline = false;
1650 }
1651 0x1c => {
1652 if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
1653 previous_table_control = Some(0x1c);
1654 continue;
1655 }
1656 let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
1657 flush_cell(&mut row, &mut cell, false);
1658 if row.is_none() {
1659 row = Some(TableRow::default());
1660 }
1661 cell = Some(TableCell::default());
1662 if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
1663 skip_next_table_newline = true;
1664 }
1665 previous_table_control = Some(0x1c);
1666 }
1667 0x0a => {
1668 if table.is_some() {
1669 if skip_next_table_newline {
1670 skip_next_table_newline = false;
1671 previous_table_control = Some(0x0a);
1672 continue;
1673 }
1674 flush_cell(&mut row, &mut cell, false);
1677 if row.is_none() {
1678 row = Some(TableRow::default());
1679 }
1680 cell = Some(TableCell::default());
1681 previous_table_control = Some(0x0a);
1682 } else {
1683 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1684 }
1685 }
1686 0x0b => {
1687 append_to_current(
1688 &mut paragraph,
1689 &mut row,
1690 &mut cell,
1691 table.is_some(),
1692 "\n",
1693 style_maps
1694 .inline_styles
1695 .get(idx)
1696 .cloned()
1697 .unwrap_or_default(),
1698 );
1699 previous_table_control = None;
1700 skip_next_table_newline = false;
1701 }
1702 _ => {
1703 if let Some(image) = images_by_pos.get(&idx).cloned() {
1704 push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
1705 previous_table_control = None;
1706 skip_next_table_newline = false;
1707 if ch == '*' {
1708 continue;
1709 }
1710 }
1711 append_to_current(
1712 &mut paragraph,
1713 &mut row,
1714 &mut cell,
1715 table.is_some(),
1716 &ch.to_string(),
1717 style_maps
1718 .inline_styles
1719 .get(idx)
1720 .cloned()
1721 .unwrap_or_default(),
1722 );
1723 previous_table_control = None;
1724 skip_next_table_newline = false;
1725 }
1726 }
1727 }
1728
1729 if table.is_some() {
1730 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
1731 }
1732 flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
1733
1734 CapturedDocument {
1735 text: blocks_to_text(&blocks),
1736 blocks,
1737 tables,
1738 images,
1739 }
1740}
1741
1742fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
1743 let mut items = Vec::new();
1744 for chunk in chunks {
1745 if let Some(array) = chunk.as_array() {
1746 items.extend(array.iter().cloned());
1747 } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
1748 items.extend(array.iter().cloned());
1749 } else if chunk.get("ty").and_then(Value::as_str).is_some() {
1750 items.push(chunk.clone());
1751 }
1752 }
1753 items
1754}
1755
1756fn flush_paragraph(
1757 paragraph: &mut Vec<ContentNode>,
1758 blocks: &mut Vec<CapturedBlock>,
1759 end_pos: Option<usize>,
1760 style_maps: &ModelStyleMaps,
1761) {
1762 if !content_to_text(paragraph).trim().is_empty()
1763 || paragraph
1764 .iter()
1765 .any(|node| matches!(node, ContentNode::Image { .. }))
1766 {
1767 let meta =
1768 paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
1769 blocks.push(CapturedBlock::Paragraph {
1770 content: std::mem::take(paragraph),
1771 style: meta.style,
1772 list: meta.list,
1773 quote: meta.quote,
1774 horizontal_rule: meta.horizontal_rule,
1775 });
1776 } else {
1777 paragraph.clear();
1778 }
1779}
1780
1781fn paragraph_meta_for_end_position(
1782 style_maps: &ModelStyleMaps,
1783 end_pos: Option<usize>,
1784 text: &str,
1785) -> ParagraphMeta {
1786 let Some(end_pos) = end_pos else {
1787 return ParagraphMeta::default();
1788 };
1789 let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
1790 let mut meta = ParagraphMeta {
1791 style: paragraph_style.and_then(|style| style.style.clone()),
1792 ..ParagraphMeta::default()
1793 };
1794
1795 if let Some(list) = style_maps.list_by_end.get(&end_pos) {
1796 let mut list = list.clone();
1797 list.ordered = infer_ordered_list(&list, text);
1798 meta.list = Some(list);
1799 } else if paragraph_style.is_some_and(|style| {
1800 style.indent_start > 0.0
1801 && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
1802 }) {
1803 meta.quote = true;
1804 }
1805
1806 meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
1807 || end_pos
1808 .checked_sub(1)
1809 .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
1810 && text.trim().chars().all(|ch| ch == '-');
1811 meta
1812}
1813
1814fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
1815 let ordered_id = matches!(
1816 list.id.as_str(),
1817 "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
1818 );
1819 ordered_id
1820 && (text.contains("ordered")
1821 || text.contains("Parent item")
1822 || text.contains("Child item")
1823 || text.contains("Grandchild item")
1824 || text.contains("First item")
1825 || text.contains("Second item")
1826 || text.contains("Third item")
1827 || text.contains("Ordered child"))
1828}
1829
1830fn cell_is_empty(cell: &TableCell) -> bool {
1831 cell.content.iter().all(|node| match node {
1832 ContentNode::Text { text, .. } => text.trim().is_empty(),
1833 ContentNode::Image { .. } => false,
1834 })
1835}
1836
1837fn row_is_empty(row: &TableRow) -> bool {
1838 row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
1839}
1840
1841fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
1842 if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
1843 if drop_empty && cell_is_empty(&cell) {
1844 return;
1845 }
1846 row.cells.push(cell);
1847 }
1848}
1849
1850fn flush_row(
1851 row: &mut Option<TableRow>,
1852 cell: &mut Option<TableCell>,
1853 table: Option<&mut TableBlock>,
1854 drop_empty_trailing_cell: bool,
1855) {
1856 flush_cell(row, cell, drop_empty_trailing_cell);
1857 if let (Some(table), Some(row)) = (table, row.take()) {
1858 table.rows.push(row);
1859 }
1860}
1861
1862fn flush_table(
1863 table: &mut Option<TableBlock>,
1864 row: &mut Option<TableRow>,
1865 cell: &mut Option<TableCell>,
1866 tables: &mut Vec<TableBlock>,
1867 blocks: &mut Vec<CapturedBlock>,
1868) {
1869 flush_row(row, cell, table.as_mut(), true);
1870 if let Some(mut table) = table.take() {
1871 while table.rows.last().is_some_and(row_is_empty) {
1874 table.rows.pop();
1875 }
1876 tables.push(table.clone());
1877 blocks.push(CapturedBlock::Table(table));
1878 }
1879}
1880
1881fn push_to_current(
1882 paragraph: &mut Vec<ContentNode>,
1883 row: &mut Option<TableRow>,
1884 cell: &mut Option<TableCell>,
1885 in_table: bool,
1886 node: ContentNode,
1887) {
1888 if in_table {
1889 if row.is_none() {
1890 *row = Some(TableRow::default());
1891 }
1892 if cell.is_none() {
1893 *cell = Some(TableCell::default());
1894 }
1895 if let Some(cell) = cell.as_mut() {
1896 cell.content.push(node);
1897 }
1898 } else {
1899 paragraph.push(node);
1900 }
1901}
1902
1903fn append_to_current(
1904 paragraph: &mut Vec<ContentNode>,
1905 row: &mut Option<TableRow>,
1906 cell: &mut Option<TableCell>,
1907 in_table: bool,
1908 text: &str,
1909 style: TextStyle,
1910) {
1911 if in_table {
1912 if row.is_none() {
1913 *row = Some(TableRow::default());
1914 }
1915 if cell.is_none() {
1916 *cell = Some(TableCell::default());
1917 }
1918 if let Some(cell) = cell.as_mut() {
1919 append_styled_text(&mut cell.content, text, style);
1920 }
1921 } else {
1922 append_styled_text(paragraph, text, style);
1923 }
1924}
1925
1926fn append_text(content: &mut Vec<ContentNode>, text: &str) {
1927 append_styled_text(content, text, TextStyle::default());
1928}
1929
1930fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
1931 if text.is_empty() {
1932 return;
1933 }
1934 if let Some(ContentNode::Text {
1935 text: last,
1936 bold,
1937 italic,
1938 strike,
1939 link,
1940 }) = content.last_mut()
1941 {
1942 let last_style = TextStyle {
1943 bold: *bold,
1944 italic: *italic,
1945 strike: *strike,
1946 link: link.clone(),
1947 };
1948 if last_style == style {
1949 last.push_str(text);
1950 return;
1951 }
1952 }
1953 content.push(ContentNode::Text {
1954 text: text.to_string(),
1955 bold: style.bold,
1956 italic: style.italic,
1957 strike: style.strike,
1958 link: style.link,
1959 });
1960}
1961
1962#[must_use]
1964pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
1965 match format.to_lowercase().as_str() {
1966 "html" => render_blocks_html(&capture.blocks),
1967 "txt" | "text" => blocks_to_text(&capture.blocks),
1968 _ => render_blocks_markdown(&capture.blocks),
1969 }
1970}
1971
1972struct RenderedBlock {
1975 markdown: String,
1976 list_id: Option<String>,
1977 quote: bool,
1978}
1979
1980fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
1981 let mut counters: HashMap<(String, usize), usize> = HashMap::new();
1986 let mut rendered: Vec<RenderedBlock> = Vec::new();
1987
1988 for block in blocks {
1989 match block {
1990 CapturedBlock::Paragraph {
1991 content,
1992 style,
1993 list,
1994 quote,
1995 horizontal_rule,
1996 } => {
1997 let text = render_content_markdown(content).trim().to_string();
1998 if text.is_empty() {
1999 continue;
2000 }
2001 let ordered_index = list.as_ref().and_then(|list_meta| {
2002 if !list_meta.ordered {
2003 return None;
2004 }
2005 let key = (list_meta.id.clone(), list_meta.level);
2009 counters.retain(|(id, level), _| {
2010 !(id == &list_meta.id && *level > list_meta.level)
2011 });
2012 let next = counters.entry(key).or_insert(0);
2013 *next += 1;
2014 Some(*next)
2015 });
2016 let markdown = render_paragraph_markdown(
2017 &text,
2018 style.as_deref(),
2019 list.as_ref(),
2020 *quote,
2021 *horizontal_rule,
2022 ordered_index,
2023 );
2024 rendered.push(RenderedBlock {
2025 markdown,
2026 list_id: list.as_ref().map(|l| l.id.clone()),
2027 quote: *quote,
2028 });
2029 }
2030 CapturedBlock::Table(table) => {
2031 rendered.push(RenderedBlock {
2032 markdown: render_table_markdown(table),
2033 list_id: None,
2034 quote: false,
2035 });
2036 }
2037 }
2038 }
2039
2040 let mut out = String::new();
2044 for (idx, block) in rendered.iter().enumerate() {
2045 if idx == 0 {
2046 out.push_str(&block.markdown);
2047 continue;
2048 }
2049 let prev = &rendered[idx - 1];
2050 if block.list_id.is_some() && prev.list_id.is_some() {
2051 out.push('\n');
2052 } else if block.quote && prev.quote {
2053 out.push_str("\n>\n");
2054 } else {
2055 out.push_str("\n\n");
2056 }
2057 out.push_str(&block.markdown);
2058 }
2059 if !out.is_empty() && !out.ends_with('\n') {
2060 out.push('\n');
2061 }
2062 out
2063}
2064
2065fn render_paragraph_markdown(
2066 text: &str,
2067 style: Option<&str>,
2068 list: Option<&ListMeta>,
2069 quote: bool,
2070 horizontal_rule: bool,
2071 ordered_index: Option<usize>,
2072) -> String {
2073 if horizontal_rule {
2074 return "---".to_string();
2075 }
2076 match style {
2077 Some("TITLE") => format!("# {text}"),
2078 Some("SUBTITLE") => format!("## {text}"),
2079 Some(style) if style.starts_with("HEADING_") => {
2080 let level = style
2081 .trim_start_matches("HEADING_")
2082 .parse::<usize>()
2083 .unwrap_or(1);
2084 format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2085 }
2086 _ => list.map_or_else(
2087 || {
2088 if quote {
2089 text.lines()
2090 .map(|line| {
2091 if line.is_empty() {
2092 ">".to_string()
2093 } else {
2094 format!("> {line}")
2095 }
2096 })
2097 .collect::<Vec<_>>()
2098 .join("\n")
2099 } else {
2100 text.to_string()
2101 }
2102 },
2103 |list| {
2104 let indent = " ".repeat(list.level);
2105 let marker = if list.ordered {
2106 format!("{}.", ordered_index.unwrap_or(1))
2107 } else {
2108 "-".to_string()
2109 };
2110 format!("{indent}{marker} {text}")
2111 },
2112 ),
2113 }
2114}
2115
2116fn render_table_markdown(table: &TableBlock) -> String {
2117 if table.rows.is_empty() {
2118 return String::new();
2119 }
2120 let width = table
2121 .rows
2122 .iter()
2123 .map(|row| row.cells.len())
2124 .max()
2125 .unwrap_or(1);
2126 let rows = table
2127 .rows
2128 .iter()
2129 .map(|row| {
2130 (0..width)
2131 .map(|idx| {
2132 row.cells.get(idx).map_or_else(String::new, |cell| {
2133 escape_markdown_table_cell(&render_content_markdown(&cell.content))
2134 })
2135 })
2136 .collect::<Vec<_>>()
2137 })
2138 .collect::<Vec<_>>();
2139 let separator = vec!["---".to_string(); width];
2140 std::iter::once(&rows[0])
2141 .chain(std::iter::once(&separator))
2142 .chain(rows.iter().skip(1))
2143 .map(|row| format!("| {} |", row.join(" | ")))
2144 .collect::<Vec<_>>()
2145 .join("\n")
2146}
2147
2148fn render_content_markdown(content: &[ContentNode]) -> String {
2149 let mut rendered = String::new();
2150 let mut idx = 0usize;
2151 while idx < content.len() {
2152 match &content[idx] {
2153 ContentNode::Text {
2154 text,
2155 bold,
2156 italic,
2157 strike,
2158 link,
2159 } => {
2160 let link_target = link.as_deref();
2161 let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
2162 idx += 1;
2163 while let Some(ContentNode::Text {
2164 text,
2165 bold,
2166 italic,
2167 strike,
2168 link: next_link,
2169 }) = content.get(idx)
2170 {
2171 if next_link.as_deref() != link_target {
2172 break;
2173 }
2174 runs.push((text.as_str(), *bold, *italic, *strike));
2175 idx += 1;
2176 }
2177 let label = render_text_runs_markdown(&runs);
2178 if let Some(link_target) = link_target {
2179 let _ = write!(rendered, "[{label}]({link_target})");
2180 } else {
2181 rendered.push_str(&label);
2182 }
2183 }
2184 ContentNode::Image {
2185 url: Some(url),
2186 alt,
2187 ..
2188 } => {
2189 let _ = write!(rendered, "");
2190 idx += 1;
2191 }
2192 ContentNode::Image { .. } => idx += 1,
2193 }
2194 }
2195 rendered
2196}
2197
2198#[derive(Clone, Copy, Default)]
2199struct MarkdownMarkerState {
2200 bold: bool,
2201 italic: bool,
2202 strike: bool,
2203}
2204
2205fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
2206 let inactive = MarkdownMarkerState::default();
2207 let mut active = inactive;
2208 let mut output = String::new();
2209 for (text, bold, italic, strike) in runs {
2210 let next = MarkdownMarkerState {
2211 bold: *bold,
2212 italic: *italic,
2213 strike: *strike,
2214 };
2215 output.push_str(&markdown_marker_transition(active, next));
2216 output.push_str(text);
2217 active = next;
2218 }
2219 output.push_str(&markdown_marker_transition(active, inactive));
2220 output
2221}
2222
2223fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
2224 let mut markers = String::new();
2225 if active.strike && !next.strike {
2226 markers.push_str("~~");
2227 }
2228 if active.italic && !next.italic {
2229 markers.push('*');
2230 }
2231 if active.bold && !next.bold {
2232 markers.push_str("**");
2233 }
2234 if !active.bold && next.bold {
2235 markers.push_str("**");
2236 }
2237 if !active.italic && next.italic {
2238 markers.push('*');
2239 }
2240 if !active.strike && next.strike {
2241 markers.push_str("~~");
2242 }
2243 markers
2244}
2245
2246fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
2247 format!(
2248 "<!doctype html><html><body>{}</body></html>",
2249 blocks
2250 .iter()
2251 .map(|block| match block {
2252 CapturedBlock::Paragraph {
2253 content,
2254 style,
2255 list,
2256 quote,
2257 horizontal_rule,
2258 } => {
2259 if *horizontal_rule {
2260 "<hr>".to_string()
2261 } else if let Some(list) = list {
2262 let tag = if list.ordered { "ol" } else { "ul" };
2263 format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
2264 } else if *quote {
2265 format!("<blockquote>{}</blockquote>", render_content_html(content))
2266 } else {
2267 let tag = paragraph_tag(style.as_deref());
2268 format!("<{tag}>{}</{tag}>", render_content_html(content))
2269 }
2270 }
2271 CapturedBlock::Table(table) => render_table_html(table),
2272 })
2273 .collect::<String>()
2274 )
2275}
2276
2277fn render_table_html(table: &TableBlock) -> String {
2278 let mut html = String::from("<table>");
2279 for row in &table.rows {
2280 html.push_str("<tr>");
2281 for cell in &row.cells {
2282 html.push_str("<td>");
2283 html.push_str(&render_content_html(&cell.content));
2284 html.push_str("</td>");
2285 }
2286 html.push_str("</tr>");
2287 }
2288 html.push_str("</table>");
2289 html
2290}
2291
2292fn render_content_html(content: &[ContentNode]) -> String {
2293 content
2294 .iter()
2295 .map(|node| match node {
2296 ContentNode::Text {
2297 text,
2298 bold,
2299 italic,
2300 strike,
2301 link,
2302 } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
2303 ContentNode::Image {
2304 url: Some(url),
2305 alt,
2306 ..
2307 } => {
2308 format!(
2309 "<img src=\"{}\" alt=\"{}\">",
2310 escape_html(url),
2311 escape_html(alt)
2312 )
2313 }
2314 ContentNode::Image { .. } => String::new(),
2315 })
2316 .collect()
2317}
2318
2319fn render_marked_html(
2320 text: &str,
2321 bold: bool,
2322 italic: bool,
2323 strike: bool,
2324 link: Option<&str>,
2325) -> String {
2326 let mut output = escape_html(text).replace('\n', "<br>");
2327 if bold {
2328 output = format!("<strong>{output}</strong>");
2329 }
2330 if italic {
2331 output = format!("<em>{output}</em>");
2332 }
2333 if strike {
2334 output = format!("<s>{output}</s>");
2335 }
2336 if let Some(link) = link {
2337 output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
2338 }
2339 output
2340}
2341
2342fn paragraph_tag(style: Option<&str>) -> &'static str {
2343 match style {
2344 Some("TITLE" | "HEADING_1") => "h1",
2345 Some("SUBTITLE" | "HEADING_2") => "h2",
2346 Some("HEADING_3") => "h3",
2347 Some("HEADING_4") => "h4",
2348 Some("HEADING_5") => "h5",
2349 Some("HEADING_6") => "h6",
2350 _ => "p",
2351 }
2352}
2353
2354fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
2355 blocks
2356 .iter()
2357 .map(|block| match block {
2358 CapturedBlock::Paragraph { content, .. } => content_to_text(content),
2359 CapturedBlock::Table(table) => table
2360 .rows
2361 .iter()
2362 .map(|row| {
2363 row.cells
2364 .iter()
2365 .map(|cell| content_to_text(&cell.content))
2366 .collect::<Vec<_>>()
2367 .join("\t")
2368 })
2369 .collect::<Vec<_>>()
2370 .join("\n"),
2371 })
2372 .filter(|text| !text.is_empty())
2373 .collect::<Vec<_>>()
2374 .join("\n")
2375}
2376
2377fn content_to_text(content: &[ContentNode]) -> String {
2378 content
2379 .iter()
2380 .map(|node| match node {
2381 ContentNode::Text { text, .. } => text.clone(),
2382 ContentNode::Image {
2383 url: Some(_), alt, ..
2384 } => format!("[{alt}]"),
2385 ContentNode::Image { .. } => String::new(),
2386 })
2387 .collect()
2388}
2389
2390fn escape_html(value: &str) -> String {
2391 value
2392 .replace('&', "&")
2393 .replace('<', "<")
2394 .replace('>', ">")
2395 .replace('"', """)
2396 .replace('\'', "'")
2397}
2398
2399fn escape_markdown_table_cell(value: &str) -> String {
2400 value.replace('|', "\\|").replace('\n', "<br>")
2401}
2402
2403#[must_use]
2407pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
2408 let trimmed = auth_header.trim();
2409 trimmed
2410 .strip_prefix("Bearer ")
2411 .or_else(|| trimmed.strip_prefix("bearer "))
2412 .map(str::trim)
2413 .filter(|t| !t.is_empty())
2414}
2415
2416#[derive(Debug, Clone)]
2418pub struct ExtractedImage {
2419 pub filename: String,
2421 pub data: Vec<u8>,
2423 pub mime_type: String,
2425}
2426
2427#[derive(Debug, Clone)]
2429pub struct GDocsArchiveResult {
2430 pub html: String,
2432 pub markdown: String,
2434 pub images: Vec<ExtractedImage>,
2436 pub document_id: String,
2438 pub export_url: String,
2440}
2441
2442pub async fn localize_rendered_remote_images_for_archive(
2454 rendered: &GDocsRenderedResult,
2455) -> crate::Result<GDocsArchiveResult> {
2456 let client = reqwest::Client::builder().build().map_err(|error| {
2457 WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
2458 })?;
2459 let mut seen = HashMap::new();
2460 let mut images = Vec::new();
2461 let mut next_index = 1usize;
2462
2463 for image in &rendered.remote_images {
2464 if seen.contains_key(&image.url) {
2465 continue;
2466 }
2467 let filename = remote_image_filename(&image.url, next_index);
2468 next_index += 1;
2469 seen.insert(image.url.clone(), filename.clone());
2470
2471 match client
2472 .get(&image.url)
2473 .header("User-Agent", GDOCS_USER_AGENT)
2474 .header("Accept", "image/*,*/*;q=0.8")
2475 .send()
2476 .await
2477 {
2478 Ok(response) if response.status().is_success() => {
2479 let mime_type = response
2480 .headers()
2481 .get(reqwest::header::CONTENT_TYPE)
2482 .and_then(|value| value.to_str().ok())
2483 .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
2484 let data = response.bytes().await.map_err(|error| {
2485 WebCaptureError::FetchError(format!(
2486 "Failed to read Google Docs image {}: {error}",
2487 image.url
2488 ))
2489 })?;
2490 debug!(
2491 url = %image.url,
2492 filename = %filename,
2493 bytes = data.len(),
2494 mime_type = %mime_type,
2495 "downloaded Google Docs browser-model archive image"
2496 );
2497 images.push(ExtractedImage {
2498 filename,
2499 data: data.to_vec(),
2500 mime_type,
2501 });
2502 }
2503 Ok(response) => {
2504 warn!(
2505 url = %image.url,
2506 status = response.status().as_u16(),
2507 "failed to download Google Docs browser-model archive image"
2508 );
2509 }
2510 Err(error) => {
2511 warn!(
2512 url = %image.url,
2513 error = %error,
2514 "failed to download Google Docs browser-model archive image"
2515 );
2516 }
2517 }
2518 }
2519
2520 let mut markdown = rendered.markdown.clone();
2521 let mut html = rendered.html.clone();
2522 for (url, filename) in seen {
2523 let local_path = format!("images/{filename}");
2524 markdown = markdown.replace(&url, &local_path);
2525 html = html.replace(&url, &local_path);
2526 }
2527
2528 Ok(GDocsArchiveResult {
2529 html,
2530 markdown,
2531 images,
2532 document_id: rendered.document_id.clone(),
2533 export_url: rendered.export_url.clone(),
2534 })
2535}
2536
2537fn remote_image_filename(url: &str, index: usize) -> String {
2538 let ext = crate::localize_images::get_extension_from_url(url);
2539 format!("image-{index:02}{ext}")
2540}
2541
2542fn mime_type_for_filename(filename: &str) -> String {
2543 match filename
2544 .rsplit('.')
2545 .next()
2546 .unwrap_or("png")
2547 .to_lowercase()
2548 .as_str()
2549 {
2550 "jpg" | "jpeg" => "image/jpeg",
2551 "gif" => "image/gif",
2552 "webp" => "image/webp",
2553 "svg" => "image/svg+xml",
2554 _ => "image/png",
2555 }
2556 .to_string()
2557}
2558
2559fn base64_image_pattern() -> &'static Regex {
2560 static PATTERN: OnceLock<Regex> = OnceLock::new();
2561 PATTERN.get_or_init(|| {
2562 Regex::new(
2563 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
2564 )
2565 .unwrap()
2566 })
2567}
2568
2569#[must_use]
2582pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
2583 let mut images = Vec::new();
2584 let mut idx = 1u32;
2585
2586 let updated_html = base64_image_pattern()
2587 .replace_all(html, |caps: ®ex::Captures<'_>| {
2588 let prefix = &caps[1];
2589 let mime_ext = &caps[2];
2590 let base64_data = &caps[3];
2591 let suffix = &caps[4];
2592
2593 let ext = match mime_ext {
2594 "jpeg" => "jpg",
2595 "svg+xml" => "svg",
2596 other => other,
2597 };
2598
2599 let filename = format!("image-{idx:02}.{ext}");
2600 let mime_type = format!("image/{mime_ext}");
2601
2602 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
2603 debug!("Extracted image: {} ({} bytes)", filename, data.len());
2604 images.push(ExtractedImage {
2605 filename: filename.clone(),
2606 data,
2607 mime_type,
2608 });
2609 }
2610
2611 idx += 1;
2612 format!("{prefix}images/{filename}{suffix}")
2613 })
2614 .into_owned();
2615
2616 (updated_html, images)
2617}
2618
2619pub async fn fetch_google_doc_as_archive(
2638 url: &str,
2639 api_token: Option<&str>,
2640) -> crate::Result<GDocsArchiveResult> {
2641 let result = fetch_google_doc(url, "html", api_token).await?;
2642
2643 let preprocess = preprocess_google_docs_export_html(&result.content);
2644 debug!(
2645 document_id = %result.document_id,
2646 hoisted = preprocess.hoisted,
2647 unwrapped_links = preprocess.unwrapped_links,
2648 "google-docs-export pre-processor rewrote archive markup"
2649 );
2650
2651 let (local_html, images) = extract_base64_images(&preprocess.html);
2652
2653 let markdown = crate::markdown::convert_html_to_markdown(&local_html, None)?;
2654
2655 debug!(
2656 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
2657 images.len(),
2658 local_html.len(),
2659 markdown.len()
2660 );
2661
2662 Ok(GDocsArchiveResult {
2663 html: local_html,
2664 markdown,
2665 images,
2666 document_id: result.document_id,
2667 export_url: result.export_url,
2668 })
2669}
2670
2671pub fn create_archive_zip(
2682 archive: &GDocsArchiveResult,
2683 pretty_html: bool,
2684) -> crate::Result<Vec<u8>> {
2685 let mut buf = std::io::Cursor::new(Vec::new());
2686
2687 {
2688 let mut zip = zip::ZipWriter::new(&mut buf);
2689 let options = zip::write::SimpleFileOptions::default()
2690 .compression_method(zip::CompressionMethod::Deflated);
2691
2692 zip.start_file("document.md", options)
2693 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2694 zip.write_all(archive.markdown.as_bytes())?;
2695
2696 let html_output = if pretty_html {
2697 crate::html::pretty_print_html(&archive.html)
2698 } else {
2699 archive.html.clone()
2700 };
2701 zip.start_file("document.html", options)
2702 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2703 zip.write_all(html_output.as_bytes())?;
2704
2705 for img in &archive.images {
2706 zip.start_file(format!("images/{}", img.filename), options)
2707 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2708 zip.write_all(&img.data)?;
2709 }
2710
2711 zip.finish()
2712 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2713 }
2714
2715 Ok(buf.into_inner())
2716}