1use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use serde_json::Value;
38use std::collections::HashMap;
39use std::fmt::Write as _;
40use std::hash::BuildHasher;
41use std::io::Write;
42use std::process::Stdio;
43use std::sync::OnceLock;
44use std::time::{Duration, Instant};
45use tokio::io::{AsyncBufReadExt, BufReader};
46use tokio::process::{Child, Command};
47use tracing::{debug, info, warn};
48
49use crate::WebCaptureError;
50
51const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
52const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
53const GDOCS_USER_AGENT: &str =
54 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
55const GDOCS_EDITOR_MODEL_WAIT: Duration = Duration::from_secs(30);
56const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
57
58type CdpWebSocket = WebSocketStream<ConnectStream>;
59
60const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
61window.__captured_chunks = [];
62const captureChunk = (value) => {
63 if (!value) {
64 return;
65 }
66 if (Array.isArray(value)) {
67 for (const item of value) {
68 captureChunk(item);
69 }
70 return;
71 }
72 try {
73 window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
74 } catch {
75 window.__captured_chunks.push(value);
76 }
77};
78const wrapChunkArray = (value) => {
79 if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
80 return value;
81 }
82 const originalPush = value.push;
83 Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
84 value: true,
85 enumerable: false,
86 });
87 Object.defineProperty(value, 'push', {
88 value(...items) {
89 for (const item of items) {
90 captureChunk(item);
91 }
92 return originalPush.apply(this, items);
93 },
94 writable: true,
95 configurable: true,
96 });
97 for (const item of value) {
98 captureChunk(item);
99 }
100 return value;
101};
102Object.defineProperty(window, 'DOCS_modelChunk', {
103 set(value) {
104 captureChunk(value);
105 window.__DOCS_modelChunk_latest = wrapChunkArray(value);
106 },
107 get() {
108 return window.__DOCS_modelChunk_latest;
109 },
110 configurable: false,
111});
112";
113
114const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
115 const chunks = [...(window.__captured_chunks || [])];
116 if (
117 window.DOCS_modelChunk &&
118 chunks.length === 0 &&
119 !chunks.includes(window.DOCS_modelChunk)
120 ) {
121 chunks.push(window.DOCS_modelChunk);
122 }
123 const cidUrlMap = {};
124 const scripts = document.querySelectorAll('script');
125 for (const script of scripts) {
126 const text = script.textContent || '';
127 if (!text.includes('docs-images-rt')) {
128 continue;
129 }
130 const regex =
131 /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
132 let match;
133 while ((match = regex.exec(text)) !== null) {
134 cidUrlMap[match[1]] = match[2]
135 .replace(/\\u003d/g, '=')
136 .replace(/\\u0026/g, '&')
137 .replace(/\\\//g, '/');
138 }
139 }
140 return { chunks, cidUrlMap };
141}"#;
142
143fn gdocs_url_pattern() -> &'static Regex {
144 static PATTERN: OnceLock<Regex> = OnceLock::new();
145 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
146}
147
148#[derive(Debug, Clone)]
150pub struct GDocsResult {
151 pub content: String,
153 pub format: String,
155 pub document_id: String,
157 pub export_url: String,
159}
160
161#[derive(Debug, Clone, Copy, PartialEq, Eq)]
163pub enum GDocsCaptureMethod {
164 BrowserModel,
166 PublicExport,
168 DocsApi,
170}
171
172#[derive(Debug, Clone)]
174pub struct GDocsRenderedResult {
175 pub markdown: String,
177 pub html: String,
179 pub text: String,
181 pub document_id: String,
183 pub export_url: String,
185 pub remote_images: Vec<RemoteImage>,
187}
188
189#[derive(Debug, Clone, PartialEq, Eq)]
191pub struct RemoteImage {
192 pub url: String,
194 pub alt: String,
196}
197
198#[derive(Debug, Clone)]
199struct BrowserModelData {
200 chunks: Vec<Value>,
201 cid_urls: HashMap<String, String>,
202}
203
204#[derive(Debug, Clone, Default)]
206pub struct CapturedDocument {
207 pub blocks: Vec<CapturedBlock>,
209 pub tables: Vec<TableBlock>,
211 pub images: Vec<ContentNode>,
213 pub text: String,
215}
216
217#[derive(Debug, Clone)]
219pub enum CapturedBlock {
220 Paragraph {
222 content: Vec<ContentNode>,
224 style: Option<String>,
226 list: Option<ListMeta>,
228 quote: bool,
230 horizontal_rule: bool,
232 },
233 Table(TableBlock),
235}
236
237#[derive(Debug, Clone, Default)]
239pub struct TableBlock {
240 pub rows: Vec<TableRow>,
242}
243
244#[derive(Debug, Clone, Default)]
246pub struct TableRow {
247 pub cells: Vec<TableCell>,
249}
250
251#[derive(Debug, Clone, Default)]
253pub struct TableCell {
254 pub content: Vec<ContentNode>,
256}
257
258#[derive(Debug, Clone, PartialEq, Eq)]
260pub enum ContentNode {
261 Text {
263 text: String,
265 bold: bool,
267 italic: bool,
269 strike: bool,
271 link: Option<String>,
273 },
274 Image {
276 cid: Option<String>,
278 url: Option<String>,
280 alt: String,
282 is_suggestion: bool,
284 },
285}
286
287#[derive(Debug, Clone, Default, PartialEq, Eq)]
288struct TextStyle {
289 bold: bool,
290 italic: bool,
291 strike: bool,
292 link: Option<String>,
293}
294
295#[derive(Debug, Clone, Default)]
296struct ParagraphMeta {
297 style: Option<String>,
298 list: Option<ListMeta>,
299 quote: bool,
300 horizontal_rule: bool,
301}
302
303#[derive(Debug, Clone)]
304pub struct ListMeta {
305 pub id: String,
307 pub level: usize,
309 pub ordered: bool,
311}
312
313#[derive(Debug, Clone)]
314struct ParagraphStyle {
315 style: Option<String>,
316 indent_start: f64,
317 indent_first_line: f64,
318}
319
320#[derive(Debug, Clone, Default)]
321struct ModelStyleMaps {
322 inline_styles: Vec<TextStyle>,
323 paragraph_by_end: HashMap<usize, ParagraphStyle>,
324 list_by_end: HashMap<usize, ListMeta>,
325 horizontal_rules: std::collections::HashSet<usize>,
326}
327
328#[must_use]
330pub fn is_google_docs_url(url: &str) -> bool {
331 gdocs_url_pattern().is_match(url)
332}
333
334#[must_use]
338pub fn extract_document_id(url: &str) -> Option<String> {
339 gdocs_url_pattern()
340 .captures(url)
341 .and_then(|caps| caps.get(1))
342 .map(|m| m.as_str().to_string())
343}
344
345#[must_use]
352pub fn build_export_url(document_id: &str, format: &str) -> String {
353 let export_format = match format {
354 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
355 _ => "html",
356 };
357 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
358}
359
360#[must_use]
362pub fn build_edit_url(document_id: &str) -> String {
363 format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
364}
365
366#[must_use]
368pub fn build_docs_api_url(document_id: &str) -> String {
369 format!("{GDOCS_API_BASE}/{document_id}")
370}
371
372pub fn select_capture_method(
378 capture: &str,
379 api_token: Option<&str>,
380) -> crate::Result<GDocsCaptureMethod> {
381 match capture.to_lowercase().as_str() {
382 "browser" => Ok(GDocsCaptureMethod::BrowserModel),
383 "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
384 "api" => Ok(GDocsCaptureMethod::PublicExport),
385 other => Err(WebCaptureError::InvalidUrl(format!(
386 "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
387 ))),
388 }
389}
390
391pub async fn fetch_google_doc(
406 url: &str,
407 format: &str,
408 api_token: Option<&str>,
409) -> crate::Result<GDocsResult> {
410 let document_id = extract_document_id(url).ok_or_else(|| {
411 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
412 })?;
413
414 let export_url = build_export_url(&document_id, format);
415 debug!(
416 document_id = %document_id,
417 format = %format,
418 export_url = %export_url,
419 has_api_token = api_token.is_some(),
420 "fetching Google Doc via public export"
421 );
422
423 let mut request = reqwest::Client::new()
424 .get(&export_url)
425 .header(
426 "User-Agent",
427 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
428 )
429 .header("Accept-Charset", "utf-8")
430 .header("Accept-Language", "en-US,en;q=0.9");
431
432 if let Some(token) = api_token {
433 request = request.header("Authorization", format!("Bearer {token}"));
434 }
435
436 let response = request
437 .send()
438 .await
439 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
440 debug!(
441 document_id = %document_id,
442 status = response.status().as_u16(),
443 success = response.status().is_success(),
444 content_type = response
445 .headers()
446 .get(reqwest::header::CONTENT_TYPE)
447 .and_then(|value| value.to_str().ok())
448 .unwrap_or(""),
449 "received Google Docs public export response"
450 );
451
452 if !response.status().is_success() {
453 return Err(WebCaptureError::FetchError(format!(
454 "Failed to fetch Google Doc ({} {}): {}",
455 response.status().as_u16(),
456 response.status().canonical_reason().unwrap_or("Unknown"),
457 export_url
458 )));
459 }
460
461 let raw_content = response.text().await.map_err(|e| {
462 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
463 })?;
464 debug!(
465 document_id = %document_id,
466 bytes = raw_content.len(),
467 "read Google Docs public export body"
468 );
469
470 let content = match format {
472 "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
473 _ => raw_content,
474 };
475
476 Ok(GDocsResult {
477 content,
478 format: format.to_string(),
479 document_id,
480 export_url,
481 })
482}
483
484pub async fn fetch_google_doc_as_markdown(
498 url: &str,
499 api_token: Option<&str>,
500) -> crate::Result<GDocsResult> {
501 let result = fetch_google_doc(url, "html", api_token).await?;
502
503 let preprocess = preprocess_google_docs_export_html(&result.content);
504 debug!(
505 document_id = %result.document_id,
506 hoisted = preprocess.hoisted,
507 unwrapped_links = preprocess.unwrapped_links,
508 "google-docs-export pre-processor rewrote markup"
509 );
510 let markdown = normalize_google_docs_export_markdown(
511 &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
512 );
513 debug!(
514 document_id = %result.document_id,
515 bytes = markdown.len(),
516 "rendered Google Docs public export markdown"
517 );
518
519 Ok(GDocsResult {
520 content: markdown,
521 format: "markdown".to_string(),
522 document_id: result.document_id,
523 export_url: result.export_url,
524 })
525}
526
527#[derive(Debug, Clone)]
532pub struct GDocsExportPreprocessResult {
533 pub html: String,
535 pub hoisted: usize,
537 pub unwrapped_links: usize,
539}
540
541#[must_use]
549pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
550 let mut hoisted: usize = 0;
551 let mut unwrapped_links: usize = 0;
552 let class_styles = extract_css_class_styles(html);
553
554 let mut out = hoist_inline_style_spans(html, &mut hoisted);
555 out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
556 out = convert_class_indented_blockquotes(&out, &class_styles);
557 out = nest_google_docs_lists(&out, &class_styles);
558 out = strip_google_docs_heading_noise(&out);
559 out = strip_heading_inline_formatting(&out);
560 out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
561 out = out.replace(" ", " ");
562 out = out.replace('\u{00A0}', " ");
563
564 GDocsExportPreprocessResult {
565 html: out,
566 hoisted,
567 unwrapped_links,
568 }
569}
570
571#[must_use]
573pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
574 let markdown = unescape_public_export_punctuation(markdown);
575 let markdown = convert_setext_headings(&markdown);
576 let markdown = normalize_atx_headings(&markdown);
577 let markdown = normalize_bullet_markers(&markdown);
578 let markdown = normalize_list_spacing(&markdown);
579 let markdown = normalize_blockquote_spacing(&markdown);
580 let markdown = normalize_markdown_tables(&markdown);
581 crate::markdown::clean_markdown(&markdown)
582}
583
584fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
585 let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
586 .expect("valid regex");
587 span_re
588 .replace_all(html, |caps: ®ex::Captures<'_>| {
589 let style = caps.get(2).map_or("", |m| m.as_str());
590 let inner = caps.get(3).map_or("", |m| m.as_str());
591 semantic_wrapped_html(inner, style).map_or_else(
592 || caps[0].to_string(),
593 |wrapped| {
594 *hoisted += 1;
595 wrapped
596 },
597 )
598 })
599 .into_owned()
600}
601
602fn hoist_class_style_spans(
603 html: &str,
604 class_styles: &HashMap<String, String>,
605 hoisted: &mut usize,
606) -> String {
607 let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
608 .expect("valid regex");
609 class_span_re
610 .replace_all(html, |caps: ®ex::Captures<'_>| {
611 let class_attr = caps.get(2).map_or("", |m| m.as_str());
612 let inner = caps.get(3).map_or("", |m| m.as_str());
613 let style = combined_class_style(class_styles, class_attr);
614 semantic_wrapped_html(inner, &style).map_or_else(
615 || caps[0].to_string(),
616 |wrapped| {
617 *hoisted += 1;
618 wrapped
619 },
620 )
621 })
622 .into_owned()
623}
624
625fn convert_class_indented_blockquotes(
626 html: &str,
627 class_styles: &HashMap<String, String>,
628) -> String {
629 let class_paragraph_re =
630 Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
631 class_paragraph_re
632 .replace_all(html, |caps: ®ex::Captures<'_>| {
633 let class_attr = caps.get(2).map_or("", |m| m.as_str());
634 let inner = caps.get(3).map_or("", |m| m.as_str());
635 let style = combined_class_style(class_styles, class_attr);
636 if is_blockquote_style(&style) {
637 format!("<blockquote><p>{inner}</p></blockquote>")
638 } else {
639 caps[0].to_string()
640 }
641 })
642 .into_owned()
643}
644
645#[derive(Debug, Clone)]
646struct ExportListBlock {
647 start: usize,
648 end: usize,
649 tag: String,
650 inner: String,
651}
652
653#[derive(Debug, Clone)]
654struct ExportListItem {
655 tag: String,
656 level: usize,
657 inner: String,
658}
659
660fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
661 let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
662 let blocks: Vec<ExportListBlock> = list_re
663 .captures_iter(html)
664 .filter_map(|caps| {
665 let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
666 let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
667 if open_tag != close_tag {
668 return None;
669 }
670 let whole = caps.get(0)?;
671 Some(ExportListBlock {
672 start: whole.start(),
673 end: whole.end(),
674 tag: open_tag,
675 inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
676 })
677 })
678 .collect();
679
680 if blocks.len() < 2 {
681 return html.to_string();
682 }
683
684 let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
685 let mut current: Vec<ExportListBlock> = Vec::new();
686 for block in blocks {
687 if let Some(previous) = current.last() {
688 if !html[previous.end..block.start].trim().is_empty() {
689 if current.len() > 1 {
690 groups.push(std::mem::take(&mut current));
691 } else {
692 current.clear();
693 }
694 }
695 }
696 current.push(block);
697 }
698 if current.len() > 1 {
699 groups.push(current);
700 }
701
702 if groups.is_empty() {
703 return html.to_string();
704 }
705
706 let mut out = html.to_string();
707 for group in groups.iter().rev() {
708 let rendered = render_nested_list_group(group, class_styles);
709 let start = group.first().expect("non-empty group").start;
710 let end = group.last().expect("non-empty group").end;
711 out.replace_range(start..end, &rendered);
712 }
713 out
714}
715
716fn render_nested_list_group(
717 group: &[ExportListBlock],
718 class_styles: &HashMap<String, String>,
719) -> String {
720 let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
721 let items: Vec<ExportListItem> = group
722 .iter()
723 .flat_map(|block| {
724 item_re.captures_iter(&block.inner).map(|caps| {
725 let attrs = caps.get(1).map_or("", |m| m.as_str());
726 let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
727 ExportListItem {
728 tag: block.tag.clone(),
729 level: google_docs_list_item_level(attrs, class_styles),
730 inner,
731 }
732 })
733 })
734 .collect();
735
736 if items.is_empty() {
737 let mut unchanged = String::new();
738 for block in group {
739 write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
740 .expect("write to String");
741 }
742 return unchanged;
743 }
744
745 let mut html = String::new();
746 let mut current_level: Option<usize> = None;
747 let mut open_tags: Vec<Option<String>> = Vec::new();
748 let mut item_open: Vec<bool> = Vec::new();
749
750 for item in items {
751 let level = item.level;
752 while current_level.is_some_and(|current| current > level) {
753 let current = current_level.expect("checked as Some");
754 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
755 current_level = current.checked_sub(1);
756 }
757
758 while current_level.is_none_or(|current| current < level) {
759 let next_level = current_level.map_or(0, |current| current + 1);
760 open_rendered_list(
761 &mut html,
762 &mut open_tags,
763 &mut item_open,
764 next_level,
765 &item.tag,
766 );
767 current_level = Some(next_level);
768 }
769
770 ensure_list_stack(&mut open_tags, &mut item_open, level);
771 if open_tags[level]
772 .as_deref()
773 .is_some_and(|tag| tag != item.tag)
774 {
775 close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
776 open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
777 } else if open_tags[level].is_none() {
778 open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
779 }
780
781 close_rendered_item(&mut html, &mut item_open, level);
782 html.push_str("<li>");
783 html.push_str(&item.inner);
784 item_open[level] = true;
785
786 for deeper in (level + 1)..item_open.len() {
787 item_open[deeper] = false;
788 open_tags[deeper] = None;
789 }
790 }
791
792 while let Some(current) = current_level {
793 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
794 current_level = current.checked_sub(1);
795 }
796
797 html
798}
799
800fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
801 while open_tags.len() <= level {
802 open_tags.push(None);
803 item_open.push(false);
804 }
805}
806
807fn open_rendered_list(
808 html: &mut String,
809 open_tags: &mut Vec<Option<String>>,
810 item_open: &mut Vec<bool>,
811 level: usize,
812 tag: &str,
813) {
814 ensure_list_stack(open_tags, item_open, level);
815 html.push('<');
816 html.push_str(tag);
817 html.push('>');
818 open_tags[level] = Some(tag.to_string());
819 item_open[level] = false;
820}
821
822fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
823 if item_open.get(level).copied().unwrap_or(false) {
824 html.push_str("</li>");
825 item_open[level] = false;
826 }
827}
828
829fn close_rendered_list(
830 html: &mut String,
831 open_tags: &mut [Option<String>],
832 item_open: &mut [bool],
833 level: usize,
834) {
835 close_rendered_item(html, item_open, level);
836 if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
837 html.push_str("</");
838 html.push_str(&tag);
839 html.push('>');
840 }
841}
842
843fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
844 let style = combined_attr_style(class_styles, attrs);
845 let margin_left = css_point_value(&style, "margin-left");
846 if margin_left <= 0.0 {
847 return 0;
848 }
849 [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
850 .iter()
851 .take_while(|boundary| margin_left >= **boundary)
852 .count()
853}
854
855fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
856 let mut styles = String::new();
857 if let Some(style) = attr_value(attrs, "style") {
858 styles.push_str(&style);
859 }
860 if let Some(class_attr) = attr_value(attrs, "class") {
861 styles.push_str(&combined_class_style(class_styles, &class_attr));
862 }
863 styles
864}
865
866fn attr_value(attrs: &str, name: &str) -> Option<String> {
867 let attr_re = Regex::new(&format!(
868 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
869 regex::escape(name)
870 ))
871 .expect("valid regex");
872 attr_re.captures(attrs).and_then(|caps| {
873 caps.get(1)
874 .or_else(|| caps.get(2))
875 .map(|value| value.as_str().to_string())
876 })
877}
878
879fn strip_google_docs_heading_noise(html: &str) -> String {
880 let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
881 let numbering_re =
882 Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
883 let mut out = empty_anchor_re.replace_all(html, "").into_owned();
884 for level in 1..=6 {
885 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
886 .expect("valid regex");
887 out = heading_re
888 .replace_all(&out, |caps: ®ex::Captures<'_>| {
889 let open = &caps[1];
890 let inner = &caps[2];
891 let close = &caps[3];
892 let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
893 cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
894 format!("{open}{cleaned}{close}")
895 })
896 .into_owned();
897 }
898 out
899}
900
901fn strip_heading_inline_formatting(html: &str) -> String {
902 let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
903 let mut out = html.to_string();
904 for level in 1..=6 {
905 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
906 .expect("valid regex");
907 out = heading_re
908 .replace_all(&out, |caps: ®ex::Captures<'_>| {
909 let open = &caps[1];
910 let inner = &caps[2];
911 let close = &caps[3];
912 let cleaned = inline_marker_re.replace_all(inner, "");
913 format!("{open}{cleaned}{close}")
914 })
915 .into_owned();
916 }
917 out
918}
919
920fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
921 let redirect_re =
922 Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
923 .expect("valid regex");
924 redirect_re
925 .replace_all(html, |caps: ®ex::Captures<'_>| {
926 let encoded = caps.get(1).map_or("", |m| m.as_str());
927 let decoded = percent_decode_utf8_lossy(encoded);
928 *unwrapped_links += 1;
929 format!(r#"href="{decoded}""#)
930 })
931 .into_owned()
932}
933
934fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
935 let mut class_styles: HashMap<String, String> = HashMap::new();
936 let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
937 let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
938 for style_caps in style_re.captures_iter(html) {
939 let css = style_caps.get(1).map_or("", |m| m.as_str());
940 for class_caps in class_re.captures_iter(css) {
941 let class_name = class_caps.get(1).map_or("", |m| m.as_str());
942 let style = class_caps.get(2).map_or("", |m| m.as_str());
943 class_styles
944 .entry(class_name.to_string())
945 .and_modify(|existing| {
946 existing.push(';');
947 existing.push_str(style);
948 })
949 .or_insert_with(|| style.to_string());
950 }
951 }
952 class_styles
953}
954
955fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
956 class_attr
957 .split_whitespace()
958 .filter_map(|class_name| class_styles.get(class_name))
959 .fold(String::new(), |mut out, style| {
960 out.push(';');
961 out.push_str(style);
962 out
963 })
964}
965
966fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
967 let bold = css_has_bold(style);
968 let italic = css_has_italic(style);
969 let strike = css_has_strike(style);
970 if !bold && !italic && !strike {
971 return None;
972 }
973 let mut wrapped = inner.to_string();
974 if strike {
975 wrapped = format!("<del>{wrapped}</del>");
976 }
977 if italic {
978 wrapped = format!("<em>{wrapped}</em>");
979 }
980 if bold {
981 wrapped = format!("<strong>{wrapped}</strong>");
982 }
983 Some(wrapped)
984}
985
986fn css_has_bold(style: &str) -> bool {
987 Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
988 .expect("valid regex")
989 .is_match(style)
990}
991
992fn css_has_italic(style: &str) -> bool {
993 Regex::new(r"(?i)font-style\s*:\s*italic")
994 .expect("valid regex")
995 .is_match(style)
996}
997
998fn css_has_strike(style: &str) -> bool {
999 Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1000 .expect("valid regex")
1001 .is_match(style)
1002}
1003
1004fn is_blockquote_style(style: &str) -> bool {
1005 let margin_left = css_point_value(style, "margin-left");
1006 let margin_right = css_point_value(style, "margin-right");
1007 margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1008}
1009
1010fn css_point_value(style: &str, property: &str) -> f64 {
1011 let re = Regex::new(&format!(
1012 r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1013 regex::escape(property)
1014 ))
1015 .expect("valid regex");
1016 re.captures(style)
1017 .and_then(|caps| caps.get(1))
1018 .and_then(|value| value.as_str().parse::<f64>().ok())
1019 .unwrap_or(0.0)
1020}
1021
1022fn percent_decode_utf8_lossy(input: &str) -> String {
1025 let bytes = input.as_bytes();
1026 let mut decoded = Vec::with_capacity(bytes.len());
1027 let mut i = 0;
1028 while i < bytes.len() {
1029 if bytes[i] == b'%' && i + 2 < bytes.len() {
1030 let hi = (bytes[i + 1] as char).to_digit(16);
1031 let lo = (bytes[i + 2] as char).to_digit(16);
1032 if let (Some(hi), Some(lo)) = (hi, lo) {
1033 if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1034 decoded.push(byte);
1035 i += 3;
1036 continue;
1037 }
1038 }
1039 }
1040 decoded.push(bytes[i]);
1041 i += 1;
1042 }
1043 String::from_utf8_lossy(&decoded).into_owned()
1044}
1045
1046fn unescape_public_export_punctuation(markdown: &str) -> String {
1047 markdown
1048 .replace("\\.", ".")
1049 .replace("\\!", "!")
1050 .replace("\\(", "(")
1051 .replace("\\)", ")")
1052 .replace("\\[", "[")
1053 .replace("\\]", "]")
1054}
1055
1056fn convert_setext_headings(markdown: &str) -> String {
1057 let lines: Vec<&str> = markdown.lines().collect();
1058 let mut out = Vec::with_capacity(lines.len());
1059 let mut index = 0;
1060 while index < lines.len() {
1061 if index + 1 < lines.len() {
1062 let underline = lines[index + 1].trim();
1063 if is_setext_underline(underline, '=') {
1064 out.push(format!("# {}", lines[index].trim()));
1065 index += 2;
1066 continue;
1067 }
1068 if is_setext_underline(underline, '-') {
1069 out.push(format!("## {}", lines[index].trim()));
1070 index += 2;
1071 continue;
1072 }
1073 }
1074 out.push(lines[index].to_string());
1075 index += 1;
1076 }
1077 out.join("\n")
1078}
1079
1080fn is_setext_underline(line: &str, marker: char) -> bool {
1081 line.len() >= 5 && line.chars().all(|ch| ch == marker)
1082}
1083
1084fn normalize_atx_headings(markdown: &str) -> String {
1085 let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1086 let closing_re = closing_atx_heading_re();
1087 markdown
1088 .lines()
1089 .map(|line| {
1090 let Some(caps) = heading_re.captures(line) else {
1091 return line.to_string();
1092 };
1093 let hashes = caps.get(1).map_or("", |m| m.as_str());
1094 let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1095 text = closing_re.replace(&text, "").trim().to_string();
1096 text = strip_wrapping_markdown_emphasis(&text);
1097 format!("{hashes} {text}")
1098 })
1099 .collect::<Vec<_>>()
1100 .join("\n")
1101}
1102
1103fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1104 let trimmed = text.trim();
1105 for marker in ["***", "**", "*"] {
1106 if trimmed.len() > marker.len() * 2
1107 && trimmed.starts_with(marker)
1108 && trimmed.ends_with(marker)
1109 {
1110 return trimmed[marker.len()..trimmed.len() - marker.len()]
1111 .trim()
1112 .to_string();
1113 }
1114 }
1115 trimmed.to_string()
1116}
1117
1118fn normalize_bullet_markers(markdown: &str) -> String {
1119 let bullet_re = asterisk_bullet_re();
1120 markdown
1121 .lines()
1122 .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1123 .collect::<Vec<_>>()
1124 .join("\n")
1125}
1126
1127fn normalize_list_spacing(markdown: &str) -> String {
1128 let lines: Vec<&str> = markdown.lines().collect();
1129 let mut out = Vec::with_capacity(lines.len());
1130
1131 for (index, line) in lines.iter().enumerate() {
1132 if line.trim().is_empty()
1133 && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1134 && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1135 {
1136 continue;
1137 }
1138 out.push((*line).to_string());
1139 }
1140
1141 out.join("\n")
1142}
1143
1144fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1145 lines[..index]
1146 .iter()
1147 .rev()
1148 .copied()
1149 .find(|line| !line.trim().is_empty())
1150}
1151
1152fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1153 lines[index + 1..]
1154 .iter()
1155 .copied()
1156 .find(|line| !line.trim().is_empty())
1157}
1158
1159fn is_markdown_list_item(line: &str) -> bool {
1160 markdown_list_item_re().is_match(line)
1161}
1162
1163fn normalize_blockquote_spacing(markdown: &str) -> String {
1164 let mut out = String::with_capacity(markdown.len());
1165 let mut pending_quote_blank = false;
1166 let mut in_quote = false;
1167
1168 for line in markdown.lines() {
1169 if line.trim().is_empty() && in_quote {
1170 pending_quote_blank = true;
1171 continue;
1172 }
1173
1174 if line.trim() == ">" {
1175 if in_quote {
1176 pending_quote_blank = true;
1177 }
1178 continue;
1179 }
1180
1181 if line.starts_with("> ") {
1182 if pending_quote_blank {
1183 out.push_str(">\n");
1184 pending_quote_blank = false;
1185 }
1186 out.push_str(line);
1187 out.push('\n');
1188 in_quote = true;
1189 continue;
1190 }
1191
1192 if in_quote && !line.trim().is_empty() {
1193 out.push('\n');
1194 }
1195 pending_quote_blank = false;
1196 in_quote = false;
1197 out.push_str(line);
1198 out.push('\n');
1199 }
1200
1201 out
1202}
1203
1204fn normalize_markdown_tables(markdown: &str) -> String {
1205 let lines: Vec<&str> = markdown.lines().collect();
1206 let mut out = Vec::with_capacity(lines.len());
1207 let mut index = 0;
1208
1209 while index < lines.len() {
1210 if !is_markdown_table_line(lines[index]) {
1211 out.push(lines[index].to_string());
1212 index += 1;
1213 continue;
1214 }
1215
1216 let start = index;
1217 while index < lines.len() && is_markdown_table_line(lines[index]) {
1218 index += 1;
1219 }
1220 let block = &lines[start..index];
1221 if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1222 out.extend(normalize_markdown_table_block(block));
1223 } else {
1224 out.extend(block.iter().map(|line| (*line).to_string()));
1225 }
1226 }
1227
1228 out.join("\n")
1229}
1230
1231fn is_markdown_table_line(line: &str) -> bool {
1232 let trimmed = line.trim();
1233 trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1234}
1235
1236fn is_markdown_separator_line(line: &str) -> bool {
1237 split_markdown_table_cells(line)
1238 .iter()
1239 .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1240}
1241
1242fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1243 lines
1244 .iter()
1245 .enumerate()
1246 .map(|(index, line)| {
1247 let cells = split_markdown_table_cells(line);
1248 if index == 1 {
1249 let separators = vec!["---".to_string(); cells.len()];
1250 render_markdown_table_row(&separators)
1251 } else {
1252 render_markdown_table_row(&cells)
1253 }
1254 })
1255 .collect()
1256}
1257
1258fn split_markdown_table_cells(line: &str) -> Vec<String> {
1259 line.trim()
1260 .trim_matches('|')
1261 .split('|')
1262 .map(|cell| cell.trim().to_string())
1263 .collect()
1264}
1265
1266fn render_markdown_table_row(cells: &[String]) -> String {
1267 format!("| {} |", cells.join(" | "))
1268}
1269
1270fn closing_atx_heading_re() -> &'static Regex {
1271 static RE: OnceLock<Regex> = OnceLock::new();
1272 RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1273}
1274
1275fn asterisk_bullet_re() -> &'static Regex {
1276 static RE: OnceLock<Regex> = OnceLock::new();
1277 RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1278}
1279
1280fn markdown_list_item_re() -> &'static Regex {
1281 static RE: OnceLock<Regex> = OnceLock::new();
1282 RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1283}
1284
1285fn markdown_table_separator_cell_re() -> &'static Regex {
1286 static RE: OnceLock<Regex> = OnceLock::new();
1287 RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1288}
1289
1290pub async fn fetch_google_doc_from_docs_api(
1296 url: &str,
1297 api_token: &str,
1298) -> crate::Result<GDocsRenderedResult> {
1299 let document_id = extract_document_id(url).ok_or_else(|| {
1300 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1301 })?;
1302 let api_url = build_docs_api_url(&document_id);
1303 debug!(
1304 document_id = %document_id,
1305 api_url = %api_url,
1306 "fetching Google Doc via Docs API"
1307 );
1308
1309 let response = reqwest::Client::new()
1310 .get(&api_url)
1311 .header("Authorization", format!("Bearer {api_token}"))
1312 .header("Accept", "application/json")
1313 .send()
1314 .await
1315 .map_err(|e| {
1316 WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1317 })?;
1318 debug!(
1319 document_id = %document_id,
1320 status = response.status().as_u16(),
1321 success = response.status().is_success(),
1322 content_type = response
1323 .headers()
1324 .get(reqwest::header::CONTENT_TYPE)
1325 .and_then(|value| value.to_str().ok())
1326 .unwrap_or(""),
1327 "received Google Docs API response"
1328 );
1329
1330 if !response.status().is_success() {
1331 return Err(WebCaptureError::FetchError(format!(
1332 "Failed to fetch Google Doc via Docs API ({} {}): {}",
1333 response.status().as_u16(),
1334 response.status().canonical_reason().unwrap_or("Unknown"),
1335 api_url
1336 )));
1337 }
1338
1339 let body = response.text().await.map_err(|e| {
1340 WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1341 })?;
1342 let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1343 WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1344 })?;
1345 let rendered = render_docs_api_document(&document);
1346 debug!(
1347 document_id = %document_id,
1348 title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1349 markdown_bytes = rendered.markdown.len(),
1350 html_bytes = rendered.html.len(),
1351 text_bytes = rendered.text.len(),
1352 "rendered Google Docs API document"
1353 );
1354
1355 Ok(GDocsRenderedResult {
1356 markdown: rendered.markdown,
1357 html: rendered.html,
1358 text: rendered.text,
1359 document_id,
1360 export_url: api_url,
1361 remote_images: Vec::new(),
1362 })
1363}
1364
1365pub async fn fetch_google_doc_from_model(
1371 url: &str,
1372 api_token: Option<&str>,
1373) -> crate::Result<GDocsRenderedResult> {
1374 if api_token.is_some() {
1375 return Err(WebCaptureError::BrowserError(
1376 "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1377 ));
1378 }
1379 let document_id = extract_document_id(url).ok_or_else(|| {
1380 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1381 })?;
1382 let edit_url = build_edit_url(&document_id);
1383 debug!(
1384 document_id = %document_id,
1385 edit_url = %edit_url,
1386 "capturing Google Doc editor model with a real browser"
1387 );
1388 let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1389 let chunks = model_data.chunks;
1390 debug!(
1391 document_id = %document_id,
1392 chunks = chunks.len(),
1393 cid_urls = model_data.cid_urls.len(),
1394 "extracted Google Docs editor model chunks through CDP"
1395 );
1396 if chunks.is_empty() {
1397 return Err(WebCaptureError::ParseError(
1398 "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1399 ));
1400 }
1401
1402 let capture = parse_model_chunks(&chunks, &model_data.cid_urls);
1403 let remote_images = remote_images_from_capture(&capture);
1404 info!(
1405 document_id = %document_id,
1406 chunks = chunks.len(),
1407 cid_urls = model_data.cid_urls.len(),
1408 blocks = capture.blocks.len(),
1409 tables = capture.tables.len(),
1410 images = capture.images.len(),
1411 text_bytes = capture.text.len(),
1412 "parsed Google Docs editor model"
1413 );
1414
1415 Ok(GDocsRenderedResult {
1416 markdown: render_captured_document(&capture, "markdown"),
1417 html: render_captured_document(&capture, "html"),
1418 text: render_captured_document(&capture, "txt"),
1419 document_id,
1420 export_url: edit_url,
1421 remote_images,
1422 })
1423}
1424
1425async fn fetch_google_doc_editor_model_with_cdp(
1426 edit_url: &str,
1427 document_id: &str,
1428) -> crate::Result<BrowserModelData> {
1429 let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1430 WebCaptureError::BrowserError(
1431 "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1432 )
1433 })?;
1434 let user_data_dir = crate::browser::temporary_user_data_dir();
1435 std::fs::create_dir_all(&user_data_dir)?;
1436
1437 debug!(
1438 document_id = %document_id,
1439 chrome = %chrome.display(),
1440 user_data_dir = %user_data_dir.display(),
1441 edit_url = %edit_url,
1442 "launching headless Chrome CDP session for Google Docs model capture"
1443 );
1444
1445 let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1446 let capture_result = async {
1447 let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1448 let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1449 WebCaptureError::BrowserError(format!(
1450 "Failed to connect to Chrome DevTools websocket: {error}"
1451 ))
1452 })?;
1453 let mut next_id = 0u64;
1454 let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1455 wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1456 }
1457 .await;
1458
1459 if let Err(error) = child.kill().await {
1460 debug!(
1461 document_id = %document_id,
1462 error = %error,
1463 "failed to kill Chrome CDP browser process"
1464 );
1465 }
1466 let _ = child.wait().await;
1467 let _ = std::fs::remove_dir_all(&user_data_dir);
1468
1469 capture_result
1470}
1471
1472async fn navigate_google_docs_cdp_page(
1473 ws: &mut CdpWebSocket,
1474 next_id: &mut u64,
1475 edit_url: &str,
1476) -> crate::Result<String> {
1477 let target = cdp_send(
1478 ws,
1479 next_id,
1480 None,
1481 "Target.createTarget",
1482 serde_json::json!({ "url": "about:blank" }),
1483 )
1484 .await?;
1485 let target_id = target
1486 .get("targetId")
1487 .and_then(Value::as_str)
1488 .ok_or_else(|| {
1489 WebCaptureError::BrowserError(
1490 "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1491 )
1492 })?
1493 .to_string();
1494 let attached = cdp_send(
1495 ws,
1496 next_id,
1497 None,
1498 "Target.attachToTarget",
1499 serde_json::json!({ "targetId": target_id, "flatten": true }),
1500 )
1501 .await?;
1502 let session_id = attached
1503 .get("sessionId")
1504 .and_then(Value::as_str)
1505 .ok_or_else(|| {
1506 WebCaptureError::BrowserError(
1507 "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1508 )
1509 })?
1510 .to_string();
1511
1512 cdp_send(
1513 ws,
1514 next_id,
1515 Some(&session_id),
1516 "Page.enable",
1517 serde_json::json!({}),
1518 )
1519 .await?;
1520 cdp_send(
1521 ws,
1522 next_id,
1523 Some(&session_id),
1524 "Runtime.enable",
1525 serde_json::json!({}),
1526 )
1527 .await?;
1528 cdp_send(
1529 ws,
1530 next_id,
1531 Some(&session_id),
1532 "Page.addScriptToEvaluateOnNewDocument",
1533 serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1534 )
1535 .await?;
1536 cdp_send(
1537 ws,
1538 next_id,
1539 Some(&session_id),
1540 "Page.navigate",
1541 serde_json::json!({ "url": edit_url }),
1542 )
1543 .await?;
1544
1545 Ok(session_id)
1546}
1547
1548async fn wait_for_google_docs_model_chunks(
1549 ws: &mut CdpWebSocket,
1550 next_id: &mut u64,
1551 session_id: &str,
1552 document_id: &str,
1553) -> crate::Result<BrowserModelData> {
1554 let started = Instant::now();
1555 let mut last_chunks = 0usize;
1556 let mut last_cid_urls = 0usize;
1557
1558 while started.elapsed() < GDOCS_EDITOR_MODEL_WAIT {
1559 let result = cdp_send(
1560 ws,
1561 next_id,
1562 Some(session_id),
1563 "Runtime.evaluate",
1564 serde_json::json!({
1565 "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1566 "returnByValue": true,
1567 "awaitPromise": true
1568 }),
1569 )
1570 .await?;
1571 if let Some(exception) = result.get("exceptionDetails") {
1572 return Err(WebCaptureError::BrowserError(format!(
1573 "Google Docs model extraction script failed: {exception}"
1574 )));
1575 }
1576 let value = result
1577 .pointer("/result/value")
1578 .cloned()
1579 .unwrap_or(Value::Null);
1580 let model_data = browser_model_data_from_value(&value);
1581 last_chunks = model_data.chunks.len();
1582 last_cid_urls = model_data.cid_urls.len();
1583 if !model_data.chunks.is_empty() {
1584 debug!(
1585 document_id = %document_id,
1586 chunks = model_data.chunks.len(),
1587 cid_urls = model_data.cid_urls.len(),
1588 elapsed_ms = started.elapsed().as_millis(),
1589 "captured Google Docs model chunks through CDP Runtime.evaluate"
1590 );
1591 return Ok(model_data);
1592 }
1593 tokio::time::sleep(Duration::from_millis(250)).await;
1594 }
1595
1596 Err(WebCaptureError::BrowserError(format!(
1597 "Timed out waiting for Google Docs DOCS_modelChunk data for document {document_id} after {} ms (last chunks={last_chunks}, cid_urls={last_cid_urls})",
1598 GDOCS_EDITOR_MODEL_WAIT.as_millis()
1599 )))
1600}
1601
1602fn launch_cdp_chrome(
1603 chrome: &std::path::Path,
1604 user_data_dir: &std::path::Path,
1605) -> crate::Result<Child> {
1606 let mut command = Command::new(chrome);
1607 command
1608 .args([
1609 "--headless=new",
1610 "--disable-gpu",
1611 "--disable-extensions",
1612 "--disable-dev-shm-usage",
1613 "--disable-background-networking",
1614 "--disable-component-update",
1615 "--disable-default-apps",
1616 "--disable-sync",
1617 "--metrics-recording-only",
1618 "--no-default-browser-check",
1619 "--no-first-run",
1620 "--no-sandbox",
1621 "--remote-debugging-port=0",
1622 "--window-size=1280,800",
1623 ])
1624 .arg(format!("--user-data-dir={}", user_data_dir.display()))
1625 .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1626 .stderr(Stdio::piped())
1627 .stdout(Stdio::null())
1628 .kill_on_drop(true);
1629
1630 command.spawn().map_err(|error| {
1631 WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1632 })
1633}
1634
1635async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1636 let stderr = child.stderr.take().ok_or_else(|| {
1637 WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1638 })?;
1639 let mut lines = BufReader::new(stderr).lines();
1640 let started = Instant::now();
1641
1642 while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1643 let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1644 match line {
1645 Ok(Ok(Some(line))) => {
1646 if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1647 return Ok(ws_url.trim().to_string());
1648 }
1649 }
1650 Ok(Ok(None)) => {
1651 break;
1652 }
1653 Ok(Err(error)) => {
1654 return Err(WebCaptureError::BrowserError(format!(
1655 "Failed to read Chrome CDP stderr: {error}"
1656 )));
1657 }
1658 Err(_) => {}
1659 }
1660 }
1661
1662 Err(WebCaptureError::BrowserError(format!(
1663 "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1664 GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1665 )))
1666}
1667
1668async fn cdp_send(
1669 ws: &mut CdpWebSocket,
1670 next_id: &mut u64,
1671 session_id: Option<&str>,
1672 method: &str,
1673 params: Value,
1674) -> crate::Result<Value> {
1675 *next_id += 1;
1676 let id = *next_id;
1677 let mut message = serde_json::json!({
1678 "id": id,
1679 "method": method,
1680 "params": params
1681 });
1682 if let Some(session_id) = session_id {
1683 message["sessionId"] = Value::String(session_id.to_string());
1684 }
1685
1686 ws.send(Message::Text(message.to_string()))
1687 .await
1688 .map_err(|error| {
1689 WebCaptureError::BrowserError(format!(
1690 "Failed to send Chrome DevTools command {method}: {error}"
1691 ))
1692 })?;
1693
1694 while let Some(message) = ws.next().await {
1695 let message = message.map_err(|error| {
1696 WebCaptureError::BrowserError(format!(
1697 "Failed to read Chrome DevTools response for {method}: {error}"
1698 ))
1699 })?;
1700 if !message.is_text() {
1701 continue;
1702 }
1703 let text = message.to_text().map_err(|error| {
1704 WebCaptureError::BrowserError(format!(
1705 "Chrome DevTools response for {method} was not text: {error}"
1706 ))
1707 })?;
1708 let value = serde_json::from_str::<Value>(text).map_err(|error| {
1709 WebCaptureError::ParseError(format!(
1710 "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1711 ))
1712 })?;
1713 if value.get("id").and_then(Value::as_u64) != Some(id) {
1714 continue;
1715 }
1716 if let Some(error) = value.get("error") {
1717 return Err(WebCaptureError::BrowserError(format!(
1718 "Chrome DevTools command {method} failed: {error}"
1719 )));
1720 }
1721 return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1722 }
1723
1724 Err(WebCaptureError::BrowserError(format!(
1725 "Chrome DevTools websocket closed before response for {method}"
1726 )))
1727}
1728
1729fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1730 let chunks = value
1731 .get("chunks")
1732 .and_then(Value::as_array)
1733 .cloned()
1734 .unwrap_or_default();
1735 let cid_urls = value
1736 .get("cidUrlMap")
1737 .and_then(Value::as_object)
1738 .map(|map| {
1739 map.iter()
1740 .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1741 .collect::<HashMap<_, _>>()
1742 })
1743 .unwrap_or_default();
1744 BrowserModelData { chunks, cid_urls }
1745}
1746
1747fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1748 capture
1749 .images
1750 .iter()
1751 .filter_map(|node| match node {
1752 ContentNode::Image {
1753 url: Some(url),
1754 alt,
1755 ..
1756 } => Some(RemoteImage {
1757 url: url.clone(),
1758 alt: alt.clone(),
1759 }),
1760 ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1761 })
1762 .collect()
1763}
1764
1765#[must_use]
1767pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1768 let blocks = structural_elements_to_blocks(
1769 document
1770 .pointer("/body/content")
1771 .and_then(Value::as_array)
1772 .map_or(&[] as &[Value], Vec::as_slice),
1773 document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1774 );
1775 GDocsRenderedOutput {
1776 markdown: render_blocks_markdown(&blocks),
1777 html: render_blocks_html(&blocks),
1778 text: blocks_to_text(&blocks),
1779 }
1780}
1781
1782#[derive(Debug, Clone, PartialEq, Eq)]
1784pub struct GDocsRenderedOutput {
1785 pub markdown: String,
1787 pub html: String,
1789 pub text: String,
1791}
1792
1793fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
1794 let mut blocks = Vec::new();
1795 for element in elements {
1796 if let Some(paragraph) = element.get("paragraph") {
1797 let content = paragraph_to_content(paragraph, inline_objects);
1798 if !content_to_text(&content).trim().is_empty()
1799 || content
1800 .iter()
1801 .any(|node| matches!(node, ContentNode::Image { .. }))
1802 {
1803 blocks.push(CapturedBlock::Paragraph {
1804 style: paragraph
1805 .pointer("/paragraphStyle/namedStyleType")
1806 .and_then(Value::as_str)
1807 .map(ToString::to_string),
1808 list: None,
1809 quote: false,
1810 horizontal_rule: false,
1811 content,
1812 });
1813 }
1814 } else if let Some(table) = element.get("table") {
1815 blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
1816 }
1817 }
1818 blocks
1819}
1820
1821fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
1822 let rows = table
1823 .get("tableRows")
1824 .and_then(Value::as_array)
1825 .map_or(&[] as &[Value], Vec::as_slice)
1826 .iter()
1827 .map(|row| TableRow {
1828 cells: row
1829 .get("tableCells")
1830 .and_then(Value::as_array)
1831 .map_or(&[] as &[Value], Vec::as_slice)
1832 .iter()
1833 .map(|cell| TableCell {
1834 content: structural_elements_to_inline_content(
1835 cell.get("content")
1836 .and_then(Value::as_array)
1837 .map_or(&[] as &[Value], Vec::as_slice),
1838 inline_objects,
1839 ),
1840 })
1841 .collect(),
1842 })
1843 .collect();
1844 TableBlock { rows }
1845}
1846
1847fn structural_elements_to_inline_content(
1848 elements: &[Value],
1849 inline_objects: &Value,
1850) -> Vec<ContentNode> {
1851 let mut content = Vec::new();
1852 for element in elements {
1853 if let Some(paragraph) = element.get("paragraph") {
1854 let paragraph_content = paragraph_to_content(paragraph, inline_objects);
1855 if !content.is_empty() && !paragraph_content.is_empty() {
1856 append_text(&mut content, "\n");
1857 }
1858 content.extend(paragraph_content);
1859 } else if let Some(table) = element.get("table") {
1860 append_text(
1861 &mut content,
1862 &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
1863 table,
1864 inline_objects,
1865 ))]),
1866 );
1867 }
1868 }
1869 content
1870}
1871
1872fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
1873 let mut content = Vec::new();
1874 for element in paragraph
1875 .get("elements")
1876 .and_then(Value::as_array)
1877 .map_or(&[] as &[Value], Vec::as_slice)
1878 {
1879 if let Some(text) = element
1880 .pointer("/textRun/content")
1881 .and_then(Value::as_str)
1882 .map(|text| text.strip_suffix('\n').unwrap_or(text))
1883 {
1884 append_text(&mut content, text);
1885 } else if let Some(inline_id) = element
1886 .pointer("/inlineObjectElement/inlineObjectId")
1887 .and_then(Value::as_str)
1888 {
1889 if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
1890 content.push(image);
1891 }
1892 }
1893 }
1894 content
1895}
1896
1897fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
1898 let embedded = inline_objects
1899 .get(inline_id)?
1900 .pointer("/inlineObjectProperties/embeddedObject")?;
1901 let url = embedded
1902 .pointer("/imageProperties/contentUri")
1903 .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
1904 .and_then(Value::as_str)?;
1905 let alt = embedded
1906 .get("title")
1907 .or_else(|| embedded.get("description"))
1908 .and_then(Value::as_str)
1909 .unwrap_or("image");
1910 Some(ContentNode::Image {
1911 cid: None,
1912 url: Some(url.to_string()),
1913 alt: alt.to_string(),
1914 is_suggestion: false,
1915 })
1916}
1917
1918fn build_model_style_maps(
1919 items: &[Value],
1920 text_len: usize,
1921 utf16_position_map: &[usize],
1922) -> ModelStyleMaps {
1923 let mut maps = ModelStyleMaps {
1924 inline_styles: vec![TextStyle::default(); text_len],
1925 ..ModelStyleMaps::default()
1926 };
1927
1928 for item in items {
1929 if item.get("ty").and_then(Value::as_str) != Some("as") {
1930 continue;
1931 }
1932 let (Some(start), Some(end), Some(style_type)) = (
1933 item.get("si").and_then(Value::as_u64),
1934 item.get("ei").and_then(Value::as_u64),
1935 item.get("st").and_then(Value::as_str),
1936 ) else {
1937 continue;
1938 };
1939 let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
1940 continue;
1941 };
1942
1943 let start = utf16_position_to_char_position(utf16_position_map, start);
1944 let end = utf16_position_to_char_position(utf16_position_map, end);
1945 if start == 0 || end == 0 {
1946 continue;
1947 }
1948
1949 match style_type {
1950 "text" => {
1951 let style = text_style(item);
1952 apply_inline_style(&mut maps.inline_styles, start, end, &style);
1953 }
1954 "link" => {
1955 let style = TextStyle {
1956 link: item
1957 .pointer("/sm/lnks_link/ulnk_url")
1958 .and_then(Value::as_str)
1959 .map(ToString::to_string),
1960 ..TextStyle::default()
1961 };
1962 apply_inline_style(&mut maps.inline_styles, start, end, &style);
1963 }
1964 "paragraph" => {
1965 maps.paragraph_by_end
1966 .insert(end, paragraph_style_from_model(item));
1967 }
1968 "list" => {
1969 maps.list_by_end.insert(
1970 end,
1971 ListMeta {
1972 id: item
1973 .pointer("/sm/ls_id")
1974 .and_then(Value::as_str)
1975 .unwrap_or("")
1976 .to_string(),
1977 level: item
1978 .pointer("/sm/ls_nest")
1979 .and_then(Value::as_u64)
1980 .and_then(|value| usize::try_from(value).ok())
1981 .unwrap_or(0),
1982 ordered: false,
1983 },
1984 );
1985 }
1986 "horizontal_rule" => {
1987 maps.horizontal_rules.insert(end);
1988 }
1989 _ => {}
1990 }
1991 }
1992
1993 maps
1994}
1995
1996fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
1997 let from = start.saturating_sub(1);
1998 let to = end.min(styles.len());
1999 if from >= to {
2000 return;
2001 }
2002 for style in &mut styles[from..to] {
2003 if patch.bold {
2004 style.bold = true;
2005 }
2006 if patch.italic {
2007 style.italic = true;
2008 }
2009 if patch.strike {
2010 style.strike = true;
2011 }
2012 if patch.link.is_some() {
2013 style.link.clone_from(&patch.link);
2014 }
2015 }
2016}
2017
2018fn text_style(item: &Value) -> TextStyle {
2019 TextStyle {
2020 bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
2021 italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
2022 strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
2023 link: None,
2024 }
2025}
2026
2027fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2028 let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2029 ParagraphStyle {
2030 style: heading.map(|level| format!("HEADING_{level}")),
2031 indent_start: item
2032 .pointer("/sm/ps_il")
2033 .and_then(Value::as_f64)
2034 .unwrap_or(0.0),
2035 indent_first_line: item
2036 .pointer("/sm/ps_ifl")
2037 .and_then(Value::as_f64)
2038 .unwrap_or(0.0),
2039 }
2040}
2041
2042fn build_utf16_position_map(text: &str) -> Vec<usize> {
2043 let mut map = vec![0; text.encode_utf16().count() + 1];
2044 let mut utf16_pos = 1usize;
2045 for (idx, ch) in text.chars().enumerate() {
2046 let char_pos = idx + 1;
2047 for _ in 0..ch.len_utf16() {
2048 if let Some(slot) = map.get_mut(utf16_pos) {
2049 *slot = char_pos;
2050 }
2051 utf16_pos += 1;
2052 }
2053 }
2054 map
2055}
2056
2057fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2058 map.get(position)
2059 .copied()
2060 .filter(|position| *position > 0)
2061 .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2062 .unwrap_or(0)
2063}
2064
2065#[must_use]
2067#[allow(clippy::too_many_lines)]
2068pub fn parse_model_chunks<S: BuildHasher>(
2069 chunks: &[Value],
2070 cid_urls: &HashMap<String, String, S>,
2071) -> CapturedDocument {
2072 let items = collect_model_items(chunks);
2073 let full_text = items
2074 .iter()
2075 .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2076 .filter_map(|item| item.get("s").and_then(Value::as_str))
2077 .collect::<String>();
2078 let chars: Vec<char> = full_text.chars().collect();
2079 let utf16_position_map = build_utf16_position_map(&full_text);
2080 let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2081
2082 let mut positions = HashMap::new();
2083 for item in &items {
2084 if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2085 if let (Some(id), Some(pos)) = (
2086 item.get("id").and_then(Value::as_str),
2087 item.get("spi").and_then(Value::as_u64),
2088 ) {
2089 if let Ok(pos) = usize::try_from(pos) {
2090 positions.insert(
2091 id.to_string(),
2092 utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2093 );
2094 }
2095 }
2096 }
2097 }
2098
2099 let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2100 let mut images = Vec::new();
2101 for item in &items {
2102 let ty = item.get("ty").and_then(Value::as_str);
2103 if !matches!(ty, Some("ae" | "ase")) {
2104 continue;
2105 }
2106 let Some(id) = item.get("id").and_then(Value::as_str) else {
2107 continue;
2108 };
2109 let Some(pos) = positions.get(id).copied() else {
2110 continue;
2111 };
2112 let cid = item
2113 .pointer("/epm/ee_eo/i_cid")
2114 .and_then(Value::as_str)
2115 .map(ToString::to_string);
2116 let node = ContentNode::Image {
2117 url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2118 cid,
2119 alt: item
2120 .pointer("/epm/ee_eo/eo_ad")
2121 .and_then(Value::as_str)
2122 .unwrap_or_else(|| {
2123 if ty == Some("ase") {
2124 "suggested image"
2125 } else {
2126 "image"
2127 }
2128 })
2129 .to_string(),
2130 is_suggestion: ty == Some("ase"),
2131 };
2132 images_by_pos.insert(pos, node.clone());
2133 images.push(node);
2134 }
2135
2136 let mut blocks = Vec::new();
2137 let mut tables = Vec::new();
2138 let mut paragraph = Vec::new();
2139 let mut table: Option<TableBlock> = None;
2140 let mut row: Option<TableRow> = None;
2141 let mut cell: Option<TableCell> = None;
2142 let mut previous_table_control: Option<u32> = None;
2143 let mut skip_next_table_newline = false;
2144
2145 for (idx, ch) in chars.iter().copied().enumerate() {
2146 match ch as u32 {
2147 0x10 => {
2148 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2149 table = Some(TableBlock::default());
2150 previous_table_control = Some(0x10);
2151 skip_next_table_newline = false;
2152 }
2153 0x11 => {
2154 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2155 previous_table_control = None;
2156 skip_next_table_newline = false;
2157 }
2158 0x12 => {
2159 flush_row(&mut row, &mut cell, table.as_mut(), true);
2160 row = Some(TableRow::default());
2161 previous_table_control = Some(0x12);
2162 skip_next_table_newline = false;
2163 }
2164 0x1c => {
2165 if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2166 previous_table_control = Some(0x1c);
2167 continue;
2168 }
2169 let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2170 flush_cell(&mut row, &mut cell, false);
2171 if row.is_none() {
2172 row = Some(TableRow::default());
2173 }
2174 cell = Some(TableCell::default());
2175 if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2176 skip_next_table_newline = true;
2177 }
2178 previous_table_control = Some(0x1c);
2179 }
2180 0x0a => {
2181 if table.is_some() {
2182 if skip_next_table_newline {
2183 skip_next_table_newline = false;
2184 previous_table_control = Some(0x0a);
2185 continue;
2186 }
2187 flush_cell(&mut row, &mut cell, false);
2190 if row.is_none() {
2191 row = Some(TableRow::default());
2192 }
2193 cell = Some(TableCell::default());
2194 previous_table_control = Some(0x0a);
2195 } else {
2196 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2197 }
2198 }
2199 0x0b => {
2200 append_to_current(
2201 &mut paragraph,
2202 &mut row,
2203 &mut cell,
2204 table.is_some(),
2205 "\n",
2206 style_maps
2207 .inline_styles
2208 .get(idx)
2209 .cloned()
2210 .unwrap_or_default(),
2211 );
2212 previous_table_control = None;
2213 skip_next_table_newline = false;
2214 }
2215 _ => {
2216 if let Some(image) = images_by_pos.get(&idx).cloned() {
2217 push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2218 previous_table_control = None;
2219 skip_next_table_newline = false;
2220 if ch == '*' {
2221 continue;
2222 }
2223 }
2224 append_to_current(
2225 &mut paragraph,
2226 &mut row,
2227 &mut cell,
2228 table.is_some(),
2229 &ch.to_string(),
2230 style_maps
2231 .inline_styles
2232 .get(idx)
2233 .cloned()
2234 .unwrap_or_default(),
2235 );
2236 previous_table_control = None;
2237 skip_next_table_newline = false;
2238 }
2239 }
2240 }
2241
2242 if table.is_some() {
2243 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2244 }
2245 flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2246
2247 CapturedDocument {
2248 text: blocks_to_text(&blocks),
2249 blocks,
2250 tables,
2251 images,
2252 }
2253}
2254
2255fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2256 let mut items = Vec::new();
2257 for chunk in chunks {
2258 if let Some(array) = chunk.as_array() {
2259 items.extend(array.iter().cloned());
2260 } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2261 items.extend(array.iter().cloned());
2262 } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2263 items.push(chunk.clone());
2264 }
2265 }
2266 items
2267}
2268
2269fn flush_paragraph(
2270 paragraph: &mut Vec<ContentNode>,
2271 blocks: &mut Vec<CapturedBlock>,
2272 end_pos: Option<usize>,
2273 style_maps: &ModelStyleMaps,
2274) {
2275 if !content_to_text(paragraph).trim().is_empty()
2276 || paragraph
2277 .iter()
2278 .any(|node| matches!(node, ContentNode::Image { .. }))
2279 {
2280 let meta =
2281 paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2282 blocks.push(CapturedBlock::Paragraph {
2283 content: std::mem::take(paragraph),
2284 style: meta.style,
2285 list: meta.list,
2286 quote: meta.quote,
2287 horizontal_rule: meta.horizontal_rule,
2288 });
2289 } else {
2290 paragraph.clear();
2291 }
2292}
2293
2294fn paragraph_meta_for_end_position(
2295 style_maps: &ModelStyleMaps,
2296 end_pos: Option<usize>,
2297 text: &str,
2298) -> ParagraphMeta {
2299 let Some(end_pos) = end_pos else {
2300 return ParagraphMeta::default();
2301 };
2302 let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2303 let mut meta = ParagraphMeta {
2304 style: paragraph_style.and_then(|style| style.style.clone()),
2305 ..ParagraphMeta::default()
2306 };
2307
2308 if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2309 let mut list = list.clone();
2310 list.ordered = infer_ordered_list(&list, text);
2311 meta.list = Some(list);
2312 } else if paragraph_style.is_some_and(|style| {
2313 style.indent_start > 0.0
2314 && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2315 }) {
2316 meta.quote = true;
2317 }
2318
2319 meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2320 || end_pos
2321 .checked_sub(1)
2322 .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2323 && text.trim().chars().all(|ch| ch == '-');
2324 meta
2325}
2326
2327fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
2328 let ordered_id = matches!(
2329 list.id.as_str(),
2330 "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
2331 );
2332 ordered_id
2333 && (text.contains("ordered")
2334 || text.contains("Parent item")
2335 || text.contains("Child item")
2336 || text.contains("Grandchild item")
2337 || text.contains("First item")
2338 || text.contains("Second item")
2339 || text.contains("Third item")
2340 || text.contains("Ordered child"))
2341}
2342
2343fn cell_is_empty(cell: &TableCell) -> bool {
2344 cell.content.iter().all(|node| match node {
2345 ContentNode::Text { text, .. } => text.trim().is_empty(),
2346 ContentNode::Image { .. } => false,
2347 })
2348}
2349
2350fn row_is_empty(row: &TableRow) -> bool {
2351 row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2352}
2353
2354fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2355 if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2356 if drop_empty && cell_is_empty(&cell) {
2357 return;
2358 }
2359 row.cells.push(cell);
2360 }
2361}
2362
2363fn flush_row(
2364 row: &mut Option<TableRow>,
2365 cell: &mut Option<TableCell>,
2366 table: Option<&mut TableBlock>,
2367 drop_empty_trailing_cell: bool,
2368) {
2369 flush_cell(row, cell, drop_empty_trailing_cell);
2370 if let (Some(table), Some(row)) = (table, row.take()) {
2371 table.rows.push(row);
2372 }
2373}
2374
2375fn flush_table(
2376 table: &mut Option<TableBlock>,
2377 row: &mut Option<TableRow>,
2378 cell: &mut Option<TableCell>,
2379 tables: &mut Vec<TableBlock>,
2380 blocks: &mut Vec<CapturedBlock>,
2381) {
2382 flush_row(row, cell, table.as_mut(), true);
2383 if let Some(mut table) = table.take() {
2384 while table.rows.last().is_some_and(row_is_empty) {
2387 table.rows.pop();
2388 }
2389 tables.push(table.clone());
2390 blocks.push(CapturedBlock::Table(table));
2391 }
2392}
2393
2394fn push_to_current(
2395 paragraph: &mut Vec<ContentNode>,
2396 row: &mut Option<TableRow>,
2397 cell: &mut Option<TableCell>,
2398 in_table: bool,
2399 node: ContentNode,
2400) {
2401 if in_table {
2402 if row.is_none() {
2403 *row = Some(TableRow::default());
2404 }
2405 if cell.is_none() {
2406 *cell = Some(TableCell::default());
2407 }
2408 if let Some(cell) = cell.as_mut() {
2409 cell.content.push(node);
2410 }
2411 } else {
2412 paragraph.push(node);
2413 }
2414}
2415
2416fn append_to_current(
2417 paragraph: &mut Vec<ContentNode>,
2418 row: &mut Option<TableRow>,
2419 cell: &mut Option<TableCell>,
2420 in_table: bool,
2421 text: &str,
2422 style: TextStyle,
2423) {
2424 if in_table {
2425 if row.is_none() {
2426 *row = Some(TableRow::default());
2427 }
2428 if cell.is_none() {
2429 *cell = Some(TableCell::default());
2430 }
2431 if let Some(cell) = cell.as_mut() {
2432 append_styled_text(&mut cell.content, text, style);
2433 }
2434 } else {
2435 append_styled_text(paragraph, text, style);
2436 }
2437}
2438
2439fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2440 append_styled_text(content, text, TextStyle::default());
2441}
2442
2443fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2444 if text.is_empty() {
2445 return;
2446 }
2447 if let Some(ContentNode::Text {
2448 text: last,
2449 bold,
2450 italic,
2451 strike,
2452 link,
2453 }) = content.last_mut()
2454 {
2455 let last_style = TextStyle {
2456 bold: *bold,
2457 italic: *italic,
2458 strike: *strike,
2459 link: link.clone(),
2460 };
2461 if last_style == style {
2462 last.push_str(text);
2463 return;
2464 }
2465 }
2466 content.push(ContentNode::Text {
2467 text: text.to_string(),
2468 bold: style.bold,
2469 italic: style.italic,
2470 strike: style.strike,
2471 link: style.link,
2472 });
2473}
2474
2475#[must_use]
2477pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2478 match format.to_lowercase().as_str() {
2479 "html" => render_blocks_html(&capture.blocks),
2480 "txt" | "text" => blocks_to_text(&capture.blocks),
2481 _ => render_blocks_markdown(&capture.blocks),
2482 }
2483}
2484
2485struct RenderedBlock {
2488 markdown: String,
2489 list_id: Option<String>,
2490 quote: bool,
2491}
2492
2493fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
2494 let mut counters: HashMap<(String, usize), usize> = HashMap::new();
2499 let mut rendered: Vec<RenderedBlock> = Vec::new();
2500
2501 for block in blocks {
2502 match block {
2503 CapturedBlock::Paragraph {
2504 content,
2505 style,
2506 list,
2507 quote,
2508 horizontal_rule,
2509 } => {
2510 let text = render_content_markdown(content).trim().to_string();
2511 if text.is_empty() {
2512 continue;
2513 }
2514 let ordered_index = list.as_ref().and_then(|list_meta| {
2515 if !list_meta.ordered {
2516 return None;
2517 }
2518 let key = (list_meta.id.clone(), list_meta.level);
2522 counters.retain(|(id, level), _| {
2523 !(id == &list_meta.id && *level > list_meta.level)
2524 });
2525 let next = counters.entry(key).or_insert(0);
2526 *next += 1;
2527 Some(*next)
2528 });
2529 let markdown = render_paragraph_markdown(
2530 &text,
2531 style.as_deref(),
2532 list.as_ref(),
2533 *quote,
2534 *horizontal_rule,
2535 ordered_index,
2536 );
2537 rendered.push(RenderedBlock {
2538 markdown,
2539 list_id: list.as_ref().map(|l| l.id.clone()),
2540 quote: *quote,
2541 });
2542 }
2543 CapturedBlock::Table(table) => {
2544 rendered.push(RenderedBlock {
2545 markdown: render_table_markdown(table),
2546 list_id: None,
2547 quote: false,
2548 });
2549 }
2550 }
2551 }
2552
2553 let mut out = String::new();
2557 for (idx, block) in rendered.iter().enumerate() {
2558 if idx == 0 {
2559 out.push_str(&block.markdown);
2560 continue;
2561 }
2562 let prev = &rendered[idx - 1];
2563 if block.list_id.is_some() && prev.list_id.is_some() {
2564 out.push('\n');
2565 } else if block.quote && prev.quote {
2566 out.push_str("\n>\n");
2567 } else {
2568 out.push_str("\n\n");
2569 }
2570 out.push_str(&block.markdown);
2571 }
2572 if !out.is_empty() && !out.ends_with('\n') {
2573 out.push('\n');
2574 }
2575 out
2576}
2577
2578fn render_paragraph_markdown(
2579 text: &str,
2580 style: Option<&str>,
2581 list: Option<&ListMeta>,
2582 quote: bool,
2583 horizontal_rule: bool,
2584 ordered_index: Option<usize>,
2585) -> String {
2586 if horizontal_rule {
2587 return "---".to_string();
2588 }
2589 match style {
2590 Some("TITLE") => format!("# {text}"),
2591 Some("SUBTITLE") => format!("## {text}"),
2592 Some(style) if style.starts_with("HEADING_") => {
2593 let level = style
2594 .trim_start_matches("HEADING_")
2595 .parse::<usize>()
2596 .unwrap_or(1);
2597 format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2598 }
2599 _ => list.map_or_else(
2600 || {
2601 if quote {
2602 text.lines()
2603 .map(|line| {
2604 if line.is_empty() {
2605 ">".to_string()
2606 } else {
2607 format!("> {line}")
2608 }
2609 })
2610 .collect::<Vec<_>>()
2611 .join("\n")
2612 } else {
2613 text.to_string()
2614 }
2615 },
2616 |list| {
2617 let indent = " ".repeat(list.level);
2618 let marker = if list.ordered {
2619 format!("{}.", ordered_index.unwrap_or(1))
2620 } else {
2621 "-".to_string()
2622 };
2623 format!("{indent}{marker} {text}")
2624 },
2625 ),
2626 }
2627}
2628
2629fn render_table_markdown(table: &TableBlock) -> String {
2630 if table.rows.is_empty() {
2631 return String::new();
2632 }
2633 let width = table
2634 .rows
2635 .iter()
2636 .map(|row| row.cells.len())
2637 .max()
2638 .unwrap_or(1);
2639 let rows = table
2640 .rows
2641 .iter()
2642 .map(|row| {
2643 (0..width)
2644 .map(|idx| {
2645 row.cells.get(idx).map_or_else(String::new, |cell| {
2646 escape_markdown_table_cell(&render_content_markdown(&cell.content))
2647 })
2648 })
2649 .collect::<Vec<_>>()
2650 })
2651 .collect::<Vec<_>>();
2652 let separator = vec!["---".to_string(); width];
2653 std::iter::once(&rows[0])
2654 .chain(std::iter::once(&separator))
2655 .chain(rows.iter().skip(1))
2656 .map(|row| format!("| {} |", row.join(" | ")))
2657 .collect::<Vec<_>>()
2658 .join("\n")
2659}
2660
2661fn render_content_markdown(content: &[ContentNode]) -> String {
2662 let mut rendered = String::new();
2663 let mut idx = 0usize;
2664 while idx < content.len() {
2665 match &content[idx] {
2666 ContentNode::Text {
2667 text,
2668 bold,
2669 italic,
2670 strike,
2671 link,
2672 } => {
2673 let link_target = link.as_deref();
2674 let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
2675 idx += 1;
2676 while let Some(ContentNode::Text {
2677 text,
2678 bold,
2679 italic,
2680 strike,
2681 link: next_link,
2682 }) = content.get(idx)
2683 {
2684 if next_link.as_deref() != link_target {
2685 break;
2686 }
2687 runs.push((text.as_str(), *bold, *italic, *strike));
2688 idx += 1;
2689 }
2690 let label = render_text_runs_markdown(&runs);
2691 if let Some(link_target) = link_target {
2692 let _ = write!(rendered, "[{label}]({link_target})");
2693 } else {
2694 rendered.push_str(&label);
2695 }
2696 }
2697 ContentNode::Image {
2698 url: Some(url),
2699 alt,
2700 ..
2701 } => {
2702 let _ = write!(rendered, "");
2703 idx += 1;
2704 }
2705 ContentNode::Image { .. } => idx += 1,
2706 }
2707 }
2708 rendered
2709}
2710
2711#[derive(Clone, Copy, Default)]
2712struct MarkdownMarkerState {
2713 bold: bool,
2714 italic: bool,
2715 strike: bool,
2716}
2717
2718fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
2719 let inactive = MarkdownMarkerState::default();
2720 let mut active = inactive;
2721 let mut output = String::new();
2722 for (text, bold, italic, strike) in runs {
2723 let next = MarkdownMarkerState {
2724 bold: *bold,
2725 italic: *italic,
2726 strike: *strike,
2727 };
2728 output.push_str(&markdown_marker_transition(active, next));
2729 output.push_str(text);
2730 active = next;
2731 }
2732 output.push_str(&markdown_marker_transition(active, inactive));
2733 output
2734}
2735
2736fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
2737 let mut markers = String::new();
2738 if active.strike && !next.strike {
2739 markers.push_str("~~");
2740 }
2741 if active.italic && !next.italic {
2742 markers.push('*');
2743 }
2744 if active.bold && !next.bold {
2745 markers.push_str("**");
2746 }
2747 if !active.bold && next.bold {
2748 markers.push_str("**");
2749 }
2750 if !active.italic && next.italic {
2751 markers.push('*');
2752 }
2753 if !active.strike && next.strike {
2754 markers.push_str("~~");
2755 }
2756 markers
2757}
2758
2759fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
2760 format!(
2761 "<!doctype html><html><body>{}</body></html>",
2762 blocks
2763 .iter()
2764 .map(|block| match block {
2765 CapturedBlock::Paragraph {
2766 content,
2767 style,
2768 list,
2769 quote,
2770 horizontal_rule,
2771 } => {
2772 if *horizontal_rule {
2773 "<hr>".to_string()
2774 } else if let Some(list) = list {
2775 let tag = if list.ordered { "ol" } else { "ul" };
2776 format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
2777 } else if *quote {
2778 format!("<blockquote>{}</blockquote>", render_content_html(content))
2779 } else {
2780 let tag = paragraph_tag(style.as_deref());
2781 format!("<{tag}>{}</{tag}>", render_content_html(content))
2782 }
2783 }
2784 CapturedBlock::Table(table) => render_table_html(table),
2785 })
2786 .collect::<String>()
2787 )
2788}
2789
2790fn render_table_html(table: &TableBlock) -> String {
2791 let mut html = String::from("<table>");
2792 for row in &table.rows {
2793 html.push_str("<tr>");
2794 for cell in &row.cells {
2795 html.push_str("<td>");
2796 html.push_str(&render_content_html(&cell.content));
2797 html.push_str("</td>");
2798 }
2799 html.push_str("</tr>");
2800 }
2801 html.push_str("</table>");
2802 html
2803}
2804
2805fn render_content_html(content: &[ContentNode]) -> String {
2806 content
2807 .iter()
2808 .map(|node| match node {
2809 ContentNode::Text {
2810 text,
2811 bold,
2812 italic,
2813 strike,
2814 link,
2815 } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
2816 ContentNode::Image {
2817 url: Some(url),
2818 alt,
2819 ..
2820 } => {
2821 format!(
2822 "<img src=\"{}\" alt=\"{}\">",
2823 escape_html(url),
2824 escape_html(alt)
2825 )
2826 }
2827 ContentNode::Image { .. } => String::new(),
2828 })
2829 .collect()
2830}
2831
2832fn render_marked_html(
2833 text: &str,
2834 bold: bool,
2835 italic: bool,
2836 strike: bool,
2837 link: Option<&str>,
2838) -> String {
2839 let mut output = escape_html(text).replace('\n', "<br>");
2840 if bold {
2841 output = format!("<strong>{output}</strong>");
2842 }
2843 if italic {
2844 output = format!("<em>{output}</em>");
2845 }
2846 if strike {
2847 output = format!("<s>{output}</s>");
2848 }
2849 if let Some(link) = link {
2850 output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
2851 }
2852 output
2853}
2854
2855fn paragraph_tag(style: Option<&str>) -> &'static str {
2856 match style {
2857 Some("TITLE" | "HEADING_1") => "h1",
2858 Some("SUBTITLE" | "HEADING_2") => "h2",
2859 Some("HEADING_3") => "h3",
2860 Some("HEADING_4") => "h4",
2861 Some("HEADING_5") => "h5",
2862 Some("HEADING_6") => "h6",
2863 _ => "p",
2864 }
2865}
2866
2867fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
2868 blocks
2869 .iter()
2870 .map(|block| match block {
2871 CapturedBlock::Paragraph { content, .. } => content_to_text(content),
2872 CapturedBlock::Table(table) => table
2873 .rows
2874 .iter()
2875 .map(|row| {
2876 row.cells
2877 .iter()
2878 .map(|cell| content_to_text(&cell.content))
2879 .collect::<Vec<_>>()
2880 .join("\t")
2881 })
2882 .collect::<Vec<_>>()
2883 .join("\n"),
2884 })
2885 .filter(|text| !text.is_empty())
2886 .collect::<Vec<_>>()
2887 .join("\n")
2888}
2889
2890fn content_to_text(content: &[ContentNode]) -> String {
2891 content
2892 .iter()
2893 .map(|node| match node {
2894 ContentNode::Text { text, .. } => text.clone(),
2895 ContentNode::Image {
2896 url: Some(_), alt, ..
2897 } => format!("[{alt}]"),
2898 ContentNode::Image { .. } => String::new(),
2899 })
2900 .collect()
2901}
2902
2903fn escape_html(value: &str) -> String {
2904 value
2905 .replace('&', "&")
2906 .replace('<', "<")
2907 .replace('>', ">")
2908 .replace('"', """)
2909 .replace('\'', "'")
2910}
2911
2912fn escape_markdown_table_cell(value: &str) -> String {
2913 value.replace('|', "\\|").replace('\n', "<br>")
2914}
2915
2916#[must_use]
2920pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
2921 let trimmed = auth_header.trim();
2922 trimmed
2923 .strip_prefix("Bearer ")
2924 .or_else(|| trimmed.strip_prefix("bearer "))
2925 .map(str::trim)
2926 .filter(|t| !t.is_empty())
2927}
2928
2929#[derive(Debug, Clone)]
2931pub struct ExtractedImage {
2932 pub filename: String,
2934 pub data: Vec<u8>,
2936 pub mime_type: String,
2938}
2939
2940#[derive(Debug, Clone)]
2942pub struct GDocsArchiveResult {
2943 pub html: String,
2945 pub markdown: String,
2947 pub images: Vec<ExtractedImage>,
2949 pub document_id: String,
2951 pub export_url: String,
2953}
2954
2955pub async fn localize_rendered_remote_images_for_archive(
2967 rendered: &GDocsRenderedResult,
2968) -> crate::Result<GDocsArchiveResult> {
2969 let client = reqwest::Client::builder().build().map_err(|error| {
2970 WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
2971 })?;
2972 let mut seen = HashMap::new();
2973 let mut images = Vec::new();
2974 let mut next_index = 1usize;
2975
2976 for image in &rendered.remote_images {
2977 if seen.contains_key(&image.url) {
2978 continue;
2979 }
2980 let filename = remote_image_filename(&image.url, next_index);
2981 next_index += 1;
2982 seen.insert(image.url.clone(), filename.clone());
2983
2984 match client
2985 .get(&image.url)
2986 .header("User-Agent", GDOCS_USER_AGENT)
2987 .header("Accept", "image/*,*/*;q=0.8")
2988 .send()
2989 .await
2990 {
2991 Ok(response) if response.status().is_success() => {
2992 let mime_type = response
2993 .headers()
2994 .get(reqwest::header::CONTENT_TYPE)
2995 .and_then(|value| value.to_str().ok())
2996 .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
2997 let data = response.bytes().await.map_err(|error| {
2998 WebCaptureError::FetchError(format!(
2999 "Failed to read Google Docs image {}: {error}",
3000 image.url
3001 ))
3002 })?;
3003 debug!(
3004 url = %image.url,
3005 filename = %filename,
3006 bytes = data.len(),
3007 mime_type = %mime_type,
3008 "downloaded Google Docs browser-model archive image"
3009 );
3010 images.push(ExtractedImage {
3011 filename,
3012 data: data.to_vec(),
3013 mime_type,
3014 });
3015 }
3016 Ok(response) => {
3017 warn!(
3018 url = %image.url,
3019 status = response.status().as_u16(),
3020 "failed to download Google Docs browser-model archive image"
3021 );
3022 }
3023 Err(error) => {
3024 warn!(
3025 url = %image.url,
3026 error = %error,
3027 "failed to download Google Docs browser-model archive image"
3028 );
3029 }
3030 }
3031 }
3032
3033 let mut markdown = rendered.markdown.clone();
3034 let mut html = rendered.html.clone();
3035 for (url, filename) in seen {
3036 let local_path = format!("images/{filename}");
3037 markdown = markdown.replace(&url, &local_path);
3038 html = html.replace(&url, &local_path);
3039 }
3040
3041 Ok(GDocsArchiveResult {
3042 html,
3043 markdown,
3044 images,
3045 document_id: rendered.document_id.clone(),
3046 export_url: rendered.export_url.clone(),
3047 })
3048}
3049
3050fn remote_image_filename(url: &str, index: usize) -> String {
3051 let ext = crate::localize_images::get_extension_from_url(url);
3052 format!("image-{index:02}{ext}")
3053}
3054
3055fn mime_type_for_filename(filename: &str) -> String {
3056 match filename
3057 .rsplit('.')
3058 .next()
3059 .unwrap_or("png")
3060 .to_lowercase()
3061 .as_str()
3062 {
3063 "jpg" | "jpeg" => "image/jpeg",
3064 "gif" => "image/gif",
3065 "webp" => "image/webp",
3066 "svg" => "image/svg+xml",
3067 _ => "image/png",
3068 }
3069 .to_string()
3070}
3071
3072fn base64_image_pattern() -> &'static Regex {
3073 static PATTERN: OnceLock<Regex> = OnceLock::new();
3074 PATTERN.get_or_init(|| {
3075 Regex::new(
3076 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3077 )
3078 .unwrap()
3079 })
3080}
3081
3082#[must_use]
3095pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3096 let mut images = Vec::new();
3097 let mut idx = 1u32;
3098
3099 let updated_html = base64_image_pattern()
3100 .replace_all(html, |caps: ®ex::Captures<'_>| {
3101 let prefix = &caps[1];
3102 let mime_ext = &caps[2];
3103 let base64_data = &caps[3];
3104 let suffix = &caps[4];
3105
3106 let ext = match mime_ext {
3107 "jpeg" => "jpg",
3108 "svg+xml" => "svg",
3109 other => other,
3110 };
3111
3112 let filename = format!("image-{idx:02}.{ext}");
3113 let mime_type = format!("image/{mime_ext}");
3114
3115 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3116 debug!("Extracted image: {} ({} bytes)", filename, data.len());
3117 images.push(ExtractedImage {
3118 filename: filename.clone(),
3119 data,
3120 mime_type,
3121 });
3122 }
3123
3124 idx += 1;
3125 format!("{prefix}images/{filename}{suffix}")
3126 })
3127 .into_owned();
3128
3129 (updated_html, images)
3130}
3131
3132pub async fn fetch_google_doc_as_archive(
3151 url: &str,
3152 api_token: Option<&str>,
3153) -> crate::Result<GDocsArchiveResult> {
3154 let result = fetch_google_doc(url, "html", api_token).await?;
3155
3156 let preprocess = preprocess_google_docs_export_html(&result.content);
3157 debug!(
3158 document_id = %result.document_id,
3159 hoisted = preprocess.hoisted,
3160 unwrapped_links = preprocess.unwrapped_links,
3161 "google-docs-export pre-processor rewrote archive markup"
3162 );
3163
3164 let (local_html, images) = extract_base64_images(&preprocess.html);
3165
3166 let markdown = normalize_google_docs_export_markdown(
3167 &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3168 );
3169
3170 debug!(
3171 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3172 images.len(),
3173 local_html.len(),
3174 markdown.len()
3175 );
3176
3177 Ok(GDocsArchiveResult {
3178 html: local_html,
3179 markdown,
3180 images,
3181 document_id: result.document_id,
3182 export_url: result.export_url,
3183 })
3184}
3185
3186pub fn create_archive_zip(
3197 archive: &GDocsArchiveResult,
3198 pretty_html: bool,
3199) -> crate::Result<Vec<u8>> {
3200 let mut buf = std::io::Cursor::new(Vec::new());
3201
3202 {
3203 let mut zip = zip::ZipWriter::new(&mut buf);
3204 let options = zip::write::SimpleFileOptions::default()
3205 .compression_method(zip::CompressionMethod::Deflated);
3206
3207 zip.start_file("document.md", options)
3208 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3209 zip.write_all(archive.markdown.as_bytes())?;
3210
3211 let html_output = if pretty_html {
3212 crate::html::pretty_print_html(&archive.html)
3213 } else {
3214 archive.html.clone()
3215 };
3216 zip.start_file("document.html", options)
3217 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3218 zip.write_all(html_output.as_bytes())?;
3219
3220 for img in &archive.images {
3221 zip.start_file(format!("images/{}", img.filename), options)
3222 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3223 zip.write_all(&img.data)?;
3224 }
3225
3226 zip.finish()
3227 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3228 }
3229
3230 Ok(buf.into_inner())
3231}