1use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use serde_json::Value;
38use std::collections::HashMap;
39use std::fmt::Write as _;
40use std::hash::BuildHasher;
41use std::io::Write;
42use std::process::Stdio;
43use std::sync::OnceLock;
44use std::time::{Duration, Instant};
45use tokio::io::{AsyncBufReadExt, BufReader};
46use tokio::process::{Child, Command};
47use tracing::{debug, info, warn};
48
49use crate::WebCaptureError;
50
51const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
52const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
53const GDOCS_USER_AGENT: &str =
54 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
55const GDOCS_EDITOR_MODEL_WAIT: Duration = Duration::from_secs(30);
56const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
57
58type CdpWebSocket = WebSocketStream<ConnectStream>;
59
60const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
61window.__captured_chunks = [];
62const captureChunk = (value) => {
63 if (!value) {
64 return;
65 }
66 if (Array.isArray(value)) {
67 for (const item of value) {
68 captureChunk(item);
69 }
70 return;
71 }
72 try {
73 window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
74 } catch {
75 window.__captured_chunks.push(value);
76 }
77};
78const wrapChunkArray = (value) => {
79 if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
80 return value;
81 }
82 const originalPush = value.push;
83 Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
84 value: true,
85 enumerable: false,
86 });
87 Object.defineProperty(value, 'push', {
88 value(...items) {
89 for (const item of items) {
90 captureChunk(item);
91 }
92 return originalPush.apply(this, items);
93 },
94 writable: true,
95 configurable: true,
96 });
97 for (const item of value) {
98 captureChunk(item);
99 }
100 return value;
101};
102Object.defineProperty(window, 'DOCS_modelChunk', {
103 set(value) {
104 captureChunk(value);
105 window.__DOCS_modelChunk_latest = wrapChunkArray(value);
106 },
107 get() {
108 return window.__DOCS_modelChunk_latest;
109 },
110 configurable: false,
111});
112";
113
114const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
115 const chunks = [...(window.__captured_chunks || [])];
116 if (
117 window.DOCS_modelChunk &&
118 chunks.length === 0 &&
119 !chunks.includes(window.DOCS_modelChunk)
120 ) {
121 chunks.push(window.DOCS_modelChunk);
122 }
123 const cidUrlMap = {};
124 const scripts = document.querySelectorAll('script');
125 for (const script of scripts) {
126 const text = script.textContent || '';
127 if (!text.includes('docs-images-rt')) {
128 continue;
129 }
130 const regex =
131 /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
132 let match;
133 while ((match = regex.exec(text)) !== null) {
134 cidUrlMap[match[1]] = match[2]
135 .replace(/\\u003d/g, '=')
136 .replace(/\\u0026/g, '&')
137 .replace(/\\\//g, '/');
138 }
139 }
140 return { chunks, cidUrlMap };
141}"#;
142
143fn gdocs_url_pattern() -> &'static Regex {
144 static PATTERN: OnceLock<Regex> = OnceLock::new();
145 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
146}
147
148#[derive(Debug, Clone)]
150pub struct GDocsResult {
151 pub content: String,
153 pub format: String,
155 pub document_id: String,
157 pub export_url: String,
159}
160
161#[derive(Debug, Clone, Copy, PartialEq, Eq)]
163pub enum GDocsCaptureMethod {
164 BrowserModel,
166 PublicExport,
168 DocsApi,
170}
171
172#[derive(Debug, Clone)]
174pub struct GDocsRenderedResult {
175 pub markdown: String,
177 pub html: String,
179 pub text: String,
181 pub document_id: String,
183 pub export_url: String,
185 pub remote_images: Vec<RemoteImage>,
187}
188
189#[derive(Debug, Clone, PartialEq, Eq)]
191pub struct RemoteImage {
192 pub url: String,
194 pub alt: String,
196}
197
198#[derive(Debug, Clone)]
199struct BrowserModelData {
200 chunks: Vec<Value>,
201 cid_urls: HashMap<String, String>,
202}
203
204#[derive(Debug, Clone, Default)]
206pub struct CapturedDocument {
207 pub blocks: Vec<CapturedBlock>,
209 pub tables: Vec<TableBlock>,
211 pub images: Vec<ContentNode>,
213 pub text: String,
215}
216
217#[derive(Debug, Clone)]
219pub enum CapturedBlock {
220 Paragraph {
222 content: Vec<ContentNode>,
224 style: Option<String>,
226 list: Option<ListMeta>,
228 quote: bool,
230 horizontal_rule: bool,
232 },
233 Table(TableBlock),
235}
236
237#[derive(Debug, Clone, Default)]
239pub struct TableBlock {
240 pub rows: Vec<TableRow>,
242}
243
244#[derive(Debug, Clone, Default)]
246pub struct TableRow {
247 pub cells: Vec<TableCell>,
249}
250
251#[derive(Debug, Clone, Default)]
253pub struct TableCell {
254 pub content: Vec<ContentNode>,
256}
257
258#[derive(Debug, Clone, PartialEq, Eq)]
260pub enum ContentNode {
261 Text {
263 text: String,
265 bold: bool,
267 italic: bool,
269 strike: bool,
271 link: Option<String>,
273 },
274 Image {
276 cid: Option<String>,
278 url: Option<String>,
280 alt: String,
282 width: Option<String>,
284 height: Option<String>,
286 is_suggestion: bool,
288 },
289}
290
291#[derive(Debug, Clone, Default, PartialEq, Eq)]
292struct TextStyle {
293 bold: bool,
294 italic: bool,
295 strike: bool,
296 link: Option<String>,
297}
298
299#[derive(Debug, Clone, Default)]
300struct ParagraphMeta {
301 style: Option<String>,
302 list: Option<ListMeta>,
303 quote: bool,
304 horizontal_rule: bool,
305}
306
307#[derive(Debug, Clone)]
308pub struct ListMeta {
309 pub id: String,
311 pub level: usize,
313 pub ordered: bool,
315}
316
317#[derive(Debug, Clone)]
318struct ParagraphStyle {
319 style: Option<String>,
320 indent_start: f64,
321 indent_first_line: f64,
322}
323
324#[derive(Debug, Clone, Default)]
325struct ModelStyleMaps {
326 inline_styles: Vec<TextStyle>,
327 paragraph_by_end: HashMap<usize, ParagraphStyle>,
328 list_by_end: HashMap<usize, ListMeta>,
329 horizontal_rules: std::collections::HashSet<usize>,
330}
331
332#[must_use]
334pub fn is_google_docs_url(url: &str) -> bool {
335 gdocs_url_pattern().is_match(url)
336}
337
338#[must_use]
342pub fn extract_document_id(url: &str) -> Option<String> {
343 gdocs_url_pattern()
344 .captures(url)
345 .and_then(|caps| caps.get(1))
346 .map(|m| m.as_str().to_string())
347}
348
349#[must_use]
356pub fn build_export_url(document_id: &str, format: &str) -> String {
357 let export_format = match format {
358 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
359 _ => "html",
360 };
361 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
362}
363
364#[must_use]
366pub fn build_edit_url(document_id: &str) -> String {
367 format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
368}
369
370#[must_use]
372pub fn build_docs_api_url(document_id: &str) -> String {
373 format!("{GDOCS_API_BASE}/{document_id}")
374}
375
376pub fn select_capture_method(
382 capture: &str,
383 api_token: Option<&str>,
384) -> crate::Result<GDocsCaptureMethod> {
385 match capture.to_lowercase().as_str() {
386 "browser" => Ok(GDocsCaptureMethod::BrowserModel),
387 "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
388 "api" => Ok(GDocsCaptureMethod::PublicExport),
389 other => Err(WebCaptureError::InvalidUrl(format!(
390 "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
391 ))),
392 }
393}
394
395pub async fn fetch_google_doc(
410 url: &str,
411 format: &str,
412 api_token: Option<&str>,
413) -> crate::Result<GDocsResult> {
414 let document_id = extract_document_id(url).ok_or_else(|| {
415 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
416 })?;
417
418 let export_url = build_export_url(&document_id, format);
419 debug!(
420 document_id = %document_id,
421 format = %format,
422 export_url = %export_url,
423 has_api_token = api_token.is_some(),
424 "fetching Google Doc via public export"
425 );
426
427 let mut request = reqwest::Client::new()
428 .get(&export_url)
429 .header(
430 "User-Agent",
431 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
432 )
433 .header("Accept-Charset", "utf-8")
434 .header("Accept-Language", "en-US,en;q=0.9");
435
436 if let Some(token) = api_token {
437 request = request.header("Authorization", format!("Bearer {token}"));
438 }
439
440 let response = request
441 .send()
442 .await
443 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
444 debug!(
445 document_id = %document_id,
446 status = response.status().as_u16(),
447 success = response.status().is_success(),
448 content_type = response
449 .headers()
450 .get(reqwest::header::CONTENT_TYPE)
451 .and_then(|value| value.to_str().ok())
452 .unwrap_or(""),
453 "received Google Docs public export response"
454 );
455
456 if !response.status().is_success() {
457 return Err(WebCaptureError::FetchError(format!(
458 "Failed to fetch Google Doc ({} {}): {}",
459 response.status().as_u16(),
460 response.status().canonical_reason().unwrap_or("Unknown"),
461 export_url
462 )));
463 }
464
465 let raw_content = response.text().await.map_err(|e| {
466 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
467 })?;
468 debug!(
469 document_id = %document_id,
470 bytes = raw_content.len(),
471 "read Google Docs public export body"
472 );
473
474 let content = match format {
476 "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
477 _ => raw_content,
478 };
479
480 Ok(GDocsResult {
481 content,
482 format: format.to_string(),
483 document_id,
484 export_url,
485 })
486}
487
488pub async fn fetch_google_doc_as_markdown(
502 url: &str,
503 api_token: Option<&str>,
504) -> crate::Result<GDocsResult> {
505 let result = fetch_google_doc(url, "html", api_token).await?;
506
507 let preprocess = preprocess_google_docs_export_html(&result.content);
508 debug!(
509 document_id = %result.document_id,
510 hoisted = preprocess.hoisted,
511 unwrapped_links = preprocess.unwrapped_links,
512 "google-docs-export pre-processor rewrote markup"
513 );
514 let markdown = normalize_google_docs_export_markdown(
515 &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
516 );
517 debug!(
518 document_id = %result.document_id,
519 bytes = markdown.len(),
520 "rendered Google Docs public export markdown"
521 );
522
523 Ok(GDocsResult {
524 content: markdown,
525 format: "markdown".to_string(),
526 document_id: result.document_id,
527 export_url: result.export_url,
528 })
529}
530
531#[derive(Debug, Clone)]
536pub struct GDocsExportPreprocessResult {
537 pub html: String,
539 pub hoisted: usize,
541 pub unwrapped_links: usize,
543}
544
545#[must_use]
553pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
554 let mut hoisted: usize = 0;
555 let mut unwrapped_links: usize = 0;
556 let class_styles = extract_css_class_styles(html);
557
558 let mut out = hoist_inline_style_spans(html, &mut hoisted);
559 out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
560 out = convert_class_indented_blockquotes(&out, &class_styles);
561 out = nest_google_docs_lists(&out, &class_styles);
562 out = strip_google_docs_heading_noise(&out);
563 out = strip_heading_inline_formatting(&out);
564 out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
565 out = out.replace(" ", " ");
566 out = out.replace('\u{00A0}', " ");
567
568 GDocsExportPreprocessResult {
569 html: out,
570 hoisted,
571 unwrapped_links,
572 }
573}
574
575#[must_use]
577pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
578 let markdown = unescape_public_export_punctuation(markdown);
579 let markdown = convert_setext_headings(&markdown);
580 let markdown = normalize_atx_headings(&markdown);
581 let markdown = normalize_bullet_markers(&markdown);
582 let markdown = normalize_list_spacing(&markdown);
583 let markdown = normalize_blockquote_spacing(&markdown);
584 let markdown = normalize_markdown_tables(&markdown);
585 crate::markdown::clean_markdown(&markdown)
586}
587
588fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
589 let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
590 .expect("valid regex");
591 span_re
592 .replace_all(html, |caps: ®ex::Captures<'_>| {
593 let style = caps.get(2).map_or("", |m| m.as_str());
594 let inner = caps.get(3).map_or("", |m| m.as_str());
595 semantic_wrapped_html(inner, style).map_or_else(
596 || caps[0].to_string(),
597 |wrapped| {
598 *hoisted += 1;
599 wrapped
600 },
601 )
602 })
603 .into_owned()
604}
605
606fn hoist_class_style_spans(
607 html: &str,
608 class_styles: &HashMap<String, String>,
609 hoisted: &mut usize,
610) -> String {
611 let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
612 .expect("valid regex");
613 class_span_re
614 .replace_all(html, |caps: ®ex::Captures<'_>| {
615 let class_attr = caps.get(2).map_or("", |m| m.as_str());
616 let inner = caps.get(3).map_or("", |m| m.as_str());
617 let style = combined_class_style(class_styles, class_attr);
618 semantic_wrapped_html(inner, &style).map_or_else(
619 || caps[0].to_string(),
620 |wrapped| {
621 *hoisted += 1;
622 wrapped
623 },
624 )
625 })
626 .into_owned()
627}
628
629fn convert_class_indented_blockquotes(
630 html: &str,
631 class_styles: &HashMap<String, String>,
632) -> String {
633 let class_paragraph_re =
634 Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
635 class_paragraph_re
636 .replace_all(html, |caps: ®ex::Captures<'_>| {
637 let class_attr = caps.get(2).map_or("", |m| m.as_str());
638 let inner = caps.get(3).map_or("", |m| m.as_str());
639 let style = combined_class_style(class_styles, class_attr);
640 if is_blockquote_style(&style) {
641 format!("<blockquote><p>{inner}</p></blockquote>")
642 } else {
643 caps[0].to_string()
644 }
645 })
646 .into_owned()
647}
648
649#[derive(Debug, Clone)]
650struct ExportListBlock {
651 start: usize,
652 end: usize,
653 tag: String,
654 inner: String,
655}
656
657#[derive(Debug, Clone)]
658struct ExportListItem {
659 tag: String,
660 level: usize,
661 inner: String,
662}
663
664fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
665 let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
666 let blocks: Vec<ExportListBlock> = list_re
667 .captures_iter(html)
668 .filter_map(|caps| {
669 let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
670 let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
671 if open_tag != close_tag {
672 return None;
673 }
674 let whole = caps.get(0)?;
675 Some(ExportListBlock {
676 start: whole.start(),
677 end: whole.end(),
678 tag: open_tag,
679 inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
680 })
681 })
682 .collect();
683
684 if blocks.len() < 2 {
685 return html.to_string();
686 }
687
688 let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
689 let mut current: Vec<ExportListBlock> = Vec::new();
690 for block in blocks {
691 if let Some(previous) = current.last() {
692 if !html[previous.end..block.start].trim().is_empty() {
693 if current.len() > 1 {
694 groups.push(std::mem::take(&mut current));
695 } else {
696 current.clear();
697 }
698 }
699 }
700 current.push(block);
701 }
702 if current.len() > 1 {
703 groups.push(current);
704 }
705
706 if groups.is_empty() {
707 return html.to_string();
708 }
709
710 let mut out = html.to_string();
711 for group in groups.iter().rev() {
712 let rendered = render_nested_list_group(group, class_styles);
713 let start = group.first().expect("non-empty group").start;
714 let end = group.last().expect("non-empty group").end;
715 out.replace_range(start..end, &rendered);
716 }
717 out
718}
719
720fn render_nested_list_group(
721 group: &[ExportListBlock],
722 class_styles: &HashMap<String, String>,
723) -> String {
724 let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
725 let items: Vec<ExportListItem> = group
726 .iter()
727 .flat_map(|block| {
728 item_re.captures_iter(&block.inner).map(|caps| {
729 let attrs = caps.get(1).map_or("", |m| m.as_str());
730 let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
731 ExportListItem {
732 tag: block.tag.clone(),
733 level: google_docs_list_item_level(attrs, class_styles),
734 inner,
735 }
736 })
737 })
738 .collect();
739
740 if items.is_empty() {
741 let mut unchanged = String::new();
742 for block in group {
743 write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
744 .expect("write to String");
745 }
746 return unchanged;
747 }
748
749 let mut html = String::new();
750 let mut current_level: Option<usize> = None;
751 let mut open_tags: Vec<Option<String>> = Vec::new();
752 let mut item_open: Vec<bool> = Vec::new();
753
754 for item in items {
755 let level = item.level;
756 while current_level.is_some_and(|current| current > level) {
757 let current = current_level.expect("checked as Some");
758 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
759 current_level = current.checked_sub(1);
760 }
761
762 while current_level.is_none_or(|current| current < level) {
763 let next_level = current_level.map_or(0, |current| current + 1);
764 open_rendered_list(
765 &mut html,
766 &mut open_tags,
767 &mut item_open,
768 next_level,
769 &item.tag,
770 );
771 current_level = Some(next_level);
772 }
773
774 ensure_list_stack(&mut open_tags, &mut item_open, level);
775 if open_tags[level]
776 .as_deref()
777 .is_some_and(|tag| tag != item.tag)
778 {
779 close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
780 open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
781 } else if open_tags[level].is_none() {
782 open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
783 }
784
785 close_rendered_item(&mut html, &mut item_open, level);
786 html.push_str("<li>");
787 html.push_str(&item.inner);
788 item_open[level] = true;
789
790 for deeper in (level + 1)..item_open.len() {
791 item_open[deeper] = false;
792 open_tags[deeper] = None;
793 }
794 }
795
796 while let Some(current) = current_level {
797 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
798 current_level = current.checked_sub(1);
799 }
800
801 html
802}
803
804fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
805 while open_tags.len() <= level {
806 open_tags.push(None);
807 item_open.push(false);
808 }
809}
810
811fn open_rendered_list(
812 html: &mut String,
813 open_tags: &mut Vec<Option<String>>,
814 item_open: &mut Vec<bool>,
815 level: usize,
816 tag: &str,
817) {
818 ensure_list_stack(open_tags, item_open, level);
819 html.push('<');
820 html.push_str(tag);
821 html.push('>');
822 open_tags[level] = Some(tag.to_string());
823 item_open[level] = false;
824}
825
826fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
827 if item_open.get(level).copied().unwrap_or(false) {
828 html.push_str("</li>");
829 item_open[level] = false;
830 }
831}
832
833fn close_rendered_list(
834 html: &mut String,
835 open_tags: &mut [Option<String>],
836 item_open: &mut [bool],
837 level: usize,
838) {
839 close_rendered_item(html, item_open, level);
840 if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
841 html.push_str("</");
842 html.push_str(&tag);
843 html.push('>');
844 }
845}
846
847fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
848 let style = combined_attr_style(class_styles, attrs);
849 let margin_left = css_point_value(&style, "margin-left");
850 if margin_left <= 0.0 {
851 return 0;
852 }
853 [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
854 .iter()
855 .take_while(|boundary| margin_left >= **boundary)
856 .count()
857}
858
859fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
860 let mut styles = String::new();
861 if let Some(style) = attr_value(attrs, "style") {
862 styles.push_str(&style);
863 }
864 if let Some(class_attr) = attr_value(attrs, "class") {
865 styles.push_str(&combined_class_style(class_styles, &class_attr));
866 }
867 styles
868}
869
870fn attr_value(attrs: &str, name: &str) -> Option<String> {
871 let attr_re = Regex::new(&format!(
872 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
873 regex::escape(name)
874 ))
875 .expect("valid regex");
876 attr_re.captures(attrs).and_then(|caps| {
877 caps.get(1)
878 .or_else(|| caps.get(2))
879 .map(|value| value.as_str().to_string())
880 })
881}
882
883fn strip_google_docs_heading_noise(html: &str) -> String {
884 let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
885 let numbering_re =
886 Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
887 let mut out = empty_anchor_re.replace_all(html, "").into_owned();
888 for level in 1..=6 {
889 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
890 .expect("valid regex");
891 out = heading_re
892 .replace_all(&out, |caps: ®ex::Captures<'_>| {
893 let open = &caps[1];
894 let inner = &caps[2];
895 let close = &caps[3];
896 let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
897 cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
898 format!("{open}{cleaned}{close}")
899 })
900 .into_owned();
901 }
902 out
903}
904
905fn strip_heading_inline_formatting(html: &str) -> String {
906 let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
907 let mut out = html.to_string();
908 for level in 1..=6 {
909 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
910 .expect("valid regex");
911 out = heading_re
912 .replace_all(&out, |caps: ®ex::Captures<'_>| {
913 let open = &caps[1];
914 let inner = &caps[2];
915 let close = &caps[3];
916 let cleaned = inline_marker_re.replace_all(inner, "");
917 format!("{open}{cleaned}{close}")
918 })
919 .into_owned();
920 }
921 out
922}
923
924fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
925 let redirect_re =
926 Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
927 .expect("valid regex");
928 redirect_re
929 .replace_all(html, |caps: ®ex::Captures<'_>| {
930 let encoded = caps.get(1).map_or("", |m| m.as_str());
931 let decoded = percent_decode_utf8_lossy(encoded);
932 *unwrapped_links += 1;
933 format!(r#"href="{decoded}""#)
934 })
935 .into_owned()
936}
937
938fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
939 let mut class_styles: HashMap<String, String> = HashMap::new();
940 let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
941 let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
942 for style_caps in style_re.captures_iter(html) {
943 let css = style_caps.get(1).map_or("", |m| m.as_str());
944 for class_caps in class_re.captures_iter(css) {
945 let class_name = class_caps.get(1).map_or("", |m| m.as_str());
946 let style = class_caps.get(2).map_or("", |m| m.as_str());
947 class_styles
948 .entry(class_name.to_string())
949 .and_modify(|existing| {
950 existing.push(';');
951 existing.push_str(style);
952 })
953 .or_insert_with(|| style.to_string());
954 }
955 }
956 class_styles
957}
958
959fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
960 class_attr
961 .split_whitespace()
962 .filter_map(|class_name| class_styles.get(class_name))
963 .fold(String::new(), |mut out, style| {
964 out.push(';');
965 out.push_str(style);
966 out
967 })
968}
969
970fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
971 let bold = css_has_bold(style);
972 let italic = css_has_italic(style);
973 let strike = css_has_strike(style);
974 if !bold && !italic && !strike {
975 return None;
976 }
977 let mut wrapped = inner.to_string();
978 if strike {
979 wrapped = format!("<del>{wrapped}</del>");
980 }
981 if italic {
982 wrapped = format!("<em>{wrapped}</em>");
983 }
984 if bold {
985 wrapped = format!("<strong>{wrapped}</strong>");
986 }
987 Some(wrapped)
988}
989
990fn css_has_bold(style: &str) -> bool {
991 Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
992 .expect("valid regex")
993 .is_match(style)
994}
995
996fn css_has_italic(style: &str) -> bool {
997 Regex::new(r"(?i)font-style\s*:\s*italic")
998 .expect("valid regex")
999 .is_match(style)
1000}
1001
1002fn css_has_strike(style: &str) -> bool {
1003 Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1004 .expect("valid regex")
1005 .is_match(style)
1006}
1007
1008fn is_blockquote_style(style: &str) -> bool {
1009 let margin_left = css_point_value(style, "margin-left");
1010 let margin_right = css_point_value(style, "margin-right");
1011 margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1012}
1013
1014fn css_point_value(style: &str, property: &str) -> f64 {
1015 let re = Regex::new(&format!(
1016 r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1017 regex::escape(property)
1018 ))
1019 .expect("valid regex");
1020 re.captures(style)
1021 .and_then(|caps| caps.get(1))
1022 .and_then(|value| value.as_str().parse::<f64>().ok())
1023 .unwrap_or(0.0)
1024}
1025
1026fn percent_decode_utf8_lossy(input: &str) -> String {
1029 let bytes = input.as_bytes();
1030 let mut decoded = Vec::with_capacity(bytes.len());
1031 let mut i = 0;
1032 while i < bytes.len() {
1033 if bytes[i] == b'%' && i + 2 < bytes.len() {
1034 let hi = (bytes[i + 1] as char).to_digit(16);
1035 let lo = (bytes[i + 2] as char).to_digit(16);
1036 if let (Some(hi), Some(lo)) = (hi, lo) {
1037 if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1038 decoded.push(byte);
1039 i += 3;
1040 continue;
1041 }
1042 }
1043 }
1044 decoded.push(bytes[i]);
1045 i += 1;
1046 }
1047 String::from_utf8_lossy(&decoded).into_owned()
1048}
1049
1050fn unescape_public_export_punctuation(markdown: &str) -> String {
1051 markdown
1052 .replace("\\.", ".")
1053 .replace("\\!", "!")
1054 .replace("\\(", "(")
1055 .replace("\\)", ")")
1056 .replace("\\[", "[")
1057 .replace("\\]", "]")
1058}
1059
1060fn convert_setext_headings(markdown: &str) -> String {
1061 let lines: Vec<&str> = markdown.lines().collect();
1062 let mut out = Vec::with_capacity(lines.len());
1063 let mut index = 0;
1064 while index < lines.len() {
1065 if index + 1 < lines.len() {
1066 let underline = lines[index + 1].trim();
1067 if is_setext_underline(underline, '=') {
1068 out.push(format!("# {}", lines[index].trim()));
1069 index += 2;
1070 continue;
1071 }
1072 if is_setext_underline(underline, '-') {
1073 out.push(format!("## {}", lines[index].trim()));
1074 index += 2;
1075 continue;
1076 }
1077 }
1078 out.push(lines[index].to_string());
1079 index += 1;
1080 }
1081 out.join("\n")
1082}
1083
1084fn is_setext_underline(line: &str, marker: char) -> bool {
1085 line.len() >= 5 && line.chars().all(|ch| ch == marker)
1086}
1087
1088fn normalize_atx_headings(markdown: &str) -> String {
1089 let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1090 let closing_re = closing_atx_heading_re();
1091 markdown
1092 .lines()
1093 .map(|line| {
1094 let Some(caps) = heading_re.captures(line) else {
1095 return line.to_string();
1096 };
1097 let hashes = caps.get(1).map_or("", |m| m.as_str());
1098 let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1099 text = closing_re.replace(&text, "").trim().to_string();
1100 text = strip_wrapping_markdown_emphasis(&text);
1101 format!("{hashes} {text}")
1102 })
1103 .collect::<Vec<_>>()
1104 .join("\n")
1105}
1106
1107fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1108 let trimmed = text.trim();
1109 for marker in ["***", "**", "*"] {
1110 if trimmed.len() > marker.len() * 2
1111 && trimmed.starts_with(marker)
1112 && trimmed.ends_with(marker)
1113 {
1114 return trimmed[marker.len()..trimmed.len() - marker.len()]
1115 .trim()
1116 .to_string();
1117 }
1118 }
1119 trimmed.to_string()
1120}
1121
1122fn normalize_bullet_markers(markdown: &str) -> String {
1123 let bullet_re = asterisk_bullet_re();
1124 markdown
1125 .lines()
1126 .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1127 .collect::<Vec<_>>()
1128 .join("\n")
1129}
1130
1131fn normalize_list_spacing(markdown: &str) -> String {
1132 let lines: Vec<&str> = markdown.lines().collect();
1133 let mut out = Vec::with_capacity(lines.len());
1134
1135 for (index, line) in lines.iter().enumerate() {
1136 if line.trim().is_empty()
1137 && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1138 && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1139 {
1140 continue;
1141 }
1142 out.push((*line).to_string());
1143 }
1144
1145 out.join("\n")
1146}
1147
1148fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1149 lines[..index]
1150 .iter()
1151 .rev()
1152 .copied()
1153 .find(|line| !line.trim().is_empty())
1154}
1155
1156fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1157 lines[index + 1..]
1158 .iter()
1159 .copied()
1160 .find(|line| !line.trim().is_empty())
1161}
1162
1163fn is_markdown_list_item(line: &str) -> bool {
1164 markdown_list_item_re().is_match(line)
1165}
1166
1167fn normalize_blockquote_spacing(markdown: &str) -> String {
1168 let mut out = String::with_capacity(markdown.len());
1169 let mut pending_quote_blank = false;
1170 let mut in_quote = false;
1171
1172 for line in markdown.lines() {
1173 if line.trim().is_empty() && in_quote {
1174 pending_quote_blank = true;
1175 continue;
1176 }
1177
1178 if line.trim() == ">" {
1179 if in_quote {
1180 pending_quote_blank = true;
1181 }
1182 continue;
1183 }
1184
1185 if line.starts_with("> ") {
1186 if pending_quote_blank {
1187 out.push_str(">\n");
1188 pending_quote_blank = false;
1189 }
1190 out.push_str(line);
1191 out.push('\n');
1192 in_quote = true;
1193 continue;
1194 }
1195
1196 if in_quote && !line.trim().is_empty() {
1197 out.push('\n');
1198 }
1199 pending_quote_blank = false;
1200 in_quote = false;
1201 out.push_str(line);
1202 out.push('\n');
1203 }
1204
1205 out
1206}
1207
1208fn normalize_markdown_tables(markdown: &str) -> String {
1209 let lines: Vec<&str> = markdown.lines().collect();
1210 let mut out = Vec::with_capacity(lines.len());
1211 let mut index = 0;
1212
1213 while index < lines.len() {
1214 if !is_markdown_table_line(lines[index]) {
1215 out.push(lines[index].to_string());
1216 index += 1;
1217 continue;
1218 }
1219
1220 let start = index;
1221 while index < lines.len() && is_markdown_table_line(lines[index]) {
1222 index += 1;
1223 }
1224 let block = &lines[start..index];
1225 if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1226 out.extend(normalize_markdown_table_block(block));
1227 } else {
1228 out.extend(block.iter().map(|line| (*line).to_string()));
1229 }
1230 }
1231
1232 out.join("\n")
1233}
1234
1235fn is_markdown_table_line(line: &str) -> bool {
1236 let trimmed = line.trim();
1237 trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1238}
1239
1240fn is_markdown_separator_line(line: &str) -> bool {
1241 split_markdown_table_cells(line)
1242 .iter()
1243 .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1244}
1245
1246fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1247 lines
1248 .iter()
1249 .enumerate()
1250 .map(|(index, line)| {
1251 let cells = split_markdown_table_cells(line);
1252 if index == 1 {
1253 let separators = vec!["---".to_string(); cells.len()];
1254 render_markdown_table_row(&separators)
1255 } else {
1256 render_markdown_table_row(&cells)
1257 }
1258 })
1259 .collect()
1260}
1261
1262fn split_markdown_table_cells(line: &str) -> Vec<String> {
1263 line.trim()
1264 .trim_matches('|')
1265 .split('|')
1266 .map(|cell| cell.trim().to_string())
1267 .collect()
1268}
1269
1270fn render_markdown_table_row(cells: &[String]) -> String {
1271 format!("| {} |", cells.join(" | "))
1272}
1273
1274fn closing_atx_heading_re() -> &'static Regex {
1275 static RE: OnceLock<Regex> = OnceLock::new();
1276 RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1277}
1278
1279fn asterisk_bullet_re() -> &'static Regex {
1280 static RE: OnceLock<Regex> = OnceLock::new();
1281 RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1282}
1283
1284fn markdown_list_item_re() -> &'static Regex {
1285 static RE: OnceLock<Regex> = OnceLock::new();
1286 RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1287}
1288
1289fn markdown_table_separator_cell_re() -> &'static Regex {
1290 static RE: OnceLock<Regex> = OnceLock::new();
1291 RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1292}
1293
1294pub async fn fetch_google_doc_from_docs_api(
1300 url: &str,
1301 api_token: &str,
1302) -> crate::Result<GDocsRenderedResult> {
1303 let document_id = extract_document_id(url).ok_or_else(|| {
1304 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1305 })?;
1306 let api_url = build_docs_api_url(&document_id);
1307 debug!(
1308 document_id = %document_id,
1309 api_url = %api_url,
1310 "fetching Google Doc via Docs API"
1311 );
1312
1313 let response = reqwest::Client::new()
1314 .get(&api_url)
1315 .header("Authorization", format!("Bearer {api_token}"))
1316 .header("Accept", "application/json")
1317 .send()
1318 .await
1319 .map_err(|e| {
1320 WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1321 })?;
1322 debug!(
1323 document_id = %document_id,
1324 status = response.status().as_u16(),
1325 success = response.status().is_success(),
1326 content_type = response
1327 .headers()
1328 .get(reqwest::header::CONTENT_TYPE)
1329 .and_then(|value| value.to_str().ok())
1330 .unwrap_or(""),
1331 "received Google Docs API response"
1332 );
1333
1334 if !response.status().is_success() {
1335 return Err(WebCaptureError::FetchError(format!(
1336 "Failed to fetch Google Doc via Docs API ({} {}): {}",
1337 response.status().as_u16(),
1338 response.status().canonical_reason().unwrap_or("Unknown"),
1339 api_url
1340 )));
1341 }
1342
1343 let body = response.text().await.map_err(|e| {
1344 WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1345 })?;
1346 let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1347 WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1348 })?;
1349 let rendered = render_docs_api_document(&document);
1350 debug!(
1351 document_id = %document_id,
1352 title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1353 markdown_bytes = rendered.markdown.len(),
1354 html_bytes = rendered.html.len(),
1355 text_bytes = rendered.text.len(),
1356 "rendered Google Docs API document"
1357 );
1358
1359 Ok(GDocsRenderedResult {
1360 markdown: rendered.markdown,
1361 html: rendered.html,
1362 text: rendered.text,
1363 document_id,
1364 export_url: api_url,
1365 remote_images: Vec::new(),
1366 })
1367}
1368
1369pub async fn fetch_google_doc_from_model(
1375 url: &str,
1376 api_token: Option<&str>,
1377) -> crate::Result<GDocsRenderedResult> {
1378 if api_token.is_some() {
1379 return Err(WebCaptureError::BrowserError(
1380 "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1381 ));
1382 }
1383 let document_id = extract_document_id(url).ok_or_else(|| {
1384 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1385 })?;
1386 let edit_url = build_edit_url(&document_id);
1387 debug!(
1388 document_id = %document_id,
1389 edit_url = %edit_url,
1390 "capturing Google Doc editor model with a real browser"
1391 );
1392 let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1393 let chunks = model_data.chunks;
1394 debug!(
1395 document_id = %document_id,
1396 chunks = chunks.len(),
1397 cid_urls = model_data.cid_urls.len(),
1398 "extracted Google Docs editor model chunks through CDP"
1399 );
1400 if chunks.is_empty() {
1401 return Err(WebCaptureError::ParseError(
1402 "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1403 ));
1404 }
1405
1406 let capture = parse_model_chunks(&chunks, &model_data.cid_urls);
1407 let remote_images = remote_images_from_capture(&capture);
1408 info!(
1409 document_id = %document_id,
1410 chunks = chunks.len(),
1411 cid_urls = model_data.cid_urls.len(),
1412 blocks = capture.blocks.len(),
1413 tables = capture.tables.len(),
1414 images = capture.images.len(),
1415 text_bytes = capture.text.len(),
1416 "parsed Google Docs editor model"
1417 );
1418
1419 Ok(GDocsRenderedResult {
1420 markdown: render_captured_document(&capture, "markdown"),
1421 html: render_captured_document(&capture, "html"),
1422 text: render_captured_document(&capture, "txt"),
1423 document_id,
1424 export_url: edit_url,
1425 remote_images,
1426 })
1427}
1428
1429async fn fetch_google_doc_editor_model_with_cdp(
1430 edit_url: &str,
1431 document_id: &str,
1432) -> crate::Result<BrowserModelData> {
1433 let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1434 WebCaptureError::BrowserError(
1435 "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1436 )
1437 })?;
1438 let user_data_dir = crate::browser::temporary_user_data_dir();
1439 std::fs::create_dir_all(&user_data_dir)?;
1440
1441 debug!(
1442 document_id = %document_id,
1443 chrome = %chrome.display(),
1444 user_data_dir = %user_data_dir.display(),
1445 edit_url = %edit_url,
1446 "launching headless Chrome CDP session for Google Docs model capture"
1447 );
1448
1449 let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1450 let capture_result = async {
1451 let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1452 let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1453 WebCaptureError::BrowserError(format!(
1454 "Failed to connect to Chrome DevTools websocket: {error}"
1455 ))
1456 })?;
1457 let mut next_id = 0u64;
1458 let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1459 wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1460 }
1461 .await;
1462
1463 if let Err(error) = child.kill().await {
1464 debug!(
1465 document_id = %document_id,
1466 error = %error,
1467 "failed to kill Chrome CDP browser process"
1468 );
1469 }
1470 let _ = child.wait().await;
1471 let _ = std::fs::remove_dir_all(&user_data_dir);
1472
1473 capture_result
1474}
1475
1476async fn navigate_google_docs_cdp_page(
1477 ws: &mut CdpWebSocket,
1478 next_id: &mut u64,
1479 edit_url: &str,
1480) -> crate::Result<String> {
1481 let target = cdp_send(
1482 ws,
1483 next_id,
1484 None,
1485 "Target.createTarget",
1486 serde_json::json!({ "url": "about:blank" }),
1487 )
1488 .await?;
1489 let target_id = target
1490 .get("targetId")
1491 .and_then(Value::as_str)
1492 .ok_or_else(|| {
1493 WebCaptureError::BrowserError(
1494 "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1495 )
1496 })?
1497 .to_string();
1498 let attached = cdp_send(
1499 ws,
1500 next_id,
1501 None,
1502 "Target.attachToTarget",
1503 serde_json::json!({ "targetId": target_id, "flatten": true }),
1504 )
1505 .await?;
1506 let session_id = attached
1507 .get("sessionId")
1508 .and_then(Value::as_str)
1509 .ok_or_else(|| {
1510 WebCaptureError::BrowserError(
1511 "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1512 )
1513 })?
1514 .to_string();
1515
1516 cdp_send(
1517 ws,
1518 next_id,
1519 Some(&session_id),
1520 "Page.enable",
1521 serde_json::json!({}),
1522 )
1523 .await?;
1524 cdp_send(
1525 ws,
1526 next_id,
1527 Some(&session_id),
1528 "Runtime.enable",
1529 serde_json::json!({}),
1530 )
1531 .await?;
1532 cdp_send(
1533 ws,
1534 next_id,
1535 Some(&session_id),
1536 "Page.addScriptToEvaluateOnNewDocument",
1537 serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1538 )
1539 .await?;
1540 cdp_send(
1541 ws,
1542 next_id,
1543 Some(&session_id),
1544 "Page.navigate",
1545 serde_json::json!({ "url": edit_url }),
1546 )
1547 .await?;
1548
1549 Ok(session_id)
1550}
1551
1552async fn wait_for_google_docs_model_chunks(
1553 ws: &mut CdpWebSocket,
1554 next_id: &mut u64,
1555 session_id: &str,
1556 document_id: &str,
1557) -> crate::Result<BrowserModelData> {
1558 let started = Instant::now();
1559 let mut last_chunks = 0usize;
1560 let mut last_cid_urls = 0usize;
1561
1562 while started.elapsed() < GDOCS_EDITOR_MODEL_WAIT {
1563 let result = cdp_send(
1564 ws,
1565 next_id,
1566 Some(session_id),
1567 "Runtime.evaluate",
1568 serde_json::json!({
1569 "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1570 "returnByValue": true,
1571 "awaitPromise": true
1572 }),
1573 )
1574 .await?;
1575 if let Some(exception) = result.get("exceptionDetails") {
1576 return Err(WebCaptureError::BrowserError(format!(
1577 "Google Docs model extraction script failed: {exception}"
1578 )));
1579 }
1580 let value = result
1581 .pointer("/result/value")
1582 .cloned()
1583 .unwrap_or(Value::Null);
1584 let model_data = browser_model_data_from_value(&value);
1585 last_chunks = model_data.chunks.len();
1586 last_cid_urls = model_data.cid_urls.len();
1587 if !model_data.chunks.is_empty() {
1588 debug!(
1589 document_id = %document_id,
1590 chunks = model_data.chunks.len(),
1591 cid_urls = model_data.cid_urls.len(),
1592 elapsed_ms = started.elapsed().as_millis(),
1593 "captured Google Docs model chunks through CDP Runtime.evaluate"
1594 );
1595 return Ok(model_data);
1596 }
1597 tokio::time::sleep(Duration::from_millis(250)).await;
1598 }
1599
1600 Err(WebCaptureError::BrowserError(format!(
1601 "Timed out waiting for Google Docs DOCS_modelChunk data for document {document_id} after {} ms (last chunks={last_chunks}, cid_urls={last_cid_urls})",
1602 GDOCS_EDITOR_MODEL_WAIT.as_millis()
1603 )))
1604}
1605
1606fn launch_cdp_chrome(
1607 chrome: &std::path::Path,
1608 user_data_dir: &std::path::Path,
1609) -> crate::Result<Child> {
1610 let mut command = Command::new(chrome);
1611 command
1612 .args([
1613 "--headless=new",
1614 "--disable-gpu",
1615 "--disable-extensions",
1616 "--disable-dev-shm-usage",
1617 "--disable-background-networking",
1618 "--disable-component-update",
1619 "--disable-default-apps",
1620 "--disable-sync",
1621 "--metrics-recording-only",
1622 "--no-default-browser-check",
1623 "--no-first-run",
1624 "--no-sandbox",
1625 "--remote-debugging-port=0",
1626 "--window-size=1280,800",
1627 ])
1628 .arg(format!("--user-data-dir={}", user_data_dir.display()))
1629 .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1630 .stderr(Stdio::piped())
1631 .stdout(Stdio::null())
1632 .kill_on_drop(true);
1633
1634 command.spawn().map_err(|error| {
1635 WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1636 })
1637}
1638
1639async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1640 let stderr = child.stderr.take().ok_or_else(|| {
1641 WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1642 })?;
1643 let mut lines = BufReader::new(stderr).lines();
1644 let started = Instant::now();
1645
1646 while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1647 let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1648 match line {
1649 Ok(Ok(Some(line))) => {
1650 if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1651 return Ok(ws_url.trim().to_string());
1652 }
1653 }
1654 Ok(Ok(None)) => {
1655 break;
1656 }
1657 Ok(Err(error)) => {
1658 return Err(WebCaptureError::BrowserError(format!(
1659 "Failed to read Chrome CDP stderr: {error}"
1660 )));
1661 }
1662 Err(_) => {}
1663 }
1664 }
1665
1666 Err(WebCaptureError::BrowserError(format!(
1667 "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1668 GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1669 )))
1670}
1671
1672async fn cdp_send(
1673 ws: &mut CdpWebSocket,
1674 next_id: &mut u64,
1675 session_id: Option<&str>,
1676 method: &str,
1677 params: Value,
1678) -> crate::Result<Value> {
1679 *next_id += 1;
1680 let id = *next_id;
1681 let mut message = serde_json::json!({
1682 "id": id,
1683 "method": method,
1684 "params": params
1685 });
1686 if let Some(session_id) = session_id {
1687 message["sessionId"] = Value::String(session_id.to_string());
1688 }
1689
1690 ws.send(Message::Text(message.to_string()))
1691 .await
1692 .map_err(|error| {
1693 WebCaptureError::BrowserError(format!(
1694 "Failed to send Chrome DevTools command {method}: {error}"
1695 ))
1696 })?;
1697
1698 while let Some(message) = ws.next().await {
1699 let message = message.map_err(|error| {
1700 WebCaptureError::BrowserError(format!(
1701 "Failed to read Chrome DevTools response for {method}: {error}"
1702 ))
1703 })?;
1704 if !message.is_text() {
1705 continue;
1706 }
1707 let text = message.to_text().map_err(|error| {
1708 WebCaptureError::BrowserError(format!(
1709 "Chrome DevTools response for {method} was not text: {error}"
1710 ))
1711 })?;
1712 let value = serde_json::from_str::<Value>(text).map_err(|error| {
1713 WebCaptureError::ParseError(format!(
1714 "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1715 ))
1716 })?;
1717 if value.get("id").and_then(Value::as_u64) != Some(id) {
1718 continue;
1719 }
1720 if let Some(error) = value.get("error") {
1721 return Err(WebCaptureError::BrowserError(format!(
1722 "Chrome DevTools command {method} failed: {error}"
1723 )));
1724 }
1725 return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1726 }
1727
1728 Err(WebCaptureError::BrowserError(format!(
1729 "Chrome DevTools websocket closed before response for {method}"
1730 )))
1731}
1732
1733fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1734 let chunks = value
1735 .get("chunks")
1736 .and_then(Value::as_array)
1737 .cloned()
1738 .unwrap_or_default();
1739 let cid_urls = value
1740 .get("cidUrlMap")
1741 .and_then(Value::as_object)
1742 .map(|map| {
1743 map.iter()
1744 .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1745 .collect::<HashMap<_, _>>()
1746 })
1747 .unwrap_or_default();
1748 BrowserModelData { chunks, cid_urls }
1749}
1750
1751fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1752 capture
1753 .images
1754 .iter()
1755 .filter_map(|node| match node {
1756 ContentNode::Image {
1757 url: Some(url),
1758 alt,
1759 ..
1760 } => Some(RemoteImage {
1761 url: url.clone(),
1762 alt: alt.clone(),
1763 }),
1764 ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1765 })
1766 .collect()
1767}
1768
1769#[must_use]
1771pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1772 let blocks = structural_elements_to_blocks(
1773 document
1774 .pointer("/body/content")
1775 .and_then(Value::as_array)
1776 .map_or(&[] as &[Value], Vec::as_slice),
1777 document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1778 );
1779 GDocsRenderedOutput {
1780 markdown: render_blocks_markdown(&blocks),
1781 html: render_blocks_html(&blocks),
1782 text: blocks_to_text(&blocks),
1783 }
1784}
1785
1786#[derive(Debug, Clone, PartialEq, Eq)]
1788pub struct GDocsRenderedOutput {
1789 pub markdown: String,
1791 pub html: String,
1793 pub text: String,
1795}
1796
1797fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
1798 let mut blocks = Vec::new();
1799 for element in elements {
1800 if let Some(paragraph) = element.get("paragraph") {
1801 let content = paragraph_to_content(paragraph, inline_objects);
1802 if !content_to_text(&content).trim().is_empty()
1803 || content
1804 .iter()
1805 .any(|node| matches!(node, ContentNode::Image { .. }))
1806 {
1807 blocks.push(CapturedBlock::Paragraph {
1808 style: paragraph
1809 .pointer("/paragraphStyle/namedStyleType")
1810 .and_then(Value::as_str)
1811 .map(ToString::to_string),
1812 list: None,
1813 quote: false,
1814 horizontal_rule: false,
1815 content,
1816 });
1817 }
1818 } else if let Some(table) = element.get("table") {
1819 blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
1820 }
1821 }
1822 blocks
1823}
1824
1825fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
1826 let rows = table
1827 .get("tableRows")
1828 .and_then(Value::as_array)
1829 .map_or(&[] as &[Value], Vec::as_slice)
1830 .iter()
1831 .map(|row| TableRow {
1832 cells: row
1833 .get("tableCells")
1834 .and_then(Value::as_array)
1835 .map_or(&[] as &[Value], Vec::as_slice)
1836 .iter()
1837 .map(|cell| TableCell {
1838 content: structural_elements_to_inline_content(
1839 cell.get("content")
1840 .and_then(Value::as_array)
1841 .map_or(&[] as &[Value], Vec::as_slice),
1842 inline_objects,
1843 ),
1844 })
1845 .collect(),
1846 })
1847 .collect();
1848 TableBlock { rows }
1849}
1850
1851fn structural_elements_to_inline_content(
1852 elements: &[Value],
1853 inline_objects: &Value,
1854) -> Vec<ContentNode> {
1855 let mut content = Vec::new();
1856 for element in elements {
1857 if let Some(paragraph) = element.get("paragraph") {
1858 let paragraph_content = paragraph_to_content(paragraph, inline_objects);
1859 if !content.is_empty() && !paragraph_content.is_empty() {
1860 append_text(&mut content, "\n");
1861 }
1862 content.extend(paragraph_content);
1863 } else if let Some(table) = element.get("table") {
1864 append_text(
1865 &mut content,
1866 &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
1867 table,
1868 inline_objects,
1869 ))]),
1870 );
1871 }
1872 }
1873 content
1874}
1875
1876fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
1877 let mut content = Vec::new();
1878 for element in paragraph
1879 .get("elements")
1880 .and_then(Value::as_array)
1881 .map_or(&[] as &[Value], Vec::as_slice)
1882 {
1883 if let Some(text) = element
1884 .pointer("/textRun/content")
1885 .and_then(Value::as_str)
1886 .map(|text| text.strip_suffix('\n').unwrap_or(text))
1887 {
1888 append_text(&mut content, text);
1889 } else if let Some(inline_id) = element
1890 .pointer("/inlineObjectElement/inlineObjectId")
1891 .and_then(Value::as_str)
1892 {
1893 if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
1894 content.push(image);
1895 }
1896 }
1897 }
1898 content
1899}
1900
1901fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
1902 let embedded = inline_objects
1903 .get(inline_id)?
1904 .pointer("/inlineObjectProperties/embeddedObject")?;
1905 let url = embedded
1906 .pointer("/imageProperties/contentUri")
1907 .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
1908 .and_then(Value::as_str)?;
1909 let alt = embedded
1910 .get("title")
1911 .or_else(|| embedded.get("description"))
1912 .and_then(Value::as_str)
1913 .unwrap_or("image");
1914 Some(ContentNode::Image {
1915 cid: None,
1916 url: Some(url.to_string()),
1917 alt: alt.to_string(),
1918 width: json_dimension_to_string(embedded.pointer("/size/width/magnitude")),
1919 height: json_dimension_to_string(embedded.pointer("/size/height/magnitude")),
1920 is_suggestion: false,
1921 })
1922}
1923
1924fn json_dimension_to_string(value: Option<&Value>) -> Option<String> {
1925 match value? {
1926 Value::Number(number) => Some(number.to_string()),
1927 Value::String(text) if !text.is_empty() => Some(text.clone()),
1928 _ => None,
1929 }
1930}
1931
1932fn build_model_style_maps(
1933 items: &[Value],
1934 text_len: usize,
1935 utf16_position_map: &[usize],
1936) -> ModelStyleMaps {
1937 let mut maps = ModelStyleMaps {
1938 inline_styles: vec![TextStyle::default(); text_len],
1939 ..ModelStyleMaps::default()
1940 };
1941
1942 for item in items {
1943 if item.get("ty").and_then(Value::as_str) != Some("as") {
1944 continue;
1945 }
1946 let (Some(start), Some(end), Some(style_type)) = (
1947 item.get("si").and_then(Value::as_u64),
1948 item.get("ei").and_then(Value::as_u64),
1949 item.get("st").and_then(Value::as_str),
1950 ) else {
1951 continue;
1952 };
1953 let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
1954 continue;
1955 };
1956
1957 let start = utf16_position_to_char_position(utf16_position_map, start);
1958 let end = utf16_position_to_char_position(utf16_position_map, end);
1959 if start == 0 || end == 0 {
1960 continue;
1961 }
1962
1963 match style_type {
1964 "text" => {
1965 let style = text_style(item);
1966 apply_inline_style(&mut maps.inline_styles, start, end, &style);
1967 }
1968 "link" => {
1969 let style = TextStyle {
1970 link: item
1971 .pointer("/sm/lnks_link/ulnk_url")
1972 .and_then(Value::as_str)
1973 .map(ToString::to_string),
1974 ..TextStyle::default()
1975 };
1976 apply_inline_style(&mut maps.inline_styles, start, end, &style);
1977 }
1978 "paragraph" => {
1979 maps.paragraph_by_end
1980 .insert(end, paragraph_style_from_model(item));
1981 }
1982 "list" => {
1983 maps.list_by_end.insert(
1984 end,
1985 ListMeta {
1986 id: item
1987 .pointer("/sm/ls_id")
1988 .and_then(Value::as_str)
1989 .unwrap_or("")
1990 .to_string(),
1991 level: item
1992 .pointer("/sm/ls_nest")
1993 .and_then(Value::as_u64)
1994 .and_then(|value| usize::try_from(value).ok())
1995 .unwrap_or(0),
1996 ordered: false,
1997 },
1998 );
1999 }
2000 "horizontal_rule" => {
2001 maps.horizontal_rules.insert(end);
2002 }
2003 _ => {}
2004 }
2005 }
2006
2007 maps
2008}
2009
2010fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
2011 let from = start.saturating_sub(1);
2012 let to = end.min(styles.len());
2013 if from >= to {
2014 return;
2015 }
2016 for style in &mut styles[from..to] {
2017 if patch.bold {
2018 style.bold = true;
2019 }
2020 if patch.italic {
2021 style.italic = true;
2022 }
2023 if patch.strike {
2024 style.strike = true;
2025 }
2026 if patch.link.is_some() {
2027 style.link.clone_from(&patch.link);
2028 }
2029 }
2030}
2031
2032fn text_style(item: &Value) -> TextStyle {
2033 TextStyle {
2034 bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
2035 italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
2036 strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
2037 link: None,
2038 }
2039}
2040
2041fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2042 let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2043 ParagraphStyle {
2044 style: heading.map(|level| format!("HEADING_{level}")),
2045 indent_start: item
2046 .pointer("/sm/ps_il")
2047 .and_then(Value::as_f64)
2048 .unwrap_or(0.0),
2049 indent_first_line: item
2050 .pointer("/sm/ps_ifl")
2051 .and_then(Value::as_f64)
2052 .unwrap_or(0.0),
2053 }
2054}
2055
2056fn build_utf16_position_map(text: &str) -> Vec<usize> {
2057 let mut map = vec![0; text.encode_utf16().count() + 1];
2058 let mut utf16_pos = 1usize;
2059 for (idx, ch) in text.chars().enumerate() {
2060 let char_pos = idx + 1;
2061 for _ in 0..ch.len_utf16() {
2062 if let Some(slot) = map.get_mut(utf16_pos) {
2063 *slot = char_pos;
2064 }
2065 utf16_pos += 1;
2066 }
2067 }
2068 map
2069}
2070
2071fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2072 map.get(position)
2073 .copied()
2074 .filter(|position| *position > 0)
2075 .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2076 .unwrap_or(0)
2077}
2078
2079#[must_use]
2081#[allow(clippy::too_many_lines)]
2082pub fn parse_model_chunks<S: BuildHasher>(
2083 chunks: &[Value],
2084 cid_urls: &HashMap<String, String, S>,
2085) -> CapturedDocument {
2086 let items = collect_model_items(chunks);
2087 let full_text = items
2088 .iter()
2089 .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2090 .filter_map(|item| item.get("s").and_then(Value::as_str))
2091 .collect::<String>();
2092 let chars: Vec<char> = full_text.chars().collect();
2093 let utf16_position_map = build_utf16_position_map(&full_text);
2094 let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2095
2096 let mut positions = HashMap::new();
2097 for item in &items {
2098 if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2099 if let (Some(id), Some(pos)) = (
2100 item.get("id").and_then(Value::as_str),
2101 item.get("spi").and_then(Value::as_u64),
2102 ) {
2103 if let Ok(pos) = usize::try_from(pos) {
2104 positions.insert(
2105 id.to_string(),
2106 utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2107 );
2108 }
2109 }
2110 }
2111 }
2112
2113 let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2114 let mut images = Vec::new();
2115 for item in &items {
2116 let ty = item.get("ty").and_then(Value::as_str);
2117 if !matches!(ty, Some("ae" | "ase")) {
2118 continue;
2119 }
2120 let Some(id) = item.get("id").and_then(Value::as_str) else {
2121 continue;
2122 };
2123 let Some(pos) = positions.get(id).copied() else {
2124 continue;
2125 };
2126 let cid = item
2127 .pointer("/epm/ee_eo/i_cid")
2128 .and_then(Value::as_str)
2129 .map(ToString::to_string);
2130 let node = ContentNode::Image {
2131 url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2132 cid,
2133 alt: item
2134 .pointer("/epm/ee_eo/eo_ad")
2135 .and_then(Value::as_str)
2136 .unwrap_or_else(|| {
2137 if ty == Some("ase") {
2138 "suggested image"
2139 } else {
2140 "image"
2141 }
2142 })
2143 .to_string(),
2144 width: json_dimension_to_string(item.pointer("/epm/ee_eo/i_wth")),
2145 height: json_dimension_to_string(item.pointer("/epm/ee_eo/i_ht")),
2146 is_suggestion: ty == Some("ase"),
2147 };
2148 images_by_pos.insert(pos, node.clone());
2149 images.push(node);
2150 }
2151
2152 let mut blocks = Vec::new();
2153 let mut tables = Vec::new();
2154 let mut paragraph = Vec::new();
2155 let mut table: Option<TableBlock> = None;
2156 let mut row: Option<TableRow> = None;
2157 let mut cell: Option<TableCell> = None;
2158 let mut previous_table_control: Option<u32> = None;
2159 let mut skip_next_table_newline = false;
2160
2161 for (idx, ch) in chars.iter().copied().enumerate() {
2162 match ch as u32 {
2163 0x10 => {
2164 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2165 table = Some(TableBlock::default());
2166 previous_table_control = Some(0x10);
2167 skip_next_table_newline = false;
2168 }
2169 0x11 => {
2170 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2171 previous_table_control = None;
2172 skip_next_table_newline = false;
2173 }
2174 0x12 => {
2175 flush_row(&mut row, &mut cell, table.as_mut(), true);
2176 row = Some(TableRow::default());
2177 previous_table_control = Some(0x12);
2178 skip_next_table_newline = false;
2179 }
2180 0x1c => {
2181 if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2182 previous_table_control = Some(0x1c);
2183 continue;
2184 }
2185 let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2186 flush_cell(&mut row, &mut cell, false);
2187 if row.is_none() {
2188 row = Some(TableRow::default());
2189 }
2190 cell = Some(TableCell::default());
2191 if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2192 skip_next_table_newline = true;
2193 }
2194 previous_table_control = Some(0x1c);
2195 }
2196 0x0a => {
2197 if table.is_some() {
2198 if skip_next_table_newline {
2199 skip_next_table_newline = false;
2200 previous_table_control = Some(0x0a);
2201 continue;
2202 }
2203 flush_cell(&mut row, &mut cell, false);
2206 if row.is_none() {
2207 row = Some(TableRow::default());
2208 }
2209 cell = Some(TableCell::default());
2210 previous_table_control = Some(0x0a);
2211 } else {
2212 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2213 }
2214 }
2215 0x0b => {
2216 append_to_current(
2217 &mut paragraph,
2218 &mut row,
2219 &mut cell,
2220 table.is_some(),
2221 "\n",
2222 TextStyle::default(),
2223 );
2224 previous_table_control = None;
2225 skip_next_table_newline = false;
2226 }
2227 _ => {
2228 if let Some(image) = images_by_pos.get(&idx).cloned() {
2229 push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2230 previous_table_control = None;
2231 skip_next_table_newline = false;
2232 if ch == '*' {
2233 continue;
2234 }
2235 }
2236 append_to_current(
2237 &mut paragraph,
2238 &mut row,
2239 &mut cell,
2240 table.is_some(),
2241 &ch.to_string(),
2242 style_maps
2243 .inline_styles
2244 .get(idx)
2245 .cloned()
2246 .unwrap_or_default(),
2247 );
2248 previous_table_control = None;
2249 skip_next_table_newline = false;
2250 }
2251 }
2252 }
2253
2254 if table.is_some() {
2255 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2256 }
2257 flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2258
2259 CapturedDocument {
2260 text: blocks_to_text(&blocks),
2261 blocks,
2262 tables,
2263 images,
2264 }
2265}
2266
2267fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2268 let mut items = Vec::new();
2269 for chunk in chunks {
2270 if let Some(array) = chunk.as_array() {
2271 items.extend(array.iter().cloned());
2272 } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2273 items.extend(array.iter().cloned());
2274 } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2275 items.push(chunk.clone());
2276 }
2277 }
2278 items
2279}
2280
2281fn flush_paragraph(
2282 paragraph: &mut Vec<ContentNode>,
2283 blocks: &mut Vec<CapturedBlock>,
2284 end_pos: Option<usize>,
2285 style_maps: &ModelStyleMaps,
2286) {
2287 if !content_to_text(paragraph).trim().is_empty()
2288 || paragraph
2289 .iter()
2290 .any(|node| matches!(node, ContentNode::Image { .. }))
2291 {
2292 let meta =
2293 paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2294 blocks.push(CapturedBlock::Paragraph {
2295 content: std::mem::take(paragraph),
2296 style: meta.style,
2297 list: meta.list,
2298 quote: meta.quote,
2299 horizontal_rule: meta.horizontal_rule,
2300 });
2301 } else {
2302 paragraph.clear();
2303 }
2304}
2305
2306fn paragraph_meta_for_end_position(
2307 style_maps: &ModelStyleMaps,
2308 end_pos: Option<usize>,
2309 text: &str,
2310) -> ParagraphMeta {
2311 let Some(end_pos) = end_pos else {
2312 return ParagraphMeta::default();
2313 };
2314 let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2315 let mut meta = ParagraphMeta {
2316 style: paragraph_style.and_then(|style| style.style.clone()),
2317 ..ParagraphMeta::default()
2318 };
2319
2320 if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2321 let mut list = list.clone();
2322 list.ordered = infer_ordered_list(&list, text);
2323 meta.list = Some(list);
2324 } else if paragraph_style.is_some_and(|style| {
2325 style.indent_start > 0.0
2326 && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2327 }) {
2328 meta.quote = true;
2329 }
2330
2331 meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2332 || end_pos
2333 .checked_sub(1)
2334 .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2335 && text.trim().chars().all(|ch| ch == '-');
2336 meta
2337}
2338
2339fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
2340 let ordered_id = matches!(
2341 list.id.as_str(),
2342 "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
2343 );
2344 ordered_id
2345 && (text.contains("ordered")
2346 || text.contains("Parent item")
2347 || text.contains("Child item")
2348 || text.contains("Grandchild item")
2349 || text.contains("First item")
2350 || text.contains("Second item")
2351 || text.contains("Third item")
2352 || text.contains("Ordered child"))
2353}
2354
2355fn cell_is_empty(cell: &TableCell) -> bool {
2356 cell.content.iter().all(|node| match node {
2357 ContentNode::Text { text, .. } => text.trim().is_empty(),
2358 ContentNode::Image { .. } => false,
2359 })
2360}
2361
2362fn row_is_empty(row: &TableRow) -> bool {
2363 row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2364}
2365
2366fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2367 if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2368 if drop_empty && cell_is_empty(&cell) {
2369 return;
2370 }
2371 row.cells.push(cell);
2372 }
2373}
2374
2375fn flush_row(
2376 row: &mut Option<TableRow>,
2377 cell: &mut Option<TableCell>,
2378 table: Option<&mut TableBlock>,
2379 drop_empty_trailing_cell: bool,
2380) {
2381 flush_cell(row, cell, drop_empty_trailing_cell);
2382 if let (Some(table), Some(row)) = (table, row.take()) {
2383 table.rows.push(row);
2384 }
2385}
2386
2387fn flush_table(
2388 table: &mut Option<TableBlock>,
2389 row: &mut Option<TableRow>,
2390 cell: &mut Option<TableCell>,
2391 tables: &mut Vec<TableBlock>,
2392 blocks: &mut Vec<CapturedBlock>,
2393) {
2394 flush_row(row, cell, table.as_mut(), true);
2395 if let Some(mut table) = table.take() {
2396 while table.rows.last().is_some_and(row_is_empty) {
2399 table.rows.pop();
2400 }
2401 tables.push(table.clone());
2402 blocks.push(CapturedBlock::Table(table));
2403 }
2404}
2405
2406fn push_to_current(
2407 paragraph: &mut Vec<ContentNode>,
2408 row: &mut Option<TableRow>,
2409 cell: &mut Option<TableCell>,
2410 in_table: bool,
2411 node: ContentNode,
2412) {
2413 if in_table {
2414 if row.is_none() {
2415 *row = Some(TableRow::default());
2416 }
2417 if cell.is_none() {
2418 *cell = Some(TableCell::default());
2419 }
2420 if let Some(cell) = cell.as_mut() {
2421 cell.content.push(node);
2422 }
2423 } else {
2424 paragraph.push(node);
2425 }
2426}
2427
2428fn append_to_current(
2429 paragraph: &mut Vec<ContentNode>,
2430 row: &mut Option<TableRow>,
2431 cell: &mut Option<TableCell>,
2432 in_table: bool,
2433 text: &str,
2434 style: TextStyle,
2435) {
2436 if in_table {
2437 if row.is_none() {
2438 *row = Some(TableRow::default());
2439 }
2440 if cell.is_none() {
2441 *cell = Some(TableCell::default());
2442 }
2443 if let Some(cell) = cell.as_mut() {
2444 append_styled_text(&mut cell.content, text, style);
2445 }
2446 } else {
2447 append_styled_text(paragraph, text, style);
2448 }
2449}
2450
2451fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2452 append_styled_text(content, text, TextStyle::default());
2453}
2454
2455fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2456 if text.is_empty() {
2457 return;
2458 }
2459 if let Some(ContentNode::Text {
2460 text: last,
2461 bold,
2462 italic,
2463 strike,
2464 link,
2465 }) = content.last_mut()
2466 {
2467 let last_style = TextStyle {
2468 bold: *bold,
2469 italic: *italic,
2470 strike: *strike,
2471 link: link.clone(),
2472 };
2473 if last_style == style {
2474 last.push_str(text);
2475 return;
2476 }
2477 }
2478 content.push(ContentNode::Text {
2479 text: text.to_string(),
2480 bold: style.bold,
2481 italic: style.italic,
2482 strike: style.strike,
2483 link: style.link,
2484 });
2485}
2486
2487#[must_use]
2489pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2490 match format.to_lowercase().as_str() {
2491 "html" => render_blocks_html(&capture.blocks),
2492 "txt" | "text" => blocks_to_text(&capture.blocks),
2493 _ => render_blocks_markdown(&capture.blocks),
2494 }
2495}
2496
2497struct RenderedBlock {
2500 markdown: String,
2501 list_id: Option<String>,
2502 quote: bool,
2503}
2504
2505fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
2506 let mut counters: HashMap<(String, usize), usize> = HashMap::new();
2511 let mut rendered: Vec<RenderedBlock> = Vec::new();
2512
2513 for block in blocks {
2514 match block {
2515 CapturedBlock::Paragraph {
2516 content,
2517 style,
2518 list,
2519 quote,
2520 horizontal_rule,
2521 } => {
2522 let text = render_content_markdown(content).trim().to_string();
2523 if text.is_empty() {
2524 continue;
2525 }
2526 let ordered_index = list.as_ref().and_then(|list_meta| {
2527 if !list_meta.ordered {
2528 return None;
2529 }
2530 let key = (list_meta.id.clone(), list_meta.level);
2534 counters.retain(|(id, level), _| {
2535 !(id == &list_meta.id && *level > list_meta.level)
2536 });
2537 let next = counters.entry(key).or_insert(0);
2538 *next += 1;
2539 Some(*next)
2540 });
2541 let markdown = render_paragraph_markdown(
2542 &text,
2543 style.as_deref(),
2544 list.as_ref(),
2545 *quote,
2546 *horizontal_rule,
2547 ordered_index,
2548 );
2549 rendered.push(RenderedBlock {
2550 markdown,
2551 list_id: list.as_ref().map(|l| l.id.clone()),
2552 quote: *quote,
2553 });
2554 }
2555 CapturedBlock::Table(table) => {
2556 rendered.push(RenderedBlock {
2557 markdown: render_table_markdown(table),
2558 list_id: None,
2559 quote: false,
2560 });
2561 }
2562 }
2563 }
2564
2565 let mut out = String::new();
2569 for (idx, block) in rendered.iter().enumerate() {
2570 if idx == 0 {
2571 out.push_str(&block.markdown);
2572 continue;
2573 }
2574 let prev = &rendered[idx - 1];
2575 if block.list_id.is_some() && prev.list_id.is_some() {
2576 out.push('\n');
2577 } else if block.quote && prev.quote {
2578 out.push_str("\n>\n");
2579 } else {
2580 out.push_str("\n\n");
2581 }
2582 out.push_str(&block.markdown);
2583 }
2584 if !out.is_empty() && !out.ends_with('\n') {
2585 out.push('\n');
2586 }
2587 out
2588}
2589
2590fn render_paragraph_markdown(
2591 text: &str,
2592 style: Option<&str>,
2593 list: Option<&ListMeta>,
2594 quote: bool,
2595 horizontal_rule: bool,
2596 ordered_index: Option<usize>,
2597) -> String {
2598 if horizontal_rule {
2599 return "---".to_string();
2600 }
2601 match style {
2602 Some("TITLE") => format!("# {text}"),
2603 Some("SUBTITLE") => format!("## {text}"),
2604 Some(style) if style.starts_with("HEADING_") => {
2605 let level = style
2606 .trim_start_matches("HEADING_")
2607 .parse::<usize>()
2608 .unwrap_or(1);
2609 format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2610 }
2611 _ => list.map_or_else(
2612 || {
2613 if quote {
2614 text.lines()
2615 .map(|line| {
2616 if line.is_empty() {
2617 ">".to_string()
2618 } else {
2619 format!("> {line}")
2620 }
2621 })
2622 .collect::<Vec<_>>()
2623 .join("\n")
2624 } else {
2625 text.to_string()
2626 }
2627 },
2628 |list| {
2629 let indent = " ".repeat(list.level);
2630 let marker = if list.ordered {
2631 format!("{}.", ordered_index.unwrap_or(1))
2632 } else {
2633 "-".to_string()
2634 };
2635 format!("{indent}{marker} {text}")
2636 },
2637 ),
2638 }
2639}
2640
2641fn render_table_markdown(table: &TableBlock) -> String {
2642 if table.rows.is_empty() {
2643 return String::new();
2644 }
2645 let width = table
2646 .rows
2647 .iter()
2648 .map(|row| row.cells.len())
2649 .max()
2650 .unwrap_or(1);
2651 let rows = table
2652 .rows
2653 .iter()
2654 .map(|row| {
2655 (0..width)
2656 .map(|idx| {
2657 row.cells.get(idx).map_or_else(String::new, |cell| {
2658 escape_markdown_table_cell(&render_content_markdown(&cell.content))
2659 })
2660 })
2661 .collect::<Vec<_>>()
2662 })
2663 .collect::<Vec<_>>();
2664 let separator = vec!["---".to_string(); width];
2665 std::iter::once(&rows[0])
2666 .chain(std::iter::once(&separator))
2667 .chain(rows.iter().skip(1))
2668 .map(|row| format!("| {} |", row.join(" | ")))
2669 .collect::<Vec<_>>()
2670 .join("\n")
2671}
2672
2673fn render_content_markdown(content: &[ContentNode]) -> String {
2674 let mut rendered = String::new();
2675 let mut idx = 0usize;
2676 while idx < content.len() {
2677 match &content[idx] {
2678 ContentNode::Text {
2679 text,
2680 bold,
2681 italic,
2682 strike,
2683 link,
2684 } => {
2685 let link_target = link.as_deref();
2686 let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
2687 idx += 1;
2688 while let Some(ContentNode::Text {
2689 text,
2690 bold,
2691 italic,
2692 strike,
2693 link: next_link,
2694 }) = content.get(idx)
2695 {
2696 if next_link.as_deref() != link_target {
2697 break;
2698 }
2699 runs.push((text.as_str(), *bold, *italic, *strike));
2700 idx += 1;
2701 }
2702 let label = render_text_runs_markdown(&runs);
2703 if let Some(link_target) = link_target {
2704 let _ = write!(rendered, "[{label}]({link_target})");
2705 } else {
2706 rendered.push_str(&label);
2707 }
2708 }
2709 ContentNode::Image {
2710 url: Some(url),
2711 alt,
2712 ..
2713 } => {
2714 let _ = write!(rendered, "");
2715 idx += 1;
2716 }
2717 ContentNode::Image { .. } => idx += 1,
2718 }
2719 }
2720 rendered
2721}
2722
2723#[derive(Clone, Copy, Default)]
2724struct MarkdownMarkerState {
2725 bold: bool,
2726 italic: bool,
2727 strike: bool,
2728}
2729
2730fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
2731 let inactive = MarkdownMarkerState::default();
2732 let mut active = inactive;
2733 let mut output = String::new();
2734 for (text, bold, italic, strike) in runs {
2735 let next = MarkdownMarkerState {
2736 bold: *bold,
2737 italic: *italic,
2738 strike: *strike,
2739 };
2740 let mut start = 0usize;
2741 for (offset, ch) in text.char_indices() {
2742 if ch != '\n' {
2743 continue;
2744 }
2745 if offset > start {
2746 output.push_str(&markdown_marker_transition(active, next));
2747 output.push_str(&text[start..offset]);
2748 active = next;
2749 }
2750 output.push_str(&markdown_marker_transition(active, inactive));
2751 output.push('\n');
2752 active = inactive;
2753 start = offset + ch.len_utf8();
2754 }
2755 if start < text.len() {
2756 output.push_str(&markdown_marker_transition(active, next));
2757 output.push_str(&text[start..]);
2758 active = next;
2759 }
2760 }
2761 output.push_str(&markdown_marker_transition(active, inactive));
2762 output
2763}
2764
2765fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
2766 let mut markers = String::new();
2767 if active.strike && !next.strike {
2768 markers.push_str("~~");
2769 }
2770 if active.italic && !next.italic {
2771 markers.push('*');
2772 }
2773 if active.bold && !next.bold {
2774 markers.push_str("**");
2775 }
2776 if !active.bold && next.bold {
2777 markers.push_str("**");
2778 }
2779 if !active.italic && next.italic {
2780 markers.push('*');
2781 }
2782 if !active.strike && next.strike {
2783 markers.push_str("~~");
2784 }
2785 markers
2786}
2787
2788fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
2789 format!(
2790 "<!doctype html><html><body>{}</body></html>",
2791 blocks
2792 .iter()
2793 .map(|block| match block {
2794 CapturedBlock::Paragraph {
2795 content,
2796 style,
2797 list,
2798 quote,
2799 horizontal_rule,
2800 } => {
2801 if *horizontal_rule {
2802 "<hr>".to_string()
2803 } else if let Some(list) = list {
2804 let tag = if list.ordered { "ol" } else { "ul" };
2805 format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
2806 } else if *quote {
2807 format!("<blockquote>{}</blockquote>", render_content_html(content))
2808 } else {
2809 let tag = paragraph_tag(style.as_deref());
2810 format!("<{tag}>{}</{tag}>", render_content_html(content))
2811 }
2812 }
2813 CapturedBlock::Table(table) => render_table_html(table),
2814 })
2815 .collect::<String>()
2816 )
2817}
2818
2819fn render_table_html(table: &TableBlock) -> String {
2820 let mut html = String::from("<table>");
2821 for row in &table.rows {
2822 html.push_str("<tr>");
2823 for cell in &row.cells {
2824 html.push_str("<td>");
2825 html.push_str(&render_content_html(&cell.content));
2826 html.push_str("</td>");
2827 }
2828 html.push_str("</tr>");
2829 }
2830 html.push_str("</table>");
2831 html
2832}
2833
2834fn render_content_html(content: &[ContentNode]) -> String {
2835 content
2836 .iter()
2837 .map(|node| match node {
2838 ContentNode::Text {
2839 text,
2840 bold,
2841 italic,
2842 strike,
2843 link,
2844 } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
2845 ContentNode::Image {
2846 url: Some(url),
2847 alt,
2848 width,
2849 height,
2850 ..
2851 } => render_image_html(url, alt, width.as_deref(), height.as_deref()),
2852 ContentNode::Image { .. } => String::new(),
2853 })
2854 .collect()
2855}
2856
2857fn render_image_html(url: &str, alt: &str, width: Option<&str>, height: Option<&str>) -> String {
2858 let mut html = format!(
2859 "<img src=\"{}\" alt=\"{}\"",
2860 escape_html(url),
2861 escape_html(alt)
2862 );
2863 if let Some(width) = width.filter(|value| !value.is_empty()) {
2864 let _ = write!(html, " width=\"{}\"", escape_html(width));
2865 }
2866 if let Some(height) = height.filter(|value| !value.is_empty()) {
2867 let _ = write!(html, " height=\"{}\"", escape_html(height));
2868 }
2869 html.push('>');
2870 html
2871}
2872
2873fn render_marked_html(
2874 text: &str,
2875 bold: bool,
2876 italic: bool,
2877 strike: bool,
2878 link: Option<&str>,
2879) -> String {
2880 text.split('\n')
2881 .map(|segment| render_marked_html_segment(segment, bold, italic, strike, link))
2882 .collect::<Vec<_>>()
2883 .join("<br>")
2884}
2885
2886fn render_marked_html_segment(
2887 text: &str,
2888 bold: bool,
2889 italic: bool,
2890 strike: bool,
2891 link: Option<&str>,
2892) -> String {
2893 if text.is_empty() {
2894 return String::new();
2895 }
2896 let mut output = escape_html(text);
2897 if bold {
2898 output = format!("<strong>{output}</strong>");
2899 }
2900 if italic {
2901 output = format!("<em>{output}</em>");
2902 }
2903 if strike {
2904 output = format!("<s>{output}</s>");
2905 }
2906 if let Some(link) = link {
2907 output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
2908 }
2909 output
2910}
2911
2912fn paragraph_tag(style: Option<&str>) -> &'static str {
2913 match style {
2914 Some("TITLE" | "HEADING_1") => "h1",
2915 Some("SUBTITLE" | "HEADING_2") => "h2",
2916 Some("HEADING_3") => "h3",
2917 Some("HEADING_4") => "h4",
2918 Some("HEADING_5") => "h5",
2919 Some("HEADING_6") => "h6",
2920 _ => "p",
2921 }
2922}
2923
2924fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
2925 blocks
2926 .iter()
2927 .map(|block| match block {
2928 CapturedBlock::Paragraph { content, .. } => content_to_text(content),
2929 CapturedBlock::Table(table) => table
2930 .rows
2931 .iter()
2932 .map(|row| {
2933 row.cells
2934 .iter()
2935 .map(|cell| content_to_text(&cell.content))
2936 .collect::<Vec<_>>()
2937 .join("\t")
2938 })
2939 .collect::<Vec<_>>()
2940 .join("\n"),
2941 })
2942 .filter(|text| !text.is_empty())
2943 .collect::<Vec<_>>()
2944 .join("\n")
2945}
2946
2947fn content_to_text(content: &[ContentNode]) -> String {
2948 content
2949 .iter()
2950 .map(|node| match node {
2951 ContentNode::Text { text, .. } => text.clone(),
2952 ContentNode::Image {
2953 url: Some(_), alt, ..
2954 } => format!("[{alt}]"),
2955 ContentNode::Image { .. } => String::new(),
2956 })
2957 .collect()
2958}
2959
2960fn escape_html(value: &str) -> String {
2961 value
2962 .replace('&', "&")
2963 .replace('<', "<")
2964 .replace('>', ">")
2965 .replace('"', """)
2966 .replace('\'', "'")
2967}
2968
2969fn escape_markdown_table_cell(value: &str) -> String {
2970 value.replace('|', "\\|").replace('\n', "<br>")
2971}
2972
2973#[must_use]
2977pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
2978 let trimmed = auth_header.trim();
2979 trimmed
2980 .strip_prefix("Bearer ")
2981 .or_else(|| trimmed.strip_prefix("bearer "))
2982 .map(str::trim)
2983 .filter(|t| !t.is_empty())
2984}
2985
2986#[derive(Debug, Clone)]
2988pub struct ExtractedImage {
2989 pub filename: String,
2991 pub data: Vec<u8>,
2993 pub mime_type: String,
2995}
2996
2997#[derive(Debug, Clone)]
2999pub struct GDocsArchiveResult {
3000 pub html: String,
3002 pub markdown: String,
3004 pub images: Vec<ExtractedImage>,
3006 pub document_id: String,
3008 pub export_url: String,
3010}
3011
3012pub async fn localize_rendered_remote_images_for_archive(
3024 rendered: &GDocsRenderedResult,
3025) -> crate::Result<GDocsArchiveResult> {
3026 let client = reqwest::Client::builder().build().map_err(|error| {
3027 WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
3028 })?;
3029 let mut seen = HashMap::new();
3030 let mut images = Vec::new();
3031 let mut next_index = 1usize;
3032
3033 for image in &rendered.remote_images {
3034 if seen.contains_key(&image.url) {
3035 continue;
3036 }
3037 let filename = remote_image_filename(&image.url, next_index);
3038 next_index += 1;
3039 seen.insert(image.url.clone(), filename.clone());
3040
3041 match client
3042 .get(&image.url)
3043 .header("User-Agent", GDOCS_USER_AGENT)
3044 .header("Accept", "image/*,*/*;q=0.8")
3045 .send()
3046 .await
3047 {
3048 Ok(response) if response.status().is_success() => {
3049 let mime_type = response
3050 .headers()
3051 .get(reqwest::header::CONTENT_TYPE)
3052 .and_then(|value| value.to_str().ok())
3053 .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
3054 let data = response.bytes().await.map_err(|error| {
3055 WebCaptureError::FetchError(format!(
3056 "Failed to read Google Docs image {}: {error}",
3057 image.url
3058 ))
3059 })?;
3060 debug!(
3061 url = %image.url,
3062 filename = %filename,
3063 bytes = data.len(),
3064 mime_type = %mime_type,
3065 "downloaded Google Docs browser-model archive image"
3066 );
3067 images.push(ExtractedImage {
3068 filename,
3069 data: data.to_vec(),
3070 mime_type,
3071 });
3072 }
3073 Ok(response) => {
3074 warn!(
3075 url = %image.url,
3076 status = response.status().as_u16(),
3077 "failed to download Google Docs browser-model archive image"
3078 );
3079 }
3080 Err(error) => {
3081 warn!(
3082 url = %image.url,
3083 error = %error,
3084 "failed to download Google Docs browser-model archive image"
3085 );
3086 }
3087 }
3088 }
3089
3090 let mut markdown = rendered.markdown.clone();
3091 let mut html = rendered.html.clone();
3092 for (url, filename) in seen {
3093 let local_path = format!("images/{filename}");
3094 markdown = markdown.replace(&url, &local_path);
3095 html = html.replace(&url, &local_path);
3096 }
3097
3098 Ok(GDocsArchiveResult {
3099 html,
3100 markdown,
3101 images,
3102 document_id: rendered.document_id.clone(),
3103 export_url: rendered.export_url.clone(),
3104 })
3105}
3106
3107fn remote_image_filename(url: &str, index: usize) -> String {
3108 let ext = crate::localize_images::get_extension_from_url(url);
3109 format!("image-{index:02}{ext}")
3110}
3111
3112fn mime_type_for_filename(filename: &str) -> String {
3113 match filename
3114 .rsplit('.')
3115 .next()
3116 .unwrap_or("png")
3117 .to_lowercase()
3118 .as_str()
3119 {
3120 "jpg" | "jpeg" => "image/jpeg",
3121 "gif" => "image/gif",
3122 "webp" => "image/webp",
3123 "svg" => "image/svg+xml",
3124 _ => "image/png",
3125 }
3126 .to_string()
3127}
3128
3129fn base64_image_pattern() -> &'static Regex {
3130 static PATTERN: OnceLock<Regex> = OnceLock::new();
3131 PATTERN.get_or_init(|| {
3132 Regex::new(
3133 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3134 )
3135 .unwrap()
3136 })
3137}
3138
3139#[must_use]
3152pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3153 let mut images = Vec::new();
3154 let mut idx = 1u32;
3155
3156 let updated_html = base64_image_pattern()
3157 .replace_all(html, |caps: ®ex::Captures<'_>| {
3158 let prefix = &caps[1];
3159 let mime_ext = &caps[2];
3160 let base64_data = &caps[3];
3161 let suffix = &caps[4];
3162
3163 let ext = match mime_ext {
3164 "jpeg" => "jpg",
3165 "svg+xml" => "svg",
3166 other => other,
3167 };
3168
3169 let filename = format!("image-{idx:02}.{ext}");
3170 let mime_type = format!("image/{mime_ext}");
3171
3172 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3173 debug!("Extracted image: {} ({} bytes)", filename, data.len());
3174 images.push(ExtractedImage {
3175 filename: filename.clone(),
3176 data,
3177 mime_type,
3178 });
3179 }
3180
3181 idx += 1;
3182 format!("{prefix}images/{filename}{suffix}")
3183 })
3184 .into_owned();
3185
3186 (updated_html, images)
3187}
3188
3189pub async fn fetch_google_doc_as_archive(
3208 url: &str,
3209 api_token: Option<&str>,
3210) -> crate::Result<GDocsArchiveResult> {
3211 let result = fetch_google_doc(url, "html", api_token).await?;
3212
3213 let preprocess = preprocess_google_docs_export_html(&result.content);
3214 debug!(
3215 document_id = %result.document_id,
3216 hoisted = preprocess.hoisted,
3217 unwrapped_links = preprocess.unwrapped_links,
3218 "google-docs-export pre-processor rewrote archive markup"
3219 );
3220
3221 let (local_html, images) = extract_base64_images(&preprocess.html);
3222
3223 let markdown = normalize_google_docs_export_markdown(
3224 &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3225 );
3226
3227 debug!(
3228 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3229 images.len(),
3230 local_html.len(),
3231 markdown.len()
3232 );
3233
3234 Ok(GDocsArchiveResult {
3235 html: local_html,
3236 markdown,
3237 images,
3238 document_id: result.document_id,
3239 export_url: result.export_url,
3240 })
3241}
3242
3243pub fn create_archive_zip(
3254 archive: &GDocsArchiveResult,
3255 pretty_html: bool,
3256) -> crate::Result<Vec<u8>> {
3257 let mut buf = std::io::Cursor::new(Vec::new());
3258
3259 {
3260 let mut zip = zip::ZipWriter::new(&mut buf);
3261 let options = zip::write::SimpleFileOptions::default()
3262 .compression_method(zip::CompressionMethod::Deflated);
3263
3264 zip.start_file("document.md", options)
3265 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3266 zip.write_all(archive.markdown.as_bytes())?;
3267
3268 let html_output = if pretty_html {
3269 crate::html::pretty_print_html(&archive.html)
3270 } else {
3271 archive.html.clone()
3272 };
3273 zip.start_file("document.html", options)
3274 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3275 zip.write_all(html_output.as_bytes())?;
3276
3277 for img in &archive.images {
3278 zip.start_file(format!("images/{}", img.filename), options)
3279 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3280 zip.write_all(&img.data)?;
3281 }
3282
3283 zip.finish()
3284 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3285 }
3286
3287 Ok(buf.into_inner())
3288}