1use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use scraper::{node::Node, ElementRef, Html, Selector};
38use serde_json::Value;
39use std::collections::HashMap;
40use std::fmt::Write as _;
41use std::hash::BuildHasher;
42use std::io::Write;
43use std::process::Stdio;
44use std::sync::OnceLock;
45use std::time::{Duration, Instant};
46use tokio::io::{AsyncBufReadExt, BufReader};
47use tokio::process::{Child, Command};
48use tracing::{debug, info, warn};
49
50use crate::WebCaptureError;
51
52const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
53const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
54const GDOCS_USER_AGENT: &str =
55 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
56const GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT: Duration = Duration::from_secs(30);
57const GDOCS_EDITOR_MODEL_STABILITY_DEFAULT: Duration = Duration::from_millis(1500);
58const GDOCS_EDITOR_MODEL_POLL_INTERVAL: Duration = Duration::from_millis(250);
59const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
60
61type CdpWebSocket = WebSocketStream<ConnectStream>;
62
63const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
64window.__captured_chunks = [];
65const captureChunk = (value) => {
66 if (!value) {
67 return;
68 }
69 if (Array.isArray(value)) {
70 for (const item of value) {
71 captureChunk(item);
72 }
73 return;
74 }
75 try {
76 window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
77 } catch {
78 window.__captured_chunks.push(value);
79 }
80};
81const wrapChunkArray = (value) => {
82 if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
83 return value;
84 }
85 const originalPush = value.push;
86 Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
87 value: true,
88 enumerable: false,
89 });
90 Object.defineProperty(value, 'push', {
91 value(...items) {
92 for (const item of items) {
93 captureChunk(item);
94 }
95 return originalPush.apply(this, items);
96 },
97 writable: true,
98 configurable: true,
99 });
100 for (const item of value) {
101 captureChunk(item);
102 }
103 return value;
104};
105Object.defineProperty(window, 'DOCS_modelChunk', {
106 set(value) {
107 captureChunk(value);
108 window.__DOCS_modelChunk_latest = wrapChunkArray(value);
109 },
110 get() {
111 return window.__DOCS_modelChunk_latest;
112 },
113 configurable: false,
114});
115";
116
117const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
118 const chunks = [...(window.__captured_chunks || [])];
119 if (
120 window.DOCS_modelChunk &&
121 chunks.length === 0 &&
122 !chunks.includes(window.DOCS_modelChunk)
123 ) {
124 chunks.push(window.DOCS_modelChunk);
125 }
126 const cidUrlMap = {};
127 const scripts = document.querySelectorAll('script');
128 for (const script of scripts) {
129 const text = script.textContent || '';
130 if (!text.includes('docs-images-rt')) {
131 continue;
132 }
133 const regex =
134 /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
135 let match;
136 while ((match = regex.exec(text)) !== null) {
137 cidUrlMap[match[1]] = match[2]
138 .replace(/\\u003d/g, '=')
139 .replace(/\\u0026/g, '&')
140 .replace(/\\\//g, '/');
141 }
142 }
143 return { chunks, cidUrlMap };
144}"#;
145
146fn gdocs_url_pattern() -> &'static Regex {
147 static PATTERN: OnceLock<Regex> = OnceLock::new();
148 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
149}
150
151#[derive(Debug, Clone)]
153pub struct GDocsResult {
154 pub content: String,
156 pub format: String,
158 pub document_id: String,
160 pub export_url: String,
162}
163
164#[derive(Debug, Clone, Copy, PartialEq, Eq)]
166pub enum GDocsCaptureMethod {
167 BrowserModel,
169 PublicExport,
171 DocsApi,
173}
174
175#[derive(Debug, Clone)]
177pub struct GDocsRenderedResult {
178 pub markdown: String,
180 pub html: String,
182 pub text: String,
184 pub document_id: String,
186 pub export_url: String,
188 pub remote_images: Vec<RemoteImage>,
190}
191
192#[derive(Debug, Clone, PartialEq, Eq)]
194pub struct RemoteImage {
195 pub url: String,
197 pub alt: String,
199}
200
201#[derive(Debug, Clone)]
202struct BrowserModelData {
203 chunks: Vec<Value>,
204 cid_urls: HashMap<String, String>,
205 chunk_payload_bytes: usize,
206 poll_count: usize,
207 stable_for: Duration,
208}
209
210#[derive(Debug, Clone, Copy, PartialEq, Eq)]
211struct BrowserModelFingerprint {
212 chunks: usize,
213 payload_bytes: usize,
214}
215
216#[derive(Debug, Default)]
217struct BrowserModelQuiescence {
218 last_fingerprint: Option<BrowserModelFingerprint>,
219 stable_since: Option<Instant>,
220}
221
222impl BrowserModelData {
223 const fn fingerprint(&self) -> BrowserModelFingerprint {
224 BrowserModelFingerprint {
225 chunks: self.chunks.len(),
226 payload_bytes: self.chunk_payload_bytes,
227 }
228 }
229}
230
231impl BrowserModelQuiescence {
232 fn observe(
233 &mut self,
234 fingerprint: BrowserModelFingerprint,
235 now: Instant,
236 stability_window: Duration,
237 ) -> Option<Duration> {
238 if fingerprint.chunks == 0 {
239 self.last_fingerprint = Some(fingerprint);
240 self.stable_since = None;
241 return None;
242 }
243
244 if self.last_fingerprint == Some(fingerprint) {
245 let stable_since = *self.stable_since.get_or_insert(now);
246 let stable_for = now.saturating_duration_since(stable_since);
247 if stable_for >= stability_window {
248 return Some(stable_for);
249 }
250 } else {
251 self.last_fingerprint = Some(fingerprint);
252 self.stable_since = None;
253 }
254
255 None
256 }
257
258 fn stable_for(&self, now: Instant) -> Duration {
259 self.stable_since.map_or(Duration::ZERO, |stable_since| {
260 now.saturating_duration_since(stable_since)
261 })
262 }
263}
264
265#[derive(Debug, Clone, Default)]
267pub struct CapturedDocument {
268 pub blocks: Vec<CapturedBlock>,
270 pub tables: Vec<TableBlock>,
272 pub images: Vec<ContentNode>,
274 pub text: String,
276}
277
278#[derive(Debug, Clone)]
280pub enum CapturedBlock {
281 Paragraph {
283 content: Vec<ContentNode>,
285 style: Option<String>,
287 list: Option<ListMeta>,
289 quote: bool,
291 horizontal_rule: bool,
293 },
294 Table(TableBlock),
296}
297
298#[derive(Debug, Clone, Default)]
300pub struct TableBlock {
301 pub rows: Vec<TableRow>,
303}
304
305#[derive(Debug, Clone, Default)]
307pub struct TableRow {
308 pub cells: Vec<TableCell>,
310}
311
312#[derive(Debug, Clone, Default)]
314pub struct TableCell {
315 pub content: Vec<ContentNode>,
317}
318
319#[derive(Debug, Clone, PartialEq, Eq)]
321pub enum ContentNode {
322 Text {
324 text: String,
326 bold: bool,
328 italic: bool,
330 strike: bool,
332 link: Option<String>,
334 },
335 Image {
337 cid: Option<String>,
339 url: Option<String>,
341 alt: String,
343 width: Option<String>,
345 height: Option<String>,
347 is_suggestion: bool,
349 },
350}
351
352#[derive(Debug, Clone, Default, PartialEq, Eq)]
353struct TextStyle {
354 bold: bool,
355 italic: bool,
356 strike: bool,
357 link: Option<String>,
358}
359
360#[derive(Debug, Clone, Default)]
361struct ParagraphMeta {
362 style: Option<String>,
363 list: Option<ListMeta>,
364 quote: bool,
365 horizontal_rule: bool,
366}
367
368#[derive(Debug, Clone)]
369pub struct ListMeta {
370 pub id: String,
372 pub level: usize,
374 pub ordered: bool,
376}
377
378#[derive(Debug, Clone)]
379struct ParagraphStyle {
380 style: Option<String>,
381 indent_start: f64,
382 indent_first_line: f64,
383}
384
385#[derive(Debug, Clone)]
386struct ExportSemanticHint {
387 text: String,
388 list_ordered: Option<bool>,
389 quote: bool,
390}
391
392#[derive(Debug, Clone, Default)]
393struct ModelStyleMaps {
394 inline_styles: Vec<TextStyle>,
395 paragraph_by_end: HashMap<usize, ParagraphStyle>,
396 list_by_end: HashMap<usize, ListMeta>,
397 horizontal_rules: std::collections::HashSet<usize>,
398}
399
400#[must_use]
402pub fn is_google_docs_url(url: &str) -> bool {
403 gdocs_url_pattern().is_match(url)
404}
405
406#[must_use]
410pub fn extract_document_id(url: &str) -> Option<String> {
411 gdocs_url_pattern()
412 .captures(url)
413 .and_then(|caps| caps.get(1))
414 .map(|m| m.as_str().to_string())
415}
416
417#[must_use]
424pub fn build_export_url(document_id: &str, format: &str) -> String {
425 let export_format = match format {
426 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
427 _ => "html",
428 };
429 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
430}
431
432#[must_use]
434pub fn build_edit_url(document_id: &str) -> String {
435 format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
436}
437
438#[must_use]
440pub fn build_docs_api_url(document_id: &str) -> String {
441 format!("{GDOCS_API_BASE}/{document_id}")
442}
443
444pub fn select_capture_method(
450 capture: &str,
451 api_token: Option<&str>,
452) -> crate::Result<GDocsCaptureMethod> {
453 match capture.to_lowercase().as_str() {
454 "browser" => Ok(GDocsCaptureMethod::BrowserModel),
455 "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
456 "api" => Ok(GDocsCaptureMethod::PublicExport),
457 other => Err(WebCaptureError::InvalidUrl(format!(
458 "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
459 ))),
460 }
461}
462
463pub async fn fetch_google_doc(
478 url: &str,
479 format: &str,
480 api_token: Option<&str>,
481) -> crate::Result<GDocsResult> {
482 let document_id = extract_document_id(url).ok_or_else(|| {
483 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
484 })?;
485
486 let export_url = build_export_url(&document_id, format);
487 debug!(
488 document_id = %document_id,
489 format = %format,
490 export_url = %export_url,
491 has_api_token = api_token.is_some(),
492 "fetching Google Doc via public export"
493 );
494
495 let mut request = reqwest::Client::new()
496 .get(&export_url)
497 .header(
498 "User-Agent",
499 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
500 )
501 .header("Accept-Charset", "utf-8")
502 .header("Accept-Language", "en-US,en;q=0.9");
503
504 if let Some(token) = api_token {
505 request = request.header("Authorization", format!("Bearer {token}"));
506 }
507
508 let response = request
509 .send()
510 .await
511 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
512 debug!(
513 document_id = %document_id,
514 status = response.status().as_u16(),
515 success = response.status().is_success(),
516 content_type = response
517 .headers()
518 .get(reqwest::header::CONTENT_TYPE)
519 .and_then(|value| value.to_str().ok())
520 .unwrap_or(""),
521 "received Google Docs public export response"
522 );
523
524 if !response.status().is_success() {
525 return Err(WebCaptureError::FetchError(format!(
526 "Failed to fetch Google Doc ({} {}): {}",
527 response.status().as_u16(),
528 response.status().canonical_reason().unwrap_or("Unknown"),
529 export_url
530 )));
531 }
532
533 let raw_content = response.text().await.map_err(|e| {
534 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
535 })?;
536 debug!(
537 document_id = %document_id,
538 bytes = raw_content.len(),
539 "read Google Docs public export body"
540 );
541
542 let content = match format {
545 "txt" | "md" => crate::html::decode_html_entities(&raw_content),
546 _ => raw_content,
547 };
548
549 Ok(GDocsResult {
550 content,
551 format: format.to_string(),
552 document_id,
553 export_url,
554 })
555}
556
557pub async fn fetch_google_doc_as_markdown(
571 url: &str,
572 api_token: Option<&str>,
573) -> crate::Result<GDocsResult> {
574 let result = fetch_google_doc(url, "html", api_token).await?;
575
576 let preprocess = preprocess_google_docs_export_html(&result.content);
577 debug!(
578 document_id = %result.document_id,
579 hoisted = preprocess.hoisted,
580 unwrapped_links = preprocess.unwrapped_links,
581 "google-docs-export pre-processor rewrote markup"
582 );
583 let markdown = normalize_google_docs_export_markdown(
584 &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
585 );
586 debug!(
587 document_id = %result.document_id,
588 bytes = markdown.len(),
589 "rendered Google Docs public export markdown"
590 );
591
592 Ok(GDocsResult {
593 content: markdown,
594 format: "markdown".to_string(),
595 document_id: result.document_id,
596 export_url: result.export_url,
597 })
598}
599
600#[derive(Debug, Clone)]
605pub struct GDocsExportPreprocessResult {
606 pub html: String,
608 pub hoisted: usize,
610 pub unwrapped_links: usize,
612}
613
614#[must_use]
622pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
623 let mut hoisted: usize = 0;
624 let mut unwrapped_links: usize = 0;
625 let class_styles = extract_css_class_styles(html);
626
627 let mut out = hoist_inline_style_spans(html, &mut hoisted);
628 out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
629 out = convert_class_indented_blockquotes(&out, &class_styles);
630 out = nest_google_docs_lists(&out, &class_styles);
631 out = strip_google_docs_heading_noise(&out);
632 out = strip_heading_inline_formatting(&out);
633 out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
634 out = out.replace(" ", " ");
635 out = out.replace('\u{00A0}', " ");
636
637 GDocsExportPreprocessResult {
638 html: out,
639 hoisted,
640 unwrapped_links,
641 }
642}
643
644#[must_use]
646pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
647 let markdown = unescape_public_export_punctuation(markdown);
648 let markdown = convert_setext_headings(&markdown);
649 let markdown = normalize_atx_headings(&markdown);
650 let markdown = normalize_bullet_markers(&markdown);
651 let markdown = normalize_list_spacing(&markdown);
652 let markdown = normalize_blockquote_spacing(&markdown);
653 let markdown = normalize_markdown_tables(&markdown);
654 crate::markdown::clean_markdown(&markdown)
655}
656
657fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
658 let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
659 .expect("valid regex");
660 span_re
661 .replace_all(html, |caps: ®ex::Captures<'_>| {
662 let style = caps.get(2).map_or("", |m| m.as_str());
663 let inner = caps.get(3).map_or("", |m| m.as_str());
664 semantic_wrapped_html(inner, style).map_or_else(
665 || caps[0].to_string(),
666 |wrapped| {
667 *hoisted += 1;
668 wrapped
669 },
670 )
671 })
672 .into_owned()
673}
674
675fn hoist_class_style_spans(
676 html: &str,
677 class_styles: &HashMap<String, String>,
678 hoisted: &mut usize,
679) -> String {
680 let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
681 .expect("valid regex");
682 class_span_re
683 .replace_all(html, |caps: ®ex::Captures<'_>| {
684 let class_attr = caps.get(2).map_or("", |m| m.as_str());
685 let inner = caps.get(3).map_or("", |m| m.as_str());
686 let style = combined_class_style(class_styles, class_attr);
687 semantic_wrapped_html(inner, &style).map_or_else(
688 || caps[0].to_string(),
689 |wrapped| {
690 *hoisted += 1;
691 wrapped
692 },
693 )
694 })
695 .into_owned()
696}
697
698fn convert_class_indented_blockquotes(
699 html: &str,
700 class_styles: &HashMap<String, String>,
701) -> String {
702 let class_paragraph_re =
703 Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
704 class_paragraph_re
705 .replace_all(html, |caps: ®ex::Captures<'_>| {
706 let class_attr = caps.get(2).map_or("", |m| m.as_str());
707 let inner = caps.get(3).map_or("", |m| m.as_str());
708 let style = combined_class_style(class_styles, class_attr);
709 if is_blockquote_style(&style) {
710 format!("<blockquote><p>{inner}</p></blockquote>")
711 } else {
712 caps[0].to_string()
713 }
714 })
715 .into_owned()
716}
717
718#[derive(Debug, Clone)]
719struct ExportListBlock {
720 start: usize,
721 end: usize,
722 tag: String,
723 inner: String,
724}
725
726#[derive(Debug, Clone)]
727struct ExportListItem {
728 tag: String,
729 level: usize,
730 inner: String,
731}
732
733fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
734 let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
735 let blocks: Vec<ExportListBlock> = list_re
736 .captures_iter(html)
737 .filter_map(|caps| {
738 let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
739 let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
740 if open_tag != close_tag {
741 return None;
742 }
743 let whole = caps.get(0)?;
744 Some(ExportListBlock {
745 start: whole.start(),
746 end: whole.end(),
747 tag: open_tag,
748 inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
749 })
750 })
751 .collect();
752
753 if blocks.len() < 2 {
754 return html.to_string();
755 }
756
757 let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
758 let mut current: Vec<ExportListBlock> = Vec::new();
759 for block in blocks {
760 if let Some(previous) = current.last() {
761 if !html[previous.end..block.start].trim().is_empty() {
762 if current.len() > 1 {
763 groups.push(std::mem::take(&mut current));
764 } else {
765 current.clear();
766 }
767 }
768 }
769 current.push(block);
770 }
771 if current.len() > 1 {
772 groups.push(current);
773 }
774
775 if groups.is_empty() {
776 return html.to_string();
777 }
778
779 let mut out = html.to_string();
780 for group in groups.iter().rev() {
781 let rendered = render_nested_list_group(group, class_styles);
782 let start = group.first().expect("non-empty group").start;
783 let end = group.last().expect("non-empty group").end;
784 out.replace_range(start..end, &rendered);
785 }
786 out
787}
788
789fn render_nested_list_group(
790 group: &[ExportListBlock],
791 class_styles: &HashMap<String, String>,
792) -> String {
793 let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
794 let items: Vec<ExportListItem> = group
795 .iter()
796 .flat_map(|block| {
797 item_re.captures_iter(&block.inner).map(|caps| {
798 let attrs = caps.get(1).map_or("", |m| m.as_str());
799 let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
800 ExportListItem {
801 tag: block.tag.clone(),
802 level: google_docs_list_item_level(attrs, class_styles),
803 inner,
804 }
805 })
806 })
807 .collect();
808
809 if items.is_empty() {
810 let mut unchanged = String::new();
811 for block in group {
812 write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
813 .expect("write to String");
814 }
815 return unchanged;
816 }
817
818 let mut html = String::new();
819 let mut current_level: Option<usize> = None;
820 let mut open_tags: Vec<Option<String>> = Vec::new();
821 let mut item_open: Vec<bool> = Vec::new();
822
823 for item in items {
824 let level = item.level;
825 while current_level.is_some_and(|current| current > level) {
826 let current = current_level.expect("checked as Some");
827 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
828 current_level = current.checked_sub(1);
829 }
830
831 while current_level.is_none_or(|current| current < level) {
832 let next_level = current_level.map_or(0, |current| current + 1);
833 open_rendered_list(
834 &mut html,
835 &mut open_tags,
836 &mut item_open,
837 next_level,
838 &item.tag,
839 );
840 current_level = Some(next_level);
841 }
842
843 ensure_list_stack(&mut open_tags, &mut item_open, level);
844 if open_tags[level]
845 .as_deref()
846 .is_some_and(|tag| tag != item.tag)
847 {
848 close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
849 open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
850 } else if open_tags[level].is_none() {
851 open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
852 }
853
854 close_rendered_item(&mut html, &mut item_open, level);
855 html.push_str("<li>");
856 html.push_str(&item.inner);
857 item_open[level] = true;
858
859 for deeper in (level + 1)..item_open.len() {
860 item_open[deeper] = false;
861 open_tags[deeper] = None;
862 }
863 }
864
865 while let Some(current) = current_level {
866 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
867 current_level = current.checked_sub(1);
868 }
869
870 html
871}
872
873fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
874 while open_tags.len() <= level {
875 open_tags.push(None);
876 item_open.push(false);
877 }
878}
879
880fn open_rendered_list(
881 html: &mut String,
882 open_tags: &mut Vec<Option<String>>,
883 item_open: &mut Vec<bool>,
884 level: usize,
885 tag: &str,
886) {
887 ensure_list_stack(open_tags, item_open, level);
888 html.push('<');
889 html.push_str(tag);
890 html.push('>');
891 open_tags[level] = Some(tag.to_string());
892 item_open[level] = false;
893}
894
895fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
896 if item_open.get(level).copied().unwrap_or(false) {
897 html.push_str("</li>");
898 item_open[level] = false;
899 }
900}
901
902fn close_rendered_list(
903 html: &mut String,
904 open_tags: &mut [Option<String>],
905 item_open: &mut [bool],
906 level: usize,
907) {
908 close_rendered_item(html, item_open, level);
909 if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
910 html.push_str("</");
911 html.push_str(&tag);
912 html.push('>');
913 }
914}
915
916fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
917 let style = combined_attr_style(class_styles, attrs);
918 let margin_left = css_point_value(&style, "margin-left");
919 if margin_left <= 0.0 {
920 return 0;
921 }
922 [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
923 .iter()
924 .take_while(|boundary| margin_left >= **boundary)
925 .count()
926}
927
928fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
929 let mut styles = String::new();
930 if let Some(style) = attr_value(attrs, "style") {
931 styles.push_str(&style);
932 }
933 if let Some(class_attr) = attr_value(attrs, "class") {
934 styles.push_str(&combined_class_style(class_styles, &class_attr));
935 }
936 styles
937}
938
939fn attr_value(attrs: &str, name: &str) -> Option<String> {
940 let attr_re = Regex::new(&format!(
941 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
942 regex::escape(name)
943 ))
944 .expect("valid regex");
945 attr_re.captures(attrs).and_then(|caps| {
946 caps.get(1)
947 .or_else(|| caps.get(2))
948 .map(|value| value.as_str().to_string())
949 })
950}
951
952fn strip_google_docs_heading_noise(html: &str) -> String {
953 let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
954 let numbering_re =
955 Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
956 let mut out = empty_anchor_re.replace_all(html, "").into_owned();
957 for level in 1..=6 {
958 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
959 .expect("valid regex");
960 out = heading_re
961 .replace_all(&out, |caps: ®ex::Captures<'_>| {
962 let open = &caps[1];
963 let inner = &caps[2];
964 let close = &caps[3];
965 let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
966 cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
967 format!("{open}{cleaned}{close}")
968 })
969 .into_owned();
970 }
971 out
972}
973
974fn strip_heading_inline_formatting(html: &str) -> String {
975 let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
976 let mut out = html.to_string();
977 for level in 1..=6 {
978 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
979 .expect("valid regex");
980 out = heading_re
981 .replace_all(&out, |caps: ®ex::Captures<'_>| {
982 let open = &caps[1];
983 let inner = &caps[2];
984 let close = &caps[3];
985 let cleaned = inline_marker_re.replace_all(inner, "");
986 format!("{open}{cleaned}{close}")
987 })
988 .into_owned();
989 }
990 out
991}
992
993fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
994 let redirect_re =
995 Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
996 .expect("valid regex");
997 redirect_re
998 .replace_all(html, |caps: ®ex::Captures<'_>| {
999 let encoded = caps.get(1).map_or("", |m| m.as_str());
1000 let decoded = percent_decode_utf8_lossy(encoded);
1001 *unwrapped_links += 1;
1002 format!(r#"href="{decoded}""#)
1003 })
1004 .into_owned()
1005}
1006
1007fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
1008 let mut class_styles: HashMap<String, String> = HashMap::new();
1009 let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
1010 let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
1011 for style_caps in style_re.captures_iter(html) {
1012 let css = style_caps.get(1).map_or("", |m| m.as_str());
1013 for class_caps in class_re.captures_iter(css) {
1014 let class_name = class_caps.get(1).map_or("", |m| m.as_str());
1015 let style = class_caps.get(2).map_or("", |m| m.as_str());
1016 class_styles
1017 .entry(class_name.to_string())
1018 .and_modify(|existing| {
1019 existing.push(';');
1020 existing.push_str(style);
1021 })
1022 .or_insert_with(|| style.to_string());
1023 }
1024 }
1025 class_styles
1026}
1027
1028fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
1029 class_attr
1030 .split_whitespace()
1031 .filter_map(|class_name| class_styles.get(class_name))
1032 .fold(String::new(), |mut out, style| {
1033 out.push(';');
1034 out.push_str(style);
1035 out
1036 })
1037}
1038
1039fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
1040 let bold = css_has_bold(style);
1041 let italic = css_has_italic(style);
1042 let strike = css_has_strike(style);
1043 if !bold && !italic && !strike {
1044 return None;
1045 }
1046 let mut wrapped = inner.to_string();
1047 if strike {
1048 wrapped = format!("<del>{wrapped}</del>");
1049 }
1050 if italic {
1051 wrapped = format!("<em>{wrapped}</em>");
1052 }
1053 if bold {
1054 wrapped = format!("<strong>{wrapped}</strong>");
1055 }
1056 Some(wrapped)
1057}
1058
1059fn css_has_bold(style: &str) -> bool {
1060 Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
1061 .expect("valid regex")
1062 .is_match(style)
1063}
1064
1065fn css_has_italic(style: &str) -> bool {
1066 Regex::new(r"(?i)font-style\s*:\s*italic")
1067 .expect("valid regex")
1068 .is_match(style)
1069}
1070
1071fn css_has_strike(style: &str) -> bool {
1072 Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1073 .expect("valid regex")
1074 .is_match(style)
1075}
1076
1077fn is_blockquote_style(style: &str) -> bool {
1078 let margin_left = css_point_value(style, "margin-left");
1079 let margin_right = css_point_value(style, "margin-right");
1080 margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1081}
1082
1083fn css_point_value(style: &str, property: &str) -> f64 {
1084 let re = Regex::new(&format!(
1085 r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1086 regex::escape(property)
1087 ))
1088 .expect("valid regex");
1089 re.captures(style)
1090 .and_then(|caps| caps.get(1))
1091 .and_then(|value| value.as_str().parse::<f64>().ok())
1092 .unwrap_or(0.0)
1093}
1094
1095fn percent_decode_utf8_lossy(input: &str) -> String {
1098 let bytes = input.as_bytes();
1099 let mut decoded = Vec::with_capacity(bytes.len());
1100 let mut i = 0;
1101 while i < bytes.len() {
1102 if bytes[i] == b'%' && i + 2 < bytes.len() {
1103 let hi = (bytes[i + 1] as char).to_digit(16);
1104 let lo = (bytes[i + 2] as char).to_digit(16);
1105 if let (Some(hi), Some(lo)) = (hi, lo) {
1106 if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1107 decoded.push(byte);
1108 i += 3;
1109 continue;
1110 }
1111 }
1112 }
1113 decoded.push(bytes[i]);
1114 i += 1;
1115 }
1116 String::from_utf8_lossy(&decoded).into_owned()
1117}
1118
1119fn unescape_public_export_punctuation(markdown: &str) -> String {
1120 markdown
1121 .replace("\\.", ".")
1122 .replace("\\!", "!")
1123 .replace("\\(", "(")
1124 .replace("\\)", ")")
1125 .replace("\\[", "[")
1126 .replace("\\]", "]")
1127}
1128
1129fn convert_setext_headings(markdown: &str) -> String {
1130 let lines: Vec<&str> = markdown.lines().collect();
1131 let mut out = Vec::with_capacity(lines.len());
1132 let mut index = 0;
1133 while index < lines.len() {
1134 if index + 1 < lines.len() {
1135 let underline = lines[index + 1].trim();
1136 if is_setext_underline(underline, '=') {
1137 out.push(format!("# {}", lines[index].trim()));
1138 index += 2;
1139 continue;
1140 }
1141 if is_setext_underline(underline, '-') {
1142 out.push(format!("## {}", lines[index].trim()));
1143 index += 2;
1144 continue;
1145 }
1146 }
1147 out.push(lines[index].to_string());
1148 index += 1;
1149 }
1150 out.join("\n")
1151}
1152
1153fn is_setext_underline(line: &str, marker: char) -> bool {
1154 line.len() >= 5 && line.chars().all(|ch| ch == marker)
1155}
1156
1157fn normalize_atx_headings(markdown: &str) -> String {
1158 let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1159 let closing_re = closing_atx_heading_re();
1160 markdown
1161 .lines()
1162 .map(|line| {
1163 let Some(caps) = heading_re.captures(line) else {
1164 return line.to_string();
1165 };
1166 let hashes = caps.get(1).map_or("", |m| m.as_str());
1167 let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1168 text = closing_re.replace(&text, "").trim().to_string();
1169 text = strip_wrapping_markdown_emphasis(&text);
1170 format!("{hashes} {text}")
1171 })
1172 .collect::<Vec<_>>()
1173 .join("\n")
1174}
1175
1176fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1177 let trimmed = text.trim();
1178 for marker in ["***", "**", "*"] {
1179 if trimmed.len() > marker.len() * 2
1180 && trimmed.starts_with(marker)
1181 && trimmed.ends_with(marker)
1182 {
1183 return trimmed[marker.len()..trimmed.len() - marker.len()]
1184 .trim()
1185 .to_string();
1186 }
1187 }
1188 trimmed.to_string()
1189}
1190
1191fn normalize_bullet_markers(markdown: &str) -> String {
1192 let bullet_re = asterisk_bullet_re();
1193 markdown
1194 .lines()
1195 .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1196 .collect::<Vec<_>>()
1197 .join("\n")
1198}
1199
1200fn normalize_list_spacing(markdown: &str) -> String {
1201 let lines: Vec<&str> = markdown.lines().collect();
1202 let mut out = Vec::with_capacity(lines.len());
1203
1204 for (index, line) in lines.iter().enumerate() {
1205 if line.trim().is_empty()
1206 && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1207 && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1208 {
1209 continue;
1210 }
1211 out.push((*line).to_string());
1212 }
1213
1214 out.join("\n")
1215}
1216
1217fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1218 lines[..index]
1219 .iter()
1220 .rev()
1221 .copied()
1222 .find(|line| !line.trim().is_empty())
1223}
1224
1225fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1226 lines[index + 1..]
1227 .iter()
1228 .copied()
1229 .find(|line| !line.trim().is_empty())
1230}
1231
1232fn is_markdown_list_item(line: &str) -> bool {
1233 markdown_list_item_re().is_match(line)
1234}
1235
1236fn normalize_blockquote_spacing(markdown: &str) -> String {
1237 let mut out = String::with_capacity(markdown.len());
1238 let mut pending_quote_blank = false;
1239 let mut in_quote = false;
1240
1241 for line in markdown.lines() {
1242 if line.trim().is_empty() && in_quote {
1243 pending_quote_blank = true;
1244 continue;
1245 }
1246
1247 if line.trim() == ">" {
1248 if in_quote {
1249 pending_quote_blank = true;
1250 }
1251 continue;
1252 }
1253
1254 if line.starts_with("> ") {
1255 if pending_quote_blank {
1256 out.push_str(">\n");
1257 pending_quote_blank = false;
1258 }
1259 out.push_str(line);
1260 out.push('\n');
1261 in_quote = true;
1262 continue;
1263 }
1264
1265 if in_quote && !line.trim().is_empty() {
1266 out.push('\n');
1267 }
1268 pending_quote_blank = false;
1269 in_quote = false;
1270 out.push_str(line);
1271 out.push('\n');
1272 }
1273
1274 out
1275}
1276
1277fn normalize_markdown_tables(markdown: &str) -> String {
1278 let lines: Vec<&str> = markdown.lines().collect();
1279 let mut out = Vec::with_capacity(lines.len());
1280 let mut index = 0;
1281
1282 while index < lines.len() {
1283 if !is_markdown_table_line(lines[index]) {
1284 out.push(lines[index].to_string());
1285 index += 1;
1286 continue;
1287 }
1288
1289 let start = index;
1290 while index < lines.len() && is_markdown_table_line(lines[index]) {
1291 index += 1;
1292 }
1293 let block = &lines[start..index];
1294 if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1295 out.extend(normalize_markdown_table_block(block));
1296 } else {
1297 out.extend(block.iter().map(|line| (*line).to_string()));
1298 }
1299 }
1300
1301 out.join("\n")
1302}
1303
1304fn is_markdown_table_line(line: &str) -> bool {
1305 let trimmed = line.trim();
1306 trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1307}
1308
1309fn is_markdown_separator_line(line: &str) -> bool {
1310 split_markdown_table_cells(line)
1311 .iter()
1312 .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1313}
1314
1315fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1316 lines
1317 .iter()
1318 .enumerate()
1319 .map(|(index, line)| {
1320 let cells = split_markdown_table_cells(line);
1321 if index == 1 {
1322 let separators = vec!["---".to_string(); cells.len()];
1323 render_markdown_table_row(&separators)
1324 } else {
1325 render_markdown_table_row(&cells)
1326 }
1327 })
1328 .collect()
1329}
1330
1331fn split_markdown_table_cells(line: &str) -> Vec<String> {
1332 line.trim()
1333 .trim_matches('|')
1334 .split('|')
1335 .map(|cell| cell.trim().to_string())
1336 .collect()
1337}
1338
1339fn render_markdown_table_row(cells: &[String]) -> String {
1340 format!("| {} |", cells.join(" | "))
1341}
1342
1343fn closing_atx_heading_re() -> &'static Regex {
1344 static RE: OnceLock<Regex> = OnceLock::new();
1345 RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1346}
1347
1348fn asterisk_bullet_re() -> &'static Regex {
1349 static RE: OnceLock<Regex> = OnceLock::new();
1350 RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1351}
1352
1353fn markdown_list_item_re() -> &'static Regex {
1354 static RE: OnceLock<Regex> = OnceLock::new();
1355 RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1356}
1357
1358fn markdown_table_separator_cell_re() -> &'static Regex {
1359 static RE: OnceLock<Regex> = OnceLock::new();
1360 RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1361}
1362
1363pub async fn fetch_google_doc_from_docs_api(
1369 url: &str,
1370 api_token: &str,
1371) -> crate::Result<GDocsRenderedResult> {
1372 let document_id = extract_document_id(url).ok_or_else(|| {
1373 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1374 })?;
1375 let api_url = build_docs_api_url(&document_id);
1376 debug!(
1377 document_id = %document_id,
1378 api_url = %api_url,
1379 "fetching Google Doc via Docs API"
1380 );
1381
1382 let response = reqwest::Client::new()
1383 .get(&api_url)
1384 .header("Authorization", format!("Bearer {api_token}"))
1385 .header("Accept", "application/json")
1386 .send()
1387 .await
1388 .map_err(|e| {
1389 WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1390 })?;
1391 debug!(
1392 document_id = %document_id,
1393 status = response.status().as_u16(),
1394 success = response.status().is_success(),
1395 content_type = response
1396 .headers()
1397 .get(reqwest::header::CONTENT_TYPE)
1398 .and_then(|value| value.to_str().ok())
1399 .unwrap_or(""),
1400 "received Google Docs API response"
1401 );
1402
1403 if !response.status().is_success() {
1404 return Err(WebCaptureError::FetchError(format!(
1405 "Failed to fetch Google Doc via Docs API ({} {}): {}",
1406 response.status().as_u16(),
1407 response.status().canonical_reason().unwrap_or("Unknown"),
1408 api_url
1409 )));
1410 }
1411
1412 let body = response.text().await.map_err(|e| {
1413 WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1414 })?;
1415 let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1416 WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1417 })?;
1418 let rendered = render_docs_api_document(&document);
1419 debug!(
1420 document_id = %document_id,
1421 title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1422 markdown_bytes = rendered.markdown.len(),
1423 html_bytes = rendered.html.len(),
1424 text_bytes = rendered.text.len(),
1425 "rendered Google Docs API document"
1426 );
1427
1428 Ok(GDocsRenderedResult {
1429 markdown: rendered.markdown,
1430 html: rendered.html,
1431 text: rendered.text,
1432 document_id,
1433 export_url: api_url,
1434 remote_images: Vec::new(),
1435 })
1436}
1437
1438pub async fn fetch_google_doc_from_model(
1444 url: &str,
1445 api_token: Option<&str>,
1446) -> crate::Result<GDocsRenderedResult> {
1447 if api_token.is_some() {
1448 return Err(WebCaptureError::BrowserError(
1449 "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1450 ));
1451 }
1452 let document_id = extract_document_id(url).ok_or_else(|| {
1453 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1454 })?;
1455 let edit_url = build_edit_url(&document_id);
1456 debug!(
1457 document_id = %document_id,
1458 edit_url = %edit_url,
1459 "capturing Google Doc editor model with a real browser"
1460 );
1461 let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1462 let BrowserModelData {
1463 chunks,
1464 cid_urls,
1465 chunk_payload_bytes,
1466 poll_count,
1467 stable_for,
1468 } = model_data;
1469 debug!(
1470 document_id = %document_id,
1471 chunks = chunks.len(),
1472 cid_urls = cid_urls.len(),
1473 chunk_payload_bytes,
1474 poll_count,
1475 stable_for_ms = stable_for.as_millis(),
1476 "extracted Google Docs editor model chunks through CDP"
1477 );
1478 if chunks.is_empty() {
1479 return Err(WebCaptureError::ParseError(
1480 "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1481 ));
1482 }
1483
1484 let export_html = match fetch_google_doc(url, "html", None).await {
1485 Ok(result) => Some(result.content),
1486 Err(error) => {
1487 warn!(
1488 document_id = %document_id,
1489 error = %error,
1490 "failed to fetch Google Docs export HTML for browser-model semantic hints"
1491 );
1492 None
1493 }
1494 };
1495 let capture = parse_model_chunks_with_export_html(&chunks, &cid_urls, export_html.as_deref());
1496 let remote_images = remote_images_from_capture(&capture);
1497 info!(
1498 document_id = %document_id,
1499 chunks = chunks.len(),
1500 cid_urls = cid_urls.len(),
1501 chunk_payload_bytes,
1502 poll_count,
1503 stable_for_ms = stable_for.as_millis(),
1504 blocks = capture.blocks.len(),
1505 tables = capture.tables.len(),
1506 images = capture.images.len(),
1507 text_bytes = capture.text.len(),
1508 "parsed Google Docs editor model"
1509 );
1510
1511 Ok(GDocsRenderedResult {
1512 markdown: render_captured_document(&capture, "markdown"),
1513 html: render_captured_document(&capture, "html"),
1514 text: render_captured_document(&capture, "txt"),
1515 document_id,
1516 export_url: edit_url,
1517 remote_images,
1518 })
1519}
1520
1521async fn fetch_google_doc_editor_model_with_cdp(
1522 edit_url: &str,
1523 document_id: &str,
1524) -> crate::Result<BrowserModelData> {
1525 let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1526 WebCaptureError::BrowserError(
1527 "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1528 )
1529 })?;
1530 let user_data_dir = crate::browser::temporary_user_data_dir();
1531 std::fs::create_dir_all(&user_data_dir)?;
1532
1533 debug!(
1534 document_id = %document_id,
1535 chrome = %chrome.display(),
1536 user_data_dir = %user_data_dir.display(),
1537 edit_url = %edit_url,
1538 "launching headless Chrome CDP session for Google Docs model capture"
1539 );
1540
1541 let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1542 let capture_result = async {
1543 let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1544 let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1545 WebCaptureError::BrowserError(format!(
1546 "Failed to connect to Chrome DevTools websocket: {error}"
1547 ))
1548 })?;
1549 let mut next_id = 0u64;
1550 let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1551 wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1552 }
1553 .await;
1554
1555 if let Err(error) = child.kill().await {
1556 debug!(
1557 document_id = %document_id,
1558 error = %error,
1559 "failed to kill Chrome CDP browser process"
1560 );
1561 }
1562 let _ = child.wait().await;
1563 let _ = std::fs::remove_dir_all(&user_data_dir);
1564
1565 capture_result
1566}
1567
1568async fn navigate_google_docs_cdp_page(
1569 ws: &mut CdpWebSocket,
1570 next_id: &mut u64,
1571 edit_url: &str,
1572) -> crate::Result<String> {
1573 let target = cdp_send(
1574 ws,
1575 next_id,
1576 None,
1577 "Target.createTarget",
1578 serde_json::json!({ "url": "about:blank" }),
1579 )
1580 .await?;
1581 let target_id = target
1582 .get("targetId")
1583 .and_then(Value::as_str)
1584 .ok_or_else(|| {
1585 WebCaptureError::BrowserError(
1586 "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1587 )
1588 })?
1589 .to_string();
1590 let attached = cdp_send(
1591 ws,
1592 next_id,
1593 None,
1594 "Target.attachToTarget",
1595 serde_json::json!({ "targetId": target_id, "flatten": true }),
1596 )
1597 .await?;
1598 let session_id = attached
1599 .get("sessionId")
1600 .and_then(Value::as_str)
1601 .ok_or_else(|| {
1602 WebCaptureError::BrowserError(
1603 "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1604 )
1605 })?
1606 .to_string();
1607
1608 cdp_send(
1609 ws,
1610 next_id,
1611 Some(&session_id),
1612 "Page.enable",
1613 serde_json::json!({}),
1614 )
1615 .await?;
1616 cdp_send(
1617 ws,
1618 next_id,
1619 Some(&session_id),
1620 "Runtime.enable",
1621 serde_json::json!({}),
1622 )
1623 .await?;
1624 cdp_send(
1625 ws,
1626 next_id,
1627 Some(&session_id),
1628 "Page.addScriptToEvaluateOnNewDocument",
1629 serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1630 )
1631 .await?;
1632 cdp_send(
1633 ws,
1634 next_id,
1635 Some(&session_id),
1636 "Page.navigate",
1637 serde_json::json!({ "url": edit_url }),
1638 )
1639 .await?;
1640
1641 Ok(session_id)
1642}
1643
1644async fn wait_for_google_docs_model_chunks(
1645 ws: &mut CdpWebSocket,
1646 next_id: &mut u64,
1647 session_id: &str,
1648 document_id: &str,
1649) -> crate::Result<BrowserModelData> {
1650 let started = Instant::now();
1651 let max_wait = gdocs_editor_model_max_wait();
1652 let stability_window = gdocs_editor_model_stability_window();
1653 let mut quiescence = BrowserModelQuiescence::default();
1654 let mut last_chunks = 0usize;
1655 let mut last_cid_urls = 0usize;
1656 let mut last_payload_bytes = 0usize;
1657 let mut last_stable_for = Duration::ZERO;
1658 let mut poll_count = 0usize;
1659
1660 while started.elapsed() < max_wait {
1661 let result = cdp_send(
1662 ws,
1663 next_id,
1664 Some(session_id),
1665 "Runtime.evaluate",
1666 serde_json::json!({
1667 "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1668 "returnByValue": true,
1669 "awaitPromise": true
1670 }),
1671 )
1672 .await?;
1673 if let Some(exception) = result.get("exceptionDetails") {
1674 return Err(WebCaptureError::BrowserError(format!(
1675 "Google Docs model extraction script failed: {exception}"
1676 )));
1677 }
1678 let value = result
1679 .pointer("/result/value")
1680 .cloned()
1681 .unwrap_or(Value::Null);
1682 let model_data = browser_model_data_from_value(&value);
1683 poll_count += 1;
1684 let fingerprint = model_data.fingerprint();
1685 last_chunks = model_data.chunks.len();
1686 last_cid_urls = model_data.cid_urls.len();
1687 last_payload_bytes = model_data.chunk_payload_bytes;
1688 let now = Instant::now();
1689 if let Some(stable_for) = quiescence.observe(fingerprint, now, stability_window) {
1690 let mut model_data = model_data;
1691 model_data.poll_count = poll_count;
1692 model_data.stable_for = stable_for;
1693 debug!(
1694 document_id = %document_id,
1695 chunks = model_data.chunks.len(),
1696 cid_urls = model_data.cid_urls.len(),
1697 chunk_payload_bytes = model_data.chunk_payload_bytes,
1698 poll_count,
1699 stable_for_ms = stable_for.as_millis(),
1700 elapsed_ms = started.elapsed().as_millis(),
1701 "captured quiesced Google Docs model chunks through CDP Runtime.evaluate"
1702 );
1703 return Ok(model_data);
1704 }
1705 last_stable_for = quiescence.stable_for(now);
1706 tokio::time::sleep(GDOCS_EDITOR_MODEL_POLL_INTERVAL).await;
1707 }
1708
1709 Err(WebCaptureError::BrowserError(format!(
1710 "Timed out waiting for Google Docs DOCS_modelChunk stream to quiesce for document {document_id} after {} ms (last chunks={last_chunks}, payload_bytes={last_payload_bytes}, cid_urls={last_cid_urls}, poll_count={poll_count}, stable_for_ms={})",
1711 max_wait.as_millis(),
1712 last_stable_for.as_millis()
1713 )))
1714}
1715
1716fn launch_cdp_chrome(
1717 chrome: &std::path::Path,
1718 user_data_dir: &std::path::Path,
1719) -> crate::Result<Child> {
1720 let mut command = Command::new(chrome);
1721 command
1722 .args([
1723 "--headless=new",
1724 "--disable-gpu",
1725 "--disable-extensions",
1726 "--disable-dev-shm-usage",
1727 "--disable-background-networking",
1728 "--disable-component-update",
1729 "--disable-default-apps",
1730 "--disable-sync",
1731 "--metrics-recording-only",
1732 "--no-default-browser-check",
1733 "--no-first-run",
1734 "--no-sandbox",
1735 "--remote-debugging-port=0",
1736 "--window-size=1280,800",
1737 ])
1738 .arg(format!("--user-data-dir={}", user_data_dir.display()))
1739 .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1740 .stderr(Stdio::piped())
1741 .stdout(Stdio::null())
1742 .kill_on_drop(true);
1743
1744 command.spawn().map_err(|error| {
1745 WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1746 })
1747}
1748
1749async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1750 let stderr = child.stderr.take().ok_or_else(|| {
1751 WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1752 })?;
1753 let mut lines = BufReader::new(stderr).lines();
1754 let started = Instant::now();
1755
1756 while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1757 let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1758 match line {
1759 Ok(Ok(Some(line))) => {
1760 if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1761 return Ok(ws_url.trim().to_string());
1762 }
1763 }
1764 Ok(Ok(None)) => {
1765 break;
1766 }
1767 Ok(Err(error)) => {
1768 return Err(WebCaptureError::BrowserError(format!(
1769 "Failed to read Chrome CDP stderr: {error}"
1770 )));
1771 }
1772 Err(_) => {}
1773 }
1774 }
1775
1776 Err(WebCaptureError::BrowserError(format!(
1777 "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1778 GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1779 )))
1780}
1781
1782async fn cdp_send(
1783 ws: &mut CdpWebSocket,
1784 next_id: &mut u64,
1785 session_id: Option<&str>,
1786 method: &str,
1787 params: Value,
1788) -> crate::Result<Value> {
1789 *next_id += 1;
1790 let id = *next_id;
1791 let mut message = serde_json::json!({
1792 "id": id,
1793 "method": method,
1794 "params": params
1795 });
1796 if let Some(session_id) = session_id {
1797 message["sessionId"] = Value::String(session_id.to_string());
1798 }
1799
1800 ws.send(Message::Text(message.to_string()))
1801 .await
1802 .map_err(|error| {
1803 WebCaptureError::BrowserError(format!(
1804 "Failed to send Chrome DevTools command {method}: {error}"
1805 ))
1806 })?;
1807
1808 while let Some(message) = ws.next().await {
1809 let message = message.map_err(|error| {
1810 WebCaptureError::BrowserError(format!(
1811 "Failed to read Chrome DevTools response for {method}: {error}"
1812 ))
1813 })?;
1814 if !message.is_text() {
1815 continue;
1816 }
1817 let text = message.to_text().map_err(|error| {
1818 WebCaptureError::BrowserError(format!(
1819 "Chrome DevTools response for {method} was not text: {error}"
1820 ))
1821 })?;
1822 let value = serde_json::from_str::<Value>(text).map_err(|error| {
1823 WebCaptureError::ParseError(format!(
1824 "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1825 ))
1826 })?;
1827 if value.get("id").and_then(Value::as_u64) != Some(id) {
1828 continue;
1829 }
1830 if let Some(error) = value.get("error") {
1831 return Err(WebCaptureError::BrowserError(format!(
1832 "Chrome DevTools command {method} failed: {error}"
1833 )));
1834 }
1835 return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1836 }
1837
1838 Err(WebCaptureError::BrowserError(format!(
1839 "Chrome DevTools websocket closed before response for {method}"
1840 )))
1841}
1842
1843fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1844 let chunks = value
1845 .get("chunks")
1846 .and_then(Value::as_array)
1847 .cloned()
1848 .unwrap_or_default();
1849 let chunk_payload_bytes = model_chunk_payload_bytes(&chunks);
1850 let cid_urls = value
1851 .get("cidUrlMap")
1852 .and_then(Value::as_object)
1853 .map(|map| {
1854 map.iter()
1855 .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1856 .collect::<HashMap<_, _>>()
1857 })
1858 .unwrap_or_default();
1859 BrowserModelData {
1860 chunks,
1861 cid_urls,
1862 chunk_payload_bytes,
1863 poll_count: 0,
1864 stable_for: Duration::ZERO,
1865 }
1866}
1867
1868fn model_chunk_payload_bytes(chunks: &[Value]) -> usize {
1869 chunks
1870 .iter()
1871 .map(|chunk| serde_json::to_vec(chunk).map_or(0, |encoded| encoded.len()))
1872 .sum()
1873}
1874
1875fn gdocs_editor_model_max_wait() -> Duration {
1876 duration_from_env_ms(
1877 "WEB_CAPTURE_GDOCS_MAX_WAIT_MS",
1878 GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT,
1879 )
1880}
1881
1882fn gdocs_editor_model_stability_window() -> Duration {
1883 duration_from_env_ms(
1884 "WEB_CAPTURE_GDOCS_STABILITY_MS",
1885 GDOCS_EDITOR_MODEL_STABILITY_DEFAULT,
1886 )
1887}
1888
1889fn duration_from_env_ms(name: &str, default: Duration) -> Duration {
1890 std::env::var(name).map_or(default, |value| match value.trim().parse::<u64>() {
1891 Ok(ms) => Duration::from_millis(ms),
1892 Err(error) => {
1893 warn!(
1894 name,
1895 value,
1896 error = %error,
1897 default_ms = default.as_millis(),
1898 "ignoring invalid Google Docs model wait environment variable"
1899 );
1900 default
1901 }
1902 })
1903}
1904
1905fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1906 capture
1907 .images
1908 .iter()
1909 .filter_map(|node| match node {
1910 ContentNode::Image {
1911 url: Some(url),
1912 alt,
1913 ..
1914 } => Some(RemoteImage {
1915 url: url.clone(),
1916 alt: alt.clone(),
1917 }),
1918 ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1919 })
1920 .collect()
1921}
1922
1923#[must_use]
1925pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1926 let blocks = structural_elements_to_blocks(
1927 document
1928 .pointer("/body/content")
1929 .and_then(Value::as_array)
1930 .map_or(&[] as &[Value], Vec::as_slice),
1931 document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1932 );
1933 GDocsRenderedOutput {
1934 markdown: render_blocks_markdown(&blocks),
1935 html: render_blocks_html(&blocks),
1936 text: blocks_to_text(&blocks),
1937 }
1938}
1939
1940#[derive(Debug, Clone, PartialEq, Eq)]
1942pub struct GDocsRenderedOutput {
1943 pub markdown: String,
1945 pub html: String,
1947 pub text: String,
1949}
1950
1951fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
1952 let mut blocks = Vec::new();
1953 for element in elements {
1954 if let Some(paragraph) = element.get("paragraph") {
1955 let content = paragraph_to_content(paragraph, inline_objects);
1956 if !content_to_text(&content).trim().is_empty()
1957 || content
1958 .iter()
1959 .any(|node| matches!(node, ContentNode::Image { .. }))
1960 {
1961 blocks.push(CapturedBlock::Paragraph {
1962 style: paragraph
1963 .pointer("/paragraphStyle/namedStyleType")
1964 .and_then(Value::as_str)
1965 .map(ToString::to_string),
1966 list: None,
1967 quote: false,
1968 horizontal_rule: false,
1969 content,
1970 });
1971 }
1972 } else if let Some(table) = element.get("table") {
1973 blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
1974 }
1975 }
1976 blocks
1977}
1978
1979fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
1980 let rows = table
1981 .get("tableRows")
1982 .and_then(Value::as_array)
1983 .map_or(&[] as &[Value], Vec::as_slice)
1984 .iter()
1985 .map(|row| TableRow {
1986 cells: row
1987 .get("tableCells")
1988 .and_then(Value::as_array)
1989 .map_or(&[] as &[Value], Vec::as_slice)
1990 .iter()
1991 .map(|cell| TableCell {
1992 content: structural_elements_to_inline_content(
1993 cell.get("content")
1994 .and_then(Value::as_array)
1995 .map_or(&[] as &[Value], Vec::as_slice),
1996 inline_objects,
1997 ),
1998 })
1999 .collect(),
2000 })
2001 .collect();
2002 TableBlock { rows }
2003}
2004
2005fn structural_elements_to_inline_content(
2006 elements: &[Value],
2007 inline_objects: &Value,
2008) -> Vec<ContentNode> {
2009 let mut content = Vec::new();
2010 for element in elements {
2011 if let Some(paragraph) = element.get("paragraph") {
2012 let paragraph_content = paragraph_to_content(paragraph, inline_objects);
2013 if !content.is_empty() && !paragraph_content.is_empty() {
2014 append_text(&mut content, "\n");
2015 }
2016 content.extend(paragraph_content);
2017 } else if let Some(table) = element.get("table") {
2018 append_text(
2019 &mut content,
2020 &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
2021 table,
2022 inline_objects,
2023 ))]),
2024 );
2025 }
2026 }
2027 content
2028}
2029
2030fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
2031 let mut content = Vec::new();
2032 for element in paragraph
2033 .get("elements")
2034 .and_then(Value::as_array)
2035 .map_or(&[] as &[Value], Vec::as_slice)
2036 {
2037 if let Some(text) = element
2038 .pointer("/textRun/content")
2039 .and_then(Value::as_str)
2040 .map(|text| text.strip_suffix('\n').unwrap_or(text))
2041 {
2042 append_text(&mut content, text);
2043 } else if let Some(inline_id) = element
2044 .pointer("/inlineObjectElement/inlineObjectId")
2045 .and_then(Value::as_str)
2046 {
2047 if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
2048 content.push(image);
2049 }
2050 }
2051 }
2052 content
2053}
2054
2055fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
2056 let embedded = inline_objects
2057 .get(inline_id)?
2058 .pointer("/inlineObjectProperties/embeddedObject")?;
2059 let url = embedded
2060 .pointer("/imageProperties/contentUri")
2061 .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
2062 .and_then(Value::as_str)?;
2063 let alt = embedded
2064 .get("title")
2065 .or_else(|| embedded.get("description"))
2066 .and_then(Value::as_str)
2067 .unwrap_or("image");
2068 Some(ContentNode::Image {
2069 cid: None,
2070 url: Some(url.to_string()),
2071 alt: alt.to_string(),
2072 width: json_dimension_to_string(embedded.pointer("/size/width/magnitude")),
2073 height: json_dimension_to_string(embedded.pointer("/size/height/magnitude")),
2074 is_suggestion: false,
2075 })
2076}
2077
2078fn json_dimension_to_string(value: Option<&Value>) -> Option<String> {
2079 match value? {
2080 Value::Number(number) => Some(number.to_string()),
2081 Value::String(text) if !text.is_empty() => Some(text.clone()),
2082 _ => None,
2083 }
2084}
2085
2086fn build_model_style_maps(
2087 items: &[Value],
2088 text_len: usize,
2089 utf16_position_map: &[usize],
2090) -> ModelStyleMaps {
2091 let mut maps = ModelStyleMaps {
2092 inline_styles: vec![TextStyle::default(); text_len],
2093 ..ModelStyleMaps::default()
2094 };
2095
2096 for item in items {
2097 if item.get("ty").and_then(Value::as_str) != Some("as") {
2098 continue;
2099 }
2100 let (Some(start), Some(end), Some(style_type)) = (
2101 item.get("si").and_then(Value::as_u64),
2102 item.get("ei").and_then(Value::as_u64),
2103 item.get("st").and_then(Value::as_str),
2104 ) else {
2105 continue;
2106 };
2107 let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
2108 continue;
2109 };
2110
2111 let start = utf16_position_to_char_position(utf16_position_map, start);
2112 let end = utf16_position_to_char_position(utf16_position_map, end);
2113 if start == 0 || end == 0 {
2114 continue;
2115 }
2116
2117 match style_type {
2118 "text" => {
2119 let style = text_style(item);
2120 apply_inline_style(&mut maps.inline_styles, start, end, &style);
2121 }
2122 "link" => {
2123 let style = TextStyle {
2124 link: item
2125 .pointer("/sm/lnks_link/ulnk_url")
2126 .and_then(Value::as_str)
2127 .map(ToString::to_string),
2128 ..TextStyle::default()
2129 };
2130 apply_inline_style(&mut maps.inline_styles, start, end, &style);
2131 }
2132 "paragraph" => {
2133 maps.paragraph_by_end
2134 .insert(end, paragraph_style_from_model(item));
2135 }
2136 "list" => {
2137 maps.list_by_end.insert(
2138 end,
2139 ListMeta {
2140 id: item
2141 .pointer("/sm/ls_id")
2142 .and_then(Value::as_str)
2143 .unwrap_or("")
2144 .to_string(),
2145 level: item
2146 .pointer("/sm/ls_nest")
2147 .and_then(Value::as_u64)
2148 .and_then(|value| usize::try_from(value).ok())
2149 .unwrap_or(0),
2150 ordered: false,
2151 },
2152 );
2153 }
2154 "horizontal_rule" => {
2155 maps.horizontal_rules.insert(end);
2156 }
2157 _ => {}
2158 }
2159 }
2160
2161 maps
2162}
2163
2164fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
2165 let from = start.saturating_sub(1);
2166 let to = end.min(styles.len());
2167 if from >= to {
2168 return;
2169 }
2170 for style in &mut styles[from..to] {
2171 if patch.bold {
2172 style.bold = true;
2173 }
2174 if patch.italic {
2175 style.italic = true;
2176 }
2177 if patch.strike {
2178 style.strike = true;
2179 }
2180 if patch.link.is_some() {
2181 style.link.clone_from(&patch.link);
2182 }
2183 }
2184}
2185
2186fn text_style(item: &Value) -> TextStyle {
2187 TextStyle {
2188 bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true)
2189 && item.pointer("/sm/ts_bd_i").and_then(Value::as_bool) != Some(true),
2190 italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true)
2191 && item.pointer("/sm/ts_it_i").and_then(Value::as_bool) != Some(true),
2192 strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true)
2193 && item.pointer("/sm/ts_st_i").and_then(Value::as_bool) != Some(true),
2194 link: None,
2195 }
2196}
2197
2198fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2199 let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2200 ParagraphStyle {
2201 style: heading.map(|level| format!("HEADING_{level}")),
2202 indent_start: item
2203 .pointer("/sm/ps_il")
2204 .and_then(Value::as_f64)
2205 .unwrap_or(0.0),
2206 indent_first_line: item
2207 .pointer("/sm/ps_ifl")
2208 .and_then(Value::as_f64)
2209 .unwrap_or(0.0),
2210 }
2211}
2212
2213fn build_utf16_position_map(text: &str) -> Vec<usize> {
2214 let mut map = vec![0; text.encode_utf16().count() + 1];
2215 let mut utf16_pos = 1usize;
2216 for (idx, ch) in text.chars().enumerate() {
2217 let char_pos = idx + 1;
2218 for _ in 0..ch.len_utf16() {
2219 if let Some(slot) = map.get_mut(utf16_pos) {
2220 *slot = char_pos;
2221 }
2222 utf16_pos += 1;
2223 }
2224 }
2225 map
2226}
2227
2228fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2229 map.get(position)
2230 .copied()
2231 .filter(|position| *position > 0)
2232 .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2233 .unwrap_or(0)
2234}
2235
2236#[must_use]
2238pub fn parse_model_chunks<S: BuildHasher>(
2239 chunks: &[Value],
2240 cid_urls: &HashMap<String, String, S>,
2241) -> CapturedDocument {
2242 parse_model_chunks_with_export_html(chunks, cid_urls, None)
2243}
2244
2245#[must_use]
2248#[allow(clippy::too_many_lines)]
2249pub fn parse_model_chunks_with_export_html<S: BuildHasher>(
2250 chunks: &[Value],
2251 cid_urls: &HashMap<String, String, S>,
2252 export_html: Option<&str>,
2253) -> CapturedDocument {
2254 let items = collect_model_items(chunks);
2255 let full_text = items
2256 .iter()
2257 .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2258 .filter_map(|item| item.get("s").and_then(Value::as_str))
2259 .collect::<String>();
2260 let chars: Vec<char> = full_text.chars().collect();
2261 let utf16_position_map = build_utf16_position_map(&full_text);
2262 let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2263
2264 let mut positions = HashMap::new();
2265 for item in &items {
2266 if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2267 if let (Some(id), Some(pos)) = (
2268 item.get("id").and_then(Value::as_str),
2269 item.get("spi").and_then(Value::as_u64),
2270 ) {
2271 if let Ok(pos) = usize::try_from(pos) {
2272 positions.insert(
2273 id.to_string(),
2274 utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2275 );
2276 }
2277 }
2278 }
2279 }
2280
2281 let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2282 let mut images = Vec::new();
2283 for item in &items {
2284 let ty = item.get("ty").and_then(Value::as_str);
2285 if !matches!(ty, Some("ae" | "ase")) {
2286 continue;
2287 }
2288 let Some(id) = item.get("id").and_then(Value::as_str) else {
2289 continue;
2290 };
2291 let Some(pos) = positions.get(id).copied() else {
2292 continue;
2293 };
2294 let cid = item
2295 .pointer("/epm/ee_eo/i_cid")
2296 .and_then(Value::as_str)
2297 .map(ToString::to_string);
2298 let node = ContentNode::Image {
2299 url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2300 cid,
2301 alt: item
2302 .pointer("/epm/ee_eo/eo_ad")
2303 .and_then(Value::as_str)
2304 .unwrap_or_else(|| {
2305 if ty == Some("ase") {
2306 "suggested image"
2307 } else {
2308 "image"
2309 }
2310 })
2311 .to_string(),
2312 width: json_dimension_to_string(item.pointer("/epm/ee_eo/i_wth")),
2313 height: json_dimension_to_string(item.pointer("/epm/ee_eo/i_ht")),
2314 is_suggestion: ty == Some("ase"),
2315 };
2316 images_by_pos.insert(pos, node.clone());
2317 images.push(node);
2318 }
2319
2320 let mut blocks = Vec::new();
2321 let mut tables = Vec::new();
2322 let mut paragraph = Vec::new();
2323 let mut table: Option<TableBlock> = None;
2324 let mut row: Option<TableRow> = None;
2325 let mut cell: Option<TableCell> = None;
2326 let mut previous_table_control: Option<u32> = None;
2327 let mut skip_next_table_newline = false;
2328
2329 for (idx, ch) in chars.iter().copied().enumerate() {
2330 match ch as u32 {
2331 0x10 => {
2332 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2333 table = Some(TableBlock::default());
2334 previous_table_control = Some(0x10);
2335 skip_next_table_newline = false;
2336 }
2337 0x11 => {
2338 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2339 previous_table_control = None;
2340 skip_next_table_newline = false;
2341 }
2342 0x12 => {
2343 flush_row(&mut row, &mut cell, table.as_mut(), true);
2344 row = Some(TableRow::default());
2345 previous_table_control = Some(0x12);
2346 skip_next_table_newline = false;
2347 }
2348 0x1c => {
2349 if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2350 previous_table_control = Some(0x1c);
2351 continue;
2352 }
2353 let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2354 flush_cell(&mut row, &mut cell, false);
2355 if row.is_none() {
2356 row = Some(TableRow::default());
2357 }
2358 cell = Some(TableCell::default());
2359 if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2360 skip_next_table_newline = true;
2361 }
2362 previous_table_control = Some(0x1c);
2363 }
2364 0x0a => {
2365 if table.is_some() {
2366 if skip_next_table_newline {
2367 skip_next_table_newline = false;
2368 previous_table_control = Some(0x0a);
2369 continue;
2370 }
2371 flush_cell(&mut row, &mut cell, false);
2374 if row.is_none() {
2375 row = Some(TableRow::default());
2376 }
2377 cell = Some(TableCell::default());
2378 previous_table_control = Some(0x0a);
2379 } else {
2380 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2381 }
2382 }
2383 0x0b => {
2384 append_to_current(
2385 &mut paragraph,
2386 &mut row,
2387 &mut cell,
2388 table.is_some(),
2389 "\n",
2390 TextStyle::default(),
2391 );
2392 previous_table_control = None;
2393 skip_next_table_newline = false;
2394 }
2395 _ => {
2396 if let Some(image) = images_by_pos.get(&idx).cloned() {
2397 push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2398 previous_table_control = None;
2399 skip_next_table_newline = false;
2400 if ch == '*' {
2401 continue;
2402 }
2403 }
2404 append_to_current(
2405 &mut paragraph,
2406 &mut row,
2407 &mut cell,
2408 table.is_some(),
2409 &ch.to_string(),
2410 style_maps
2411 .inline_styles
2412 .get(idx)
2413 .cloned()
2414 .unwrap_or_default(),
2415 );
2416 previous_table_control = None;
2417 skip_next_table_newline = false;
2418 }
2419 }
2420 }
2421
2422 if table.is_some() {
2423 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2424 }
2425 flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2426
2427 let mut capture = CapturedDocument {
2428 text: blocks_to_text(&blocks),
2429 blocks,
2430 tables,
2431 images,
2432 };
2433 if let Some(export_html) = export_html {
2434 apply_export_semantic_hints(&mut capture.blocks, export_html);
2435 capture.text = blocks_to_text(&capture.blocks);
2436 }
2437 capture
2438}
2439
2440fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2441 let mut items = Vec::new();
2442 for chunk in chunks {
2443 if let Some(array) = chunk.as_array() {
2444 items.extend(array.iter().cloned());
2445 } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2446 items.extend(array.iter().cloned());
2447 } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2448 items.push(chunk.clone());
2449 }
2450 }
2451 items
2452}
2453
2454fn flush_paragraph(
2455 paragraph: &mut Vec<ContentNode>,
2456 blocks: &mut Vec<CapturedBlock>,
2457 end_pos: Option<usize>,
2458 style_maps: &ModelStyleMaps,
2459) {
2460 if !content_to_text(paragraph).trim().is_empty()
2461 || paragraph
2462 .iter()
2463 .any(|node| matches!(node, ContentNode::Image { .. }))
2464 {
2465 let meta =
2466 paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2467 blocks.push(CapturedBlock::Paragraph {
2468 content: std::mem::take(paragraph),
2469 style: meta.style,
2470 list: meta.list,
2471 quote: meta.quote,
2472 horizontal_rule: meta.horizontal_rule,
2473 });
2474 } else {
2475 paragraph.clear();
2476 }
2477}
2478
2479fn paragraph_meta_for_end_position(
2480 style_maps: &ModelStyleMaps,
2481 end_pos: Option<usize>,
2482 text: &str,
2483) -> ParagraphMeta {
2484 let Some(end_pos) = end_pos else {
2485 return ParagraphMeta::default();
2486 };
2487 let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2488 let mut meta = ParagraphMeta {
2489 style: paragraph_style.and_then(|style| style.style.clone()),
2490 ..ParagraphMeta::default()
2491 };
2492
2493 if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2494 let mut list = list.clone();
2495 list.ordered = infer_ordered_list(&list, text);
2496 meta.list = Some(list);
2497 } else if paragraph_style.is_some_and(|style| {
2498 style.indent_start > 0.0
2499 && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2500 }) {
2501 meta.quote = true;
2502 }
2503
2504 meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2505 || end_pos
2506 .checked_sub(1)
2507 .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2508 && text.trim().chars().all(|ch| ch == '-');
2509 meta
2510}
2511
2512const fn infer_ordered_list(_list: &ListMeta, _text: &str) -> bool {
2513 false
2514}
2515
2516fn apply_export_semantic_hints(blocks: &mut [CapturedBlock], export_html: &str) {
2517 let hints = extract_export_semantic_hints(export_html);
2518 let mut cursor = 0usize;
2519 for block in blocks {
2520 let CapturedBlock::Paragraph {
2521 content,
2522 list,
2523 quote,
2524 ..
2525 } = block
2526 else {
2527 continue;
2528 };
2529 let text = normalize_semantic_text(&content_to_text(content));
2530 if text.is_empty() {
2531 continue;
2532 }
2533 let Some((index, hint)) = find_next_semantic_hint(&hints, &text, cursor, list.is_some())
2534 else {
2535 continue;
2536 };
2537 cursor = index + 1;
2538 if let Some(list) = list.as_mut() {
2539 if let Some(ordered) = hint.list_ordered {
2540 list.ordered = ordered;
2541 }
2542 } else {
2543 *quote = hint.quote;
2544 }
2545 }
2546}
2547
2548fn find_next_semantic_hint<'a>(
2549 hints: &'a [ExportSemanticHint],
2550 text: &str,
2551 cursor: usize,
2552 needs_list_hint: bool,
2553) -> Option<(usize, &'a ExportSemanticHint)> {
2554 hints.iter().enumerate().skip(cursor).find(|(_, hint)| {
2555 hint.text == text
2556 && if needs_list_hint {
2557 hint.list_ordered.is_some()
2558 } else {
2559 hint.list_ordered.is_none()
2560 }
2561 })
2562}
2563
2564fn extract_export_semantic_hints(export_html: &str) -> Vec<ExportSemanticHint> {
2565 let preprocessed = preprocess_google_docs_export_html(export_html).html;
2566 let document = Html::parse_document(&preprocessed);
2567 let selector =
2568 Selector::parse("body h1,body h2,body h3,body h4,body h5,body h6,body p,body li")
2569 .expect("valid semantic hint selector");
2570 document
2571 .select(&selector)
2572 .filter_map(|element| {
2573 let tag = element.value().name();
2574 let text = export_element_semantic_text(&element);
2575 if text.is_empty() {
2576 return None;
2577 }
2578 let list_ordered = if tag == "li" {
2579 nearest_list_is_ordered(&element)
2580 } else {
2581 None
2582 };
2583 Some(ExportSemanticHint {
2584 text,
2585 list_ordered,
2586 quote: tag != "li" && has_ancestor_tag(&element, "blockquote"),
2587 })
2588 })
2589 .collect()
2590}
2591
2592fn export_element_semantic_text(element: &ElementRef<'_>) -> String {
2593 let raw_text = if element.value().name() == "li" {
2594 list_item_own_text(element)
2595 } else {
2596 element.text().collect()
2597 };
2598 normalize_semantic_text(&raw_text)
2599}
2600
2601fn list_item_own_text(element: &ElementRef<'_>) -> String {
2602 let mut text = String::new();
2603 let mut stack: Vec<_> = element.children().collect();
2604 stack.reverse();
2605
2606 while let Some(node) = stack.pop() {
2607 match node.value() {
2608 Node::Text(value) => text.push_str(value),
2609 Node::Element(child) if matches!(child.name(), "ol" | "ul") => {}
2610 Node::Element(_) => {
2611 let mut children: Vec<_> = node.children().collect();
2612 children.reverse();
2613 stack.extend(children);
2614 }
2615 _ => {}
2616 }
2617 }
2618
2619 text
2620}
2621
2622fn nearest_list_is_ordered(element: &ElementRef<'_>) -> Option<bool> {
2623 element
2624 .ancestors()
2625 .filter_map(ElementRef::wrap)
2626 .find_map(|ancestor| match ancestor.value().name() {
2627 "ol" => Some(true),
2628 "ul" => Some(false),
2629 _ => None,
2630 })
2631}
2632
2633fn has_ancestor_tag(element: &ElementRef<'_>, tag: &str) -> bool {
2634 element
2635 .ancestors()
2636 .filter_map(ElementRef::wrap)
2637 .any(|ancestor| ancestor.value().name() == tag)
2638}
2639
2640fn normalize_semantic_text(text: &str) -> String {
2641 text.replace('\u{a0}', " ")
2642 .split_whitespace()
2643 .collect::<Vec<_>>()
2644 .join(" ")
2645}
2646
2647fn cell_is_empty(cell: &TableCell) -> bool {
2648 cell.content.iter().all(|node| match node {
2649 ContentNode::Text { text, .. } => text.trim().is_empty(),
2650 ContentNode::Image { .. } => false,
2651 })
2652}
2653
2654fn row_is_empty(row: &TableRow) -> bool {
2655 row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2656}
2657
2658fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2659 if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2660 if drop_empty && cell_is_empty(&cell) {
2661 return;
2662 }
2663 row.cells.push(cell);
2664 }
2665}
2666
2667fn flush_row(
2668 row: &mut Option<TableRow>,
2669 cell: &mut Option<TableCell>,
2670 table: Option<&mut TableBlock>,
2671 drop_empty_trailing_cell: bool,
2672) {
2673 flush_cell(row, cell, drop_empty_trailing_cell);
2674 if let (Some(table), Some(row)) = (table, row.take()) {
2675 table.rows.push(row);
2676 }
2677}
2678
2679fn flush_table(
2680 table: &mut Option<TableBlock>,
2681 row: &mut Option<TableRow>,
2682 cell: &mut Option<TableCell>,
2683 tables: &mut Vec<TableBlock>,
2684 blocks: &mut Vec<CapturedBlock>,
2685) {
2686 flush_row(row, cell, table.as_mut(), true);
2687 if let Some(mut table) = table.take() {
2688 while table.rows.last().is_some_and(row_is_empty) {
2691 table.rows.pop();
2692 }
2693 tables.push(table.clone());
2694 blocks.push(CapturedBlock::Table(table));
2695 }
2696}
2697
2698fn push_to_current(
2699 paragraph: &mut Vec<ContentNode>,
2700 row: &mut Option<TableRow>,
2701 cell: &mut Option<TableCell>,
2702 in_table: bool,
2703 node: ContentNode,
2704) {
2705 if in_table {
2706 if row.is_none() {
2707 *row = Some(TableRow::default());
2708 }
2709 if cell.is_none() {
2710 *cell = Some(TableCell::default());
2711 }
2712 if let Some(cell) = cell.as_mut() {
2713 cell.content.push(node);
2714 }
2715 } else {
2716 paragraph.push(node);
2717 }
2718}
2719
2720fn append_to_current(
2721 paragraph: &mut Vec<ContentNode>,
2722 row: &mut Option<TableRow>,
2723 cell: &mut Option<TableCell>,
2724 in_table: bool,
2725 text: &str,
2726 style: TextStyle,
2727) {
2728 if in_table {
2729 if row.is_none() {
2730 *row = Some(TableRow::default());
2731 }
2732 if cell.is_none() {
2733 *cell = Some(TableCell::default());
2734 }
2735 if let Some(cell) = cell.as_mut() {
2736 append_styled_text(&mut cell.content, text, style);
2737 }
2738 } else {
2739 append_styled_text(paragraph, text, style);
2740 }
2741}
2742
2743fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2744 append_styled_text(content, text, TextStyle::default());
2745}
2746
2747fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2748 if text.is_empty() {
2749 return;
2750 }
2751 if let Some(ContentNode::Text {
2752 text: last,
2753 bold,
2754 italic,
2755 strike,
2756 link,
2757 }) = content.last_mut()
2758 {
2759 let last_style = TextStyle {
2760 bold: *bold,
2761 italic: *italic,
2762 strike: *strike,
2763 link: link.clone(),
2764 };
2765 if last_style == style {
2766 last.push_str(text);
2767 return;
2768 }
2769 }
2770 content.push(ContentNode::Text {
2771 text: text.to_string(),
2772 bold: style.bold,
2773 italic: style.italic,
2774 strike: style.strike,
2775 link: style.link,
2776 });
2777}
2778
2779#[must_use]
2781pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2782 match format.to_lowercase().as_str() {
2783 "html" => render_blocks_html(&capture.blocks),
2784 "txt" | "text" => blocks_to_text(&capture.blocks),
2785 _ => render_blocks_markdown(&capture.blocks),
2786 }
2787}
2788
2789struct RenderedBlock {
2792 markdown: String,
2793 list_id: Option<String>,
2794 quote: bool,
2795}
2796
2797fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
2798 let mut counters: HashMap<(String, usize), usize> = HashMap::new();
2803 let mut rendered: Vec<RenderedBlock> = Vec::new();
2804
2805 for block in blocks {
2806 match block {
2807 CapturedBlock::Paragraph {
2808 content,
2809 style,
2810 list,
2811 quote,
2812 horizontal_rule,
2813 } => {
2814 let text = render_content_markdown(content).trim().to_string();
2815 if text.is_empty() {
2816 continue;
2817 }
2818 let ordered_index = list.as_ref().and_then(|list_meta| {
2819 if !list_meta.ordered {
2820 return None;
2821 }
2822 let key = (list_meta.id.clone(), list_meta.level);
2826 counters.retain(|(id, level), _| {
2827 !(id == &list_meta.id && *level > list_meta.level)
2828 });
2829 let next = counters.entry(key).or_insert(0);
2830 *next += 1;
2831 Some(*next)
2832 });
2833 let markdown = render_paragraph_markdown(
2834 &text,
2835 style.as_deref(),
2836 list.as_ref(),
2837 *quote,
2838 *horizontal_rule,
2839 ordered_index,
2840 );
2841 rendered.push(RenderedBlock {
2842 markdown,
2843 list_id: list.as_ref().map(|l| l.id.clone()),
2844 quote: *quote,
2845 });
2846 }
2847 CapturedBlock::Table(table) => {
2848 rendered.push(RenderedBlock {
2849 markdown: render_table_markdown(table),
2850 list_id: None,
2851 quote: false,
2852 });
2853 }
2854 }
2855 }
2856
2857 let mut out = String::new();
2861 for (idx, block) in rendered.iter().enumerate() {
2862 if idx == 0 {
2863 out.push_str(&block.markdown);
2864 continue;
2865 }
2866 let prev = &rendered[idx - 1];
2867 if block.list_id.is_some() && prev.list_id.is_some() {
2868 out.push('\n');
2869 } else if block.quote && prev.quote {
2870 out.push_str("\n>\n");
2871 } else {
2872 out.push_str("\n\n");
2873 }
2874 out.push_str(&block.markdown);
2875 }
2876 if !out.is_empty() && !out.ends_with('\n') {
2877 out.push('\n');
2878 }
2879 out
2880}
2881
2882fn render_paragraph_markdown(
2883 text: &str,
2884 style: Option<&str>,
2885 list: Option<&ListMeta>,
2886 quote: bool,
2887 horizontal_rule: bool,
2888 ordered_index: Option<usize>,
2889) -> String {
2890 if horizontal_rule {
2891 return "---".to_string();
2892 }
2893 match style {
2894 Some("TITLE") => format!("# {text}"),
2895 Some("SUBTITLE") => format!("## {text}"),
2896 Some(style) if style.starts_with("HEADING_") => {
2897 let level = style
2898 .trim_start_matches("HEADING_")
2899 .parse::<usize>()
2900 .unwrap_or(1);
2901 format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2902 }
2903 _ => list.map_or_else(
2904 || {
2905 if quote {
2906 text.lines()
2907 .map(|line| {
2908 if line.is_empty() {
2909 ">".to_string()
2910 } else {
2911 format!("> {line}")
2912 }
2913 })
2914 .collect::<Vec<_>>()
2915 .join("\n")
2916 } else {
2917 text.to_string()
2918 }
2919 },
2920 |list| {
2921 let indent = " ".repeat(list.level);
2922 let marker = if list.ordered {
2923 format!("{}.", ordered_index.unwrap_or(1))
2924 } else {
2925 "-".to_string()
2926 };
2927 format!("{indent}{marker} {text}")
2928 },
2929 ),
2930 }
2931}
2932
2933fn render_table_markdown(table: &TableBlock) -> String {
2934 if table.rows.is_empty() {
2935 return String::new();
2936 }
2937 let width = table
2938 .rows
2939 .iter()
2940 .map(|row| row.cells.len())
2941 .max()
2942 .unwrap_or(1);
2943 let rows = table
2944 .rows
2945 .iter()
2946 .map(|row| {
2947 (0..width)
2948 .map(|idx| {
2949 row.cells.get(idx).map_or_else(String::new, |cell| {
2950 escape_markdown_table_cell(&render_content_markdown(&cell.content))
2951 })
2952 })
2953 .collect::<Vec<_>>()
2954 })
2955 .collect::<Vec<_>>();
2956 let separator = vec!["---".to_string(); width];
2957 std::iter::once(&rows[0])
2958 .chain(std::iter::once(&separator))
2959 .chain(rows.iter().skip(1))
2960 .map(|row| format!("| {} |", row.join(" | ")))
2961 .collect::<Vec<_>>()
2962 .join("\n")
2963}
2964
2965fn render_content_markdown(content: &[ContentNode]) -> String {
2966 let mut rendered = String::new();
2967 let mut idx = 0usize;
2968 while idx < content.len() {
2969 match &content[idx] {
2970 ContentNode::Text {
2971 text,
2972 bold,
2973 italic,
2974 strike,
2975 link,
2976 } => {
2977 let link_target = link.as_deref();
2978 let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
2979 idx += 1;
2980 while let Some(ContentNode::Text {
2981 text,
2982 bold,
2983 italic,
2984 strike,
2985 link: next_link,
2986 }) = content.get(idx)
2987 {
2988 if next_link.as_deref() != link_target {
2989 break;
2990 }
2991 runs.push((text.as_str(), *bold, *italic, *strike));
2992 idx += 1;
2993 }
2994 let label = render_text_runs_markdown(&runs);
2995 if let Some(link_target) = link_target {
2996 let _ = write!(rendered, "[{label}]({link_target})");
2997 } else {
2998 rendered.push_str(&label);
2999 }
3000 }
3001 ContentNode::Image {
3002 url: Some(url),
3003 alt,
3004 ..
3005 } => {
3006 let _ = write!(rendered, "");
3007 idx += 1;
3008 }
3009 ContentNode::Image { .. } => idx += 1,
3010 }
3011 }
3012 rendered
3013}
3014
3015#[derive(Clone, Copy, Default)]
3016struct MarkdownMarkerState {
3017 bold: bool,
3018 italic: bool,
3019 strike: bool,
3020}
3021
3022fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
3023 let inactive = MarkdownMarkerState::default();
3024 let mut active = inactive;
3025 let mut output = String::new();
3026 for (text, bold, italic, strike) in runs {
3027 let next = MarkdownMarkerState {
3028 bold: *bold,
3029 italic: *italic,
3030 strike: *strike,
3031 };
3032 let mut start = 0usize;
3033 for (offset, ch) in text.char_indices() {
3034 if ch != '\n' {
3035 continue;
3036 }
3037 if offset > start {
3038 output.push_str(&markdown_marker_transition(active, next));
3039 output.push_str(&text[start..offset]);
3040 active = next;
3041 }
3042 output.push_str(&markdown_marker_transition(active, inactive));
3043 output.push('\n');
3044 active = inactive;
3045 start = offset + ch.len_utf8();
3046 }
3047 if start < text.len() {
3048 output.push_str(&markdown_marker_transition(active, next));
3049 output.push_str(&text[start..]);
3050 active = next;
3051 }
3052 }
3053 output.push_str(&markdown_marker_transition(active, inactive));
3054 output
3055}
3056
3057fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
3058 let mut markers = String::new();
3059 if active.strike && !next.strike {
3060 markers.push_str("~~");
3061 }
3062 if active.italic && !next.italic {
3063 markers.push('*');
3064 }
3065 if active.bold && !next.bold {
3066 markers.push_str("**");
3067 }
3068 if !active.bold && next.bold {
3069 markers.push_str("**");
3070 }
3071 if !active.italic && next.italic {
3072 markers.push('*');
3073 }
3074 if !active.strike && next.strike {
3075 markers.push_str("~~");
3076 }
3077 markers
3078}
3079
3080fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
3081 format!(
3082 "<!doctype html><html><body>{}</body></html>",
3083 blocks
3084 .iter()
3085 .map(|block| match block {
3086 CapturedBlock::Paragraph {
3087 content,
3088 style,
3089 list,
3090 quote,
3091 horizontal_rule,
3092 } => {
3093 if *horizontal_rule {
3094 "<hr>".to_string()
3095 } else if let Some(list) = list {
3096 let tag = if list.ordered { "ol" } else { "ul" };
3097 format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
3098 } else if *quote {
3099 format!("<blockquote>{}</blockquote>", render_content_html(content))
3100 } else {
3101 let tag = paragraph_tag(style.as_deref());
3102 format!("<{tag}>{}</{tag}>", render_content_html(content))
3103 }
3104 }
3105 CapturedBlock::Table(table) => render_table_html(table),
3106 })
3107 .collect::<String>()
3108 )
3109}
3110
3111fn render_table_html(table: &TableBlock) -> String {
3112 let mut html = String::from("<table>");
3113 for row in &table.rows {
3114 html.push_str("<tr>");
3115 for cell in &row.cells {
3116 html.push_str("<td>");
3117 html.push_str(&render_content_html(&cell.content));
3118 html.push_str("</td>");
3119 }
3120 html.push_str("</tr>");
3121 }
3122 html.push_str("</table>");
3123 html
3124}
3125
3126fn render_content_html(content: &[ContentNode]) -> String {
3127 content
3128 .iter()
3129 .map(|node| match node {
3130 ContentNode::Text {
3131 text,
3132 bold,
3133 italic,
3134 strike,
3135 link,
3136 } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
3137 ContentNode::Image {
3138 url: Some(url),
3139 alt,
3140 width,
3141 height,
3142 ..
3143 } => render_image_html(url, alt, width.as_deref(), height.as_deref()),
3144 ContentNode::Image { .. } => String::new(),
3145 })
3146 .collect()
3147}
3148
3149fn render_image_html(url: &str, alt: &str, width: Option<&str>, height: Option<&str>) -> String {
3150 let mut html = format!(
3151 "<img src=\"{}\" alt=\"{}\"",
3152 escape_html(url),
3153 escape_html(alt)
3154 );
3155 if let Some(width) = width.filter(|value| !value.is_empty()) {
3156 let _ = write!(html, " width=\"{}\"", escape_html(width));
3157 }
3158 if let Some(height) = height.filter(|value| !value.is_empty()) {
3159 let _ = write!(html, " height=\"{}\"", escape_html(height));
3160 }
3161 html.push('>');
3162 html
3163}
3164
3165fn render_marked_html(
3166 text: &str,
3167 bold: bool,
3168 italic: bool,
3169 strike: bool,
3170 link: Option<&str>,
3171) -> String {
3172 text.split('\n')
3173 .map(|segment| render_marked_html_segment(segment, bold, italic, strike, link))
3174 .collect::<Vec<_>>()
3175 .join("<br>")
3176}
3177
3178fn render_marked_html_segment(
3179 text: &str,
3180 bold: bool,
3181 italic: bool,
3182 strike: bool,
3183 link: Option<&str>,
3184) -> String {
3185 if text.is_empty() {
3186 return String::new();
3187 }
3188 let mut output = escape_html(text);
3189 if bold {
3190 output = format!("<strong>{output}</strong>");
3191 }
3192 if italic {
3193 output = format!("<em>{output}</em>");
3194 }
3195 if strike {
3196 output = format!("<s>{output}</s>");
3197 }
3198 if let Some(link) = link {
3199 output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
3200 }
3201 output
3202}
3203
3204fn paragraph_tag(style: Option<&str>) -> &'static str {
3205 match style {
3206 Some("TITLE" | "HEADING_1") => "h1",
3207 Some("SUBTITLE" | "HEADING_2") => "h2",
3208 Some("HEADING_3") => "h3",
3209 Some("HEADING_4") => "h4",
3210 Some("HEADING_5") => "h5",
3211 Some("HEADING_6") => "h6",
3212 _ => "p",
3213 }
3214}
3215
3216fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
3217 blocks
3218 .iter()
3219 .map(|block| match block {
3220 CapturedBlock::Paragraph { content, .. } => content_to_text(content),
3221 CapturedBlock::Table(table) => table
3222 .rows
3223 .iter()
3224 .map(|row| {
3225 row.cells
3226 .iter()
3227 .map(|cell| content_to_text(&cell.content))
3228 .collect::<Vec<_>>()
3229 .join("\t")
3230 })
3231 .collect::<Vec<_>>()
3232 .join("\n"),
3233 })
3234 .filter(|text| !text.is_empty())
3235 .collect::<Vec<_>>()
3236 .join("\n")
3237}
3238
3239fn content_to_text(content: &[ContentNode]) -> String {
3240 content
3241 .iter()
3242 .map(|node| match node {
3243 ContentNode::Text { text, .. } => text.clone(),
3244 ContentNode::Image {
3245 url: Some(_), alt, ..
3246 } => format!("[{alt}]"),
3247 ContentNode::Image { .. } => String::new(),
3248 })
3249 .collect()
3250}
3251
3252fn escape_html(value: &str) -> String {
3253 value
3254 .replace('&', "&")
3255 .replace('<', "<")
3256 .replace('>', ">")
3257 .replace('"', """)
3258 .replace('\'', "'")
3259}
3260
3261fn escape_markdown_table_cell(value: &str) -> String {
3262 value.replace('|', "\\|").replace('\n', "<br>")
3263}
3264
3265#[must_use]
3269pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
3270 let trimmed = auth_header.trim();
3271 trimmed
3272 .strip_prefix("Bearer ")
3273 .or_else(|| trimmed.strip_prefix("bearer "))
3274 .map(str::trim)
3275 .filter(|t| !t.is_empty())
3276}
3277
3278#[derive(Debug, Clone)]
3280pub struct ExtractedImage {
3281 pub filename: String,
3283 pub data: Vec<u8>,
3285 pub mime_type: String,
3287}
3288
3289#[derive(Debug, Clone)]
3291pub struct GDocsArchiveResult {
3292 pub html: String,
3294 pub markdown: String,
3296 pub images: Vec<ExtractedImage>,
3298 pub document_id: String,
3300 pub export_url: String,
3302}
3303
3304pub async fn localize_rendered_remote_images_for_archive(
3316 rendered: &GDocsRenderedResult,
3317) -> crate::Result<GDocsArchiveResult> {
3318 let client = reqwest::Client::builder().build().map_err(|error| {
3319 WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
3320 })?;
3321 let mut seen = HashMap::new();
3322 let mut images = Vec::new();
3323 let mut next_index = 1usize;
3324
3325 for image in &rendered.remote_images {
3326 if seen.contains_key(&image.url) {
3327 continue;
3328 }
3329 let filename = remote_image_filename(&image.url, next_index);
3330 next_index += 1;
3331 seen.insert(image.url.clone(), filename.clone());
3332
3333 match client
3334 .get(&image.url)
3335 .header("User-Agent", GDOCS_USER_AGENT)
3336 .header("Accept", "image/*,*/*;q=0.8")
3337 .send()
3338 .await
3339 {
3340 Ok(response) if response.status().is_success() => {
3341 let mime_type = response
3342 .headers()
3343 .get(reqwest::header::CONTENT_TYPE)
3344 .and_then(|value| value.to_str().ok())
3345 .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
3346 let data = response.bytes().await.map_err(|error| {
3347 WebCaptureError::FetchError(format!(
3348 "Failed to read Google Docs image {}: {error}",
3349 image.url
3350 ))
3351 })?;
3352 debug!(
3353 url = %image.url,
3354 filename = %filename,
3355 bytes = data.len(),
3356 mime_type = %mime_type,
3357 "downloaded Google Docs browser-model archive image"
3358 );
3359 images.push(ExtractedImage {
3360 filename,
3361 data: data.to_vec(),
3362 mime_type,
3363 });
3364 }
3365 Ok(response) => {
3366 warn!(
3367 url = %image.url,
3368 status = response.status().as_u16(),
3369 "failed to download Google Docs browser-model archive image"
3370 );
3371 }
3372 Err(error) => {
3373 warn!(
3374 url = %image.url,
3375 error = %error,
3376 "failed to download Google Docs browser-model archive image"
3377 );
3378 }
3379 }
3380 }
3381
3382 let mut markdown = rendered.markdown.clone();
3383 let mut html = rendered.html.clone();
3384 for (url, filename) in seen {
3385 let local_path = format!("images/{filename}");
3386 markdown = markdown.replace(&url, &local_path);
3387 html = html.replace(&url, &local_path);
3388 }
3389
3390 Ok(GDocsArchiveResult {
3391 html,
3392 markdown,
3393 images,
3394 document_id: rendered.document_id.clone(),
3395 export_url: rendered.export_url.clone(),
3396 })
3397}
3398
3399fn remote_image_filename(url: &str, index: usize) -> String {
3400 let ext = crate::localize_images::get_extension_from_url(url);
3401 format!("image-{index:02}{ext}")
3402}
3403
3404fn mime_type_for_filename(filename: &str) -> String {
3405 match filename
3406 .rsplit('.')
3407 .next()
3408 .unwrap_or("png")
3409 .to_lowercase()
3410 .as_str()
3411 {
3412 "jpg" | "jpeg" => "image/jpeg",
3413 "gif" => "image/gif",
3414 "webp" => "image/webp",
3415 "svg" => "image/svg+xml",
3416 _ => "image/png",
3417 }
3418 .to_string()
3419}
3420
3421fn base64_image_pattern() -> &'static Regex {
3422 static PATTERN: OnceLock<Regex> = OnceLock::new();
3423 PATTERN.get_or_init(|| {
3424 Regex::new(
3425 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3426 )
3427 .unwrap()
3428 })
3429}
3430
3431#[must_use]
3444pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3445 let mut images = Vec::new();
3446 let mut idx = 1u32;
3447
3448 let updated_html = base64_image_pattern()
3449 .replace_all(html, |caps: ®ex::Captures<'_>| {
3450 let prefix = &caps[1];
3451 let mime_ext = &caps[2];
3452 let base64_data = &caps[3];
3453 let suffix = &caps[4];
3454
3455 let ext = match mime_ext {
3456 "jpeg" => "jpg",
3457 "svg+xml" => "svg",
3458 other => other,
3459 };
3460
3461 let filename = format!("image-{idx:02}.{ext}");
3462 let mime_type = format!("image/{mime_ext}");
3463
3464 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3465 debug!("Extracted image: {} ({} bytes)", filename, data.len());
3466 images.push(ExtractedImage {
3467 filename: filename.clone(),
3468 data,
3469 mime_type,
3470 });
3471 }
3472
3473 idx += 1;
3474 format!("{prefix}images/{filename}{suffix}")
3475 })
3476 .into_owned();
3477
3478 (updated_html, images)
3479}
3480
3481pub async fn fetch_google_doc_as_archive(
3500 url: &str,
3501 api_token: Option<&str>,
3502) -> crate::Result<GDocsArchiveResult> {
3503 let result = fetch_google_doc(url, "html", api_token).await?;
3504
3505 let preprocess = preprocess_google_docs_export_html(&result.content);
3506 debug!(
3507 document_id = %result.document_id,
3508 hoisted = preprocess.hoisted,
3509 unwrapped_links = preprocess.unwrapped_links,
3510 "google-docs-export pre-processor rewrote archive markup"
3511 );
3512
3513 let (local_html, images) = extract_base64_images(&preprocess.html);
3514
3515 let markdown = normalize_google_docs_export_markdown(
3516 &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3517 );
3518
3519 debug!(
3520 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3521 images.len(),
3522 local_html.len(),
3523 markdown.len()
3524 );
3525
3526 Ok(GDocsArchiveResult {
3527 html: local_html,
3528 markdown,
3529 images,
3530 document_id: result.document_id,
3531 export_url: result.export_url,
3532 })
3533}
3534
3535pub fn create_archive_zip(
3546 archive: &GDocsArchiveResult,
3547 pretty_html: bool,
3548) -> crate::Result<Vec<u8>> {
3549 let mut buf = std::io::Cursor::new(Vec::new());
3550
3551 {
3552 let mut zip = zip::ZipWriter::new(&mut buf);
3553 let options = zip::write::SimpleFileOptions::default()
3554 .compression_method(zip::CompressionMethod::Deflated);
3555
3556 zip.start_file("document.md", options)
3557 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3558 zip.write_all(archive.markdown.as_bytes())?;
3559
3560 let html_output = if pretty_html {
3561 crate::html::pretty_print_html(&archive.html)
3562 } else {
3563 archive.html.clone()
3564 };
3565 zip.start_file("document.html", options)
3566 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3567 zip.write_all(html_output.as_bytes())?;
3568
3569 for img in &archive.images {
3570 zip.start_file(format!("images/{}", img.filename), options)
3571 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3572 zip.write_all(&img.data)?;
3573 }
3574
3575 zip.finish()
3576 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3577 }
3578
3579 Ok(buf.into_inner())
3580}
3581
3582#[cfg(test)]
3583mod tests {
3584 use super::*;
3585 use serde_json::json;
3586
3587 #[test]
3588 fn browser_model_fingerprint_includes_payload_size() {
3589 let small = browser_model_data_from_value(&json!({
3590 "chunks": [{ "chunk": [{ "ty": "is", "s": "first" }] }],
3591 "cidUrlMap": {}
3592 }));
3593 let larger = browser_model_data_from_value(&json!({
3594 "chunks": [{ "chunk": [{ "ty": "is", "s": "first and later text" }] }],
3595 "cidUrlMap": {}
3596 }));
3597
3598 assert_eq!(small.fingerprint().chunks, larger.fingerprint().chunks);
3599 assert_ne!(
3600 small.fingerprint().payload_bytes,
3601 larger.fingerprint().payload_bytes
3602 );
3603 }
3604
3605 #[test]
3606 fn browser_model_quiescence_resets_when_chunks_change() {
3607 let start = Instant::now();
3608 let stability_window = Duration::from_millis(1500);
3609 let one_chunk = BrowserModelFingerprint {
3610 chunks: 1,
3611 payload_bytes: 100,
3612 };
3613 let two_chunks = BrowserModelFingerprint {
3614 chunks: 2,
3615 payload_bytes: 200,
3616 };
3617 let mut quiescence = BrowserModelQuiescence::default();
3618
3619 assert_eq!(quiescence.observe(one_chunk, start, stability_window), None);
3620 assert_eq!(
3621 quiescence.observe(
3622 one_chunk,
3623 start + Duration::from_millis(250),
3624 stability_window
3625 ),
3626 None
3627 );
3628 assert_eq!(
3629 quiescence.observe(
3630 two_chunks,
3631 start + Duration::from_millis(500),
3632 stability_window
3633 ),
3634 None
3635 );
3636 assert_eq!(
3637 quiescence.observe(
3638 two_chunks,
3639 start + Duration::from_millis(750),
3640 stability_window
3641 ),
3642 None
3643 );
3644 assert_eq!(
3645 quiescence.observe(
3646 two_chunks,
3647 start + Duration::from_millis(2300),
3648 stability_window
3649 ),
3650 Some(Duration::from_millis(1550))
3651 );
3652 }
3653}