1use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use scraper::{node::Node, ElementRef, Html, Selector};
38use serde_json::Value;
39use std::collections::HashMap;
40use std::fmt::Write as _;
41use std::hash::BuildHasher;
42use std::io::Write;
43use std::process::Stdio;
44use std::sync::OnceLock;
45use std::time::{Duration, Instant};
46use tokio::io::{AsyncBufReadExt, BufReader};
47use tokio::process::{Child, Command};
48use tracing::{debug, info, warn};
49
50use crate::WebCaptureError;
51
52const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
53const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
54const GDOCS_USER_AGENT: &str =
55 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
56const GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT: Duration = Duration::from_secs(30);
57const GDOCS_EDITOR_MODEL_STABILITY_DEFAULT: Duration = Duration::from_millis(1500);
58const GDOCS_EDITOR_MODEL_POLL_INTERVAL: Duration = Duration::from_millis(250);
59const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
60
61type CdpWebSocket = WebSocketStream<ConnectStream>;
62
63const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
64window.__captured_chunks = [];
65const captureChunk = (value) => {
66 if (!value) {
67 return;
68 }
69 if (Array.isArray(value)) {
70 for (const item of value) {
71 captureChunk(item);
72 }
73 return;
74 }
75 try {
76 window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
77 } catch {
78 window.__captured_chunks.push(value);
79 }
80};
81const wrapChunkArray = (value) => {
82 if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
83 return value;
84 }
85 const originalPush = value.push;
86 Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
87 value: true,
88 enumerable: false,
89 });
90 Object.defineProperty(value, 'push', {
91 value(...items) {
92 for (const item of items) {
93 captureChunk(item);
94 }
95 return originalPush.apply(this, items);
96 },
97 writable: true,
98 configurable: true,
99 });
100 for (const item of value) {
101 captureChunk(item);
102 }
103 return value;
104};
105Object.defineProperty(window, 'DOCS_modelChunk', {
106 set(value) {
107 captureChunk(value);
108 window.__DOCS_modelChunk_latest = wrapChunkArray(value);
109 },
110 get() {
111 return window.__DOCS_modelChunk_latest;
112 },
113 configurable: false,
114});
115";
116
117const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
118 const chunks = [...(window.__captured_chunks || [])];
119 if (
120 window.DOCS_modelChunk &&
121 chunks.length === 0 &&
122 !chunks.includes(window.DOCS_modelChunk)
123 ) {
124 chunks.push(window.DOCS_modelChunk);
125 }
126 const cidUrlMap = {};
127 const scripts = document.querySelectorAll('script');
128 for (const script of scripts) {
129 const text = script.textContent || '';
130 if (!text.includes('docs-images-rt')) {
131 continue;
132 }
133 const regex =
134 /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
135 let match;
136 while ((match = regex.exec(text)) !== null) {
137 cidUrlMap[match[1]] = match[2]
138 .replace(/\\u003d/g, '=')
139 .replace(/\\u0026/g, '&')
140 .replace(/\\\//g, '/');
141 }
142 }
143 return { chunks, cidUrlMap };
144}"#;
145
146fn gdocs_url_pattern() -> &'static Regex {
147 static PATTERN: OnceLock<Regex> = OnceLock::new();
148 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
149}
150
151#[derive(Debug, Clone)]
153pub struct GDocsResult {
154 pub content: String,
156 pub format: String,
158 pub document_id: String,
160 pub export_url: String,
162}
163
164#[derive(Debug, Clone, Copy, PartialEq, Eq)]
166pub enum GDocsCaptureMethod {
167 BrowserModel,
169 PublicExport,
171 DocsApi,
173}
174
175#[derive(Debug, Clone)]
177pub struct GDocsRenderedResult {
178 pub markdown: String,
180 pub html: String,
182 pub text: String,
184 pub document_id: String,
186 pub export_url: String,
188 pub remote_images: Vec<RemoteImage>,
190}
191
192#[derive(Debug, Clone, PartialEq, Eq)]
194pub struct RemoteImage {
195 pub url: String,
197 pub alt: String,
199}
200
201#[derive(Debug, Clone)]
202struct BrowserModelData {
203 chunks: Vec<Value>,
204 cid_urls: HashMap<String, String>,
205 chunk_payload_bytes: usize,
206 poll_count: usize,
207 stable_for: Duration,
208}
209
210#[derive(Debug, Clone, Copy, PartialEq, Eq)]
211struct BrowserModelFingerprint {
212 chunks: usize,
213 payload_bytes: usize,
214}
215
216#[derive(Debug, Default)]
217struct BrowserModelQuiescence {
218 last_fingerprint: Option<BrowserModelFingerprint>,
219 stable_since: Option<Instant>,
220}
221
222impl BrowserModelData {
223 const fn fingerprint(&self) -> BrowserModelFingerprint {
224 BrowserModelFingerprint {
225 chunks: self.chunks.len(),
226 payload_bytes: self.chunk_payload_bytes,
227 }
228 }
229}
230
231impl BrowserModelQuiescence {
232 fn observe(
233 &mut self,
234 fingerprint: BrowserModelFingerprint,
235 now: Instant,
236 stability_window: Duration,
237 ) -> Option<Duration> {
238 if fingerprint.chunks == 0 {
239 self.last_fingerprint = Some(fingerprint);
240 self.stable_since = None;
241 return None;
242 }
243
244 if self.last_fingerprint == Some(fingerprint) {
245 let stable_since = *self.stable_since.get_or_insert(now);
246 let stable_for = now.saturating_duration_since(stable_since);
247 if stable_for >= stability_window {
248 return Some(stable_for);
249 }
250 } else {
251 self.last_fingerprint = Some(fingerprint);
252 self.stable_since = None;
253 }
254
255 None
256 }
257
258 fn stable_for(&self, now: Instant) -> Duration {
259 self.stable_since.map_or(Duration::ZERO, |stable_since| {
260 now.saturating_duration_since(stable_since)
261 })
262 }
263}
264
265#[derive(Debug, Clone, Default)]
267pub struct CapturedDocument {
268 pub blocks: Vec<CapturedBlock>,
270 pub tables: Vec<TableBlock>,
272 pub images: Vec<ContentNode>,
274 pub text: String,
276}
277
278#[derive(Debug, Clone)]
280pub enum CapturedBlock {
281 Paragraph {
283 content: Vec<ContentNode>,
285 style: Option<String>,
287 list: Option<ListMeta>,
289 quote: bool,
291 horizontal_rule: bool,
293 },
294 Table(TableBlock),
296}
297
298#[derive(Debug, Clone, Default)]
300pub struct TableBlock {
301 pub rows: Vec<TableRow>,
303}
304
305#[derive(Debug, Clone, Default)]
307pub struct TableRow {
308 pub cells: Vec<TableCell>,
310}
311
312#[derive(Debug, Clone, Default)]
314pub struct TableCell {
315 pub content: Vec<ContentNode>,
317}
318
319#[derive(Debug, Clone, PartialEq, Eq)]
321pub enum ContentNode {
322 Text {
324 text: String,
326 bold: bool,
328 italic: bool,
330 strike: bool,
332 link: Option<String>,
334 },
335 Image {
337 cid: Option<String>,
339 url: Option<String>,
341 alt: String,
343 width: Option<String>,
345 height: Option<String>,
347 is_suggestion: bool,
349 },
350}
351
352#[derive(Debug, Clone, Default, PartialEq, Eq)]
353struct TextStyle {
354 bold: bool,
355 italic: bool,
356 strike: bool,
357 link: Option<String>,
358}
359
360#[derive(Debug, Clone, Default)]
361struct ParagraphMeta {
362 style: Option<String>,
363 list: Option<ListMeta>,
364 quote: bool,
365 horizontal_rule: bool,
366}
367
368#[derive(Debug, Clone)]
369pub struct ListMeta {
370 pub id: String,
372 pub level: usize,
374 pub ordered: bool,
376}
377
378#[derive(Debug, Clone)]
379struct ParagraphStyle {
380 style: Option<String>,
381 indent_start: f64,
382 indent_first_line: f64,
383}
384
385#[derive(Debug, Clone)]
386struct ExportSemanticHint {
387 text: String,
388 list_ordered: Option<bool>,
389 quote: bool,
390}
391
392#[derive(Debug, Clone, Default)]
393struct ModelStyleMaps {
394 inline_styles: Vec<TextStyle>,
395 paragraph_by_end: HashMap<usize, ParagraphStyle>,
396 list_by_end: HashMap<usize, ListMeta>,
397 horizontal_rules: std::collections::HashSet<usize>,
398}
399
400#[must_use]
402pub fn is_google_docs_url(url: &str) -> bool {
403 gdocs_url_pattern().is_match(url)
404}
405
406#[must_use]
410pub fn extract_document_id(url: &str) -> Option<String> {
411 gdocs_url_pattern()
412 .captures(url)
413 .and_then(|caps| caps.get(1))
414 .map(|m| m.as_str().to_string())
415}
416
417#[must_use]
424pub fn build_export_url(document_id: &str, format: &str) -> String {
425 let export_format = match format {
426 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
427 _ => "html",
428 };
429 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
430}
431
432#[must_use]
434pub fn build_edit_url(document_id: &str) -> String {
435 format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
436}
437
438#[must_use]
440pub fn build_docs_api_url(document_id: &str) -> String {
441 format!("{GDOCS_API_BASE}/{document_id}")
442}
443
444pub fn select_capture_method(
450 capture: &str,
451 api_token: Option<&str>,
452) -> crate::Result<GDocsCaptureMethod> {
453 match capture.to_lowercase().as_str() {
454 "browser" => Ok(GDocsCaptureMethod::BrowserModel),
455 "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
456 "api" => Ok(GDocsCaptureMethod::PublicExport),
457 other => Err(WebCaptureError::InvalidUrl(format!(
458 "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
459 ))),
460 }
461}
462
463pub async fn fetch_google_doc(
478 url: &str,
479 format: &str,
480 api_token: Option<&str>,
481) -> crate::Result<GDocsResult> {
482 let document_id = extract_document_id(url).ok_or_else(|| {
483 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
484 })?;
485
486 let export_url = build_export_url(&document_id, format);
487 debug!(
488 document_id = %document_id,
489 format = %format,
490 export_url = %export_url,
491 has_api_token = api_token.is_some(),
492 "fetching Google Doc via public export"
493 );
494
495 let mut request = reqwest::Client::new()
496 .get(&export_url)
497 .header(
498 "User-Agent",
499 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
500 )
501 .header("Accept-Charset", "utf-8")
502 .header("Accept-Language", "en-US,en;q=0.9");
503
504 if let Some(token) = api_token {
505 request = request.header("Authorization", format!("Bearer {token}"));
506 }
507
508 let response = request
509 .send()
510 .await
511 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
512 debug!(
513 document_id = %document_id,
514 status = response.status().as_u16(),
515 success = response.status().is_success(),
516 content_type = response
517 .headers()
518 .get(reqwest::header::CONTENT_TYPE)
519 .and_then(|value| value.to_str().ok())
520 .unwrap_or(""),
521 "received Google Docs public export response"
522 );
523
524 if !response.status().is_success() {
525 return Err(WebCaptureError::FetchError(format!(
526 "Failed to fetch Google Doc ({} {}): {}",
527 response.status().as_u16(),
528 response.status().canonical_reason().unwrap_or("Unknown"),
529 export_url
530 )));
531 }
532
533 let raw_content = response.text().await.map_err(|e| {
534 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
535 })?;
536 debug!(
537 document_id = %document_id,
538 bytes = raw_content.len(),
539 "read Google Docs public export body"
540 );
541
542 let content = match format {
545 "txt" | "md" => crate::html::decode_html_entities(&raw_content),
546 _ => raw_content,
547 };
548
549 Ok(GDocsResult {
550 content,
551 format: format.to_string(),
552 document_id,
553 export_url,
554 })
555}
556
557pub async fn fetch_google_doc_as_markdown(
571 url: &str,
572 api_token: Option<&str>,
573) -> crate::Result<GDocsResult> {
574 let result = fetch_google_doc(url, "html", api_token).await?;
575
576 let preprocess = preprocess_google_docs_export_html(&result.content);
577 debug!(
578 document_id = %result.document_id,
579 hoisted = preprocess.hoisted,
580 unwrapped_links = preprocess.unwrapped_links,
581 "google-docs-export pre-processor rewrote markup"
582 );
583 let markdown = normalize_google_docs_export_markdown(
584 &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
585 );
586 debug!(
587 document_id = %result.document_id,
588 bytes = markdown.len(),
589 "rendered Google Docs public export markdown"
590 );
591
592 Ok(GDocsResult {
593 content: markdown,
594 format: "markdown".to_string(),
595 document_id: result.document_id,
596 export_url: result.export_url,
597 })
598}
599
600#[derive(Debug, Clone)]
605pub struct GDocsExportPreprocessResult {
606 pub html: String,
608 pub hoisted: usize,
610 pub unwrapped_links: usize,
612}
613
614#[must_use]
622pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
623 let mut hoisted: usize = 0;
624 let mut unwrapped_links: usize = 0;
625 let class_styles = extract_css_class_styles(html);
626
627 let mut out = hoist_inline_style_spans(html, &mut hoisted);
628 out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
629 out = convert_class_indented_blockquotes(&out, &class_styles);
630 out = nest_google_docs_lists(&out, &class_styles);
631 out = strip_google_docs_heading_noise(&out);
632 out = strip_heading_inline_formatting(&out);
633 out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
634 out = out.replace(" ", " ");
635 out = out.replace('\u{00A0}', " ");
636
637 GDocsExportPreprocessResult {
638 html: out,
639 hoisted,
640 unwrapped_links,
641 }
642}
643
644#[must_use]
646pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
647 let markdown = unescape_public_export_punctuation(markdown);
648 let markdown = convert_setext_headings(&markdown);
649 let markdown = normalize_atx_headings(&markdown);
650 let markdown = normalize_bullet_markers(&markdown);
651 let markdown = normalize_list_spacing(&markdown);
652 let markdown = normalize_blockquote_spacing(&markdown);
653 let markdown = normalize_markdown_tables(&markdown);
654 crate::markdown::clean_markdown(&markdown)
655}
656
657fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
658 let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
659 .expect("valid regex");
660 span_re
661 .replace_all(html, |caps: ®ex::Captures<'_>| {
662 let style = caps.get(2).map_or("", |m| m.as_str());
663 let inner = caps.get(3).map_or("", |m| m.as_str());
664 semantic_wrapped_html(inner, style).map_or_else(
665 || caps[0].to_string(),
666 |wrapped| {
667 *hoisted += 1;
668 wrapped
669 },
670 )
671 })
672 .into_owned()
673}
674
675fn hoist_class_style_spans(
676 html: &str,
677 class_styles: &HashMap<String, String>,
678 hoisted: &mut usize,
679) -> String {
680 let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
681 .expect("valid regex");
682 class_span_re
683 .replace_all(html, |caps: ®ex::Captures<'_>| {
684 let class_attr = caps.get(2).map_or("", |m| m.as_str());
685 let inner = caps.get(3).map_or("", |m| m.as_str());
686 let style = combined_class_style(class_styles, class_attr);
687 semantic_wrapped_html(inner, &style).map_or_else(
688 || caps[0].to_string(),
689 |wrapped| {
690 *hoisted += 1;
691 wrapped
692 },
693 )
694 })
695 .into_owned()
696}
697
698fn convert_class_indented_blockquotes(
699 html: &str,
700 class_styles: &HashMap<String, String>,
701) -> String {
702 let class_paragraph_re =
703 Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
704 class_paragraph_re
705 .replace_all(html, |caps: ®ex::Captures<'_>| {
706 let class_attr = caps.get(2).map_or("", |m| m.as_str());
707 let inner = caps.get(3).map_or("", |m| m.as_str());
708 let style = combined_class_style(class_styles, class_attr);
709 if is_blockquote_style(&style) {
710 format!("<blockquote><p>{inner}</p></blockquote>")
711 } else {
712 caps[0].to_string()
713 }
714 })
715 .into_owned()
716}
717
718#[derive(Debug, Clone)]
719struct ExportListBlock {
720 start: usize,
721 end: usize,
722 tag: String,
723 inner: String,
724 start_attr: Option<String>,
725}
726
727#[derive(Debug, Clone)]
728struct ExportListItem {
729 tag: String,
730 level: usize,
731 inner: String,
732}
733
734fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
735 let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
736 let start_attr_re = Regex::new(r#"(?i)\bstart\s*=\s*"([^"]*)""#).expect("valid regex");
737 let blocks: Vec<ExportListBlock> = list_re
738 .captures_iter(html)
739 .filter_map(|caps| {
740 let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
741 let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
742 if open_tag != close_tag {
743 return None;
744 }
745 let whole = caps.get(0)?;
746 let attrs = caps.get(2).map_or("", |m| m.as_str());
747 let start_attr = if open_tag == "ol" {
748 start_attr_re
749 .captures(attrs)
750 .and_then(|c| c.get(1).map(|m| m.as_str().to_string()))
751 } else {
752 None
753 };
754 Some(ExportListBlock {
755 start: whole.start(),
756 end: whole.end(),
757 tag: open_tag,
758 inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
759 start_attr,
760 })
761 })
762 .collect();
763
764 if blocks.len() < 2 {
765 return html.to_string();
766 }
767
768 let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
769 let mut current: Vec<ExportListBlock> = Vec::new();
770 for block in blocks {
771 if let Some(previous) = current.last() {
772 if !html[previous.end..block.start].trim().is_empty() {
773 if current.len() > 1 {
774 groups.push(std::mem::take(&mut current));
775 } else {
776 current.clear();
777 }
778 }
779 }
780 current.push(block);
781 }
782 if current.len() > 1 {
783 groups.push(current);
784 }
785
786 if groups.is_empty() {
787 return html.to_string();
788 }
789
790 let mut out = html.to_string();
791 for group in groups.iter().rev() {
792 let rendered = render_nested_list_group(group, class_styles);
793 let start = group.first().expect("non-empty group").start;
794 let end = group.last().expect("non-empty group").end;
795 out.replace_range(start..end, &rendered);
796 }
797 out
798}
799
800#[allow(clippy::too_many_lines)]
801fn render_nested_list_group(
802 group: &[ExportListBlock],
803 class_styles: &HashMap<String, String>,
804) -> String {
805 let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
806 let items: Vec<ExportListItem> = group
807 .iter()
808 .flat_map(|block| {
809 item_re.captures_iter(&block.inner).map(|caps| {
810 let attrs = caps.get(1).map_or("", |m| m.as_str());
811 let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
812 ExportListItem {
813 tag: block.tag.clone(),
814 level: google_docs_list_item_level(attrs, class_styles),
815 inner,
816 }
817 })
818 })
819 .collect();
820
821 if items.is_empty() {
822 let mut unchanged = String::new();
823 for block in group {
824 write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
825 .expect("write to String");
826 }
827 return unchanged;
828 }
829
830 let top_level_start = group.first().and_then(|block| block.start_attr.clone());
831
832 let mut html = String::new();
833 let mut current_level: Option<usize> = None;
834 let mut open_tags: Vec<Option<String>> = Vec::new();
835 let mut item_open: Vec<bool> = Vec::new();
836 let mut top_level_opened = false;
837
838 for item in items {
839 let level = item.level;
840 while current_level.is_some_and(|current| current > level) {
841 let current = current_level.expect("checked as Some");
842 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
843 current_level = current.checked_sub(1);
844 }
845
846 while current_level.is_none_or(|current| current < level) {
847 let next_level = current_level.map_or(0, |current| current + 1);
848 let start_attr = if next_level == 0 && !top_level_opened {
849 top_level_opened = true;
850 top_level_start.as_deref()
851 } else {
852 None
853 };
854 open_rendered_list(
855 &mut html,
856 &mut open_tags,
857 &mut item_open,
858 next_level,
859 &item.tag,
860 start_attr,
861 );
862 current_level = Some(next_level);
863 }
864
865 ensure_list_stack(&mut open_tags, &mut item_open, level);
866 if open_tags[level]
867 .as_deref()
868 .is_some_and(|tag| tag != item.tag)
869 {
870 close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
871 let start_attr = if level == 0 && !top_level_opened {
872 top_level_opened = true;
873 top_level_start.as_deref()
874 } else {
875 None
876 };
877 open_rendered_list(
878 &mut html,
879 &mut open_tags,
880 &mut item_open,
881 level,
882 &item.tag,
883 start_attr,
884 );
885 } else if open_tags[level].is_none() {
886 let start_attr = if level == 0 && !top_level_opened {
887 top_level_opened = true;
888 top_level_start.as_deref()
889 } else {
890 None
891 };
892 open_rendered_list(
893 &mut html,
894 &mut open_tags,
895 &mut item_open,
896 level,
897 &item.tag,
898 start_attr,
899 );
900 }
901
902 close_rendered_item(&mut html, &mut item_open, level);
903 html.push_str("<li>");
904 html.push_str(&item.inner);
905 item_open[level] = true;
906
907 for deeper in (level + 1)..item_open.len() {
908 item_open[deeper] = false;
909 open_tags[deeper] = None;
910 }
911 }
912
913 while let Some(current) = current_level {
914 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
915 current_level = current.checked_sub(1);
916 }
917
918 html
919}
920
921fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
922 while open_tags.len() <= level {
923 open_tags.push(None);
924 item_open.push(false);
925 }
926}
927
928fn open_rendered_list(
929 html: &mut String,
930 open_tags: &mut Vec<Option<String>>,
931 item_open: &mut Vec<bool>,
932 level: usize,
933 tag: &str,
934 start_attr: Option<&str>,
935) {
936 ensure_list_stack(open_tags, item_open, level);
937 html.push('<');
938 html.push_str(tag);
939 if let Some(start) = start_attr {
940 if tag == "ol" && !start.is_empty() {
941 write!(html, r#" start="{start}""#).expect("write to String");
942 }
943 }
944 html.push('>');
945 open_tags[level] = Some(tag.to_string());
946 item_open[level] = false;
947}
948
949fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
950 if item_open.get(level).copied().unwrap_or(false) {
951 html.push_str("</li>");
952 item_open[level] = false;
953 }
954}
955
956fn close_rendered_list(
957 html: &mut String,
958 open_tags: &mut [Option<String>],
959 item_open: &mut [bool],
960 level: usize,
961) {
962 close_rendered_item(html, item_open, level);
963 if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
964 html.push_str("</");
965 html.push_str(&tag);
966 html.push('>');
967 }
968}
969
970fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
971 let style = combined_attr_style(class_styles, attrs);
972 let margin_left = css_point_value(&style, "margin-left");
973 if margin_left <= 0.0 {
974 return 0;
975 }
976 [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
977 .iter()
978 .take_while(|boundary| margin_left >= **boundary)
979 .count()
980}
981
982fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
983 let mut styles = String::new();
984 if let Some(style) = attr_value(attrs, "style") {
985 styles.push_str(&style);
986 }
987 if let Some(class_attr) = attr_value(attrs, "class") {
988 styles.push_str(&combined_class_style(class_styles, &class_attr));
989 }
990 styles
991}
992
993fn attr_value(attrs: &str, name: &str) -> Option<String> {
994 let attr_re = Regex::new(&format!(
995 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
996 regex::escape(name)
997 ))
998 .expect("valid regex");
999 attr_re.captures(attrs).and_then(|caps| {
1000 caps.get(1)
1001 .or_else(|| caps.get(2))
1002 .map(|value| value.as_str().to_string())
1003 })
1004}
1005
1006fn strip_google_docs_heading_noise(html: &str) -> String {
1007 let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
1008 let numbering_re =
1009 Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
1010 let mut out = empty_anchor_re.replace_all(html, "").into_owned();
1011 for level in 1..=6 {
1012 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
1013 .expect("valid regex");
1014 out = heading_re
1015 .replace_all(&out, |caps: ®ex::Captures<'_>| {
1016 let open = &caps[1];
1017 let inner = &caps[2];
1018 let close = &caps[3];
1019 let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
1020 cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
1021 format!("{open}{cleaned}{close}")
1022 })
1023 .into_owned();
1024 }
1025 out
1026}
1027
1028fn strip_heading_inline_formatting(html: &str) -> String {
1029 let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
1030 let mut out = html.to_string();
1031 for level in 1..=6 {
1032 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
1033 .expect("valid regex");
1034 out = heading_re
1035 .replace_all(&out, |caps: ®ex::Captures<'_>| {
1036 let open = &caps[1];
1037 let inner = &caps[2];
1038 let close = &caps[3];
1039 let cleaned = inline_marker_re.replace_all(inner, "");
1040 format!("{open}{cleaned}{close}")
1041 })
1042 .into_owned();
1043 }
1044 out
1045}
1046
1047fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
1048 let redirect_re =
1049 Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
1050 .expect("valid regex");
1051 redirect_re
1052 .replace_all(html, |caps: ®ex::Captures<'_>| {
1053 let encoded = caps.get(1).map_or("", |m| m.as_str());
1054 let decoded = percent_decode_utf8_lossy(encoded);
1055 *unwrapped_links += 1;
1056 format!(r#"href="{decoded}""#)
1057 })
1058 .into_owned()
1059}
1060
1061fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
1062 let mut class_styles: HashMap<String, String> = HashMap::new();
1063 let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
1064 let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
1065 for style_caps in style_re.captures_iter(html) {
1066 let css = style_caps.get(1).map_or("", |m| m.as_str());
1067 for class_caps in class_re.captures_iter(css) {
1068 let class_name = class_caps.get(1).map_or("", |m| m.as_str());
1069 let style = class_caps.get(2).map_or("", |m| m.as_str());
1070 class_styles
1071 .entry(class_name.to_string())
1072 .and_modify(|existing| {
1073 existing.push(';');
1074 existing.push_str(style);
1075 })
1076 .or_insert_with(|| style.to_string());
1077 }
1078 }
1079 class_styles
1080}
1081
1082fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
1083 class_attr
1084 .split_whitespace()
1085 .filter_map(|class_name| class_styles.get(class_name))
1086 .fold(String::new(), |mut out, style| {
1087 out.push(';');
1088 out.push_str(style);
1089 out
1090 })
1091}
1092
1093fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
1094 let bold = css_has_bold(style);
1095 let italic = css_has_italic(style);
1096 let strike = css_has_strike(style);
1097 if !bold && !italic && !strike {
1098 return None;
1099 }
1100 let mut wrapped = inner.to_string();
1101 if strike {
1102 wrapped = format!("<del>{wrapped}</del>");
1103 }
1104 if italic {
1105 wrapped = format!("<em>{wrapped}</em>");
1106 }
1107 if bold {
1108 wrapped = format!("<strong>{wrapped}</strong>");
1109 }
1110 Some(wrapped)
1111}
1112
1113fn css_has_bold(style: &str) -> bool {
1114 Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
1115 .expect("valid regex")
1116 .is_match(style)
1117}
1118
1119fn css_has_italic(style: &str) -> bool {
1120 Regex::new(r"(?i)font-style\s*:\s*italic")
1121 .expect("valid regex")
1122 .is_match(style)
1123}
1124
1125fn css_has_strike(style: &str) -> bool {
1126 Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1127 .expect("valid regex")
1128 .is_match(style)
1129}
1130
1131fn is_blockquote_style(style: &str) -> bool {
1132 let margin_left = css_point_value(style, "margin-left");
1133 let margin_right = css_point_value(style, "margin-right");
1134 margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1135}
1136
1137fn css_point_value(style: &str, property: &str) -> f64 {
1138 let re = Regex::new(&format!(
1139 r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1140 regex::escape(property)
1141 ))
1142 .expect("valid regex");
1143 re.captures(style)
1144 .and_then(|caps| caps.get(1))
1145 .and_then(|value| value.as_str().parse::<f64>().ok())
1146 .unwrap_or(0.0)
1147}
1148
1149fn percent_decode_utf8_lossy(input: &str) -> String {
1152 let bytes = input.as_bytes();
1153 let mut decoded = Vec::with_capacity(bytes.len());
1154 let mut i = 0;
1155 while i < bytes.len() {
1156 if bytes[i] == b'%' && i + 2 < bytes.len() {
1157 let hi = (bytes[i + 1] as char).to_digit(16);
1158 let lo = (bytes[i + 2] as char).to_digit(16);
1159 if let (Some(hi), Some(lo)) = (hi, lo) {
1160 if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1161 decoded.push(byte);
1162 i += 3;
1163 continue;
1164 }
1165 }
1166 }
1167 decoded.push(bytes[i]);
1168 i += 1;
1169 }
1170 String::from_utf8_lossy(&decoded).into_owned()
1171}
1172
1173fn unescape_public_export_punctuation(markdown: &str) -> String {
1174 markdown
1175 .replace("\\.", ".")
1176 .replace("\\!", "!")
1177 .replace("\\(", "(")
1178 .replace("\\)", ")")
1179 .replace("\\[", "[")
1180 .replace("\\]", "]")
1181}
1182
1183fn convert_setext_headings(markdown: &str) -> String {
1184 let lines: Vec<&str> = markdown.lines().collect();
1185 let mut out = Vec::with_capacity(lines.len());
1186 let mut index = 0;
1187 while index < lines.len() {
1188 if index + 1 < lines.len() {
1189 let underline = lines[index + 1].trim();
1190 if is_setext_underline(underline, '=') {
1191 out.push(format!("# {}", lines[index].trim()));
1192 index += 2;
1193 continue;
1194 }
1195 if is_setext_underline(underline, '-') {
1196 out.push(format!("## {}", lines[index].trim()));
1197 index += 2;
1198 continue;
1199 }
1200 }
1201 out.push(lines[index].to_string());
1202 index += 1;
1203 }
1204 out.join("\n")
1205}
1206
1207fn is_setext_underline(line: &str, marker: char) -> bool {
1208 line.len() >= 5 && line.chars().all(|ch| ch == marker)
1209}
1210
1211fn normalize_atx_headings(markdown: &str) -> String {
1212 let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1213 let closing_re = closing_atx_heading_re();
1214 markdown
1215 .lines()
1216 .map(|line| {
1217 let Some(caps) = heading_re.captures(line) else {
1218 return line.to_string();
1219 };
1220 let hashes = caps.get(1).map_or("", |m| m.as_str());
1221 let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1222 text = closing_re.replace(&text, "").trim().to_string();
1223 text = strip_wrapping_markdown_emphasis(&text);
1224 format!("{hashes} {text}")
1225 })
1226 .collect::<Vec<_>>()
1227 .join("\n")
1228}
1229
1230fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1231 let trimmed = text.trim();
1232 for marker in ["***", "**", "*"] {
1233 if trimmed.len() > marker.len() * 2
1234 && trimmed.starts_with(marker)
1235 && trimmed.ends_with(marker)
1236 {
1237 return trimmed[marker.len()..trimmed.len() - marker.len()]
1238 .trim()
1239 .to_string();
1240 }
1241 }
1242 trimmed.to_string()
1243}
1244
1245fn normalize_bullet_markers(markdown: &str) -> String {
1246 let bullet_re = asterisk_bullet_re();
1247 markdown
1248 .lines()
1249 .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1250 .collect::<Vec<_>>()
1251 .join("\n")
1252}
1253
1254fn normalize_list_spacing(markdown: &str) -> String {
1255 let lines: Vec<&str> = markdown.lines().collect();
1256 let mut out = Vec::with_capacity(lines.len());
1257
1258 for (index, line) in lines.iter().enumerate() {
1259 if line.trim().is_empty()
1260 && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1261 && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1262 {
1263 continue;
1264 }
1265 out.push((*line).to_string());
1266 }
1267
1268 out.join("\n")
1269}
1270
1271fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1272 lines[..index]
1273 .iter()
1274 .rev()
1275 .copied()
1276 .find(|line| !line.trim().is_empty())
1277}
1278
1279fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1280 lines[index + 1..]
1281 .iter()
1282 .copied()
1283 .find(|line| !line.trim().is_empty())
1284}
1285
1286fn is_markdown_list_item(line: &str) -> bool {
1287 markdown_list_item_re().is_match(line)
1288}
1289
1290fn normalize_blockquote_spacing(markdown: &str) -> String {
1291 let mut out = String::with_capacity(markdown.len());
1292 let mut pending_quote_blank = false;
1293 let mut in_quote = false;
1294
1295 for line in markdown.lines() {
1296 if line.trim().is_empty() && in_quote {
1297 pending_quote_blank = true;
1298 continue;
1299 }
1300
1301 if line.trim() == ">" {
1302 if in_quote {
1303 pending_quote_blank = true;
1304 }
1305 continue;
1306 }
1307
1308 if line.starts_with("> ") {
1309 if pending_quote_blank {
1310 out.push_str(">\n");
1311 pending_quote_blank = false;
1312 }
1313 out.push_str(line);
1314 out.push('\n');
1315 in_quote = true;
1316 continue;
1317 }
1318
1319 if in_quote && !line.trim().is_empty() {
1320 out.push('\n');
1321 }
1322 pending_quote_blank = false;
1323 in_quote = false;
1324 out.push_str(line);
1325 out.push('\n');
1326 }
1327
1328 out
1329}
1330
1331fn normalize_markdown_tables(markdown: &str) -> String {
1332 let lines: Vec<&str> = markdown.lines().collect();
1333 let mut out = Vec::with_capacity(lines.len());
1334 let mut index = 0;
1335
1336 while index < lines.len() {
1337 if !is_markdown_table_line(lines[index]) {
1338 out.push(lines[index].to_string());
1339 index += 1;
1340 continue;
1341 }
1342
1343 let start = index;
1344 while index < lines.len() && is_markdown_table_line(lines[index]) {
1345 index += 1;
1346 }
1347 let block = &lines[start..index];
1348 if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1349 out.extend(normalize_markdown_table_block(block));
1350 } else {
1351 out.extend(block.iter().map(|line| (*line).to_string()));
1352 }
1353 }
1354
1355 out.join("\n")
1356}
1357
1358fn is_markdown_table_line(line: &str) -> bool {
1359 let trimmed = line.trim();
1360 trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1361}
1362
1363fn is_markdown_separator_line(line: &str) -> bool {
1364 split_markdown_table_cells(line)
1365 .iter()
1366 .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1367}
1368
1369fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1370 lines
1371 .iter()
1372 .enumerate()
1373 .map(|(index, line)| {
1374 let cells = split_markdown_table_cells(line);
1375 if index == 1 {
1376 let separators = vec!["---".to_string(); cells.len()];
1377 render_markdown_table_row(&separators)
1378 } else {
1379 render_markdown_table_row(&cells)
1380 }
1381 })
1382 .collect()
1383}
1384
1385fn split_markdown_table_cells(line: &str) -> Vec<String> {
1386 line.trim()
1387 .trim_matches('|')
1388 .split('|')
1389 .map(|cell| cell.trim().to_string())
1390 .collect()
1391}
1392
1393fn render_markdown_table_row(cells: &[String]) -> String {
1394 format!("| {} |", cells.join(" | "))
1395}
1396
1397fn closing_atx_heading_re() -> &'static Regex {
1398 static RE: OnceLock<Regex> = OnceLock::new();
1399 RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1400}
1401
1402fn asterisk_bullet_re() -> &'static Regex {
1403 static RE: OnceLock<Regex> = OnceLock::new();
1404 RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1405}
1406
1407fn markdown_list_item_re() -> &'static Regex {
1408 static RE: OnceLock<Regex> = OnceLock::new();
1409 RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1410}
1411
1412fn markdown_table_separator_cell_re() -> &'static Regex {
1413 static RE: OnceLock<Regex> = OnceLock::new();
1414 RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1415}
1416
1417pub async fn fetch_google_doc_from_docs_api(
1423 url: &str,
1424 api_token: &str,
1425) -> crate::Result<GDocsRenderedResult> {
1426 let document_id = extract_document_id(url).ok_or_else(|| {
1427 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1428 })?;
1429 let api_url = build_docs_api_url(&document_id);
1430 debug!(
1431 document_id = %document_id,
1432 api_url = %api_url,
1433 "fetching Google Doc via Docs API"
1434 );
1435
1436 let response = reqwest::Client::new()
1437 .get(&api_url)
1438 .header("Authorization", format!("Bearer {api_token}"))
1439 .header("Accept", "application/json")
1440 .send()
1441 .await
1442 .map_err(|e| {
1443 WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1444 })?;
1445 debug!(
1446 document_id = %document_id,
1447 status = response.status().as_u16(),
1448 success = response.status().is_success(),
1449 content_type = response
1450 .headers()
1451 .get(reqwest::header::CONTENT_TYPE)
1452 .and_then(|value| value.to_str().ok())
1453 .unwrap_or(""),
1454 "received Google Docs API response"
1455 );
1456
1457 if !response.status().is_success() {
1458 return Err(WebCaptureError::FetchError(format!(
1459 "Failed to fetch Google Doc via Docs API ({} {}): {}",
1460 response.status().as_u16(),
1461 response.status().canonical_reason().unwrap_or("Unknown"),
1462 api_url
1463 )));
1464 }
1465
1466 let body = response.text().await.map_err(|e| {
1467 WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1468 })?;
1469 let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1470 WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1471 })?;
1472 let rendered = render_docs_api_document(&document);
1473 debug!(
1474 document_id = %document_id,
1475 title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1476 markdown_bytes = rendered.markdown.len(),
1477 html_bytes = rendered.html.len(),
1478 text_bytes = rendered.text.len(),
1479 "rendered Google Docs API document"
1480 );
1481
1482 Ok(GDocsRenderedResult {
1483 markdown: rendered.markdown,
1484 html: rendered.html,
1485 text: rendered.text,
1486 document_id,
1487 export_url: api_url,
1488 remote_images: Vec::new(),
1489 })
1490}
1491
1492pub async fn fetch_google_doc_from_model(
1498 url: &str,
1499 api_token: Option<&str>,
1500) -> crate::Result<GDocsRenderedResult> {
1501 if api_token.is_some() {
1502 return Err(WebCaptureError::BrowserError(
1503 "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1504 ));
1505 }
1506 let document_id = extract_document_id(url).ok_or_else(|| {
1507 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1508 })?;
1509 let edit_url = build_edit_url(&document_id);
1510 debug!(
1511 document_id = %document_id,
1512 edit_url = %edit_url,
1513 "capturing Google Doc editor model with a real browser"
1514 );
1515 let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1516 let BrowserModelData {
1517 chunks,
1518 cid_urls,
1519 chunk_payload_bytes,
1520 poll_count,
1521 stable_for,
1522 } = model_data;
1523 debug!(
1524 document_id = %document_id,
1525 chunks = chunks.len(),
1526 cid_urls = cid_urls.len(),
1527 chunk_payload_bytes,
1528 poll_count,
1529 stable_for_ms = stable_for.as_millis(),
1530 "extracted Google Docs editor model chunks through CDP"
1531 );
1532 if chunks.is_empty() {
1533 return Err(WebCaptureError::ParseError(
1534 "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1535 ));
1536 }
1537
1538 let export_html = match fetch_google_doc(url, "html", None).await {
1539 Ok(result) => Some(result.content),
1540 Err(error) => {
1541 warn!(
1542 document_id = %document_id,
1543 error = %error,
1544 "failed to fetch Google Docs export HTML for browser-model semantic hints"
1545 );
1546 None
1547 }
1548 };
1549 let capture = parse_model_chunks_with_export_html(&chunks, &cid_urls, export_html.as_deref());
1550 let remote_images = remote_images_from_capture(&capture);
1551 info!(
1552 document_id = %document_id,
1553 chunks = chunks.len(),
1554 cid_urls = cid_urls.len(),
1555 chunk_payload_bytes,
1556 poll_count,
1557 stable_for_ms = stable_for.as_millis(),
1558 blocks = capture.blocks.len(),
1559 tables = capture.tables.len(),
1560 images = capture.images.len(),
1561 text_bytes = capture.text.len(),
1562 "parsed Google Docs editor model"
1563 );
1564
1565 Ok(GDocsRenderedResult {
1566 markdown: render_captured_document(&capture, "markdown"),
1567 html: render_captured_document(&capture, "html"),
1568 text: render_captured_document(&capture, "txt"),
1569 document_id,
1570 export_url: edit_url,
1571 remote_images,
1572 })
1573}
1574
1575async fn fetch_google_doc_editor_model_with_cdp(
1576 edit_url: &str,
1577 document_id: &str,
1578) -> crate::Result<BrowserModelData> {
1579 let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1580 WebCaptureError::BrowserError(
1581 "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1582 )
1583 })?;
1584 let user_data_dir = crate::browser::temporary_user_data_dir();
1585 std::fs::create_dir_all(&user_data_dir)?;
1586
1587 debug!(
1588 document_id = %document_id,
1589 chrome = %chrome.display(),
1590 user_data_dir = %user_data_dir.display(),
1591 edit_url = %edit_url,
1592 "launching headless Chrome CDP session for Google Docs model capture"
1593 );
1594
1595 let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1596 let capture_result = async {
1597 let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1598 let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1599 WebCaptureError::BrowserError(format!(
1600 "Failed to connect to Chrome DevTools websocket: {error}"
1601 ))
1602 })?;
1603 let mut next_id = 0u64;
1604 let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1605 wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1606 }
1607 .await;
1608
1609 if let Err(error) = child.kill().await {
1610 debug!(
1611 document_id = %document_id,
1612 error = %error,
1613 "failed to kill Chrome CDP browser process"
1614 );
1615 }
1616 let _ = child.wait().await;
1617 let _ = std::fs::remove_dir_all(&user_data_dir);
1618
1619 capture_result
1620}
1621
1622async fn navigate_google_docs_cdp_page(
1623 ws: &mut CdpWebSocket,
1624 next_id: &mut u64,
1625 edit_url: &str,
1626) -> crate::Result<String> {
1627 let target = cdp_send(
1628 ws,
1629 next_id,
1630 None,
1631 "Target.createTarget",
1632 serde_json::json!({ "url": "about:blank" }),
1633 )
1634 .await?;
1635 let target_id = target
1636 .get("targetId")
1637 .and_then(Value::as_str)
1638 .ok_or_else(|| {
1639 WebCaptureError::BrowserError(
1640 "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1641 )
1642 })?
1643 .to_string();
1644 let attached = cdp_send(
1645 ws,
1646 next_id,
1647 None,
1648 "Target.attachToTarget",
1649 serde_json::json!({ "targetId": target_id, "flatten": true }),
1650 )
1651 .await?;
1652 let session_id = attached
1653 .get("sessionId")
1654 .and_then(Value::as_str)
1655 .ok_or_else(|| {
1656 WebCaptureError::BrowserError(
1657 "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1658 )
1659 })?
1660 .to_string();
1661
1662 cdp_send(
1663 ws,
1664 next_id,
1665 Some(&session_id),
1666 "Page.enable",
1667 serde_json::json!({}),
1668 )
1669 .await?;
1670 cdp_send(
1671 ws,
1672 next_id,
1673 Some(&session_id),
1674 "Runtime.enable",
1675 serde_json::json!({}),
1676 )
1677 .await?;
1678 cdp_send(
1679 ws,
1680 next_id,
1681 Some(&session_id),
1682 "Page.addScriptToEvaluateOnNewDocument",
1683 serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1684 )
1685 .await?;
1686 cdp_send(
1687 ws,
1688 next_id,
1689 Some(&session_id),
1690 "Page.navigate",
1691 serde_json::json!({ "url": edit_url }),
1692 )
1693 .await?;
1694
1695 Ok(session_id)
1696}
1697
1698async fn wait_for_google_docs_model_chunks(
1699 ws: &mut CdpWebSocket,
1700 next_id: &mut u64,
1701 session_id: &str,
1702 document_id: &str,
1703) -> crate::Result<BrowserModelData> {
1704 let started = Instant::now();
1705 let max_wait = gdocs_editor_model_max_wait();
1706 let stability_window = gdocs_editor_model_stability_window();
1707 let mut quiescence = BrowserModelQuiescence::default();
1708 let mut last_chunks = 0usize;
1709 let mut last_cid_urls = 0usize;
1710 let mut last_payload_bytes = 0usize;
1711 let mut last_stable_for = Duration::ZERO;
1712 let mut poll_count = 0usize;
1713
1714 while started.elapsed() < max_wait {
1715 let result = cdp_send(
1716 ws,
1717 next_id,
1718 Some(session_id),
1719 "Runtime.evaluate",
1720 serde_json::json!({
1721 "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1722 "returnByValue": true,
1723 "awaitPromise": true
1724 }),
1725 )
1726 .await?;
1727 if let Some(exception) = result.get("exceptionDetails") {
1728 return Err(WebCaptureError::BrowserError(format!(
1729 "Google Docs model extraction script failed: {exception}"
1730 )));
1731 }
1732 let value = result
1733 .pointer("/result/value")
1734 .cloned()
1735 .unwrap_or(Value::Null);
1736 let model_data = browser_model_data_from_value(&value);
1737 poll_count += 1;
1738 let fingerprint = model_data.fingerprint();
1739 last_chunks = model_data.chunks.len();
1740 last_cid_urls = model_data.cid_urls.len();
1741 last_payload_bytes = model_data.chunk_payload_bytes;
1742 let now = Instant::now();
1743 if let Some(stable_for) = quiescence.observe(fingerprint, now, stability_window) {
1744 let mut model_data = model_data;
1745 model_data.poll_count = poll_count;
1746 model_data.stable_for = stable_for;
1747 debug!(
1748 document_id = %document_id,
1749 chunks = model_data.chunks.len(),
1750 cid_urls = model_data.cid_urls.len(),
1751 chunk_payload_bytes = model_data.chunk_payload_bytes,
1752 poll_count,
1753 stable_for_ms = stable_for.as_millis(),
1754 elapsed_ms = started.elapsed().as_millis(),
1755 "captured quiesced Google Docs model chunks through CDP Runtime.evaluate"
1756 );
1757 return Ok(model_data);
1758 }
1759 last_stable_for = quiescence.stable_for(now);
1760 tokio::time::sleep(GDOCS_EDITOR_MODEL_POLL_INTERVAL).await;
1761 }
1762
1763 Err(WebCaptureError::BrowserError(format!(
1764 "Timed out waiting for Google Docs DOCS_modelChunk stream to quiesce for document {document_id} after {} ms (last chunks={last_chunks}, payload_bytes={last_payload_bytes}, cid_urls={last_cid_urls}, poll_count={poll_count}, stable_for_ms={})",
1765 max_wait.as_millis(),
1766 last_stable_for.as_millis()
1767 )))
1768}
1769
1770fn launch_cdp_chrome(
1771 chrome: &std::path::Path,
1772 user_data_dir: &std::path::Path,
1773) -> crate::Result<Child> {
1774 let mut command = Command::new(chrome);
1775 command
1776 .args([
1777 "--headless=new",
1778 "--disable-gpu",
1779 "--disable-extensions",
1780 "--disable-dev-shm-usage",
1781 "--disable-background-networking",
1782 "--disable-component-update",
1783 "--disable-default-apps",
1784 "--disable-sync",
1785 "--metrics-recording-only",
1786 "--no-default-browser-check",
1787 "--no-first-run",
1788 "--no-sandbox",
1789 "--remote-debugging-port=0",
1790 "--window-size=1280,800",
1791 ])
1792 .arg(format!("--user-data-dir={}", user_data_dir.display()))
1793 .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1794 .stderr(Stdio::piped())
1795 .stdout(Stdio::null())
1796 .kill_on_drop(true);
1797
1798 command.spawn().map_err(|error| {
1799 WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1800 })
1801}
1802
1803async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1804 let stderr = child.stderr.take().ok_or_else(|| {
1805 WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1806 })?;
1807 let mut lines = BufReader::new(stderr).lines();
1808 let started = Instant::now();
1809
1810 while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1811 let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1812 match line {
1813 Ok(Ok(Some(line))) => {
1814 if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1815 return Ok(ws_url.trim().to_string());
1816 }
1817 }
1818 Ok(Ok(None)) => {
1819 break;
1820 }
1821 Ok(Err(error)) => {
1822 return Err(WebCaptureError::BrowserError(format!(
1823 "Failed to read Chrome CDP stderr: {error}"
1824 )));
1825 }
1826 Err(_) => {}
1827 }
1828 }
1829
1830 Err(WebCaptureError::BrowserError(format!(
1831 "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1832 GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1833 )))
1834}
1835
1836async fn cdp_send(
1837 ws: &mut CdpWebSocket,
1838 next_id: &mut u64,
1839 session_id: Option<&str>,
1840 method: &str,
1841 params: Value,
1842) -> crate::Result<Value> {
1843 *next_id += 1;
1844 let id = *next_id;
1845 let mut message = serde_json::json!({
1846 "id": id,
1847 "method": method,
1848 "params": params
1849 });
1850 if let Some(session_id) = session_id {
1851 message["sessionId"] = Value::String(session_id.to_string());
1852 }
1853
1854 ws.send(Message::Text(message.to_string()))
1855 .await
1856 .map_err(|error| {
1857 WebCaptureError::BrowserError(format!(
1858 "Failed to send Chrome DevTools command {method}: {error}"
1859 ))
1860 })?;
1861
1862 while let Some(message) = ws.next().await {
1863 let message = message.map_err(|error| {
1864 WebCaptureError::BrowserError(format!(
1865 "Failed to read Chrome DevTools response for {method}: {error}"
1866 ))
1867 })?;
1868 if !message.is_text() {
1869 continue;
1870 }
1871 let text = message.to_text().map_err(|error| {
1872 WebCaptureError::BrowserError(format!(
1873 "Chrome DevTools response for {method} was not text: {error}"
1874 ))
1875 })?;
1876 let value = serde_json::from_str::<Value>(text).map_err(|error| {
1877 WebCaptureError::ParseError(format!(
1878 "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1879 ))
1880 })?;
1881 if value.get("id").and_then(Value::as_u64) != Some(id) {
1882 continue;
1883 }
1884 if let Some(error) = value.get("error") {
1885 return Err(WebCaptureError::BrowserError(format!(
1886 "Chrome DevTools command {method} failed: {error}"
1887 )));
1888 }
1889 return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1890 }
1891
1892 Err(WebCaptureError::BrowserError(format!(
1893 "Chrome DevTools websocket closed before response for {method}"
1894 )))
1895}
1896
1897fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1898 let chunks = value
1899 .get("chunks")
1900 .and_then(Value::as_array)
1901 .cloned()
1902 .unwrap_or_default();
1903 let chunk_payload_bytes = model_chunk_payload_bytes(&chunks);
1904 let cid_urls = value
1905 .get("cidUrlMap")
1906 .and_then(Value::as_object)
1907 .map(|map| {
1908 map.iter()
1909 .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1910 .collect::<HashMap<_, _>>()
1911 })
1912 .unwrap_or_default();
1913 BrowserModelData {
1914 chunks,
1915 cid_urls,
1916 chunk_payload_bytes,
1917 poll_count: 0,
1918 stable_for: Duration::ZERO,
1919 }
1920}
1921
1922fn model_chunk_payload_bytes(chunks: &[Value]) -> usize {
1923 chunks
1924 .iter()
1925 .map(|chunk| serde_json::to_vec(chunk).map_or(0, |encoded| encoded.len()))
1926 .sum()
1927}
1928
1929fn gdocs_editor_model_max_wait() -> Duration {
1930 duration_from_env_ms(
1931 "WEB_CAPTURE_GDOCS_MAX_WAIT_MS",
1932 GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT,
1933 )
1934}
1935
1936fn gdocs_editor_model_stability_window() -> Duration {
1937 duration_from_env_ms(
1938 "WEB_CAPTURE_GDOCS_STABILITY_MS",
1939 GDOCS_EDITOR_MODEL_STABILITY_DEFAULT,
1940 )
1941}
1942
1943fn duration_from_env_ms(name: &str, default: Duration) -> Duration {
1944 std::env::var(name).map_or(default, |value| match value.trim().parse::<u64>() {
1945 Ok(ms) => Duration::from_millis(ms),
1946 Err(error) => {
1947 warn!(
1948 name,
1949 value,
1950 error = %error,
1951 default_ms = default.as_millis(),
1952 "ignoring invalid Google Docs model wait environment variable"
1953 );
1954 default
1955 }
1956 })
1957}
1958
1959fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1960 capture
1961 .images
1962 .iter()
1963 .filter_map(|node| match node {
1964 ContentNode::Image {
1965 url: Some(url),
1966 alt,
1967 ..
1968 } => Some(RemoteImage {
1969 url: url.clone(),
1970 alt: alt.clone(),
1971 }),
1972 ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1973 })
1974 .collect()
1975}
1976
1977#[must_use]
1979pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1980 let blocks = structural_elements_to_blocks(
1981 document
1982 .pointer("/body/content")
1983 .and_then(Value::as_array)
1984 .map_or(&[] as &[Value], Vec::as_slice),
1985 document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1986 );
1987 GDocsRenderedOutput {
1988 markdown: render_blocks_markdown(&blocks),
1989 html: render_blocks_html(&blocks),
1990 text: blocks_to_text(&blocks),
1991 }
1992}
1993
1994#[derive(Debug, Clone, PartialEq, Eq)]
1996pub struct GDocsRenderedOutput {
1997 pub markdown: String,
1999 pub html: String,
2001 pub text: String,
2003}
2004
2005fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
2006 let mut blocks = Vec::new();
2007 for element in elements {
2008 if let Some(paragraph) = element.get("paragraph") {
2009 let content = paragraph_to_content(paragraph, inline_objects);
2010 if !content_to_text(&content).trim().is_empty()
2011 || content
2012 .iter()
2013 .any(|node| matches!(node, ContentNode::Image { .. }))
2014 {
2015 blocks.push(CapturedBlock::Paragraph {
2016 style: paragraph
2017 .pointer("/paragraphStyle/namedStyleType")
2018 .and_then(Value::as_str)
2019 .map(ToString::to_string),
2020 list: None,
2021 quote: false,
2022 horizontal_rule: false,
2023 content,
2024 });
2025 }
2026 } else if let Some(table) = element.get("table") {
2027 blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
2028 }
2029 }
2030 blocks
2031}
2032
2033fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
2034 let rows = table
2035 .get("tableRows")
2036 .and_then(Value::as_array)
2037 .map_or(&[] as &[Value], Vec::as_slice)
2038 .iter()
2039 .map(|row| TableRow {
2040 cells: row
2041 .get("tableCells")
2042 .and_then(Value::as_array)
2043 .map_or(&[] as &[Value], Vec::as_slice)
2044 .iter()
2045 .map(|cell| TableCell {
2046 content: structural_elements_to_inline_content(
2047 cell.get("content")
2048 .and_then(Value::as_array)
2049 .map_or(&[] as &[Value], Vec::as_slice),
2050 inline_objects,
2051 ),
2052 })
2053 .collect(),
2054 })
2055 .collect();
2056 TableBlock { rows }
2057}
2058
2059fn structural_elements_to_inline_content(
2060 elements: &[Value],
2061 inline_objects: &Value,
2062) -> Vec<ContentNode> {
2063 let mut content = Vec::new();
2064 for element in elements {
2065 if let Some(paragraph) = element.get("paragraph") {
2066 let paragraph_content = paragraph_to_content(paragraph, inline_objects);
2067 if !content.is_empty() && !paragraph_content.is_empty() {
2068 append_text(&mut content, "\n");
2069 }
2070 content.extend(paragraph_content);
2071 } else if let Some(table) = element.get("table") {
2072 append_text(
2073 &mut content,
2074 &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
2075 table,
2076 inline_objects,
2077 ))]),
2078 );
2079 }
2080 }
2081 content
2082}
2083
2084fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
2085 let mut content = Vec::new();
2086 for element in paragraph
2087 .get("elements")
2088 .and_then(Value::as_array)
2089 .map_or(&[] as &[Value], Vec::as_slice)
2090 {
2091 if let Some(text) = element
2092 .pointer("/textRun/content")
2093 .and_then(Value::as_str)
2094 .map(|text| text.strip_suffix('\n').unwrap_or(text))
2095 {
2096 append_text(&mut content, text);
2097 } else if let Some(inline_id) = element
2098 .pointer("/inlineObjectElement/inlineObjectId")
2099 .and_then(Value::as_str)
2100 {
2101 if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
2102 content.push(image);
2103 }
2104 }
2105 }
2106 content
2107}
2108
2109fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
2110 let embedded = inline_objects
2111 .get(inline_id)?
2112 .pointer("/inlineObjectProperties/embeddedObject")?;
2113 let url = embedded
2114 .pointer("/imageProperties/contentUri")
2115 .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
2116 .and_then(Value::as_str)?;
2117 let alt = embedded
2118 .get("title")
2119 .or_else(|| embedded.get("description"))
2120 .and_then(Value::as_str)
2121 .unwrap_or("image");
2122 Some(ContentNode::Image {
2123 cid: None,
2124 url: Some(url.to_string()),
2125 alt: alt.to_string(),
2126 width: json_dimension_to_string(embedded.pointer("/size/width/magnitude")),
2127 height: json_dimension_to_string(embedded.pointer("/size/height/magnitude")),
2128 is_suggestion: false,
2129 })
2130}
2131
2132fn json_dimension_to_string(value: Option<&Value>) -> Option<String> {
2133 match value? {
2134 Value::Number(number) => Some(number.to_string()),
2135 Value::String(text) if !text.is_empty() => Some(text.clone()),
2136 _ => None,
2137 }
2138}
2139
2140fn build_model_style_maps(
2141 items: &[Value],
2142 text_len: usize,
2143 utf16_position_map: &[usize],
2144) -> ModelStyleMaps {
2145 let mut maps = ModelStyleMaps {
2146 inline_styles: vec![TextStyle::default(); text_len],
2147 ..ModelStyleMaps::default()
2148 };
2149
2150 for item in items {
2151 if item.get("ty").and_then(Value::as_str) != Some("as") {
2152 continue;
2153 }
2154 let (Some(start), Some(end), Some(style_type)) = (
2155 item.get("si").and_then(Value::as_u64),
2156 item.get("ei").and_then(Value::as_u64),
2157 item.get("st").and_then(Value::as_str),
2158 ) else {
2159 continue;
2160 };
2161 let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
2162 continue;
2163 };
2164
2165 let start = utf16_position_to_char_position(utf16_position_map, start);
2166 let end = utf16_position_to_char_position(utf16_position_map, end);
2167 if start == 0 || end == 0 {
2168 continue;
2169 }
2170
2171 match style_type {
2172 "text" => {
2173 let style = text_style(item);
2174 apply_inline_style(&mut maps.inline_styles, start, end, &style);
2175 }
2176 "link" => {
2177 let style = TextStyle {
2178 link: item
2179 .pointer("/sm/lnks_link/ulnk_url")
2180 .and_then(Value::as_str)
2181 .map(ToString::to_string),
2182 ..TextStyle::default()
2183 };
2184 apply_inline_style(&mut maps.inline_styles, start, end, &style);
2185 }
2186 "paragraph" => {
2187 maps.paragraph_by_end
2188 .insert(end, paragraph_style_from_model(item));
2189 }
2190 "list" => {
2191 maps.list_by_end.insert(
2192 end,
2193 ListMeta {
2194 id: item
2195 .pointer("/sm/ls_id")
2196 .and_then(Value::as_str)
2197 .unwrap_or("")
2198 .to_string(),
2199 level: item
2200 .pointer("/sm/ls_nest")
2201 .and_then(Value::as_u64)
2202 .and_then(|value| usize::try_from(value).ok())
2203 .unwrap_or(0),
2204 ordered: false,
2205 },
2206 );
2207 }
2208 "horizontal_rule" => {
2209 maps.horizontal_rules.insert(end);
2210 }
2211 _ => {}
2212 }
2213 }
2214
2215 maps
2216}
2217
2218fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
2219 let from = start.saturating_sub(1);
2220 let to = end.min(styles.len());
2221 if from >= to {
2222 return;
2223 }
2224 for style in &mut styles[from..to] {
2225 if patch.bold {
2226 style.bold = true;
2227 }
2228 if patch.italic {
2229 style.italic = true;
2230 }
2231 if patch.strike {
2232 style.strike = true;
2233 }
2234 if patch.link.is_some() {
2235 style.link.clone_from(&patch.link);
2236 }
2237 }
2238}
2239
2240fn text_style(item: &Value) -> TextStyle {
2241 TextStyle {
2242 bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true)
2243 && item.pointer("/sm/ts_bd_i").and_then(Value::as_bool) != Some(true),
2244 italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true)
2245 && item.pointer("/sm/ts_it_i").and_then(Value::as_bool) != Some(true),
2246 strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true)
2247 && item.pointer("/sm/ts_st_i").and_then(Value::as_bool) != Some(true),
2248 link: None,
2249 }
2250}
2251
2252fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2253 let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2254 ParagraphStyle {
2255 style: heading.map(|level| format!("HEADING_{level}")),
2256 indent_start: item
2257 .pointer("/sm/ps_il")
2258 .and_then(Value::as_f64)
2259 .unwrap_or(0.0),
2260 indent_first_line: item
2261 .pointer("/sm/ps_ifl")
2262 .and_then(Value::as_f64)
2263 .unwrap_or(0.0),
2264 }
2265}
2266
2267fn build_utf16_position_map(text: &str) -> Vec<usize> {
2268 let mut map = vec![0; text.encode_utf16().count() + 1];
2269 let mut utf16_pos = 1usize;
2270 for (idx, ch) in text.chars().enumerate() {
2271 let char_pos = idx + 1;
2272 for _ in 0..ch.len_utf16() {
2273 if let Some(slot) = map.get_mut(utf16_pos) {
2274 *slot = char_pos;
2275 }
2276 utf16_pos += 1;
2277 }
2278 }
2279 map
2280}
2281
2282fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2283 map.get(position)
2284 .copied()
2285 .filter(|position| *position > 0)
2286 .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2287 .unwrap_or(0)
2288}
2289
2290#[must_use]
2292pub fn parse_model_chunks<S: BuildHasher>(
2293 chunks: &[Value],
2294 cid_urls: &HashMap<String, String, S>,
2295) -> CapturedDocument {
2296 parse_model_chunks_with_export_html(chunks, cid_urls, None)
2297}
2298
2299#[must_use]
2302#[allow(clippy::too_many_lines)]
2303pub fn parse_model_chunks_with_export_html<S: BuildHasher>(
2304 chunks: &[Value],
2305 cid_urls: &HashMap<String, String, S>,
2306 export_html: Option<&str>,
2307) -> CapturedDocument {
2308 let items = collect_model_items(chunks);
2309 let full_text = items
2310 .iter()
2311 .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2312 .filter_map(|item| item.get("s").and_then(Value::as_str))
2313 .collect::<String>();
2314 let chars: Vec<char> = full_text.chars().collect();
2315 let utf16_position_map = build_utf16_position_map(&full_text);
2316 let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2317
2318 let mut positions = HashMap::new();
2319 for item in &items {
2320 if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2321 if let (Some(id), Some(pos)) = (
2322 item.get("id").and_then(Value::as_str),
2323 item.get("spi").and_then(Value::as_u64),
2324 ) {
2325 if let Ok(pos) = usize::try_from(pos) {
2326 positions.insert(
2327 id.to_string(),
2328 utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2329 );
2330 }
2331 }
2332 }
2333 }
2334
2335 let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2336 let mut images = Vec::new();
2337 for item in &items {
2338 let ty = item.get("ty").and_then(Value::as_str);
2339 if !matches!(ty, Some("ae" | "ase")) {
2340 continue;
2341 }
2342 let Some(id) = item.get("id").and_then(Value::as_str) else {
2343 continue;
2344 };
2345 let Some(pos) = positions.get(id).copied() else {
2346 continue;
2347 };
2348 let cid = item
2349 .pointer("/epm/ee_eo/i_cid")
2350 .and_then(Value::as_str)
2351 .map(ToString::to_string);
2352 let node = ContentNode::Image {
2353 url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2354 cid,
2355 alt: item
2356 .pointer("/epm/ee_eo/eo_ad")
2357 .and_then(Value::as_str)
2358 .unwrap_or_else(|| {
2359 if ty == Some("ase") {
2360 "suggested image"
2361 } else {
2362 "image"
2363 }
2364 })
2365 .to_string(),
2366 width: json_dimension_to_string(item.pointer("/epm/ee_eo/i_wth")),
2367 height: json_dimension_to_string(item.pointer("/epm/ee_eo/i_ht")),
2368 is_suggestion: ty == Some("ase"),
2369 };
2370 images_by_pos.insert(pos, node.clone());
2371 images.push(node);
2372 }
2373
2374 let mut blocks = Vec::new();
2375 let mut tables = Vec::new();
2376 let mut paragraph = Vec::new();
2377 let mut table: Option<TableBlock> = None;
2378 let mut row: Option<TableRow> = None;
2379 let mut cell: Option<TableCell> = None;
2380 let mut previous_table_control: Option<u32> = None;
2381 let mut skip_next_table_newline = false;
2382
2383 for (idx, ch) in chars.iter().copied().enumerate() {
2384 match ch as u32 {
2385 0x10 => {
2386 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2387 table = Some(TableBlock::default());
2388 previous_table_control = Some(0x10);
2389 skip_next_table_newline = false;
2390 }
2391 0x11 => {
2392 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2393 previous_table_control = None;
2394 skip_next_table_newline = false;
2395 }
2396 0x12 => {
2397 flush_row(&mut row, &mut cell, table.as_mut(), true);
2398 row = Some(TableRow::default());
2399 previous_table_control = Some(0x12);
2400 skip_next_table_newline = false;
2401 }
2402 0x1c => {
2403 if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2404 previous_table_control = Some(0x1c);
2405 continue;
2406 }
2407 let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2408 flush_cell(&mut row, &mut cell, false);
2409 if row.is_none() {
2410 row = Some(TableRow::default());
2411 }
2412 cell = Some(TableCell::default());
2413 if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2414 skip_next_table_newline = true;
2415 }
2416 previous_table_control = Some(0x1c);
2417 }
2418 0x0a => {
2419 if table.is_some() {
2420 if skip_next_table_newline {
2421 skip_next_table_newline = false;
2422 previous_table_control = Some(0x0a);
2423 continue;
2424 }
2425 flush_cell(&mut row, &mut cell, false);
2428 if row.is_none() {
2429 row = Some(TableRow::default());
2430 }
2431 cell = Some(TableCell::default());
2432 previous_table_control = Some(0x0a);
2433 } else {
2434 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2435 }
2436 }
2437 0x0b => {
2438 append_to_current(
2439 &mut paragraph,
2440 &mut row,
2441 &mut cell,
2442 table.is_some(),
2443 "\n",
2444 TextStyle::default(),
2445 );
2446 previous_table_control = None;
2447 skip_next_table_newline = false;
2448 }
2449 _ => {
2450 if let Some(image) = images_by_pos.get(&idx).cloned() {
2451 push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2452 previous_table_control = None;
2453 skip_next_table_newline = false;
2454 if ch == '*' {
2455 continue;
2456 }
2457 }
2458 append_to_current(
2459 &mut paragraph,
2460 &mut row,
2461 &mut cell,
2462 table.is_some(),
2463 &ch.to_string(),
2464 style_maps
2465 .inline_styles
2466 .get(idx)
2467 .cloned()
2468 .unwrap_or_default(),
2469 );
2470 previous_table_control = None;
2471 skip_next_table_newline = false;
2472 }
2473 }
2474 }
2475
2476 if table.is_some() {
2477 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2478 }
2479 flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2480
2481 let mut capture = CapturedDocument {
2482 text: blocks_to_text(&blocks),
2483 blocks,
2484 tables,
2485 images,
2486 };
2487 if let Some(export_html) = export_html {
2488 apply_export_semantic_hints(&mut capture.blocks, export_html);
2489 capture.text = blocks_to_text(&capture.blocks);
2490 }
2491 capture
2492}
2493
2494fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2495 let mut items = Vec::new();
2496 for chunk in chunks {
2497 if let Some(array) = chunk.as_array() {
2498 items.extend(array.iter().cloned());
2499 } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2500 items.extend(array.iter().cloned());
2501 } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2502 items.push(chunk.clone());
2503 }
2504 }
2505 items
2506}
2507
2508fn flush_paragraph(
2509 paragraph: &mut Vec<ContentNode>,
2510 blocks: &mut Vec<CapturedBlock>,
2511 end_pos: Option<usize>,
2512 style_maps: &ModelStyleMaps,
2513) {
2514 if !content_to_text(paragraph).trim().is_empty()
2515 || paragraph
2516 .iter()
2517 .any(|node| matches!(node, ContentNode::Image { .. }))
2518 {
2519 let meta =
2520 paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2521 blocks.push(CapturedBlock::Paragraph {
2522 content: std::mem::take(paragraph),
2523 style: meta.style,
2524 list: meta.list,
2525 quote: meta.quote,
2526 horizontal_rule: meta.horizontal_rule,
2527 });
2528 } else {
2529 paragraph.clear();
2530 }
2531}
2532
2533fn paragraph_meta_for_end_position(
2534 style_maps: &ModelStyleMaps,
2535 end_pos: Option<usize>,
2536 text: &str,
2537) -> ParagraphMeta {
2538 let Some(end_pos) = end_pos else {
2539 return ParagraphMeta::default();
2540 };
2541 let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2542 let mut meta = ParagraphMeta {
2543 style: paragraph_style.and_then(|style| style.style.clone()),
2544 ..ParagraphMeta::default()
2545 };
2546
2547 if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2548 let mut list = list.clone();
2549 list.ordered = infer_ordered_list(&list, text);
2550 meta.list = Some(list);
2551 } else if paragraph_style.is_some_and(|style| {
2552 style.indent_start > 0.0
2553 && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2554 }) {
2555 meta.quote = true;
2556 }
2557
2558 meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2559 || end_pos
2560 .checked_sub(1)
2561 .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2562 && text.trim().chars().all(|ch| ch == '-');
2563 meta
2564}
2565
2566const fn infer_ordered_list(_list: &ListMeta, _text: &str) -> bool {
2567 false
2568}
2569
2570fn apply_export_semantic_hints(blocks: &mut [CapturedBlock], export_html: &str) {
2571 let hints = extract_export_semantic_hints(export_html);
2572 let mut cursor = 0usize;
2573 for block in blocks {
2574 let CapturedBlock::Paragraph {
2575 content,
2576 list,
2577 quote,
2578 ..
2579 } = block
2580 else {
2581 continue;
2582 };
2583 let text = normalize_semantic_text(&content_to_text(content));
2584 if text.is_empty() {
2585 continue;
2586 }
2587 let Some((index, hint)) = find_next_semantic_hint(&hints, &text, cursor, list.is_some())
2588 else {
2589 continue;
2590 };
2591 cursor = index + 1;
2592 if let Some(list) = list.as_mut() {
2593 if let Some(ordered) = hint.list_ordered {
2594 list.ordered = ordered;
2595 }
2596 } else {
2597 *quote = hint.quote;
2598 }
2599 }
2600}
2601
2602fn find_next_semantic_hint<'a>(
2603 hints: &'a [ExportSemanticHint],
2604 text: &str,
2605 cursor: usize,
2606 needs_list_hint: bool,
2607) -> Option<(usize, &'a ExportSemanticHint)> {
2608 hints.iter().enumerate().skip(cursor).find(|(_, hint)| {
2609 hint.text == text
2610 && if needs_list_hint {
2611 hint.list_ordered.is_some()
2612 } else {
2613 hint.list_ordered.is_none()
2614 }
2615 })
2616}
2617
2618fn extract_export_semantic_hints(export_html: &str) -> Vec<ExportSemanticHint> {
2619 let preprocessed = preprocess_google_docs_export_html(export_html).html;
2620 let document = Html::parse_document(&preprocessed);
2621 let selector =
2622 Selector::parse("body h1,body h2,body h3,body h4,body h5,body h6,body p,body li")
2623 .expect("valid semantic hint selector");
2624 document
2625 .select(&selector)
2626 .filter_map(|element| {
2627 let tag = element.value().name();
2628 let text = export_element_semantic_text(&element);
2629 if text.is_empty() {
2630 return None;
2631 }
2632 let list_ordered = if tag == "li" {
2633 nearest_list_is_ordered(&element)
2634 } else {
2635 None
2636 };
2637 Some(ExportSemanticHint {
2638 text,
2639 list_ordered,
2640 quote: tag != "li" && has_ancestor_tag(&element, "blockquote"),
2641 })
2642 })
2643 .collect()
2644}
2645
2646fn export_element_semantic_text(element: &ElementRef<'_>) -> String {
2647 let raw_text = if element.value().name() == "li" {
2648 list_item_own_text(element)
2649 } else {
2650 element.text().collect()
2651 };
2652 normalize_semantic_text(&raw_text)
2653}
2654
2655fn list_item_own_text(element: &ElementRef<'_>) -> String {
2656 let mut text = String::new();
2657 let mut stack: Vec<_> = element.children().collect();
2658 stack.reverse();
2659
2660 while let Some(node) = stack.pop() {
2661 match node.value() {
2662 Node::Text(value) => text.push_str(value),
2663 Node::Element(child) if matches!(child.name(), "ol" | "ul") => {}
2664 Node::Element(_) => {
2665 let mut children: Vec<_> = node.children().collect();
2666 children.reverse();
2667 stack.extend(children);
2668 }
2669 _ => {}
2670 }
2671 }
2672
2673 text
2674}
2675
2676fn nearest_list_is_ordered(element: &ElementRef<'_>) -> Option<bool> {
2677 element
2678 .ancestors()
2679 .filter_map(ElementRef::wrap)
2680 .find_map(|ancestor| match ancestor.value().name() {
2681 "ol" => Some(true),
2682 "ul" => Some(false),
2683 _ => None,
2684 })
2685}
2686
2687fn has_ancestor_tag(element: &ElementRef<'_>, tag: &str) -> bool {
2688 element
2689 .ancestors()
2690 .filter_map(ElementRef::wrap)
2691 .any(|ancestor| ancestor.value().name() == tag)
2692}
2693
2694fn normalize_semantic_text(text: &str) -> String {
2695 text.replace('\u{a0}', " ")
2696 .split_whitespace()
2697 .collect::<Vec<_>>()
2698 .join(" ")
2699}
2700
2701fn cell_is_empty(cell: &TableCell) -> bool {
2702 cell.content.iter().all(|node| match node {
2703 ContentNode::Text { text, .. } => text.trim().is_empty(),
2704 ContentNode::Image { .. } => false,
2705 })
2706}
2707
2708fn row_is_empty(row: &TableRow) -> bool {
2709 row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2710}
2711
2712fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2713 if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2714 if drop_empty && cell_is_empty(&cell) {
2715 return;
2716 }
2717 row.cells.push(cell);
2718 }
2719}
2720
2721fn flush_row(
2722 row: &mut Option<TableRow>,
2723 cell: &mut Option<TableCell>,
2724 table: Option<&mut TableBlock>,
2725 drop_empty_trailing_cell: bool,
2726) {
2727 flush_cell(row, cell, drop_empty_trailing_cell);
2728 if let (Some(table), Some(row)) = (table, row.take()) {
2729 table.rows.push(row);
2730 }
2731}
2732
2733fn flush_table(
2734 table: &mut Option<TableBlock>,
2735 row: &mut Option<TableRow>,
2736 cell: &mut Option<TableCell>,
2737 tables: &mut Vec<TableBlock>,
2738 blocks: &mut Vec<CapturedBlock>,
2739) {
2740 flush_row(row, cell, table.as_mut(), true);
2741 if let Some(mut table) = table.take() {
2742 while table.rows.last().is_some_and(row_is_empty) {
2745 table.rows.pop();
2746 }
2747 tables.push(table.clone());
2748 blocks.push(CapturedBlock::Table(table));
2749 }
2750}
2751
2752fn push_to_current(
2753 paragraph: &mut Vec<ContentNode>,
2754 row: &mut Option<TableRow>,
2755 cell: &mut Option<TableCell>,
2756 in_table: bool,
2757 node: ContentNode,
2758) {
2759 if in_table {
2760 if row.is_none() {
2761 *row = Some(TableRow::default());
2762 }
2763 if cell.is_none() {
2764 *cell = Some(TableCell::default());
2765 }
2766 if let Some(cell) = cell.as_mut() {
2767 cell.content.push(node);
2768 }
2769 } else {
2770 paragraph.push(node);
2771 }
2772}
2773
2774fn append_to_current(
2775 paragraph: &mut Vec<ContentNode>,
2776 row: &mut Option<TableRow>,
2777 cell: &mut Option<TableCell>,
2778 in_table: bool,
2779 text: &str,
2780 style: TextStyle,
2781) {
2782 if in_table {
2783 if row.is_none() {
2784 *row = Some(TableRow::default());
2785 }
2786 if cell.is_none() {
2787 *cell = Some(TableCell::default());
2788 }
2789 if let Some(cell) = cell.as_mut() {
2790 append_styled_text(&mut cell.content, text, style);
2791 }
2792 } else {
2793 append_styled_text(paragraph, text, style);
2794 }
2795}
2796
2797fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2798 append_styled_text(content, text, TextStyle::default());
2799}
2800
2801fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2802 if text.is_empty() {
2803 return;
2804 }
2805 if let Some(ContentNode::Text {
2806 text: last,
2807 bold,
2808 italic,
2809 strike,
2810 link,
2811 }) = content.last_mut()
2812 {
2813 let last_style = TextStyle {
2814 bold: *bold,
2815 italic: *italic,
2816 strike: *strike,
2817 link: link.clone(),
2818 };
2819 if last_style == style {
2820 last.push_str(text);
2821 return;
2822 }
2823 }
2824 content.push(ContentNode::Text {
2825 text: text.to_string(),
2826 bold: style.bold,
2827 italic: style.italic,
2828 strike: style.strike,
2829 link: style.link,
2830 });
2831}
2832
2833#[must_use]
2835pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2836 match format.to_lowercase().as_str() {
2837 "html" => render_blocks_html(&capture.blocks),
2838 "txt" | "text" => blocks_to_text(&capture.blocks),
2839 _ => render_blocks_markdown(&capture.blocks),
2840 }
2841}
2842
2843struct RenderedBlock {
2846 markdown: String,
2847 list_id: Option<String>,
2848 quote: bool,
2849}
2850
2851fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
2852 let mut counters: HashMap<(String, usize), usize> = HashMap::new();
2857 let mut rendered: Vec<RenderedBlock> = Vec::new();
2858
2859 for block in blocks {
2860 match block {
2861 CapturedBlock::Paragraph {
2862 content,
2863 style,
2864 list,
2865 quote,
2866 horizontal_rule,
2867 } => {
2868 let text = render_content_markdown(content).trim().to_string();
2869 if text.is_empty() {
2870 continue;
2871 }
2872 let ordered_index = list.as_ref().and_then(|list_meta| {
2873 if !list_meta.ordered {
2874 return None;
2875 }
2876 let key = (list_meta.id.clone(), list_meta.level);
2880 counters.retain(|(id, level), _| {
2881 !(id == &list_meta.id && *level > list_meta.level)
2882 });
2883 let next = counters.entry(key).or_insert(0);
2884 *next += 1;
2885 Some(*next)
2886 });
2887 let markdown = render_paragraph_markdown(
2888 &text,
2889 style.as_deref(),
2890 list.as_ref(),
2891 *quote,
2892 *horizontal_rule,
2893 ordered_index,
2894 );
2895 rendered.push(RenderedBlock {
2896 markdown,
2897 list_id: list.as_ref().map(|l| l.id.clone()),
2898 quote: *quote,
2899 });
2900 }
2901 CapturedBlock::Table(table) => {
2902 rendered.push(RenderedBlock {
2903 markdown: render_table_markdown(table),
2904 list_id: None,
2905 quote: false,
2906 });
2907 }
2908 }
2909 }
2910
2911 let mut out = String::new();
2915 for (idx, block) in rendered.iter().enumerate() {
2916 if idx == 0 {
2917 out.push_str(&block.markdown);
2918 continue;
2919 }
2920 let prev = &rendered[idx - 1];
2921 if block.list_id.is_some() && prev.list_id.is_some() {
2922 out.push('\n');
2923 } else if block.quote && prev.quote {
2924 out.push_str("\n>\n");
2925 } else {
2926 out.push_str("\n\n");
2927 }
2928 out.push_str(&block.markdown);
2929 }
2930 if !out.is_empty() && !out.ends_with('\n') {
2931 out.push('\n');
2932 }
2933 out
2934}
2935
2936fn render_paragraph_markdown(
2937 text: &str,
2938 style: Option<&str>,
2939 list: Option<&ListMeta>,
2940 quote: bool,
2941 horizontal_rule: bool,
2942 ordered_index: Option<usize>,
2943) -> String {
2944 if horizontal_rule {
2945 return "---".to_string();
2946 }
2947 match style {
2948 Some("TITLE") => format!("# {text}"),
2949 Some("SUBTITLE") => format!("## {text}"),
2950 Some(style) if style.starts_with("HEADING_") => {
2951 let level = style
2952 .trim_start_matches("HEADING_")
2953 .parse::<usize>()
2954 .unwrap_or(1);
2955 format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2956 }
2957 _ => list.map_or_else(
2958 || {
2959 if quote {
2960 text.lines()
2961 .map(|line| {
2962 if line.is_empty() {
2963 ">".to_string()
2964 } else {
2965 format!("> {line}")
2966 }
2967 })
2968 .collect::<Vec<_>>()
2969 .join("\n")
2970 } else {
2971 text.to_string()
2972 }
2973 },
2974 |list| {
2975 let indent = " ".repeat(list.level);
2976 let marker = if list.ordered {
2977 format!("{}.", ordered_index.unwrap_or(1))
2978 } else {
2979 "-".to_string()
2980 };
2981 format!("{indent}{marker} {text}")
2982 },
2983 ),
2984 }
2985}
2986
2987fn render_table_markdown(table: &TableBlock) -> String {
2988 if table.rows.is_empty() {
2989 return String::new();
2990 }
2991 let width = table
2992 .rows
2993 .iter()
2994 .map(|row| row.cells.len())
2995 .max()
2996 .unwrap_or(1);
2997 let rows = table
2998 .rows
2999 .iter()
3000 .map(|row| {
3001 (0..width)
3002 .map(|idx| {
3003 row.cells.get(idx).map_or_else(String::new, |cell| {
3004 escape_markdown_table_cell(&render_content_markdown(&cell.content))
3005 })
3006 })
3007 .collect::<Vec<_>>()
3008 })
3009 .collect::<Vec<_>>();
3010 let separator = vec!["---".to_string(); width];
3011 std::iter::once(&rows[0])
3012 .chain(std::iter::once(&separator))
3013 .chain(rows.iter().skip(1))
3014 .map(|row| format!("| {} |", row.join(" | ")))
3015 .collect::<Vec<_>>()
3016 .join("\n")
3017}
3018
3019fn render_content_markdown(content: &[ContentNode]) -> String {
3020 let mut rendered = String::new();
3021 let mut idx = 0usize;
3022 while idx < content.len() {
3023 match &content[idx] {
3024 ContentNode::Text {
3025 text,
3026 bold,
3027 italic,
3028 strike,
3029 link,
3030 } => {
3031 let link_target = link.as_deref();
3032 let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
3033 idx += 1;
3034 while let Some(ContentNode::Text {
3035 text,
3036 bold,
3037 italic,
3038 strike,
3039 link: next_link,
3040 }) = content.get(idx)
3041 {
3042 if next_link.as_deref() != link_target {
3043 break;
3044 }
3045 runs.push((text.as_str(), *bold, *italic, *strike));
3046 idx += 1;
3047 }
3048 let label = render_text_runs_markdown(&runs);
3049 if let Some(link_target) = link_target {
3050 let _ = write!(rendered, "[{label}]({link_target})");
3051 } else {
3052 rendered.push_str(&label);
3053 }
3054 }
3055 ContentNode::Image {
3056 url: Some(url),
3057 alt,
3058 ..
3059 } => {
3060 let _ = write!(rendered, "");
3061 idx += 1;
3062 }
3063 ContentNode::Image { .. } => idx += 1,
3064 }
3065 }
3066 rendered
3067}
3068
3069#[derive(Clone, Copy, Default)]
3070struct MarkdownMarkerState {
3071 bold: bool,
3072 italic: bool,
3073 strike: bool,
3074}
3075
3076fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
3077 let inactive = MarkdownMarkerState::default();
3078 let mut active = inactive;
3079 let mut output = String::new();
3080 for (text, bold, italic, strike) in runs {
3081 let next = MarkdownMarkerState {
3082 bold: *bold,
3083 italic: *italic,
3084 strike: *strike,
3085 };
3086 let mut start = 0usize;
3087 for (offset, ch) in text.char_indices() {
3088 if ch != '\n' {
3089 continue;
3090 }
3091 if offset > start {
3092 output.push_str(&markdown_marker_transition(active, next));
3093 output.push_str(&text[start..offset]);
3094 active = next;
3095 }
3096 output.push_str(&markdown_marker_transition(active, inactive));
3097 output.push('\n');
3098 active = inactive;
3099 start = offset + ch.len_utf8();
3100 }
3101 if start < text.len() {
3102 output.push_str(&markdown_marker_transition(active, next));
3103 output.push_str(&text[start..]);
3104 active = next;
3105 }
3106 }
3107 output.push_str(&markdown_marker_transition(active, inactive));
3108 output
3109}
3110
3111fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
3112 let mut markers = String::new();
3113 if active.strike && !next.strike {
3114 markers.push_str("~~");
3115 }
3116 if active.italic && !next.italic {
3117 markers.push('*');
3118 }
3119 if active.bold && !next.bold {
3120 markers.push_str("**");
3121 }
3122 if !active.bold && next.bold {
3123 markers.push_str("**");
3124 }
3125 if !active.italic && next.italic {
3126 markers.push('*');
3127 }
3128 if !active.strike && next.strike {
3129 markers.push_str("~~");
3130 }
3131 markers
3132}
3133
3134fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
3135 format!(
3136 "<!doctype html><html><body>{}</body></html>",
3137 blocks
3138 .iter()
3139 .map(|block| match block {
3140 CapturedBlock::Paragraph {
3141 content,
3142 style,
3143 list,
3144 quote,
3145 horizontal_rule,
3146 } => {
3147 if *horizontal_rule {
3148 "<hr>".to_string()
3149 } else if let Some(list) = list {
3150 let tag = if list.ordered { "ol" } else { "ul" };
3151 format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
3152 } else if *quote {
3153 format!("<blockquote>{}</blockquote>", render_content_html(content))
3154 } else {
3155 let tag = paragraph_tag(style.as_deref());
3156 format!("<{tag}>{}</{tag}>", render_content_html(content))
3157 }
3158 }
3159 CapturedBlock::Table(table) => render_table_html(table),
3160 })
3161 .collect::<String>()
3162 )
3163}
3164
3165fn render_table_html(table: &TableBlock) -> String {
3166 let mut html = String::from("<table>");
3167 for row in &table.rows {
3168 html.push_str("<tr>");
3169 for cell in &row.cells {
3170 html.push_str("<td>");
3171 html.push_str(&render_content_html(&cell.content));
3172 html.push_str("</td>");
3173 }
3174 html.push_str("</tr>");
3175 }
3176 html.push_str("</table>");
3177 html
3178}
3179
3180fn render_content_html(content: &[ContentNode]) -> String {
3181 content
3182 .iter()
3183 .map(|node| match node {
3184 ContentNode::Text {
3185 text,
3186 bold,
3187 italic,
3188 strike,
3189 link,
3190 } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
3191 ContentNode::Image {
3192 url: Some(url),
3193 alt,
3194 width,
3195 height,
3196 ..
3197 } => render_image_html(url, alt, width.as_deref(), height.as_deref()),
3198 ContentNode::Image { .. } => String::new(),
3199 })
3200 .collect()
3201}
3202
3203fn render_image_html(url: &str, alt: &str, width: Option<&str>, height: Option<&str>) -> String {
3204 let mut html = format!(
3205 "<img src=\"{}\" alt=\"{}\"",
3206 escape_html(url),
3207 escape_html(alt)
3208 );
3209 if let Some(width) = width.filter(|value| !value.is_empty()) {
3210 let _ = write!(html, " width=\"{}\"", escape_html(width));
3211 }
3212 if let Some(height) = height.filter(|value| !value.is_empty()) {
3213 let _ = write!(html, " height=\"{}\"", escape_html(height));
3214 }
3215 html.push('>');
3216 html
3217}
3218
3219fn render_marked_html(
3220 text: &str,
3221 bold: bool,
3222 italic: bool,
3223 strike: bool,
3224 link: Option<&str>,
3225) -> String {
3226 text.split('\n')
3227 .map(|segment| render_marked_html_segment(segment, bold, italic, strike, link))
3228 .collect::<Vec<_>>()
3229 .join("<br>")
3230}
3231
3232fn render_marked_html_segment(
3233 text: &str,
3234 bold: bool,
3235 italic: bool,
3236 strike: bool,
3237 link: Option<&str>,
3238) -> String {
3239 if text.is_empty() {
3240 return String::new();
3241 }
3242 let mut output = escape_html(text);
3243 if bold {
3244 output = format!("<strong>{output}</strong>");
3245 }
3246 if italic {
3247 output = format!("<em>{output}</em>");
3248 }
3249 if strike {
3250 output = format!("<s>{output}</s>");
3251 }
3252 if let Some(link) = link {
3253 output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
3254 }
3255 output
3256}
3257
3258fn paragraph_tag(style: Option<&str>) -> &'static str {
3259 match style {
3260 Some("TITLE" | "HEADING_1") => "h1",
3261 Some("SUBTITLE" | "HEADING_2") => "h2",
3262 Some("HEADING_3") => "h3",
3263 Some("HEADING_4") => "h4",
3264 Some("HEADING_5") => "h5",
3265 Some("HEADING_6") => "h6",
3266 _ => "p",
3267 }
3268}
3269
3270fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
3271 blocks
3272 .iter()
3273 .map(|block| match block {
3274 CapturedBlock::Paragraph { content, .. } => content_to_text(content),
3275 CapturedBlock::Table(table) => table
3276 .rows
3277 .iter()
3278 .map(|row| {
3279 row.cells
3280 .iter()
3281 .map(|cell| content_to_text(&cell.content))
3282 .collect::<Vec<_>>()
3283 .join("\t")
3284 })
3285 .collect::<Vec<_>>()
3286 .join("\n"),
3287 })
3288 .filter(|text| !text.is_empty())
3289 .collect::<Vec<_>>()
3290 .join("\n")
3291}
3292
3293fn content_to_text(content: &[ContentNode]) -> String {
3294 content
3295 .iter()
3296 .map(|node| match node {
3297 ContentNode::Text { text, .. } => text.clone(),
3298 ContentNode::Image {
3299 url: Some(_), alt, ..
3300 } => format!("[{alt}]"),
3301 ContentNode::Image { .. } => String::new(),
3302 })
3303 .collect()
3304}
3305
3306fn escape_html(value: &str) -> String {
3307 value
3308 .replace('&', "&")
3309 .replace('<', "<")
3310 .replace('>', ">")
3311 .replace('"', """)
3312 .replace('\'', "'")
3313}
3314
3315fn escape_markdown_table_cell(value: &str) -> String {
3316 value.replace('|', "\\|").replace('\n', "<br>")
3317}
3318
3319#[must_use]
3323pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
3324 let trimmed = auth_header.trim();
3325 trimmed
3326 .strip_prefix("Bearer ")
3327 .or_else(|| trimmed.strip_prefix("bearer "))
3328 .map(str::trim)
3329 .filter(|t| !t.is_empty())
3330}
3331
3332#[derive(Debug, Clone)]
3334pub struct ExtractedImage {
3335 pub filename: String,
3337 pub data: Vec<u8>,
3339 pub mime_type: String,
3341}
3342
3343#[derive(Debug, Clone)]
3345pub struct GDocsArchiveResult {
3346 pub html: String,
3348 pub markdown: String,
3350 pub images: Vec<ExtractedImage>,
3352 pub document_id: String,
3354 pub export_url: String,
3356}
3357
3358pub async fn localize_rendered_remote_images_for_archive(
3370 rendered: &GDocsRenderedResult,
3371) -> crate::Result<GDocsArchiveResult> {
3372 let client = reqwest::Client::builder().build().map_err(|error| {
3373 WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
3374 })?;
3375 let mut seen = HashMap::new();
3376 let mut images = Vec::new();
3377 let mut next_index = 1usize;
3378
3379 for image in &rendered.remote_images {
3380 if seen.contains_key(&image.url) {
3381 continue;
3382 }
3383 let filename = remote_image_filename(&image.url, next_index);
3384 next_index += 1;
3385 seen.insert(image.url.clone(), filename.clone());
3386
3387 match client
3388 .get(&image.url)
3389 .header("User-Agent", GDOCS_USER_AGENT)
3390 .header("Accept", "image/*,*/*;q=0.8")
3391 .send()
3392 .await
3393 {
3394 Ok(response) if response.status().is_success() => {
3395 let mime_type = response
3396 .headers()
3397 .get(reqwest::header::CONTENT_TYPE)
3398 .and_then(|value| value.to_str().ok())
3399 .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
3400 let data = response.bytes().await.map_err(|error| {
3401 WebCaptureError::FetchError(format!(
3402 "Failed to read Google Docs image {}: {error}",
3403 image.url
3404 ))
3405 })?;
3406 debug!(
3407 url = %image.url,
3408 filename = %filename,
3409 bytes = data.len(),
3410 mime_type = %mime_type,
3411 "downloaded Google Docs browser-model archive image"
3412 );
3413 images.push(ExtractedImage {
3414 filename,
3415 data: data.to_vec(),
3416 mime_type,
3417 });
3418 }
3419 Ok(response) => {
3420 warn!(
3421 url = %image.url,
3422 status = response.status().as_u16(),
3423 "failed to download Google Docs browser-model archive image"
3424 );
3425 }
3426 Err(error) => {
3427 warn!(
3428 url = %image.url,
3429 error = %error,
3430 "failed to download Google Docs browser-model archive image"
3431 );
3432 }
3433 }
3434 }
3435
3436 let mut markdown = rendered.markdown.clone();
3437 let mut html = rendered.html.clone();
3438 for (url, filename) in seen {
3439 let local_path = format!("images/{filename}");
3440 markdown = markdown.replace(&url, &local_path);
3441 html = html.replace(&url, &local_path);
3442 }
3443
3444 Ok(GDocsArchiveResult {
3445 html,
3446 markdown,
3447 images,
3448 document_id: rendered.document_id.clone(),
3449 export_url: rendered.export_url.clone(),
3450 })
3451}
3452
3453fn remote_image_filename(url: &str, index: usize) -> String {
3454 let ext = crate::localize_images::get_extension_from_url(url);
3455 format!("image-{index:02}{ext}")
3456}
3457
3458fn mime_type_for_filename(filename: &str) -> String {
3459 match filename
3460 .rsplit('.')
3461 .next()
3462 .unwrap_or("png")
3463 .to_lowercase()
3464 .as_str()
3465 {
3466 "jpg" | "jpeg" => "image/jpeg",
3467 "gif" => "image/gif",
3468 "webp" => "image/webp",
3469 "svg" => "image/svg+xml",
3470 _ => "image/png",
3471 }
3472 .to_string()
3473}
3474
3475fn base64_image_pattern() -> &'static Regex {
3476 static PATTERN: OnceLock<Regex> = OnceLock::new();
3477 PATTERN.get_or_init(|| {
3478 Regex::new(
3479 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3480 )
3481 .unwrap()
3482 })
3483}
3484
3485#[must_use]
3498pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3499 let mut images = Vec::new();
3500 let mut idx = 1u32;
3501
3502 let updated_html = base64_image_pattern()
3503 .replace_all(html, |caps: ®ex::Captures<'_>| {
3504 let prefix = &caps[1];
3505 let mime_ext = &caps[2];
3506 let base64_data = &caps[3];
3507 let suffix = &caps[4];
3508
3509 let ext = match mime_ext {
3510 "jpeg" => "jpg",
3511 "svg+xml" => "svg",
3512 other => other,
3513 };
3514
3515 let filename = format!("image-{idx:02}.{ext}");
3516 let mime_type = format!("image/{mime_ext}");
3517
3518 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3519 debug!("Extracted image: {} ({} bytes)", filename, data.len());
3520 images.push(ExtractedImage {
3521 filename: filename.clone(),
3522 data,
3523 mime_type,
3524 });
3525 }
3526
3527 idx += 1;
3528 format!("{prefix}images/{filename}{suffix}")
3529 })
3530 .into_owned();
3531
3532 (updated_html, images)
3533}
3534
3535pub async fn fetch_google_doc_as_archive(
3554 url: &str,
3555 api_token: Option<&str>,
3556) -> crate::Result<GDocsArchiveResult> {
3557 let result = fetch_google_doc(url, "html", api_token).await?;
3558
3559 let preprocess = preprocess_google_docs_export_html(&result.content);
3560 debug!(
3561 document_id = %result.document_id,
3562 hoisted = preprocess.hoisted,
3563 unwrapped_links = preprocess.unwrapped_links,
3564 "google-docs-export pre-processor rewrote archive markup"
3565 );
3566
3567 let (local_html, images) = extract_base64_images(&preprocess.html);
3568
3569 let markdown = normalize_google_docs_export_markdown(
3570 &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3571 );
3572
3573 debug!(
3574 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3575 images.len(),
3576 local_html.len(),
3577 markdown.len()
3578 );
3579
3580 Ok(GDocsArchiveResult {
3581 html: local_html,
3582 markdown,
3583 images,
3584 document_id: result.document_id,
3585 export_url: result.export_url,
3586 })
3587}
3588
3589pub fn create_archive_zip(
3600 archive: &GDocsArchiveResult,
3601 pretty_html: bool,
3602) -> crate::Result<Vec<u8>> {
3603 let mut buf = std::io::Cursor::new(Vec::new());
3604
3605 {
3606 let mut zip = zip::ZipWriter::new(&mut buf);
3607 let options = zip::write::SimpleFileOptions::default()
3608 .compression_method(zip::CompressionMethod::Deflated);
3609
3610 zip.start_file("document.md", options)
3611 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3612 zip.write_all(archive.markdown.as_bytes())?;
3613
3614 let html_output = if pretty_html {
3615 crate::html::pretty_print_html(&archive.html)
3616 } else {
3617 archive.html.clone()
3618 };
3619 zip.start_file("document.html", options)
3620 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3621 zip.write_all(html_output.as_bytes())?;
3622
3623 for img in &archive.images {
3624 zip.start_file(format!("images/{}", img.filename), options)
3625 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3626 zip.write_all(&img.data)?;
3627 }
3628
3629 zip.finish()
3630 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3631 }
3632
3633 Ok(buf.into_inner())
3634}
3635
3636#[cfg(test)]
3637mod tests {
3638 use super::*;
3639 use serde_json::json;
3640
3641 #[test]
3642 fn browser_model_fingerprint_includes_payload_size() {
3643 let small = browser_model_data_from_value(&json!({
3644 "chunks": [{ "chunk": [{ "ty": "is", "s": "first" }] }],
3645 "cidUrlMap": {}
3646 }));
3647 let larger = browser_model_data_from_value(&json!({
3648 "chunks": [{ "chunk": [{ "ty": "is", "s": "first and later text" }] }],
3649 "cidUrlMap": {}
3650 }));
3651
3652 assert_eq!(small.fingerprint().chunks, larger.fingerprint().chunks);
3653 assert_ne!(
3654 small.fingerprint().payload_bytes,
3655 larger.fingerprint().payload_bytes
3656 );
3657 }
3658
3659 #[test]
3660 fn browser_model_quiescence_resets_when_chunks_change() {
3661 let start = Instant::now();
3662 let stability_window = Duration::from_millis(1500);
3663 let one_chunk = BrowserModelFingerprint {
3664 chunks: 1,
3665 payload_bytes: 100,
3666 };
3667 let two_chunks = BrowserModelFingerprint {
3668 chunks: 2,
3669 payload_bytes: 200,
3670 };
3671 let mut quiescence = BrowserModelQuiescence::default();
3672
3673 assert_eq!(quiescence.observe(one_chunk, start, stability_window), None);
3674 assert_eq!(
3675 quiescence.observe(
3676 one_chunk,
3677 start + Duration::from_millis(250),
3678 stability_window
3679 ),
3680 None
3681 );
3682 assert_eq!(
3683 quiescence.observe(
3684 two_chunks,
3685 start + Duration::from_millis(500),
3686 stability_window
3687 ),
3688 None
3689 );
3690 assert_eq!(
3691 quiescence.observe(
3692 two_chunks,
3693 start + Duration::from_millis(750),
3694 stability_window
3695 ),
3696 None
3697 );
3698 assert_eq!(
3699 quiescence.observe(
3700 two_chunks,
3701 start + Duration::from_millis(2300),
3702 stability_window
3703 ),
3704 Some(Duration::from_millis(1550))
3705 );
3706 }
3707}