1use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use scraper::{node::Node, ElementRef, Html, Selector};
38use serde_json::Value;
39use std::collections::HashMap;
40use std::fmt::Write as _;
41use std::hash::BuildHasher;
42use std::io::Write;
43use std::process::Stdio;
44use std::sync::OnceLock;
45use std::time::{Duration, Instant};
46use tokio::io::{AsyncBufReadExt, BufReader};
47use tokio::process::{Child, Command};
48use tracing::{debug, info, warn};
49
50use crate::WebCaptureError;
51
52const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
53const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
54const GDOCS_USER_AGENT: &str =
55 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
56const GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT: Duration = Duration::from_secs(30);
57const GDOCS_EDITOR_MODEL_STABILITY_DEFAULT: Duration = Duration::from_millis(1500);
58const GDOCS_EDITOR_MODEL_POLL_INTERVAL: Duration = Duration::from_millis(250);
59const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
60
61type CdpWebSocket = WebSocketStream<ConnectStream>;
62
63const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
64window.__captured_chunks = [];
65const captureChunk = (value) => {
66 if (!value) {
67 return;
68 }
69 if (Array.isArray(value)) {
70 for (const item of value) {
71 captureChunk(item);
72 }
73 return;
74 }
75 try {
76 window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
77 } catch {
78 window.__captured_chunks.push(value);
79 }
80};
81const wrapChunkArray = (value) => {
82 if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
83 return value;
84 }
85 const originalPush = value.push;
86 Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
87 value: true,
88 enumerable: false,
89 });
90 Object.defineProperty(value, 'push', {
91 value(...items) {
92 for (const item of items) {
93 captureChunk(item);
94 }
95 return originalPush.apply(this, items);
96 },
97 writable: true,
98 configurable: true,
99 });
100 for (const item of value) {
101 captureChunk(item);
102 }
103 return value;
104};
105Object.defineProperty(window, 'DOCS_modelChunk', {
106 set(value) {
107 captureChunk(value);
108 window.__DOCS_modelChunk_latest = wrapChunkArray(value);
109 },
110 get() {
111 return window.__DOCS_modelChunk_latest;
112 },
113 configurable: false,
114});
115";
116
117const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
118 const chunks = [...(window.__captured_chunks || [])];
119 if (
120 window.DOCS_modelChunk &&
121 chunks.length === 0 &&
122 !chunks.includes(window.DOCS_modelChunk)
123 ) {
124 chunks.push(window.DOCS_modelChunk);
125 }
126 const cidUrlMap = {};
127 const scripts = document.querySelectorAll('script');
128 for (const script of scripts) {
129 const text = script.textContent || '';
130 if (!text.includes('docs-images-rt')) {
131 continue;
132 }
133 const regex =
134 /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
135 let match;
136 while ((match = regex.exec(text)) !== null) {
137 cidUrlMap[match[1]] = match[2]
138 .replace(/\\u003d/g, '=')
139 .replace(/\\u0026/g, '&')
140 .replace(/\\\//g, '/');
141 }
142 }
143 return { chunks, cidUrlMap };
144}"#;
145
146fn gdocs_url_pattern() -> &'static Regex {
147 static PATTERN: OnceLock<Regex> = OnceLock::new();
148 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
149}
150
151#[derive(Debug, Clone)]
153pub struct GDocsResult {
154 pub content: String,
156 pub format: String,
158 pub document_id: String,
160 pub export_url: String,
162}
163
164#[derive(Debug, Clone, Copy, PartialEq, Eq)]
166pub enum GDocsCaptureMethod {
167 BrowserModel,
169 PublicExport,
171 DocsApi,
173}
174
175#[derive(Debug, Clone)]
177pub struct GDocsRenderedResult {
178 pub markdown: String,
180 pub html: String,
182 pub text: String,
184 pub document_id: String,
186 pub export_url: String,
188 pub remote_images: Vec<RemoteImage>,
190}
191
192#[derive(Debug, Clone, PartialEq, Eq)]
194pub struct RemoteImage {
195 pub url: String,
197 pub alt: String,
199}
200
201#[derive(Debug, Clone)]
202struct BrowserModelData {
203 chunks: Vec<Value>,
204 cid_urls: HashMap<String, String>,
205 chunk_payload_bytes: usize,
206 poll_count: usize,
207 stable_for: Duration,
208}
209
210#[derive(Debug, Clone, Copy, PartialEq, Eq)]
211struct BrowserModelFingerprint {
212 chunks: usize,
213 payload_bytes: usize,
214}
215
216#[derive(Debug, Default)]
217struct BrowserModelQuiescence {
218 last_fingerprint: Option<BrowserModelFingerprint>,
219 stable_since: Option<Instant>,
220}
221
222impl BrowserModelData {
223 const fn fingerprint(&self) -> BrowserModelFingerprint {
224 BrowserModelFingerprint {
225 chunks: self.chunks.len(),
226 payload_bytes: self.chunk_payload_bytes,
227 }
228 }
229}
230
231impl BrowserModelQuiescence {
232 fn observe(
233 &mut self,
234 fingerprint: BrowserModelFingerprint,
235 now: Instant,
236 stability_window: Duration,
237 ) -> Option<Duration> {
238 if fingerprint.chunks == 0 {
239 self.last_fingerprint = Some(fingerprint);
240 self.stable_since = None;
241 return None;
242 }
243
244 if self.last_fingerprint == Some(fingerprint) {
245 let stable_since = *self.stable_since.get_or_insert(now);
246 let stable_for = now.saturating_duration_since(stable_since);
247 if stable_for >= stability_window {
248 return Some(stable_for);
249 }
250 } else {
251 self.last_fingerprint = Some(fingerprint);
252 self.stable_since = None;
253 }
254
255 None
256 }
257
258 fn stable_for(&self, now: Instant) -> Duration {
259 self.stable_since.map_or(Duration::ZERO, |stable_since| {
260 now.saturating_duration_since(stable_since)
261 })
262 }
263}
264
265#[derive(Debug, Clone, Default)]
267pub struct CapturedDocument {
268 pub blocks: Vec<CapturedBlock>,
270 pub tables: Vec<TableBlock>,
272 pub images: Vec<ContentNode>,
274 pub text: String,
276}
277
278#[derive(Debug, Clone)]
280pub enum CapturedBlock {
281 Paragraph {
283 content: Vec<ContentNode>,
285 style: Option<String>,
287 list: Option<ListMeta>,
289 quote: bool,
291 horizontal_rule: bool,
293 },
294 Table(TableBlock),
296}
297
298#[derive(Debug, Clone, Default)]
300pub struct TableBlock {
301 pub rows: Vec<TableRow>,
303}
304
305#[derive(Debug, Clone, Default)]
307pub struct TableRow {
308 pub cells: Vec<TableCell>,
310}
311
312#[derive(Debug, Clone, Default)]
314pub struct TableCell {
315 pub content: Vec<ContentNode>,
317}
318
319#[derive(Debug, Clone, PartialEq, Eq)]
321pub enum ContentNode {
322 Text {
324 text: String,
326 bold: bool,
328 italic: bool,
330 strike: bool,
332 link: Option<String>,
334 },
335 Image {
337 cid: Option<String>,
339 url: Option<String>,
341 alt: String,
343 width: Option<String>,
345 height: Option<String>,
347 is_suggestion: bool,
349 },
350}
351
352#[derive(Debug, Clone, Default, PartialEq, Eq)]
353struct TextStyle {
354 bold: bool,
355 italic: bool,
356 strike: bool,
357 link: Option<String>,
358}
359
360#[derive(Debug, Clone, Default)]
361struct ParagraphMeta {
362 style: Option<String>,
363 list: Option<ListMeta>,
364 quote: bool,
365 horizontal_rule: bool,
366}
367
368#[derive(Debug, Clone)]
369pub struct ListMeta {
370 pub id: String,
372 pub level: usize,
374 pub ordered: bool,
376}
377
378#[derive(Debug, Clone)]
379struct ParagraphStyle {
380 style: Option<String>,
381 indent_start: f64,
382 indent_first_line: f64,
383}
384
385#[derive(Debug, Clone)]
386struct ExportSemanticHint {
387 text: String,
388 list_ordered: Option<bool>,
389 quote: bool,
390}
391
392#[derive(Debug, Clone, Default)]
393struct ModelStyleMaps {
394 inline_styles: Vec<TextStyle>,
395 paragraph_by_end: HashMap<usize, ParagraphStyle>,
396 list_by_end: HashMap<usize, ListMeta>,
397 horizontal_rules: std::collections::HashSet<usize>,
398}
399
400#[must_use]
402pub fn is_google_docs_url(url: &str) -> bool {
403 gdocs_url_pattern().is_match(url)
404}
405
406#[must_use]
410pub fn extract_document_id(url: &str) -> Option<String> {
411 gdocs_url_pattern()
412 .captures(url)
413 .and_then(|caps| caps.get(1))
414 .map(|m| m.as_str().to_string())
415}
416
417#[must_use]
424pub fn build_export_url(document_id: &str, format: &str) -> String {
425 let export_format = match format {
426 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
427 _ => "html",
428 };
429 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
430}
431
432#[must_use]
434pub fn build_edit_url(document_id: &str) -> String {
435 format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
436}
437
438#[must_use]
440pub fn build_docs_api_url(document_id: &str) -> String {
441 format!("{GDOCS_API_BASE}/{document_id}")
442}
443
444pub fn select_capture_method(
450 capture: &str,
451 api_token: Option<&str>,
452) -> crate::Result<GDocsCaptureMethod> {
453 match capture.to_lowercase().as_str() {
454 "browser" => Ok(GDocsCaptureMethod::BrowserModel),
455 "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
456 "api" => Ok(GDocsCaptureMethod::PublicExport),
457 other => Err(WebCaptureError::InvalidUrl(format!(
458 "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
459 ))),
460 }
461}
462
463pub async fn fetch_google_doc(
478 url: &str,
479 format: &str,
480 api_token: Option<&str>,
481) -> crate::Result<GDocsResult> {
482 let document_id = extract_document_id(url).ok_or_else(|| {
483 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
484 })?;
485
486 let export_url = build_export_url(&document_id, format);
487 debug!(
488 document_id = %document_id,
489 format = %format,
490 export_url = %export_url,
491 has_api_token = api_token.is_some(),
492 "fetching Google Doc via public export"
493 );
494
495 let mut request = reqwest::Client::new()
496 .get(&export_url)
497 .header(
498 "User-Agent",
499 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
500 )
501 .header("Accept-Charset", "utf-8")
502 .header("Accept-Language", "en-US,en;q=0.9");
503
504 if let Some(token) = api_token {
505 request = request.header("Authorization", format!("Bearer {token}"));
506 }
507
508 let response = request
509 .send()
510 .await
511 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
512 debug!(
513 document_id = %document_id,
514 status = response.status().as_u16(),
515 success = response.status().is_success(),
516 content_type = response
517 .headers()
518 .get(reqwest::header::CONTENT_TYPE)
519 .and_then(|value| value.to_str().ok())
520 .unwrap_or(""),
521 "received Google Docs public export response"
522 );
523
524 if !response.status().is_success() {
525 return Err(WebCaptureError::FetchError(format!(
526 "Failed to fetch Google Doc ({} {}): {}",
527 response.status().as_u16(),
528 response.status().canonical_reason().unwrap_or("Unknown"),
529 export_url
530 )));
531 }
532
533 let raw_content = response.text().await.map_err(|e| {
534 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
535 })?;
536 debug!(
537 document_id = %document_id,
538 bytes = raw_content.len(),
539 "read Google Docs public export body"
540 );
541
542 let content = match format {
545 "txt" | "md" => crate::html::decode_html_entities(&raw_content),
546 _ => raw_content,
547 };
548
549 Ok(GDocsResult {
550 content,
551 format: format.to_string(),
552 document_id,
553 export_url,
554 })
555}
556
557pub async fn fetch_google_doc_as_markdown(
571 url: &str,
572 api_token: Option<&str>,
573) -> crate::Result<GDocsResult> {
574 let result = fetch_google_doc(url, "html", api_token).await?;
575
576 let preprocess = preprocess_google_docs_export_html(&result.content);
577 debug!(
578 document_id = %result.document_id,
579 hoisted = preprocess.hoisted,
580 unwrapped_links = preprocess.unwrapped_links,
581 "google-docs-export pre-processor rewrote markup"
582 );
583 let markdown = normalize_google_docs_export_markdown(
584 &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
585 );
586 debug!(
587 document_id = %result.document_id,
588 bytes = markdown.len(),
589 "rendered Google Docs public export markdown"
590 );
591
592 Ok(GDocsResult {
593 content: markdown,
594 format: "markdown".to_string(),
595 document_id: result.document_id,
596 export_url: result.export_url,
597 })
598}
599
600#[derive(Debug, Clone)]
605pub struct GDocsExportPreprocessResult {
606 pub html: String,
608 pub hoisted: usize,
610 pub unwrapped_links: usize,
612}
613
614#[must_use]
622pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
623 let mut hoisted: usize = 0;
624 let mut unwrapped_links: usize = 0;
625 let class_styles = extract_css_class_styles(html);
626
627 let mut out = hoist_inline_style_spans(html, &mut hoisted);
628 out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
629 out = split_strong_at_block_boundaries(&out);
630 out = split_paragraphs_at_bold_boundaries(&out);
631 out = remove_empty_strong(&out);
632 out = coalesce_adjacent_strong(&out);
633 out = convert_class_indented_blockquotes(&out, &class_styles);
634 out = nest_google_docs_lists(&out, &class_styles);
635 out = strip_google_docs_heading_noise(&out);
636 out = strip_heading_inline_formatting(&out);
637 out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
638 out = out.replace(" ", " ");
639 out = out.replace('\u{00A0}', " ");
640
641 GDocsExportPreprocessResult {
642 html: out,
643 hoisted,
644 unwrapped_links,
645 }
646}
647
648#[must_use]
650pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
651 let markdown = unescape_public_export_punctuation(markdown);
652 let markdown = convert_setext_headings(&markdown);
653 let markdown = normalize_atx_headings(&markdown);
654 let markdown = normalize_bullet_markers(&markdown);
655 let markdown = normalize_list_spacing(&markdown);
656 let markdown = normalize_blockquote_spacing(&markdown);
657 let markdown = normalize_markdown_tables(&markdown);
658 crate::markdown::clean_markdown(&markdown)
659}
660
661fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
662 let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
663 .expect("valid regex");
664 span_re
665 .replace_all(html, |caps: ®ex::Captures<'_>| {
666 let style = caps.get(2).map_or("", |m| m.as_str());
667 let inner = caps.get(3).map_or("", |m| m.as_str());
668 semantic_wrapped_html(inner, style).map_or_else(
669 || caps[0].to_string(),
670 |wrapped| {
671 *hoisted += 1;
672 wrapped
673 },
674 )
675 })
676 .into_owned()
677}
678
679fn hoist_class_style_spans(
680 html: &str,
681 class_styles: &HashMap<String, String>,
682 hoisted: &mut usize,
683) -> String {
684 let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
685 .expect("valid regex");
686 class_span_re
687 .replace_all(html, |caps: ®ex::Captures<'_>| {
688 let class_attr = caps.get(2).map_or("", |m| m.as_str());
689 let inner = caps.get(3).map_or("", |m| m.as_str());
690 let style = combined_class_style(class_styles, class_attr);
691 semantic_wrapped_html(inner, &style).map_or_else(
692 || caps[0].to_string(),
693 |wrapped| {
694 *hoisted += 1;
695 wrapped
696 },
697 )
698 })
699 .into_owned()
700}
701
702fn split_strong_at_block_boundaries(html: &str) -> String {
707 let strong_re = Regex::new(r"(?is)<strong>(.*?)</strong>").expect("valid regex");
708 let boundary_re = Regex::new(r"(?is)<br\s*/?>|<img\b[^>]*/?>").expect("valid regex");
709
710 let mut current = html.to_string();
711 loop {
712 let mut changed = false;
713 let next = strong_re
714 .replace_all(¤t, |caps: ®ex::Captures<'_>| {
715 let inner = caps.get(1).map_or("", |m| m.as_str());
716 if !boundary_re.is_match(inner) {
717 return caps[0].to_string();
718 }
719 changed = true;
720 let mut result = String::new();
721 let mut last_end = 0usize;
722 for boundary in boundary_re.find_iter(inner) {
723 let chunk = &inner[last_end..boundary.start()];
724 if chunk.trim().is_empty() {
725 result.push_str(chunk);
726 } else {
727 result.push_str("<strong>");
728 result.push_str(chunk);
729 result.push_str("</strong>");
730 }
731 result.push_str(boundary.as_str());
732 last_end = boundary.end();
733 }
734 let tail = &inner[last_end..];
735 if tail.trim().is_empty() {
736 result.push_str(tail);
737 } else {
738 result.push_str("<strong>");
739 result.push_str(tail);
740 result.push_str("</strong>");
741 }
742 result
743 })
744 .into_owned();
745 current = next;
746 if !changed {
747 break;
748 }
749 }
750 current
751}
752
753fn split_paragraphs_at_bold_boundaries(html: &str) -> String {
758 let p_re = Regex::new(r"(?is)<p\b([^>]*)>(.*?)</p>").expect("valid regex");
759 let br_re = Regex::new(r"(?is)<br\s*/?>").expect("valid regex");
760 let img_re = Regex::new(r"(?is)<img\b[^>]*/?>").expect("valid regex");
761 let strong_or_img_re = Regex::new(r"(?is)<strong>|<img\b").expect("valid regex");
762
763 p_re.replace_all(html, |caps: ®ex::Captures<'_>| {
764 let attrs = caps.get(1).map_or("", |m| m.as_str());
765 let inner = caps.get(2).map_or("", |m| m.as_str());
766 if !br_re.is_match(inner) {
767 return caps[0].to_string();
768 }
769 if !strong_or_img_re.is_match(inner) {
770 return caps[0].to_string();
771 }
772
773 let mut segments: Vec<String> = Vec::new();
774 let mut current = String::new();
775 let mut idx = 0usize;
776 while idx < inner.len() {
777 if let Some(br) = br_re.find_at(inner, idx) {
778 if br.start() == idx {
779 flush_paragraph_segment(&mut segments, &mut current);
780 idx = br.end();
781 continue;
782 }
783 }
784 if let Some(img) = img_re.find_at(inner, idx) {
785 if img.start() == idx {
786 flush_paragraph_segment(&mut segments, &mut current);
787 segments.push(img.as_str().to_string());
788 idx = img.end();
789 continue;
790 }
791 }
792 let next_br = br_re.find_at(inner, idx).map(|m| m.start());
794 let next_img = img_re.find_at(inner, idx).map(|m| m.start());
795 let next = match (next_br, next_img) {
796 (Some(a), Some(b)) => a.min(b),
797 (Some(a), None) | (None, Some(a)) => a,
798 (None, None) => inner.len(),
799 };
800 current.push_str(&inner[idx..next]);
801 idx = next;
802 }
803 flush_paragraph_segment(&mut segments, &mut current);
804
805 if segments.len() <= 1 {
806 return caps[0].to_string();
807 }
808 let mut out = String::new();
809 for segment in &segments {
810 let _ = write!(out, "<p{attrs}>{segment}</p>");
811 }
812 out
813 })
814 .into_owned()
815}
816
817fn flush_paragraph_segment(segments: &mut Vec<String>, current: &mut String) {
818 let trimmed = current.trim();
819 if !trimmed.is_empty() {
820 segments.push(current.clone());
821 }
822 current.clear();
823}
824
825fn remove_empty_strong(html: &str) -> String {
828 let empty_re = Regex::new(r"(?is)<strong>\s*</strong>").expect("valid regex");
829 empty_re.replace_all(html, "").into_owned()
830}
831
832fn coalesce_adjacent_strong(html: &str) -> String {
836 let adjacent_re = Regex::new(r"(?is)</strong>(\s*)<strong>").expect("valid regex");
837 let mut current = html.to_string();
838 loop {
839 let next = adjacent_re.replace_all(¤t, "$1").into_owned();
840 if next == current {
841 return next;
842 }
843 current = next;
844 }
845}
846
847fn convert_class_indented_blockquotes(
848 html: &str,
849 class_styles: &HashMap<String, String>,
850) -> String {
851 let class_paragraph_re =
852 Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
853 class_paragraph_re
854 .replace_all(html, |caps: ®ex::Captures<'_>| {
855 let class_attr = caps.get(2).map_or("", |m| m.as_str());
856 let inner = caps.get(3).map_or("", |m| m.as_str());
857 let style = combined_class_style(class_styles, class_attr);
858 if is_blockquote_style(&style) {
859 format!("<blockquote><p>{inner}</p></blockquote>")
860 } else {
861 caps[0].to_string()
862 }
863 })
864 .into_owned()
865}
866
867#[derive(Debug, Clone)]
868struct ExportListBlock {
869 start: usize,
870 end: usize,
871 tag: String,
872 inner: String,
873 start_attr: Option<String>,
874}
875
876#[derive(Debug, Clone)]
877struct ExportListItem {
878 tag: String,
879 level: usize,
880 inner: String,
881}
882
883fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
884 let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
885 let start_attr_re = Regex::new(r#"(?i)\bstart\s*=\s*"([^"]*)""#).expect("valid regex");
886 let blocks: Vec<ExportListBlock> = list_re
887 .captures_iter(html)
888 .filter_map(|caps| {
889 let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
890 let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
891 if open_tag != close_tag {
892 return None;
893 }
894 let whole = caps.get(0)?;
895 let attrs = caps.get(2).map_or("", |m| m.as_str());
896 let start_attr = if open_tag == "ol" {
897 start_attr_re
898 .captures(attrs)
899 .and_then(|c| c.get(1).map(|m| m.as_str().to_string()))
900 } else {
901 None
902 };
903 Some(ExportListBlock {
904 start: whole.start(),
905 end: whole.end(),
906 tag: open_tag,
907 inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
908 start_attr,
909 })
910 })
911 .collect();
912
913 if blocks.len() < 2 {
914 return html.to_string();
915 }
916
917 let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
918 let mut current: Vec<ExportListBlock> = Vec::new();
919 for block in blocks {
920 if let Some(previous) = current.last() {
921 if !html[previous.end..block.start].trim().is_empty() {
922 if current.len() > 1 {
923 groups.push(std::mem::take(&mut current));
924 } else {
925 current.clear();
926 }
927 }
928 }
929 current.push(block);
930 }
931 if current.len() > 1 {
932 groups.push(current);
933 }
934
935 if groups.is_empty() {
936 return html.to_string();
937 }
938
939 let mut out = html.to_string();
940 for group in groups.iter().rev() {
941 let rendered = render_nested_list_group(group, class_styles);
942 let start = group.first().expect("non-empty group").start;
943 let end = group.last().expect("non-empty group").end;
944 out.replace_range(start..end, &rendered);
945 }
946 out
947}
948
949#[allow(clippy::too_many_lines)]
950fn render_nested_list_group(
951 group: &[ExportListBlock],
952 class_styles: &HashMap<String, String>,
953) -> String {
954 let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
955 let items: Vec<ExportListItem> = group
956 .iter()
957 .flat_map(|block| {
958 item_re.captures_iter(&block.inner).map(|caps| {
959 let attrs = caps.get(1).map_or("", |m| m.as_str());
960 let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
961 ExportListItem {
962 tag: block.tag.clone(),
963 level: google_docs_list_item_level(attrs, class_styles),
964 inner,
965 }
966 })
967 })
968 .collect();
969
970 if items.is_empty() {
971 let mut unchanged = String::new();
972 for block in group {
973 write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
974 .expect("write to String");
975 }
976 return unchanged;
977 }
978
979 let top_level_start = group.first().and_then(|block| block.start_attr.clone());
980
981 let mut html = String::new();
982 let mut current_level: Option<usize> = None;
983 let mut open_tags: Vec<Option<String>> = Vec::new();
984 let mut item_open: Vec<bool> = Vec::new();
985 let mut top_level_opened = false;
986
987 for item in items {
988 let level = item.level;
989 while current_level.is_some_and(|current| current > level) {
990 let current = current_level.expect("checked as Some");
991 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
992 current_level = current.checked_sub(1);
993 }
994
995 while current_level.is_none_or(|current| current < level) {
996 let next_level = current_level.map_or(0, |current| current + 1);
997 let start_attr = if next_level == 0 && !top_level_opened {
998 top_level_opened = true;
999 top_level_start.as_deref()
1000 } else {
1001 None
1002 };
1003 open_rendered_list(
1004 &mut html,
1005 &mut open_tags,
1006 &mut item_open,
1007 next_level,
1008 &item.tag,
1009 start_attr,
1010 );
1011 current_level = Some(next_level);
1012 }
1013
1014 ensure_list_stack(&mut open_tags, &mut item_open, level);
1015 if open_tags[level]
1016 .as_deref()
1017 .is_some_and(|tag| tag != item.tag)
1018 {
1019 close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
1020 let start_attr = if level == 0 && !top_level_opened {
1021 top_level_opened = true;
1022 top_level_start.as_deref()
1023 } else {
1024 None
1025 };
1026 open_rendered_list(
1027 &mut html,
1028 &mut open_tags,
1029 &mut item_open,
1030 level,
1031 &item.tag,
1032 start_attr,
1033 );
1034 } else if open_tags[level].is_none() {
1035 let start_attr = if level == 0 && !top_level_opened {
1036 top_level_opened = true;
1037 top_level_start.as_deref()
1038 } else {
1039 None
1040 };
1041 open_rendered_list(
1042 &mut html,
1043 &mut open_tags,
1044 &mut item_open,
1045 level,
1046 &item.tag,
1047 start_attr,
1048 );
1049 }
1050
1051 close_rendered_item(&mut html, &mut item_open, level);
1052 html.push_str("<li>");
1053 html.push_str(&item.inner);
1054 item_open[level] = true;
1055
1056 for deeper in (level + 1)..item_open.len() {
1057 item_open[deeper] = false;
1058 open_tags[deeper] = None;
1059 }
1060 }
1061
1062 while let Some(current) = current_level {
1063 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
1064 current_level = current.checked_sub(1);
1065 }
1066
1067 html
1068}
1069
1070fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
1071 while open_tags.len() <= level {
1072 open_tags.push(None);
1073 item_open.push(false);
1074 }
1075}
1076
1077fn open_rendered_list(
1078 html: &mut String,
1079 open_tags: &mut Vec<Option<String>>,
1080 item_open: &mut Vec<bool>,
1081 level: usize,
1082 tag: &str,
1083 start_attr: Option<&str>,
1084) {
1085 ensure_list_stack(open_tags, item_open, level);
1086 html.push('<');
1087 html.push_str(tag);
1088 if let Some(start) = start_attr {
1089 if tag == "ol" && !start.is_empty() {
1090 write!(html, r#" start="{start}""#).expect("write to String");
1091 }
1092 }
1093 html.push('>');
1094 open_tags[level] = Some(tag.to_string());
1095 item_open[level] = false;
1096}
1097
1098fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
1099 if item_open.get(level).copied().unwrap_or(false) {
1100 html.push_str("</li>");
1101 item_open[level] = false;
1102 }
1103}
1104
1105fn close_rendered_list(
1106 html: &mut String,
1107 open_tags: &mut [Option<String>],
1108 item_open: &mut [bool],
1109 level: usize,
1110) {
1111 close_rendered_item(html, item_open, level);
1112 if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
1113 html.push_str("</");
1114 html.push_str(&tag);
1115 html.push('>');
1116 }
1117}
1118
1119fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
1120 let style = combined_attr_style(class_styles, attrs);
1121 let margin_left = css_point_value(&style, "margin-left");
1122 if margin_left <= 0.0 {
1123 return 0;
1124 }
1125 [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
1126 .iter()
1127 .take_while(|boundary| margin_left >= **boundary)
1128 .count()
1129}
1130
1131fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
1132 let mut styles = String::new();
1133 if let Some(style) = attr_value(attrs, "style") {
1134 styles.push_str(&style);
1135 }
1136 if let Some(class_attr) = attr_value(attrs, "class") {
1137 styles.push_str(&combined_class_style(class_styles, &class_attr));
1138 }
1139 styles
1140}
1141
1142fn attr_value(attrs: &str, name: &str) -> Option<String> {
1143 let attr_re = Regex::new(&format!(
1144 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
1145 regex::escape(name)
1146 ))
1147 .expect("valid regex");
1148 attr_re.captures(attrs).and_then(|caps| {
1149 caps.get(1)
1150 .or_else(|| caps.get(2))
1151 .map(|value| value.as_str().to_string())
1152 })
1153}
1154
1155fn strip_google_docs_heading_noise(html: &str) -> String {
1156 let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
1157 let numbering_re =
1158 Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
1159 let mut out = empty_anchor_re.replace_all(html, "").into_owned();
1160 for level in 1..=6 {
1161 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
1162 .expect("valid regex");
1163 out = heading_re
1164 .replace_all(&out, |caps: ®ex::Captures<'_>| {
1165 let open = &caps[1];
1166 let inner = &caps[2];
1167 let close = &caps[3];
1168 let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
1169 cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
1170 format!("{open}{cleaned}{close}")
1171 })
1172 .into_owned();
1173 }
1174 out
1175}
1176
1177fn strip_heading_inline_formatting(html: &str) -> String {
1178 let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
1179 let mut out = html.to_string();
1180 for level in 1..=6 {
1181 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
1182 .expect("valid regex");
1183 out = heading_re
1184 .replace_all(&out, |caps: ®ex::Captures<'_>| {
1185 let open = &caps[1];
1186 let inner = &caps[2];
1187 let close = &caps[3];
1188 let cleaned = inline_marker_re.replace_all(inner, "");
1189 format!("{open}{cleaned}{close}")
1190 })
1191 .into_owned();
1192 }
1193 out
1194}
1195
1196fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
1197 let redirect_re =
1198 Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
1199 .expect("valid regex");
1200 redirect_re
1201 .replace_all(html, |caps: ®ex::Captures<'_>| {
1202 let encoded = caps.get(1).map_or("", |m| m.as_str());
1203 let decoded = percent_decode_utf8_lossy(encoded);
1204 *unwrapped_links += 1;
1205 format!(r#"href="{decoded}""#)
1206 })
1207 .into_owned()
1208}
1209
1210fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
1211 let mut class_styles: HashMap<String, String> = HashMap::new();
1212 let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
1213 let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
1214 for style_caps in style_re.captures_iter(html) {
1215 let css = style_caps.get(1).map_or("", |m| m.as_str());
1216 for class_caps in class_re.captures_iter(css) {
1217 let class_name = class_caps.get(1).map_or("", |m| m.as_str());
1218 let style = class_caps.get(2).map_or("", |m| m.as_str());
1219 class_styles
1220 .entry(class_name.to_string())
1221 .and_modify(|existing| {
1222 existing.push(';');
1223 existing.push_str(style);
1224 })
1225 .or_insert_with(|| style.to_string());
1226 }
1227 }
1228 class_styles
1229}
1230
1231fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
1232 class_attr
1233 .split_whitespace()
1234 .filter_map(|class_name| class_styles.get(class_name))
1235 .fold(String::new(), |mut out, style| {
1236 out.push(';');
1237 out.push_str(style);
1238 out
1239 })
1240}
1241
1242fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
1243 let bold = css_has_bold(style);
1244 let italic = css_has_italic(style);
1245 let strike = css_has_strike(style);
1246 if !bold && !italic && !strike {
1247 return None;
1248 }
1249 let mut wrapped = inner.to_string();
1250 if strike {
1251 wrapped = format!("<del>{wrapped}</del>");
1252 }
1253 if italic {
1254 wrapped = format!("<em>{wrapped}</em>");
1255 }
1256 if bold {
1257 wrapped = format!("<strong>{wrapped}</strong>");
1258 }
1259 Some(wrapped)
1260}
1261
1262fn css_has_bold(style: &str) -> bool {
1263 Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
1264 .expect("valid regex")
1265 .is_match(style)
1266}
1267
1268fn css_has_italic(style: &str) -> bool {
1269 Regex::new(r"(?i)font-style\s*:\s*italic")
1270 .expect("valid regex")
1271 .is_match(style)
1272}
1273
1274fn css_has_strike(style: &str) -> bool {
1275 Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1276 .expect("valid regex")
1277 .is_match(style)
1278}
1279
1280fn is_blockquote_style(style: &str) -> bool {
1281 let margin_left = css_point_value(style, "margin-left");
1282 let margin_right = css_point_value(style, "margin-right");
1283 margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1284}
1285
1286fn css_point_value(style: &str, property: &str) -> f64 {
1287 let re = Regex::new(&format!(
1288 r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1289 regex::escape(property)
1290 ))
1291 .expect("valid regex");
1292 re.captures(style)
1293 .and_then(|caps| caps.get(1))
1294 .and_then(|value| value.as_str().parse::<f64>().ok())
1295 .unwrap_or(0.0)
1296}
1297
1298fn percent_decode_utf8_lossy(input: &str) -> String {
1301 let bytes = input.as_bytes();
1302 let mut decoded = Vec::with_capacity(bytes.len());
1303 let mut i = 0;
1304 while i < bytes.len() {
1305 if bytes[i] == b'%' && i + 2 < bytes.len() {
1306 let hi = (bytes[i + 1] as char).to_digit(16);
1307 let lo = (bytes[i + 2] as char).to_digit(16);
1308 if let (Some(hi), Some(lo)) = (hi, lo) {
1309 if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1310 decoded.push(byte);
1311 i += 3;
1312 continue;
1313 }
1314 }
1315 }
1316 decoded.push(bytes[i]);
1317 i += 1;
1318 }
1319 String::from_utf8_lossy(&decoded).into_owned()
1320}
1321
1322fn unescape_public_export_punctuation(markdown: &str) -> String {
1323 markdown
1324 .replace("\\.", ".")
1325 .replace("\\!", "!")
1326 .replace("\\(", "(")
1327 .replace("\\)", ")")
1328 .replace("\\[", "[")
1329 .replace("\\]", "]")
1330}
1331
1332fn convert_setext_headings(markdown: &str) -> String {
1333 let lines: Vec<&str> = markdown.lines().collect();
1334 let mut out = Vec::with_capacity(lines.len());
1335 let mut index = 0;
1336 while index < lines.len() {
1337 if index + 1 < lines.len() {
1338 let underline = lines[index + 1].trim();
1339 if is_setext_underline(underline, '=') {
1340 out.push(format!("# {}", lines[index].trim()));
1341 index += 2;
1342 continue;
1343 }
1344 if is_setext_underline(underline, '-') {
1345 out.push(format!("## {}", lines[index].trim()));
1346 index += 2;
1347 continue;
1348 }
1349 }
1350 out.push(lines[index].to_string());
1351 index += 1;
1352 }
1353 out.join("\n")
1354}
1355
1356fn is_setext_underline(line: &str, marker: char) -> bool {
1357 line.len() >= 5 && line.chars().all(|ch| ch == marker)
1358}
1359
1360fn normalize_atx_headings(markdown: &str) -> String {
1361 let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1362 let closing_re = closing_atx_heading_re();
1363 markdown
1364 .lines()
1365 .map(|line| {
1366 let Some(caps) = heading_re.captures(line) else {
1367 return line.to_string();
1368 };
1369 let hashes = caps.get(1).map_or("", |m| m.as_str());
1370 let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1371 text = closing_re.replace(&text, "").trim().to_string();
1372 text = strip_wrapping_markdown_emphasis(&text);
1373 format!("{hashes} {text}")
1374 })
1375 .collect::<Vec<_>>()
1376 .join("\n")
1377}
1378
1379fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1380 let trimmed = text.trim();
1381 for marker in ["***", "**", "*"] {
1382 if trimmed.len() > marker.len() * 2
1383 && trimmed.starts_with(marker)
1384 && trimmed.ends_with(marker)
1385 {
1386 return trimmed[marker.len()..trimmed.len() - marker.len()]
1387 .trim()
1388 .to_string();
1389 }
1390 }
1391 trimmed.to_string()
1392}
1393
1394fn normalize_bullet_markers(markdown: &str) -> String {
1395 let bullet_re = asterisk_bullet_re();
1396 markdown
1397 .lines()
1398 .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1399 .collect::<Vec<_>>()
1400 .join("\n")
1401}
1402
1403fn normalize_list_spacing(markdown: &str) -> String {
1404 let lines: Vec<&str> = markdown.lines().collect();
1405 let mut out = Vec::with_capacity(lines.len());
1406
1407 for (index, line) in lines.iter().enumerate() {
1408 if line.trim().is_empty()
1409 && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1410 && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1411 {
1412 continue;
1413 }
1414 out.push((*line).to_string());
1415 }
1416
1417 out.join("\n")
1418}
1419
1420fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1421 lines[..index]
1422 .iter()
1423 .rev()
1424 .copied()
1425 .find(|line| !line.trim().is_empty())
1426}
1427
1428fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1429 lines[index + 1..]
1430 .iter()
1431 .copied()
1432 .find(|line| !line.trim().is_empty())
1433}
1434
1435fn is_markdown_list_item(line: &str) -> bool {
1436 markdown_list_item_re().is_match(line)
1437}
1438
1439fn normalize_blockquote_spacing(markdown: &str) -> String {
1440 let mut out = String::with_capacity(markdown.len());
1441 let mut pending_quote_blank = false;
1442 let mut in_quote = false;
1443
1444 for line in markdown.lines() {
1445 if line.trim().is_empty() && in_quote {
1446 pending_quote_blank = true;
1447 continue;
1448 }
1449
1450 if line.trim() == ">" {
1451 if in_quote {
1452 pending_quote_blank = true;
1453 }
1454 continue;
1455 }
1456
1457 if line.starts_with("> ") {
1458 if pending_quote_blank {
1459 out.push_str(">\n");
1460 pending_quote_blank = false;
1461 }
1462 out.push_str(line);
1463 out.push('\n');
1464 in_quote = true;
1465 continue;
1466 }
1467
1468 if in_quote && !line.trim().is_empty() {
1469 out.push('\n');
1470 }
1471 pending_quote_blank = false;
1472 in_quote = false;
1473 out.push_str(line);
1474 out.push('\n');
1475 }
1476
1477 out
1478}
1479
1480fn normalize_markdown_tables(markdown: &str) -> String {
1481 let lines: Vec<&str> = markdown.lines().collect();
1482 let mut out = Vec::with_capacity(lines.len());
1483 let mut index = 0;
1484
1485 while index < lines.len() {
1486 if !is_markdown_table_line(lines[index]) {
1487 out.push(lines[index].to_string());
1488 index += 1;
1489 continue;
1490 }
1491
1492 let start = index;
1493 while index < lines.len() && is_markdown_table_line(lines[index]) {
1494 index += 1;
1495 }
1496 let block = &lines[start..index];
1497 if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1498 out.extend(normalize_markdown_table_block(block));
1499 } else {
1500 out.extend(block.iter().map(|line| (*line).to_string()));
1501 }
1502 }
1503
1504 out.join("\n")
1505}
1506
1507fn is_markdown_table_line(line: &str) -> bool {
1508 let trimmed = line.trim();
1509 trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1510}
1511
1512fn is_markdown_separator_line(line: &str) -> bool {
1513 split_markdown_table_cells(line)
1514 .iter()
1515 .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1516}
1517
1518fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1519 lines
1520 .iter()
1521 .enumerate()
1522 .map(|(index, line)| {
1523 let cells = split_markdown_table_cells(line);
1524 if index == 1 {
1525 let separators = vec!["---".to_string(); cells.len()];
1526 render_markdown_table_row(&separators)
1527 } else {
1528 render_markdown_table_row(&cells)
1529 }
1530 })
1531 .collect()
1532}
1533
1534fn split_markdown_table_cells(line: &str) -> Vec<String> {
1535 line.trim()
1536 .trim_matches('|')
1537 .split('|')
1538 .map(|cell| cell.trim().to_string())
1539 .collect()
1540}
1541
1542fn render_markdown_table_row(cells: &[String]) -> String {
1543 format!("| {} |", cells.join(" | "))
1544}
1545
1546fn closing_atx_heading_re() -> &'static Regex {
1547 static RE: OnceLock<Regex> = OnceLock::new();
1548 RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1549}
1550
1551fn asterisk_bullet_re() -> &'static Regex {
1552 static RE: OnceLock<Regex> = OnceLock::new();
1553 RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1554}
1555
1556fn markdown_list_item_re() -> &'static Regex {
1557 static RE: OnceLock<Regex> = OnceLock::new();
1558 RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1559}
1560
1561fn markdown_table_separator_cell_re() -> &'static Regex {
1562 static RE: OnceLock<Regex> = OnceLock::new();
1563 RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1564}
1565
1566pub async fn fetch_google_doc_from_docs_api(
1572 url: &str,
1573 api_token: &str,
1574) -> crate::Result<GDocsRenderedResult> {
1575 let document_id = extract_document_id(url).ok_or_else(|| {
1576 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1577 })?;
1578 let api_url = build_docs_api_url(&document_id);
1579 debug!(
1580 document_id = %document_id,
1581 api_url = %api_url,
1582 "fetching Google Doc via Docs API"
1583 );
1584
1585 let response = reqwest::Client::new()
1586 .get(&api_url)
1587 .header("Authorization", format!("Bearer {api_token}"))
1588 .header("Accept", "application/json")
1589 .send()
1590 .await
1591 .map_err(|e| {
1592 WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1593 })?;
1594 debug!(
1595 document_id = %document_id,
1596 status = response.status().as_u16(),
1597 success = response.status().is_success(),
1598 content_type = response
1599 .headers()
1600 .get(reqwest::header::CONTENT_TYPE)
1601 .and_then(|value| value.to_str().ok())
1602 .unwrap_or(""),
1603 "received Google Docs API response"
1604 );
1605
1606 if !response.status().is_success() {
1607 return Err(WebCaptureError::FetchError(format!(
1608 "Failed to fetch Google Doc via Docs API ({} {}): {}",
1609 response.status().as_u16(),
1610 response.status().canonical_reason().unwrap_or("Unknown"),
1611 api_url
1612 )));
1613 }
1614
1615 let body = response.text().await.map_err(|e| {
1616 WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1617 })?;
1618 let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1619 WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1620 })?;
1621 let rendered = render_docs_api_document(&document);
1622 debug!(
1623 document_id = %document_id,
1624 title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1625 markdown_bytes = rendered.markdown.len(),
1626 html_bytes = rendered.html.len(),
1627 text_bytes = rendered.text.len(),
1628 "rendered Google Docs API document"
1629 );
1630
1631 Ok(GDocsRenderedResult {
1632 markdown: rendered.markdown,
1633 html: rendered.html,
1634 text: rendered.text,
1635 document_id,
1636 export_url: api_url,
1637 remote_images: Vec::new(),
1638 })
1639}
1640
1641pub async fn fetch_google_doc_from_model(
1647 url: &str,
1648 api_token: Option<&str>,
1649) -> crate::Result<GDocsRenderedResult> {
1650 if api_token.is_some() {
1651 return Err(WebCaptureError::BrowserError(
1652 "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1653 ));
1654 }
1655 let document_id = extract_document_id(url).ok_or_else(|| {
1656 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1657 })?;
1658 let edit_url = build_edit_url(&document_id);
1659 debug!(
1660 document_id = %document_id,
1661 edit_url = %edit_url,
1662 "capturing Google Doc editor model with a real browser"
1663 );
1664 let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1665 let BrowserModelData {
1666 chunks,
1667 cid_urls,
1668 chunk_payload_bytes,
1669 poll_count,
1670 stable_for,
1671 } = model_data;
1672 debug!(
1673 document_id = %document_id,
1674 chunks = chunks.len(),
1675 cid_urls = cid_urls.len(),
1676 chunk_payload_bytes,
1677 poll_count,
1678 stable_for_ms = stable_for.as_millis(),
1679 "extracted Google Docs editor model chunks through CDP"
1680 );
1681 if chunks.is_empty() {
1682 return Err(WebCaptureError::ParseError(
1683 "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1684 ));
1685 }
1686
1687 let export_html = match fetch_google_doc(url, "html", None).await {
1688 Ok(result) => Some(result.content),
1689 Err(error) => {
1690 warn!(
1691 document_id = %document_id,
1692 error = %error,
1693 "failed to fetch Google Docs export HTML for browser-model semantic hints"
1694 );
1695 None
1696 }
1697 };
1698 let capture = parse_model_chunks_with_export_html(&chunks, &cid_urls, export_html.as_deref());
1699 let remote_images = remote_images_from_capture(&capture);
1700 info!(
1701 document_id = %document_id,
1702 chunks = chunks.len(),
1703 cid_urls = cid_urls.len(),
1704 chunk_payload_bytes,
1705 poll_count,
1706 stable_for_ms = stable_for.as_millis(),
1707 blocks = capture.blocks.len(),
1708 tables = capture.tables.len(),
1709 images = capture.images.len(),
1710 text_bytes = capture.text.len(),
1711 "parsed Google Docs editor model"
1712 );
1713
1714 Ok(GDocsRenderedResult {
1715 markdown: render_captured_document(&capture, "markdown"),
1716 html: render_captured_document(&capture, "html"),
1717 text: render_captured_document(&capture, "txt"),
1718 document_id,
1719 export_url: edit_url,
1720 remote_images,
1721 })
1722}
1723
1724async fn fetch_google_doc_editor_model_with_cdp(
1725 edit_url: &str,
1726 document_id: &str,
1727) -> crate::Result<BrowserModelData> {
1728 let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1729 WebCaptureError::BrowserError(
1730 "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1731 )
1732 })?;
1733 let user_data_dir = crate::browser::temporary_user_data_dir();
1734 std::fs::create_dir_all(&user_data_dir)?;
1735
1736 debug!(
1737 document_id = %document_id,
1738 chrome = %chrome.display(),
1739 user_data_dir = %user_data_dir.display(),
1740 edit_url = %edit_url,
1741 "launching headless Chrome CDP session for Google Docs model capture"
1742 );
1743
1744 let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1745 let capture_result = async {
1746 let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1747 let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1748 WebCaptureError::BrowserError(format!(
1749 "Failed to connect to Chrome DevTools websocket: {error}"
1750 ))
1751 })?;
1752 let mut next_id = 0u64;
1753 let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1754 wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1755 }
1756 .await;
1757
1758 if let Err(error) = child.kill().await {
1759 debug!(
1760 document_id = %document_id,
1761 error = %error,
1762 "failed to kill Chrome CDP browser process"
1763 );
1764 }
1765 let _ = child.wait().await;
1766 let _ = std::fs::remove_dir_all(&user_data_dir);
1767
1768 capture_result
1769}
1770
1771async fn navigate_google_docs_cdp_page(
1772 ws: &mut CdpWebSocket,
1773 next_id: &mut u64,
1774 edit_url: &str,
1775) -> crate::Result<String> {
1776 let target = cdp_send(
1777 ws,
1778 next_id,
1779 None,
1780 "Target.createTarget",
1781 serde_json::json!({ "url": "about:blank" }),
1782 )
1783 .await?;
1784 let target_id = target
1785 .get("targetId")
1786 .and_then(Value::as_str)
1787 .ok_or_else(|| {
1788 WebCaptureError::BrowserError(
1789 "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1790 )
1791 })?
1792 .to_string();
1793 let attached = cdp_send(
1794 ws,
1795 next_id,
1796 None,
1797 "Target.attachToTarget",
1798 serde_json::json!({ "targetId": target_id, "flatten": true }),
1799 )
1800 .await?;
1801 let session_id = attached
1802 .get("sessionId")
1803 .and_then(Value::as_str)
1804 .ok_or_else(|| {
1805 WebCaptureError::BrowserError(
1806 "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1807 )
1808 })?
1809 .to_string();
1810
1811 cdp_send(
1812 ws,
1813 next_id,
1814 Some(&session_id),
1815 "Page.enable",
1816 serde_json::json!({}),
1817 )
1818 .await?;
1819 cdp_send(
1820 ws,
1821 next_id,
1822 Some(&session_id),
1823 "Runtime.enable",
1824 serde_json::json!({}),
1825 )
1826 .await?;
1827 cdp_send(
1828 ws,
1829 next_id,
1830 Some(&session_id),
1831 "Page.addScriptToEvaluateOnNewDocument",
1832 serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1833 )
1834 .await?;
1835 cdp_send(
1836 ws,
1837 next_id,
1838 Some(&session_id),
1839 "Page.navigate",
1840 serde_json::json!({ "url": edit_url }),
1841 )
1842 .await?;
1843
1844 Ok(session_id)
1845}
1846
1847async fn wait_for_google_docs_model_chunks(
1848 ws: &mut CdpWebSocket,
1849 next_id: &mut u64,
1850 session_id: &str,
1851 document_id: &str,
1852) -> crate::Result<BrowserModelData> {
1853 let started = Instant::now();
1854 let max_wait = gdocs_editor_model_max_wait();
1855 let stability_window = gdocs_editor_model_stability_window();
1856 let mut quiescence = BrowserModelQuiescence::default();
1857 let mut last_chunks = 0usize;
1858 let mut last_cid_urls = 0usize;
1859 let mut last_payload_bytes = 0usize;
1860 let mut last_stable_for = Duration::ZERO;
1861 let mut poll_count = 0usize;
1862
1863 while started.elapsed() < max_wait {
1864 let result = cdp_send(
1865 ws,
1866 next_id,
1867 Some(session_id),
1868 "Runtime.evaluate",
1869 serde_json::json!({
1870 "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1871 "returnByValue": true,
1872 "awaitPromise": true
1873 }),
1874 )
1875 .await?;
1876 if let Some(exception) = result.get("exceptionDetails") {
1877 return Err(WebCaptureError::BrowserError(format!(
1878 "Google Docs model extraction script failed: {exception}"
1879 )));
1880 }
1881 let value = result
1882 .pointer("/result/value")
1883 .cloned()
1884 .unwrap_or(Value::Null);
1885 let model_data = browser_model_data_from_value(&value);
1886 poll_count += 1;
1887 let fingerprint = model_data.fingerprint();
1888 last_chunks = model_data.chunks.len();
1889 last_cid_urls = model_data.cid_urls.len();
1890 last_payload_bytes = model_data.chunk_payload_bytes;
1891 let now = Instant::now();
1892 if let Some(stable_for) = quiescence.observe(fingerprint, now, stability_window) {
1893 let mut model_data = model_data;
1894 model_data.poll_count = poll_count;
1895 model_data.stable_for = stable_for;
1896 debug!(
1897 document_id = %document_id,
1898 chunks = model_data.chunks.len(),
1899 cid_urls = model_data.cid_urls.len(),
1900 chunk_payload_bytes = model_data.chunk_payload_bytes,
1901 poll_count,
1902 stable_for_ms = stable_for.as_millis(),
1903 elapsed_ms = started.elapsed().as_millis(),
1904 "captured quiesced Google Docs model chunks through CDP Runtime.evaluate"
1905 );
1906 return Ok(model_data);
1907 }
1908 last_stable_for = quiescence.stable_for(now);
1909 tokio::time::sleep(GDOCS_EDITOR_MODEL_POLL_INTERVAL).await;
1910 }
1911
1912 Err(WebCaptureError::BrowserError(format!(
1913 "Timed out waiting for Google Docs DOCS_modelChunk stream to quiesce for document {document_id} after {} ms (last chunks={last_chunks}, payload_bytes={last_payload_bytes}, cid_urls={last_cid_urls}, poll_count={poll_count}, stable_for_ms={})",
1914 max_wait.as_millis(),
1915 last_stable_for.as_millis()
1916 )))
1917}
1918
1919fn launch_cdp_chrome(
1920 chrome: &std::path::Path,
1921 user_data_dir: &std::path::Path,
1922) -> crate::Result<Child> {
1923 let mut command = Command::new(chrome);
1924 command
1925 .args([
1926 "--headless=new",
1927 "--disable-gpu",
1928 "--disable-extensions",
1929 "--disable-dev-shm-usage",
1930 "--disable-background-networking",
1931 "--disable-component-update",
1932 "--disable-default-apps",
1933 "--disable-sync",
1934 "--metrics-recording-only",
1935 "--no-default-browser-check",
1936 "--no-first-run",
1937 "--no-sandbox",
1938 "--remote-debugging-port=0",
1939 "--window-size=1280,800",
1940 ])
1941 .arg(format!("--user-data-dir={}", user_data_dir.display()))
1942 .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1943 .stderr(Stdio::piped())
1944 .stdout(Stdio::null())
1945 .kill_on_drop(true);
1946
1947 command.spawn().map_err(|error| {
1948 WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1949 })
1950}
1951
1952async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1953 let stderr = child.stderr.take().ok_or_else(|| {
1954 WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1955 })?;
1956 let mut lines = BufReader::new(stderr).lines();
1957 let started = Instant::now();
1958
1959 while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1960 let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1961 match line {
1962 Ok(Ok(Some(line))) => {
1963 if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1964 return Ok(ws_url.trim().to_string());
1965 }
1966 }
1967 Ok(Ok(None)) => {
1968 break;
1969 }
1970 Ok(Err(error)) => {
1971 return Err(WebCaptureError::BrowserError(format!(
1972 "Failed to read Chrome CDP stderr: {error}"
1973 )));
1974 }
1975 Err(_) => {}
1976 }
1977 }
1978
1979 Err(WebCaptureError::BrowserError(format!(
1980 "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1981 GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1982 )))
1983}
1984
1985async fn cdp_send(
1986 ws: &mut CdpWebSocket,
1987 next_id: &mut u64,
1988 session_id: Option<&str>,
1989 method: &str,
1990 params: Value,
1991) -> crate::Result<Value> {
1992 *next_id += 1;
1993 let id = *next_id;
1994 let mut message = serde_json::json!({
1995 "id": id,
1996 "method": method,
1997 "params": params
1998 });
1999 if let Some(session_id) = session_id {
2000 message["sessionId"] = Value::String(session_id.to_string());
2001 }
2002
2003 ws.send(Message::Text(message.to_string()))
2004 .await
2005 .map_err(|error| {
2006 WebCaptureError::BrowserError(format!(
2007 "Failed to send Chrome DevTools command {method}: {error}"
2008 ))
2009 })?;
2010
2011 while let Some(message) = ws.next().await {
2012 let message = message.map_err(|error| {
2013 WebCaptureError::BrowserError(format!(
2014 "Failed to read Chrome DevTools response for {method}: {error}"
2015 ))
2016 })?;
2017 if !message.is_text() {
2018 continue;
2019 }
2020 let text = message.to_text().map_err(|error| {
2021 WebCaptureError::BrowserError(format!(
2022 "Chrome DevTools response for {method} was not text: {error}"
2023 ))
2024 })?;
2025 let value = serde_json::from_str::<Value>(text).map_err(|error| {
2026 WebCaptureError::ParseError(format!(
2027 "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
2028 ))
2029 })?;
2030 if value.get("id").and_then(Value::as_u64) != Some(id) {
2031 continue;
2032 }
2033 if let Some(error) = value.get("error") {
2034 return Err(WebCaptureError::BrowserError(format!(
2035 "Chrome DevTools command {method} failed: {error}"
2036 )));
2037 }
2038 return Ok(value.get("result").cloned().unwrap_or(Value::Null));
2039 }
2040
2041 Err(WebCaptureError::BrowserError(format!(
2042 "Chrome DevTools websocket closed before response for {method}"
2043 )))
2044}
2045
2046fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
2047 let chunks = value
2048 .get("chunks")
2049 .and_then(Value::as_array)
2050 .cloned()
2051 .unwrap_or_default();
2052 let chunk_payload_bytes = model_chunk_payload_bytes(&chunks);
2053 let cid_urls = value
2054 .get("cidUrlMap")
2055 .and_then(Value::as_object)
2056 .map(|map| {
2057 map.iter()
2058 .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
2059 .collect::<HashMap<_, _>>()
2060 })
2061 .unwrap_or_default();
2062 BrowserModelData {
2063 chunks,
2064 cid_urls,
2065 chunk_payload_bytes,
2066 poll_count: 0,
2067 stable_for: Duration::ZERO,
2068 }
2069}
2070
2071fn model_chunk_payload_bytes(chunks: &[Value]) -> usize {
2072 chunks
2073 .iter()
2074 .map(|chunk| serde_json::to_vec(chunk).map_or(0, |encoded| encoded.len()))
2075 .sum()
2076}
2077
2078fn gdocs_editor_model_max_wait() -> Duration {
2079 duration_from_env_ms(
2080 "WEB_CAPTURE_GDOCS_MAX_WAIT_MS",
2081 GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT,
2082 )
2083}
2084
2085fn gdocs_editor_model_stability_window() -> Duration {
2086 duration_from_env_ms(
2087 "WEB_CAPTURE_GDOCS_STABILITY_MS",
2088 GDOCS_EDITOR_MODEL_STABILITY_DEFAULT,
2089 )
2090}
2091
2092fn duration_from_env_ms(name: &str, default: Duration) -> Duration {
2093 std::env::var(name).map_or(default, |value| match value.trim().parse::<u64>() {
2094 Ok(ms) => Duration::from_millis(ms),
2095 Err(error) => {
2096 warn!(
2097 name,
2098 value,
2099 error = %error,
2100 default_ms = default.as_millis(),
2101 "ignoring invalid Google Docs model wait environment variable"
2102 );
2103 default
2104 }
2105 })
2106}
2107
2108fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
2109 capture
2110 .images
2111 .iter()
2112 .filter_map(|node| match node {
2113 ContentNode::Image {
2114 url: Some(url),
2115 alt,
2116 ..
2117 } => Some(RemoteImage {
2118 url: url.clone(),
2119 alt: alt.clone(),
2120 }),
2121 ContentNode::Image { .. } | ContentNode::Text { .. } => None,
2122 })
2123 .collect()
2124}
2125
2126#[must_use]
2128pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
2129 let blocks = structural_elements_to_blocks(
2130 document
2131 .pointer("/body/content")
2132 .and_then(Value::as_array)
2133 .map_or(&[] as &[Value], Vec::as_slice),
2134 document.pointer("/inlineObjects").unwrap_or(&Value::Null),
2135 );
2136 GDocsRenderedOutput {
2137 markdown: render_blocks_markdown(&blocks),
2138 html: render_blocks_html(&blocks),
2139 text: blocks_to_text(&blocks),
2140 }
2141}
2142
2143#[derive(Debug, Clone, PartialEq, Eq)]
2145pub struct GDocsRenderedOutput {
2146 pub markdown: String,
2148 pub html: String,
2150 pub text: String,
2152}
2153
2154fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
2155 let mut blocks = Vec::new();
2156 for element in elements {
2157 if let Some(paragraph) = element.get("paragraph") {
2158 let content = paragraph_to_content(paragraph, inline_objects);
2159 if !content_to_text(&content).trim().is_empty()
2160 || content
2161 .iter()
2162 .any(|node| matches!(node, ContentNode::Image { .. }))
2163 {
2164 blocks.push(CapturedBlock::Paragraph {
2165 style: paragraph
2166 .pointer("/paragraphStyle/namedStyleType")
2167 .and_then(Value::as_str)
2168 .map(ToString::to_string),
2169 list: None,
2170 quote: false,
2171 horizontal_rule: false,
2172 content,
2173 });
2174 }
2175 } else if let Some(table) = element.get("table") {
2176 blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
2177 }
2178 }
2179 blocks
2180}
2181
2182fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
2183 let rows = table
2184 .get("tableRows")
2185 .and_then(Value::as_array)
2186 .map_or(&[] as &[Value], Vec::as_slice)
2187 .iter()
2188 .map(|row| TableRow {
2189 cells: row
2190 .get("tableCells")
2191 .and_then(Value::as_array)
2192 .map_or(&[] as &[Value], Vec::as_slice)
2193 .iter()
2194 .map(|cell| TableCell {
2195 content: structural_elements_to_inline_content(
2196 cell.get("content")
2197 .and_then(Value::as_array)
2198 .map_or(&[] as &[Value], Vec::as_slice),
2199 inline_objects,
2200 ),
2201 })
2202 .collect(),
2203 })
2204 .collect();
2205 TableBlock { rows }
2206}
2207
2208fn structural_elements_to_inline_content(
2209 elements: &[Value],
2210 inline_objects: &Value,
2211) -> Vec<ContentNode> {
2212 let mut content = Vec::new();
2213 for element in elements {
2214 if let Some(paragraph) = element.get("paragraph") {
2215 let paragraph_content = paragraph_to_content(paragraph, inline_objects);
2216 if !content.is_empty() && !paragraph_content.is_empty() {
2217 append_text(&mut content, "\n");
2218 }
2219 content.extend(paragraph_content);
2220 } else if let Some(table) = element.get("table") {
2221 append_text(
2222 &mut content,
2223 &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
2224 table,
2225 inline_objects,
2226 ))]),
2227 );
2228 }
2229 }
2230 content
2231}
2232
2233fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
2234 let mut content = Vec::new();
2235 for element in paragraph
2236 .get("elements")
2237 .and_then(Value::as_array)
2238 .map_or(&[] as &[Value], Vec::as_slice)
2239 {
2240 if let Some(text) = element
2241 .pointer("/textRun/content")
2242 .and_then(Value::as_str)
2243 .map(|text| text.strip_suffix('\n').unwrap_or(text))
2244 {
2245 append_text(&mut content, text);
2246 } else if let Some(inline_id) = element
2247 .pointer("/inlineObjectElement/inlineObjectId")
2248 .and_then(Value::as_str)
2249 {
2250 if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
2251 content.push(image);
2252 }
2253 }
2254 }
2255 content
2256}
2257
2258fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
2259 let embedded = inline_objects
2260 .get(inline_id)?
2261 .pointer("/inlineObjectProperties/embeddedObject")?;
2262 let url = embedded
2263 .pointer("/imageProperties/contentUri")
2264 .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
2265 .and_then(Value::as_str)?;
2266 let alt = embedded
2267 .get("title")
2268 .or_else(|| embedded.get("description"))
2269 .and_then(Value::as_str)
2270 .unwrap_or("image");
2271 Some(ContentNode::Image {
2272 cid: None,
2273 url: Some(url.to_string()),
2274 alt: alt.to_string(),
2275 width: json_dimension_to_string(embedded.pointer("/size/width/magnitude")),
2276 height: json_dimension_to_string(embedded.pointer("/size/height/magnitude")),
2277 is_suggestion: false,
2278 })
2279}
2280
2281fn json_dimension_to_string(value: Option<&Value>) -> Option<String> {
2282 match value? {
2283 Value::Number(number) => Some(number.to_string()),
2284 Value::String(text) if !text.is_empty() => Some(text.clone()),
2285 _ => None,
2286 }
2287}
2288
2289fn build_model_style_maps(
2290 items: &[Value],
2291 text_len: usize,
2292 utf16_position_map: &[usize],
2293) -> ModelStyleMaps {
2294 let mut maps = ModelStyleMaps {
2295 inline_styles: vec![TextStyle::default(); text_len],
2296 ..ModelStyleMaps::default()
2297 };
2298
2299 for item in items {
2300 if item.get("ty").and_then(Value::as_str) != Some("as") {
2301 continue;
2302 }
2303 let (Some(start), Some(end), Some(style_type)) = (
2304 item.get("si").and_then(Value::as_u64),
2305 item.get("ei").and_then(Value::as_u64),
2306 item.get("st").and_then(Value::as_str),
2307 ) else {
2308 continue;
2309 };
2310 let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
2311 continue;
2312 };
2313
2314 let start = utf16_position_to_char_position(utf16_position_map, start);
2315 let end = utf16_position_to_char_position(utf16_position_map, end);
2316 if start == 0 || end == 0 {
2317 continue;
2318 }
2319
2320 match style_type {
2321 "text" => {
2322 let style = text_style(item);
2323 apply_inline_style(&mut maps.inline_styles, start, end, &style);
2324 }
2325 "link" => {
2326 let style = TextStyle {
2327 link: item
2328 .pointer("/sm/lnks_link/ulnk_url")
2329 .and_then(Value::as_str)
2330 .map(ToString::to_string),
2331 ..TextStyle::default()
2332 };
2333 apply_inline_style(&mut maps.inline_styles, start, end, &style);
2334 }
2335 "paragraph" => {
2336 maps.paragraph_by_end
2337 .insert(end, paragraph_style_from_model(item));
2338 }
2339 "list" => {
2340 maps.list_by_end.insert(
2341 end,
2342 ListMeta {
2343 id: item
2344 .pointer("/sm/ls_id")
2345 .and_then(Value::as_str)
2346 .unwrap_or("")
2347 .to_string(),
2348 level: item
2349 .pointer("/sm/ls_nest")
2350 .and_then(Value::as_u64)
2351 .and_then(|value| usize::try_from(value).ok())
2352 .unwrap_or(0),
2353 ordered: false,
2354 },
2355 );
2356 }
2357 "horizontal_rule" => {
2358 maps.horizontal_rules.insert(end);
2359 }
2360 _ => {}
2361 }
2362 }
2363
2364 maps
2365}
2366
2367fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
2368 let from = start.saturating_sub(1);
2369 let to = end.min(styles.len());
2370 if from >= to {
2371 return;
2372 }
2373 for style in &mut styles[from..to] {
2374 if patch.bold {
2375 style.bold = true;
2376 }
2377 if patch.italic {
2378 style.italic = true;
2379 }
2380 if patch.strike {
2381 style.strike = true;
2382 }
2383 if patch.link.is_some() {
2384 style.link.clone_from(&patch.link);
2385 }
2386 }
2387}
2388
2389fn text_style(item: &Value) -> TextStyle {
2390 TextStyle {
2391 bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true)
2392 && item.pointer("/sm/ts_bd_i").and_then(Value::as_bool) != Some(true),
2393 italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true)
2394 && item.pointer("/sm/ts_it_i").and_then(Value::as_bool) != Some(true),
2395 strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true)
2396 && item.pointer("/sm/ts_st_i").and_then(Value::as_bool) != Some(true),
2397 link: None,
2398 }
2399}
2400
2401fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2402 let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2403 ParagraphStyle {
2404 style: heading.map(|level| format!("HEADING_{level}")),
2405 indent_start: item
2406 .pointer("/sm/ps_il")
2407 .and_then(Value::as_f64)
2408 .unwrap_or(0.0),
2409 indent_first_line: item
2410 .pointer("/sm/ps_ifl")
2411 .and_then(Value::as_f64)
2412 .unwrap_or(0.0),
2413 }
2414}
2415
2416fn build_utf16_position_map(text: &str) -> Vec<usize> {
2417 let mut map = vec![0; text.encode_utf16().count() + 1];
2418 let mut utf16_pos = 1usize;
2419 for (idx, ch) in text.chars().enumerate() {
2420 let char_pos = idx + 1;
2421 for _ in 0..ch.len_utf16() {
2422 if let Some(slot) = map.get_mut(utf16_pos) {
2423 *slot = char_pos;
2424 }
2425 utf16_pos += 1;
2426 }
2427 }
2428 map
2429}
2430
2431fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2432 map.get(position)
2433 .copied()
2434 .filter(|position| *position > 0)
2435 .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2436 .unwrap_or(0)
2437}
2438
2439#[must_use]
2441pub fn parse_model_chunks<S: BuildHasher>(
2442 chunks: &[Value],
2443 cid_urls: &HashMap<String, String, S>,
2444) -> CapturedDocument {
2445 parse_model_chunks_with_export_html(chunks, cid_urls, None)
2446}
2447
2448#[must_use]
2451#[allow(clippy::too_many_lines)]
2452pub fn parse_model_chunks_with_export_html<S: BuildHasher>(
2453 chunks: &[Value],
2454 cid_urls: &HashMap<String, String, S>,
2455 export_html: Option<&str>,
2456) -> CapturedDocument {
2457 let items = collect_model_items(chunks);
2458 let full_text = items
2459 .iter()
2460 .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2461 .filter_map(|item| item.get("s").and_then(Value::as_str))
2462 .collect::<String>();
2463 let chars: Vec<char> = full_text.chars().collect();
2464 let utf16_position_map = build_utf16_position_map(&full_text);
2465 let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2466
2467 let mut positions = HashMap::new();
2468 for item in &items {
2469 if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2470 if let (Some(id), Some(pos)) = (
2471 item.get("id").and_then(Value::as_str),
2472 item.get("spi").and_then(Value::as_u64),
2473 ) {
2474 if let Ok(pos) = usize::try_from(pos) {
2475 positions.insert(
2476 id.to_string(),
2477 utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2478 );
2479 }
2480 }
2481 }
2482 }
2483
2484 let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2485 let mut images = Vec::new();
2486 for item in &items {
2487 let ty = item.get("ty").and_then(Value::as_str);
2488 if !matches!(ty, Some("ae" | "ase")) {
2489 continue;
2490 }
2491 let Some(id) = item.get("id").and_then(Value::as_str) else {
2492 continue;
2493 };
2494 let Some(pos) = positions.get(id).copied() else {
2495 continue;
2496 };
2497 let cid = item
2498 .pointer("/epm/ee_eo/i_cid")
2499 .and_then(Value::as_str)
2500 .map(ToString::to_string);
2501 let node = ContentNode::Image {
2502 url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2503 cid,
2504 alt: item
2505 .pointer("/epm/ee_eo/eo_ad")
2506 .and_then(Value::as_str)
2507 .unwrap_or_else(|| {
2508 if ty == Some("ase") {
2509 "suggested image"
2510 } else {
2511 "image"
2512 }
2513 })
2514 .to_string(),
2515 width: json_dimension_to_string(item.pointer("/epm/ee_eo/i_wth")),
2516 height: json_dimension_to_string(item.pointer("/epm/ee_eo/i_ht")),
2517 is_suggestion: ty == Some("ase"),
2518 };
2519 images_by_pos.insert(pos, node.clone());
2520 images.push(node);
2521 }
2522
2523 let mut blocks = Vec::new();
2524 let mut tables = Vec::new();
2525 let mut paragraph = Vec::new();
2526 let mut table: Option<TableBlock> = None;
2527 let mut row: Option<TableRow> = None;
2528 let mut cell: Option<TableCell> = None;
2529 let mut previous_table_control: Option<u32> = None;
2530 let mut skip_next_table_newline = false;
2531
2532 for (idx, ch) in chars.iter().copied().enumerate() {
2533 match ch as u32 {
2534 0x10 => {
2535 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2536 table = Some(TableBlock::default());
2537 previous_table_control = Some(0x10);
2538 skip_next_table_newline = false;
2539 }
2540 0x11 => {
2541 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2542 previous_table_control = None;
2543 skip_next_table_newline = false;
2544 }
2545 0x12 => {
2546 flush_row(&mut row, &mut cell, table.as_mut(), true);
2547 row = Some(TableRow::default());
2548 previous_table_control = Some(0x12);
2549 skip_next_table_newline = false;
2550 }
2551 0x1c => {
2552 if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2553 previous_table_control = Some(0x1c);
2554 continue;
2555 }
2556 let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2557 flush_cell(&mut row, &mut cell, false);
2558 if row.is_none() {
2559 row = Some(TableRow::default());
2560 }
2561 cell = Some(TableCell::default());
2562 if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2563 skip_next_table_newline = true;
2564 }
2565 previous_table_control = Some(0x1c);
2566 }
2567 0x0a => {
2568 if table.is_some() {
2569 if skip_next_table_newline {
2570 skip_next_table_newline = false;
2571 previous_table_control = Some(0x0a);
2572 continue;
2573 }
2574 flush_cell(&mut row, &mut cell, false);
2577 if row.is_none() {
2578 row = Some(TableRow::default());
2579 }
2580 cell = Some(TableCell::default());
2581 previous_table_control = Some(0x0a);
2582 } else {
2583 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2584 }
2585 }
2586 0x0b => {
2587 append_to_current(
2588 &mut paragraph,
2589 &mut row,
2590 &mut cell,
2591 table.is_some(),
2592 "\n",
2593 TextStyle::default(),
2594 );
2595 previous_table_control = None;
2596 skip_next_table_newline = false;
2597 }
2598 _ => {
2599 if let Some(image) = images_by_pos.get(&idx).cloned() {
2600 push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2601 previous_table_control = None;
2602 skip_next_table_newline = false;
2603 if ch == '*' {
2604 continue;
2605 }
2606 }
2607 append_to_current(
2608 &mut paragraph,
2609 &mut row,
2610 &mut cell,
2611 table.is_some(),
2612 &ch.to_string(),
2613 style_maps
2614 .inline_styles
2615 .get(idx)
2616 .cloned()
2617 .unwrap_or_default(),
2618 );
2619 previous_table_control = None;
2620 skip_next_table_newline = false;
2621 }
2622 }
2623 }
2624
2625 if table.is_some() {
2626 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2627 }
2628 flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2629
2630 let mut capture = CapturedDocument {
2631 text: blocks_to_text(&blocks),
2632 blocks,
2633 tables,
2634 images,
2635 };
2636 if let Some(export_html) = export_html {
2637 apply_export_semantic_hints(&mut capture.blocks, export_html);
2638 capture.text = blocks_to_text(&capture.blocks);
2639 }
2640 capture
2641}
2642
2643fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2644 let mut items = Vec::new();
2645 for chunk in chunks {
2646 if let Some(array) = chunk.as_array() {
2647 items.extend(array.iter().cloned());
2648 } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2649 items.extend(array.iter().cloned());
2650 } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2651 items.push(chunk.clone());
2652 }
2653 }
2654 items
2655}
2656
2657fn flush_paragraph(
2658 paragraph: &mut Vec<ContentNode>,
2659 blocks: &mut Vec<CapturedBlock>,
2660 end_pos: Option<usize>,
2661 style_maps: &ModelStyleMaps,
2662) {
2663 if !content_to_text(paragraph).trim().is_empty()
2664 || paragraph
2665 .iter()
2666 .any(|node| matches!(node, ContentNode::Image { .. }))
2667 {
2668 let meta =
2669 paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2670 blocks.push(CapturedBlock::Paragraph {
2671 content: std::mem::take(paragraph),
2672 style: meta.style,
2673 list: meta.list,
2674 quote: meta.quote,
2675 horizontal_rule: meta.horizontal_rule,
2676 });
2677 } else {
2678 paragraph.clear();
2679 }
2680}
2681
2682fn paragraph_meta_for_end_position(
2683 style_maps: &ModelStyleMaps,
2684 end_pos: Option<usize>,
2685 text: &str,
2686) -> ParagraphMeta {
2687 let Some(end_pos) = end_pos else {
2688 return ParagraphMeta::default();
2689 };
2690 let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2691 let mut meta = ParagraphMeta {
2692 style: paragraph_style.and_then(|style| style.style.clone()),
2693 ..ParagraphMeta::default()
2694 };
2695
2696 if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2697 let mut list = list.clone();
2698 list.ordered = infer_ordered_list(&list, text);
2699 meta.list = Some(list);
2700 } else if paragraph_style.is_some_and(|style| {
2701 style.indent_start > 0.0
2702 && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2703 }) {
2704 meta.quote = true;
2705 }
2706
2707 meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2708 || end_pos
2709 .checked_sub(1)
2710 .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2711 && text.trim().chars().all(|ch| ch == '-');
2712 meta
2713}
2714
2715const fn infer_ordered_list(_list: &ListMeta, _text: &str) -> bool {
2716 false
2717}
2718
2719fn apply_export_semantic_hints(blocks: &mut [CapturedBlock], export_html: &str) {
2720 let hints = extract_export_semantic_hints(export_html);
2721 let mut cursor = 0usize;
2722 for block in blocks {
2723 let CapturedBlock::Paragraph {
2724 content,
2725 list,
2726 quote,
2727 ..
2728 } = block
2729 else {
2730 continue;
2731 };
2732 let text = normalize_semantic_text(&content_to_text(content));
2733 if text.is_empty() {
2734 continue;
2735 }
2736 let Some((index, hint)) = find_next_semantic_hint(&hints, &text, cursor, list.is_some())
2737 else {
2738 continue;
2739 };
2740 cursor = index + 1;
2741 if let Some(list) = list.as_mut() {
2742 if let Some(ordered) = hint.list_ordered {
2743 list.ordered = ordered;
2744 }
2745 } else {
2746 *quote = hint.quote;
2747 }
2748 }
2749}
2750
2751fn find_next_semantic_hint<'a>(
2752 hints: &'a [ExportSemanticHint],
2753 text: &str,
2754 cursor: usize,
2755 needs_list_hint: bool,
2756) -> Option<(usize, &'a ExportSemanticHint)> {
2757 hints.iter().enumerate().skip(cursor).find(|(_, hint)| {
2758 hint.text == text
2759 && if needs_list_hint {
2760 hint.list_ordered.is_some()
2761 } else {
2762 hint.list_ordered.is_none()
2763 }
2764 })
2765}
2766
2767fn extract_export_semantic_hints(export_html: &str) -> Vec<ExportSemanticHint> {
2768 let preprocessed = preprocess_google_docs_export_html(export_html).html;
2769 let document = Html::parse_document(&preprocessed);
2770 let selector =
2771 Selector::parse("body h1,body h2,body h3,body h4,body h5,body h6,body p,body li")
2772 .expect("valid semantic hint selector");
2773 document
2774 .select(&selector)
2775 .filter_map(|element| {
2776 let tag = element.value().name();
2777 let text = export_element_semantic_text(&element);
2778 if text.is_empty() {
2779 return None;
2780 }
2781 let list_ordered = if tag == "li" {
2782 nearest_list_is_ordered(&element)
2783 } else {
2784 None
2785 };
2786 Some(ExportSemanticHint {
2787 text,
2788 list_ordered,
2789 quote: tag != "li" && has_ancestor_tag(&element, "blockquote"),
2790 })
2791 })
2792 .collect()
2793}
2794
2795fn export_element_semantic_text(element: &ElementRef<'_>) -> String {
2796 let raw_text = if element.value().name() == "li" {
2797 list_item_own_text(element)
2798 } else {
2799 element.text().collect()
2800 };
2801 normalize_semantic_text(&raw_text)
2802}
2803
2804fn list_item_own_text(element: &ElementRef<'_>) -> String {
2805 let mut text = String::new();
2806 let mut stack: Vec<_> = element.children().collect();
2807 stack.reverse();
2808
2809 while let Some(node) = stack.pop() {
2810 match node.value() {
2811 Node::Text(value) => text.push_str(value),
2812 Node::Element(child) if matches!(child.name(), "ol" | "ul") => {}
2813 Node::Element(_) => {
2814 let mut children: Vec<_> = node.children().collect();
2815 children.reverse();
2816 stack.extend(children);
2817 }
2818 _ => {}
2819 }
2820 }
2821
2822 text
2823}
2824
2825fn nearest_list_is_ordered(element: &ElementRef<'_>) -> Option<bool> {
2826 element
2827 .ancestors()
2828 .filter_map(ElementRef::wrap)
2829 .find_map(|ancestor| match ancestor.value().name() {
2830 "ol" => Some(true),
2831 "ul" => Some(false),
2832 _ => None,
2833 })
2834}
2835
2836fn has_ancestor_tag(element: &ElementRef<'_>, tag: &str) -> bool {
2837 element
2838 .ancestors()
2839 .filter_map(ElementRef::wrap)
2840 .any(|ancestor| ancestor.value().name() == tag)
2841}
2842
2843fn normalize_semantic_text(text: &str) -> String {
2844 text.replace('\u{a0}', " ")
2845 .split_whitespace()
2846 .collect::<Vec<_>>()
2847 .join(" ")
2848}
2849
2850fn cell_is_empty(cell: &TableCell) -> bool {
2851 cell.content.iter().all(|node| match node {
2852 ContentNode::Text { text, .. } => text.trim().is_empty(),
2853 ContentNode::Image { .. } => false,
2854 })
2855}
2856
2857fn row_is_empty(row: &TableRow) -> bool {
2858 row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2859}
2860
2861fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2862 if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2863 if drop_empty && cell_is_empty(&cell) {
2864 return;
2865 }
2866 row.cells.push(cell);
2867 }
2868}
2869
2870fn flush_row(
2871 row: &mut Option<TableRow>,
2872 cell: &mut Option<TableCell>,
2873 table: Option<&mut TableBlock>,
2874 drop_empty_trailing_cell: bool,
2875) {
2876 flush_cell(row, cell, drop_empty_trailing_cell);
2877 if let (Some(table), Some(row)) = (table, row.take()) {
2878 table.rows.push(row);
2879 }
2880}
2881
2882fn flush_table(
2883 table: &mut Option<TableBlock>,
2884 row: &mut Option<TableRow>,
2885 cell: &mut Option<TableCell>,
2886 tables: &mut Vec<TableBlock>,
2887 blocks: &mut Vec<CapturedBlock>,
2888) {
2889 flush_row(row, cell, table.as_mut(), true);
2890 if let Some(mut table) = table.take() {
2891 while table.rows.last().is_some_and(row_is_empty) {
2894 table.rows.pop();
2895 }
2896 tables.push(table.clone());
2897 blocks.push(CapturedBlock::Table(table));
2898 }
2899}
2900
2901fn push_to_current(
2902 paragraph: &mut Vec<ContentNode>,
2903 row: &mut Option<TableRow>,
2904 cell: &mut Option<TableCell>,
2905 in_table: bool,
2906 node: ContentNode,
2907) {
2908 if in_table {
2909 if row.is_none() {
2910 *row = Some(TableRow::default());
2911 }
2912 if cell.is_none() {
2913 *cell = Some(TableCell::default());
2914 }
2915 if let Some(cell) = cell.as_mut() {
2916 cell.content.push(node);
2917 }
2918 } else {
2919 paragraph.push(node);
2920 }
2921}
2922
2923fn append_to_current(
2924 paragraph: &mut Vec<ContentNode>,
2925 row: &mut Option<TableRow>,
2926 cell: &mut Option<TableCell>,
2927 in_table: bool,
2928 text: &str,
2929 style: TextStyle,
2930) {
2931 if in_table {
2932 if row.is_none() {
2933 *row = Some(TableRow::default());
2934 }
2935 if cell.is_none() {
2936 *cell = Some(TableCell::default());
2937 }
2938 if let Some(cell) = cell.as_mut() {
2939 append_styled_text(&mut cell.content, text, style);
2940 }
2941 } else {
2942 append_styled_text(paragraph, text, style);
2943 }
2944}
2945
2946fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2947 append_styled_text(content, text, TextStyle::default());
2948}
2949
2950fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2951 if text.is_empty() {
2952 return;
2953 }
2954 if let Some(ContentNode::Text {
2955 text: last,
2956 bold,
2957 italic,
2958 strike,
2959 link,
2960 }) = content.last_mut()
2961 {
2962 let last_style = TextStyle {
2963 bold: *bold,
2964 italic: *italic,
2965 strike: *strike,
2966 link: link.clone(),
2967 };
2968 if last_style == style {
2969 last.push_str(text);
2970 return;
2971 }
2972 }
2973 content.push(ContentNode::Text {
2974 text: text.to_string(),
2975 bold: style.bold,
2976 italic: style.italic,
2977 strike: style.strike,
2978 link: style.link,
2979 });
2980}
2981
2982#[must_use]
2984pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2985 match format.to_lowercase().as_str() {
2986 "html" => render_blocks_html(&capture.blocks),
2987 "txt" | "text" => blocks_to_text(&capture.blocks),
2988 _ => render_blocks_markdown(&capture.blocks),
2989 }
2990}
2991
2992struct RenderedBlock {
2995 markdown: String,
2996 list_id: Option<String>,
2997 quote: bool,
2998}
2999
3000fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
3001 let mut counters: HashMap<(String, usize), usize> = HashMap::new();
3006 let mut rendered: Vec<RenderedBlock> = Vec::new();
3007
3008 for block in blocks {
3009 match block {
3010 CapturedBlock::Paragraph {
3011 content,
3012 style,
3013 list,
3014 quote,
3015 horizontal_rule,
3016 } => {
3017 let text = render_content_markdown(content).trim().to_string();
3018 if text.is_empty() {
3019 continue;
3020 }
3021 let ordered_index = list.as_ref().and_then(|list_meta| {
3022 if !list_meta.ordered {
3023 return None;
3024 }
3025 let key = (list_meta.id.clone(), list_meta.level);
3029 counters.retain(|(id, level), _| {
3030 !(id == &list_meta.id && *level > list_meta.level)
3031 });
3032 let next = counters.entry(key).or_insert(0);
3033 *next += 1;
3034 Some(*next)
3035 });
3036 let markdown = render_paragraph_markdown(
3037 &text,
3038 style.as_deref(),
3039 list.as_ref(),
3040 *quote,
3041 *horizontal_rule,
3042 ordered_index,
3043 );
3044 rendered.push(RenderedBlock {
3045 markdown,
3046 list_id: list.as_ref().map(|l| l.id.clone()),
3047 quote: *quote,
3048 });
3049 }
3050 CapturedBlock::Table(table) => {
3051 rendered.push(RenderedBlock {
3052 markdown: render_table_markdown(table),
3053 list_id: None,
3054 quote: false,
3055 });
3056 }
3057 }
3058 }
3059
3060 let mut out = String::new();
3064 for (idx, block) in rendered.iter().enumerate() {
3065 if idx == 0 {
3066 out.push_str(&block.markdown);
3067 continue;
3068 }
3069 let prev = &rendered[idx - 1];
3070 if block.list_id.is_some() && prev.list_id.is_some() {
3071 out.push('\n');
3072 } else if block.quote && prev.quote {
3073 out.push_str("\n>\n");
3074 } else {
3075 out.push_str("\n\n");
3076 }
3077 out.push_str(&block.markdown);
3078 }
3079 if !out.is_empty() && !out.ends_with('\n') {
3080 out.push('\n');
3081 }
3082 out
3083}
3084
3085fn render_paragraph_markdown(
3086 text: &str,
3087 style: Option<&str>,
3088 list: Option<&ListMeta>,
3089 quote: bool,
3090 horizontal_rule: bool,
3091 ordered_index: Option<usize>,
3092) -> String {
3093 if horizontal_rule {
3094 return "---".to_string();
3095 }
3096 match style {
3097 Some("TITLE") => format!("# {text}"),
3098 Some("SUBTITLE") => format!("## {text}"),
3099 Some(style) if style.starts_with("HEADING_") => {
3100 let level = style
3101 .trim_start_matches("HEADING_")
3102 .parse::<usize>()
3103 .unwrap_or(1);
3104 format!("{} {text}", "#".repeat(level.clamp(1, 6)))
3105 }
3106 _ => list.map_or_else(
3107 || {
3108 if quote {
3109 text.lines()
3110 .map(|line| {
3111 if line.is_empty() {
3112 ">".to_string()
3113 } else {
3114 format!("> {line}")
3115 }
3116 })
3117 .collect::<Vec<_>>()
3118 .join("\n")
3119 } else {
3120 text.to_string()
3121 }
3122 },
3123 |list| {
3124 let indent = " ".repeat(list.level);
3125 let marker = if list.ordered {
3126 format!("{}.", ordered_index.unwrap_or(1))
3127 } else {
3128 "-".to_string()
3129 };
3130 format!("{indent}{marker} {text}")
3131 },
3132 ),
3133 }
3134}
3135
3136fn render_table_markdown(table: &TableBlock) -> String {
3137 if table.rows.is_empty() {
3138 return String::new();
3139 }
3140 let width = table
3141 .rows
3142 .iter()
3143 .map(|row| row.cells.len())
3144 .max()
3145 .unwrap_or(1);
3146 let rows = table
3147 .rows
3148 .iter()
3149 .map(|row| {
3150 (0..width)
3151 .map(|idx| {
3152 row.cells.get(idx).map_or_else(String::new, |cell| {
3153 escape_markdown_table_cell(&render_content_markdown(&cell.content))
3154 })
3155 })
3156 .collect::<Vec<_>>()
3157 })
3158 .collect::<Vec<_>>();
3159 let separator = vec!["---".to_string(); width];
3160 std::iter::once(&rows[0])
3161 .chain(std::iter::once(&separator))
3162 .chain(rows.iter().skip(1))
3163 .map(|row| format!("| {} |", row.join(" | ")))
3164 .collect::<Vec<_>>()
3165 .join("\n")
3166}
3167
3168fn render_content_markdown(content: &[ContentNode]) -> String {
3169 let mut rendered = String::new();
3170 let mut idx = 0usize;
3171 while idx < content.len() {
3172 match &content[idx] {
3173 ContentNode::Text {
3174 text,
3175 bold,
3176 italic,
3177 strike,
3178 link,
3179 } => {
3180 let link_target = link.as_deref();
3181 let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
3182 idx += 1;
3183 while let Some(ContentNode::Text {
3184 text,
3185 bold,
3186 italic,
3187 strike,
3188 link: next_link,
3189 }) = content.get(idx)
3190 {
3191 if next_link.as_deref() != link_target {
3192 break;
3193 }
3194 runs.push((text.as_str(), *bold, *italic, *strike));
3195 idx += 1;
3196 }
3197 let label = render_text_runs_markdown(&runs);
3198 if let Some(link_target) = link_target {
3199 let _ = write!(rendered, "[{label}]({link_target})");
3200 } else {
3201 rendered.push_str(&label);
3202 }
3203 }
3204 ContentNode::Image {
3205 url: Some(url),
3206 alt,
3207 ..
3208 } => {
3209 let _ = write!(rendered, "");
3210 idx += 1;
3211 }
3212 ContentNode::Image { .. } => idx += 1,
3213 }
3214 }
3215 rendered
3216}
3217
3218#[derive(Clone, Copy, Default)]
3219struct MarkdownMarkerState {
3220 bold: bool,
3221 italic: bool,
3222 strike: bool,
3223}
3224
3225fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
3226 let inactive = MarkdownMarkerState::default();
3227 let mut active = inactive;
3228 let mut output = String::new();
3229 for (text, bold, italic, strike) in runs {
3230 let next = MarkdownMarkerState {
3231 bold: *bold,
3232 italic: *italic,
3233 strike: *strike,
3234 };
3235 let mut start = 0usize;
3236 for (offset, ch) in text.char_indices() {
3237 if ch != '\n' {
3238 continue;
3239 }
3240 if offset > start {
3241 output.push_str(&markdown_marker_transition(active, next));
3242 output.push_str(&text[start..offset]);
3243 active = next;
3244 }
3245 output.push_str(&markdown_marker_transition(active, inactive));
3246 output.push('\n');
3247 active = inactive;
3248 start = offset + ch.len_utf8();
3249 }
3250 if start < text.len() {
3251 output.push_str(&markdown_marker_transition(active, next));
3252 output.push_str(&text[start..]);
3253 active = next;
3254 }
3255 }
3256 output.push_str(&markdown_marker_transition(active, inactive));
3257 output
3258}
3259
3260fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
3261 let mut markers = String::new();
3262 if active.strike && !next.strike {
3263 markers.push_str("~~");
3264 }
3265 if active.italic && !next.italic {
3266 markers.push('*');
3267 }
3268 if active.bold && !next.bold {
3269 markers.push_str("**");
3270 }
3271 if !active.bold && next.bold {
3272 markers.push_str("**");
3273 }
3274 if !active.italic && next.italic {
3275 markers.push('*');
3276 }
3277 if !active.strike && next.strike {
3278 markers.push_str("~~");
3279 }
3280 markers
3281}
3282
3283fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
3284 format!(
3285 "<!doctype html><html><body>{}</body></html>",
3286 blocks
3287 .iter()
3288 .map(|block| match block {
3289 CapturedBlock::Paragraph {
3290 content,
3291 style,
3292 list,
3293 quote,
3294 horizontal_rule,
3295 } => {
3296 if *horizontal_rule {
3297 "<hr>".to_string()
3298 } else if let Some(list) = list {
3299 let tag = if list.ordered { "ol" } else { "ul" };
3300 format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
3301 } else if *quote {
3302 format!("<blockquote>{}</blockquote>", render_content_html(content))
3303 } else {
3304 let tag = paragraph_tag(style.as_deref());
3305 format!("<{tag}>{}</{tag}>", render_content_html(content))
3306 }
3307 }
3308 CapturedBlock::Table(table) => render_table_html(table),
3309 })
3310 .collect::<String>()
3311 )
3312}
3313
3314fn render_table_html(table: &TableBlock) -> String {
3315 let mut html = String::from("<table>");
3316 for row in &table.rows {
3317 html.push_str("<tr>");
3318 for cell in &row.cells {
3319 html.push_str("<td>");
3320 html.push_str(&render_content_html(&cell.content));
3321 html.push_str("</td>");
3322 }
3323 html.push_str("</tr>");
3324 }
3325 html.push_str("</table>");
3326 html
3327}
3328
3329fn render_content_html(content: &[ContentNode]) -> String {
3330 content
3331 .iter()
3332 .map(|node| match node {
3333 ContentNode::Text {
3334 text,
3335 bold,
3336 italic,
3337 strike,
3338 link,
3339 } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
3340 ContentNode::Image {
3341 url: Some(url),
3342 alt,
3343 width,
3344 height,
3345 ..
3346 } => render_image_html(url, alt, width.as_deref(), height.as_deref()),
3347 ContentNode::Image { .. } => String::new(),
3348 })
3349 .collect()
3350}
3351
3352fn render_image_html(url: &str, alt: &str, width: Option<&str>, height: Option<&str>) -> String {
3353 let mut html = format!(
3354 "<img src=\"{}\" alt=\"{}\"",
3355 escape_html(url),
3356 escape_html(alt)
3357 );
3358 if let Some(width) = width.filter(|value| !value.is_empty()) {
3359 let _ = write!(html, " width=\"{}\"", escape_html(width));
3360 }
3361 if let Some(height) = height.filter(|value| !value.is_empty()) {
3362 let _ = write!(html, " height=\"{}\"", escape_html(height));
3363 }
3364 html.push('>');
3365 html
3366}
3367
3368fn render_marked_html(
3369 text: &str,
3370 bold: bool,
3371 italic: bool,
3372 strike: bool,
3373 link: Option<&str>,
3374) -> String {
3375 text.split('\n')
3376 .map(|segment| render_marked_html_segment(segment, bold, italic, strike, link))
3377 .collect::<Vec<_>>()
3378 .join("<br>")
3379}
3380
3381fn render_marked_html_segment(
3382 text: &str,
3383 bold: bool,
3384 italic: bool,
3385 strike: bool,
3386 link: Option<&str>,
3387) -> String {
3388 if text.is_empty() {
3389 return String::new();
3390 }
3391 let mut output = escape_html(text);
3392 if bold {
3393 output = format!("<strong>{output}</strong>");
3394 }
3395 if italic {
3396 output = format!("<em>{output}</em>");
3397 }
3398 if strike {
3399 output = format!("<s>{output}</s>");
3400 }
3401 if let Some(link) = link {
3402 output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
3403 }
3404 output
3405}
3406
3407fn paragraph_tag(style: Option<&str>) -> &'static str {
3408 match style {
3409 Some("TITLE" | "HEADING_1") => "h1",
3410 Some("SUBTITLE" | "HEADING_2") => "h2",
3411 Some("HEADING_3") => "h3",
3412 Some("HEADING_4") => "h4",
3413 Some("HEADING_5") => "h5",
3414 Some("HEADING_6") => "h6",
3415 _ => "p",
3416 }
3417}
3418
3419fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
3420 blocks
3421 .iter()
3422 .map(|block| match block {
3423 CapturedBlock::Paragraph { content, .. } => content_to_text(content),
3424 CapturedBlock::Table(table) => table
3425 .rows
3426 .iter()
3427 .map(|row| {
3428 row.cells
3429 .iter()
3430 .map(|cell| content_to_text(&cell.content))
3431 .collect::<Vec<_>>()
3432 .join("\t")
3433 })
3434 .collect::<Vec<_>>()
3435 .join("\n"),
3436 })
3437 .filter(|text| !text.is_empty())
3438 .collect::<Vec<_>>()
3439 .join("\n")
3440}
3441
3442fn content_to_text(content: &[ContentNode]) -> String {
3443 content
3444 .iter()
3445 .map(|node| match node {
3446 ContentNode::Text { text, .. } => text.clone(),
3447 ContentNode::Image {
3448 url: Some(_), alt, ..
3449 } => format!("[{alt}]"),
3450 ContentNode::Image { .. } => String::new(),
3451 })
3452 .collect()
3453}
3454
3455fn escape_html(value: &str) -> String {
3456 value
3457 .replace('&', "&")
3458 .replace('<', "<")
3459 .replace('>', ">")
3460 .replace('"', """)
3461 .replace('\'', "'")
3462}
3463
3464fn escape_markdown_table_cell(value: &str) -> String {
3465 value.replace('|', "\\|").replace('\n', "<br>")
3466}
3467
3468#[must_use]
3472pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
3473 let trimmed = auth_header.trim();
3474 trimmed
3475 .strip_prefix("Bearer ")
3476 .or_else(|| trimmed.strip_prefix("bearer "))
3477 .map(str::trim)
3478 .filter(|t| !t.is_empty())
3479}
3480
3481#[derive(Debug, Clone)]
3483pub struct ExtractedImage {
3484 pub filename: String,
3486 pub data: Vec<u8>,
3488 pub mime_type: String,
3490}
3491
3492#[derive(Debug, Clone)]
3494pub struct GDocsArchiveResult {
3495 pub html: String,
3497 pub markdown: String,
3499 pub images: Vec<ExtractedImage>,
3501 pub document_id: String,
3503 pub export_url: String,
3505}
3506
3507pub async fn localize_rendered_remote_images_for_archive(
3519 rendered: &GDocsRenderedResult,
3520) -> crate::Result<GDocsArchiveResult> {
3521 let client = reqwest::Client::builder().build().map_err(|error| {
3522 WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
3523 })?;
3524 let mut seen = HashMap::new();
3525 let mut images = Vec::new();
3526 let mut next_index = 1usize;
3527
3528 for image in &rendered.remote_images {
3529 if seen.contains_key(&image.url) {
3530 continue;
3531 }
3532 let filename = remote_image_filename(&image.url, next_index);
3533 next_index += 1;
3534 seen.insert(image.url.clone(), filename.clone());
3535
3536 match client
3537 .get(&image.url)
3538 .header("User-Agent", GDOCS_USER_AGENT)
3539 .header("Accept", "image/*,*/*;q=0.8")
3540 .send()
3541 .await
3542 {
3543 Ok(response) if response.status().is_success() => {
3544 let mime_type = response
3545 .headers()
3546 .get(reqwest::header::CONTENT_TYPE)
3547 .and_then(|value| value.to_str().ok())
3548 .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
3549 let data = response.bytes().await.map_err(|error| {
3550 WebCaptureError::FetchError(format!(
3551 "Failed to read Google Docs image {}: {error}",
3552 image.url
3553 ))
3554 })?;
3555 debug!(
3556 url = %image.url,
3557 filename = %filename,
3558 bytes = data.len(),
3559 mime_type = %mime_type,
3560 "downloaded Google Docs browser-model archive image"
3561 );
3562 images.push(ExtractedImage {
3563 filename,
3564 data: data.to_vec(),
3565 mime_type,
3566 });
3567 }
3568 Ok(response) => {
3569 warn!(
3570 url = %image.url,
3571 status = response.status().as_u16(),
3572 "failed to download Google Docs browser-model archive image"
3573 );
3574 }
3575 Err(error) => {
3576 warn!(
3577 url = %image.url,
3578 error = %error,
3579 "failed to download Google Docs browser-model archive image"
3580 );
3581 }
3582 }
3583 }
3584
3585 let mut markdown = rendered.markdown.clone();
3586 let mut html = rendered.html.clone();
3587 for (url, filename) in seen {
3588 let local_path = format!("images/{filename}");
3589 markdown = markdown.replace(&url, &local_path);
3590 html = html.replace(&url, &local_path);
3591 }
3592
3593 Ok(GDocsArchiveResult {
3594 html,
3595 markdown,
3596 images,
3597 document_id: rendered.document_id.clone(),
3598 export_url: rendered.export_url.clone(),
3599 })
3600}
3601
3602fn remote_image_filename(url: &str, index: usize) -> String {
3603 let ext = crate::localize_images::get_extension_from_url(url);
3604 format!("image-{index:02}{ext}")
3605}
3606
3607fn mime_type_for_filename(filename: &str) -> String {
3608 match filename
3609 .rsplit('.')
3610 .next()
3611 .unwrap_or("png")
3612 .to_lowercase()
3613 .as_str()
3614 {
3615 "jpg" | "jpeg" => "image/jpeg",
3616 "gif" => "image/gif",
3617 "webp" => "image/webp",
3618 "svg" => "image/svg+xml",
3619 _ => "image/png",
3620 }
3621 .to_string()
3622}
3623
3624fn base64_image_pattern() -> &'static Regex {
3625 static PATTERN: OnceLock<Regex> = OnceLock::new();
3626 PATTERN.get_or_init(|| {
3627 Regex::new(
3628 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3629 )
3630 .unwrap()
3631 })
3632}
3633
3634#[must_use]
3647pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3648 let mut images = Vec::new();
3649 let mut idx = 1u32;
3650
3651 let updated_html = base64_image_pattern()
3652 .replace_all(html, |caps: ®ex::Captures<'_>| {
3653 let prefix = &caps[1];
3654 let mime_ext = &caps[2];
3655 let base64_data = &caps[3];
3656 let suffix = &caps[4];
3657
3658 let ext = match mime_ext {
3659 "jpeg" => "jpg",
3660 "svg+xml" => "svg",
3661 other => other,
3662 };
3663
3664 let filename = format!("image-{idx:02}.{ext}");
3665 let mime_type = format!("image/{mime_ext}");
3666
3667 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3668 debug!("Extracted image: {} ({} bytes)", filename, data.len());
3669 images.push(ExtractedImage {
3670 filename: filename.clone(),
3671 data,
3672 mime_type,
3673 });
3674 }
3675
3676 idx += 1;
3677 format!("{prefix}images/{filename}{suffix}")
3678 })
3679 .into_owned();
3680
3681 (updated_html, images)
3682}
3683
3684pub async fn fetch_google_doc_as_archive(
3703 url: &str,
3704 api_token: Option<&str>,
3705) -> crate::Result<GDocsArchiveResult> {
3706 let result = fetch_google_doc(url, "html", api_token).await?;
3707
3708 let preprocess = preprocess_google_docs_export_html(&result.content);
3709 debug!(
3710 document_id = %result.document_id,
3711 hoisted = preprocess.hoisted,
3712 unwrapped_links = preprocess.unwrapped_links,
3713 "google-docs-export pre-processor rewrote archive markup"
3714 );
3715
3716 let (local_html, images) = extract_base64_images(&preprocess.html);
3717
3718 let markdown = normalize_google_docs_export_markdown(
3719 &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3720 );
3721
3722 debug!(
3723 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3724 images.len(),
3725 local_html.len(),
3726 markdown.len()
3727 );
3728
3729 Ok(GDocsArchiveResult {
3730 html: local_html,
3731 markdown,
3732 images,
3733 document_id: result.document_id,
3734 export_url: result.export_url,
3735 })
3736}
3737
3738pub fn create_archive_zip(
3749 archive: &GDocsArchiveResult,
3750 pretty_html: bool,
3751) -> crate::Result<Vec<u8>> {
3752 let mut buf = std::io::Cursor::new(Vec::new());
3753
3754 {
3755 let mut zip = zip::ZipWriter::new(&mut buf);
3756 let options = zip::write::SimpleFileOptions::default()
3757 .compression_method(zip::CompressionMethod::Deflated);
3758
3759 zip.start_file("document.md", options)
3760 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3761 zip.write_all(archive.markdown.as_bytes())?;
3762
3763 let html_output = if pretty_html {
3764 crate::html::pretty_print_html(&archive.html)
3765 } else {
3766 archive.html.clone()
3767 };
3768 zip.start_file("document.html", options)
3769 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3770 zip.write_all(html_output.as_bytes())?;
3771
3772 for img in &archive.images {
3773 zip.start_file(format!("images/{}", img.filename), options)
3774 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3775 zip.write_all(&img.data)?;
3776 }
3777
3778 zip.finish()
3779 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3780 }
3781
3782 Ok(buf.into_inner())
3783}
3784
3785#[cfg(test)]
3786mod tests {
3787 use super::*;
3788 use serde_json::json;
3789
3790 #[test]
3791 fn browser_model_fingerprint_includes_payload_size() {
3792 let small = browser_model_data_from_value(&json!({
3793 "chunks": [{ "chunk": [{ "ty": "is", "s": "first" }] }],
3794 "cidUrlMap": {}
3795 }));
3796 let larger = browser_model_data_from_value(&json!({
3797 "chunks": [{ "chunk": [{ "ty": "is", "s": "first and later text" }] }],
3798 "cidUrlMap": {}
3799 }));
3800
3801 assert_eq!(small.fingerprint().chunks, larger.fingerprint().chunks);
3802 assert_ne!(
3803 small.fingerprint().payload_bytes,
3804 larger.fingerprint().payload_bytes
3805 );
3806 }
3807
3808 #[test]
3809 fn browser_model_quiescence_resets_when_chunks_change() {
3810 let start = Instant::now();
3811 let stability_window = Duration::from_millis(1500);
3812 let one_chunk = BrowserModelFingerprint {
3813 chunks: 1,
3814 payload_bytes: 100,
3815 };
3816 let two_chunks = BrowserModelFingerprint {
3817 chunks: 2,
3818 payload_bytes: 200,
3819 };
3820 let mut quiescence = BrowserModelQuiescence::default();
3821
3822 assert_eq!(quiescence.observe(one_chunk, start, stability_window), None);
3823 assert_eq!(
3824 quiescence.observe(
3825 one_chunk,
3826 start + Duration::from_millis(250),
3827 stability_window
3828 ),
3829 None
3830 );
3831 assert_eq!(
3832 quiescence.observe(
3833 two_chunks,
3834 start + Duration::from_millis(500),
3835 stability_window
3836 ),
3837 None
3838 );
3839 assert_eq!(
3840 quiescence.observe(
3841 two_chunks,
3842 start + Duration::from_millis(750),
3843 stability_window
3844 ),
3845 None
3846 );
3847 assert_eq!(
3848 quiescence.observe(
3849 two_chunks,
3850 start + Duration::from_millis(2300),
3851 stability_window
3852 ),
3853 Some(Duration::from_millis(1550))
3854 );
3855 }
3856}