1use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use serde_json::Value;
38use std::collections::HashMap;
39use std::fmt::Write as _;
40use std::hash::BuildHasher;
41use std::io::Write;
42use std::process::Stdio;
43use std::sync::OnceLock;
44use std::time::{Duration, Instant};
45use tokio::io::{AsyncBufReadExt, BufReader};
46use tokio::process::{Child, Command};
47use tracing::{debug, info, warn};
48
49use crate::WebCaptureError;
50
51const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
52const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
53const GDOCS_USER_AGENT: &str =
54 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
55const GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT: Duration = Duration::from_secs(30);
56const GDOCS_EDITOR_MODEL_STABILITY_DEFAULT: Duration = Duration::from_millis(1500);
57const GDOCS_EDITOR_MODEL_POLL_INTERVAL: Duration = Duration::from_millis(250);
58const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
59
60type CdpWebSocket = WebSocketStream<ConnectStream>;
61
62const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
63window.__captured_chunks = [];
64const captureChunk = (value) => {
65 if (!value) {
66 return;
67 }
68 if (Array.isArray(value)) {
69 for (const item of value) {
70 captureChunk(item);
71 }
72 return;
73 }
74 try {
75 window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
76 } catch {
77 window.__captured_chunks.push(value);
78 }
79};
80const wrapChunkArray = (value) => {
81 if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
82 return value;
83 }
84 const originalPush = value.push;
85 Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
86 value: true,
87 enumerable: false,
88 });
89 Object.defineProperty(value, 'push', {
90 value(...items) {
91 for (const item of items) {
92 captureChunk(item);
93 }
94 return originalPush.apply(this, items);
95 },
96 writable: true,
97 configurable: true,
98 });
99 for (const item of value) {
100 captureChunk(item);
101 }
102 return value;
103};
104Object.defineProperty(window, 'DOCS_modelChunk', {
105 set(value) {
106 captureChunk(value);
107 window.__DOCS_modelChunk_latest = wrapChunkArray(value);
108 },
109 get() {
110 return window.__DOCS_modelChunk_latest;
111 },
112 configurable: false,
113});
114";
115
116const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
117 const chunks = [...(window.__captured_chunks || [])];
118 if (
119 window.DOCS_modelChunk &&
120 chunks.length === 0 &&
121 !chunks.includes(window.DOCS_modelChunk)
122 ) {
123 chunks.push(window.DOCS_modelChunk);
124 }
125 const cidUrlMap = {};
126 const scripts = document.querySelectorAll('script');
127 for (const script of scripts) {
128 const text = script.textContent || '';
129 if (!text.includes('docs-images-rt')) {
130 continue;
131 }
132 const regex =
133 /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
134 let match;
135 while ((match = regex.exec(text)) !== null) {
136 cidUrlMap[match[1]] = match[2]
137 .replace(/\\u003d/g, '=')
138 .replace(/\\u0026/g, '&')
139 .replace(/\\\//g, '/');
140 }
141 }
142 return { chunks, cidUrlMap };
143}"#;
144
145fn gdocs_url_pattern() -> &'static Regex {
146 static PATTERN: OnceLock<Regex> = OnceLock::new();
147 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
148}
149
150#[derive(Debug, Clone)]
152pub struct GDocsResult {
153 pub content: String,
155 pub format: String,
157 pub document_id: String,
159 pub export_url: String,
161}
162
163#[derive(Debug, Clone, Copy, PartialEq, Eq)]
165pub enum GDocsCaptureMethod {
166 BrowserModel,
168 PublicExport,
170 DocsApi,
172}
173
174#[derive(Debug, Clone)]
176pub struct GDocsRenderedResult {
177 pub markdown: String,
179 pub html: String,
181 pub text: String,
183 pub document_id: String,
185 pub export_url: String,
187 pub remote_images: Vec<RemoteImage>,
189}
190
191#[derive(Debug, Clone, PartialEq, Eq)]
193pub struct RemoteImage {
194 pub url: String,
196 pub alt: String,
198}
199
200#[derive(Debug, Clone)]
201struct BrowserModelData {
202 chunks: Vec<Value>,
203 cid_urls: HashMap<String, String>,
204 chunk_payload_bytes: usize,
205 poll_count: usize,
206 stable_for: Duration,
207}
208
209#[derive(Debug, Clone, Copy, PartialEq, Eq)]
210struct BrowserModelFingerprint {
211 chunks: usize,
212 payload_bytes: usize,
213}
214
215#[derive(Debug, Default)]
216struct BrowserModelQuiescence {
217 last_fingerprint: Option<BrowserModelFingerprint>,
218 stable_since: Option<Instant>,
219}
220
221impl BrowserModelData {
222 const fn fingerprint(&self) -> BrowserModelFingerprint {
223 BrowserModelFingerprint {
224 chunks: self.chunks.len(),
225 payload_bytes: self.chunk_payload_bytes,
226 }
227 }
228}
229
230impl BrowserModelQuiescence {
231 fn observe(
232 &mut self,
233 fingerprint: BrowserModelFingerprint,
234 now: Instant,
235 stability_window: Duration,
236 ) -> Option<Duration> {
237 if fingerprint.chunks == 0 {
238 self.last_fingerprint = Some(fingerprint);
239 self.stable_since = None;
240 return None;
241 }
242
243 if self.last_fingerprint == Some(fingerprint) {
244 let stable_since = *self.stable_since.get_or_insert(now);
245 let stable_for = now.saturating_duration_since(stable_since);
246 if stable_for >= stability_window {
247 return Some(stable_for);
248 }
249 } else {
250 self.last_fingerprint = Some(fingerprint);
251 self.stable_since = None;
252 }
253
254 None
255 }
256
257 fn stable_for(&self, now: Instant) -> Duration {
258 self.stable_since.map_or(Duration::ZERO, |stable_since| {
259 now.saturating_duration_since(stable_since)
260 })
261 }
262}
263
264#[derive(Debug, Clone, Default)]
266pub struct CapturedDocument {
267 pub blocks: Vec<CapturedBlock>,
269 pub tables: Vec<TableBlock>,
271 pub images: Vec<ContentNode>,
273 pub text: String,
275}
276
277#[derive(Debug, Clone)]
279pub enum CapturedBlock {
280 Paragraph {
282 content: Vec<ContentNode>,
284 style: Option<String>,
286 list: Option<ListMeta>,
288 quote: bool,
290 horizontal_rule: bool,
292 },
293 Table(TableBlock),
295}
296
297#[derive(Debug, Clone, Default)]
299pub struct TableBlock {
300 pub rows: Vec<TableRow>,
302}
303
304#[derive(Debug, Clone, Default)]
306pub struct TableRow {
307 pub cells: Vec<TableCell>,
309}
310
311#[derive(Debug, Clone, Default)]
313pub struct TableCell {
314 pub content: Vec<ContentNode>,
316}
317
318#[derive(Debug, Clone, PartialEq, Eq)]
320pub enum ContentNode {
321 Text {
323 text: String,
325 bold: bool,
327 italic: bool,
329 strike: bool,
331 link: Option<String>,
333 },
334 Image {
336 cid: Option<String>,
338 url: Option<String>,
340 alt: String,
342 width: Option<String>,
344 height: Option<String>,
346 is_suggestion: bool,
348 },
349}
350
351#[derive(Debug, Clone, Default, PartialEq, Eq)]
352struct TextStyle {
353 bold: bool,
354 italic: bool,
355 strike: bool,
356 link: Option<String>,
357}
358
359#[derive(Debug, Clone, Default)]
360struct ParagraphMeta {
361 style: Option<String>,
362 list: Option<ListMeta>,
363 quote: bool,
364 horizontal_rule: bool,
365}
366
367#[derive(Debug, Clone)]
368pub struct ListMeta {
369 pub id: String,
371 pub level: usize,
373 pub ordered: bool,
375}
376
377#[derive(Debug, Clone)]
378struct ParagraphStyle {
379 style: Option<String>,
380 indent_start: f64,
381 indent_first_line: f64,
382}
383
384#[derive(Debug, Clone, Default)]
385struct ModelStyleMaps {
386 inline_styles: Vec<TextStyle>,
387 paragraph_by_end: HashMap<usize, ParagraphStyle>,
388 list_by_end: HashMap<usize, ListMeta>,
389 horizontal_rules: std::collections::HashSet<usize>,
390}
391
392#[must_use]
394pub fn is_google_docs_url(url: &str) -> bool {
395 gdocs_url_pattern().is_match(url)
396}
397
398#[must_use]
402pub fn extract_document_id(url: &str) -> Option<String> {
403 gdocs_url_pattern()
404 .captures(url)
405 .and_then(|caps| caps.get(1))
406 .map(|m| m.as_str().to_string())
407}
408
409#[must_use]
416pub fn build_export_url(document_id: &str, format: &str) -> String {
417 let export_format = match format {
418 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
419 _ => "html",
420 };
421 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
422}
423
424#[must_use]
426pub fn build_edit_url(document_id: &str) -> String {
427 format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
428}
429
430#[must_use]
432pub fn build_docs_api_url(document_id: &str) -> String {
433 format!("{GDOCS_API_BASE}/{document_id}")
434}
435
436pub fn select_capture_method(
442 capture: &str,
443 api_token: Option<&str>,
444) -> crate::Result<GDocsCaptureMethod> {
445 match capture.to_lowercase().as_str() {
446 "browser" => Ok(GDocsCaptureMethod::BrowserModel),
447 "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
448 "api" => Ok(GDocsCaptureMethod::PublicExport),
449 other => Err(WebCaptureError::InvalidUrl(format!(
450 "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
451 ))),
452 }
453}
454
455pub async fn fetch_google_doc(
470 url: &str,
471 format: &str,
472 api_token: Option<&str>,
473) -> crate::Result<GDocsResult> {
474 let document_id = extract_document_id(url).ok_or_else(|| {
475 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
476 })?;
477
478 let export_url = build_export_url(&document_id, format);
479 debug!(
480 document_id = %document_id,
481 format = %format,
482 export_url = %export_url,
483 has_api_token = api_token.is_some(),
484 "fetching Google Doc via public export"
485 );
486
487 let mut request = reqwest::Client::new()
488 .get(&export_url)
489 .header(
490 "User-Agent",
491 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
492 )
493 .header("Accept-Charset", "utf-8")
494 .header("Accept-Language", "en-US,en;q=0.9");
495
496 if let Some(token) = api_token {
497 request = request.header("Authorization", format!("Bearer {token}"));
498 }
499
500 let response = request
501 .send()
502 .await
503 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
504 debug!(
505 document_id = %document_id,
506 status = response.status().as_u16(),
507 success = response.status().is_success(),
508 content_type = response
509 .headers()
510 .get(reqwest::header::CONTENT_TYPE)
511 .and_then(|value| value.to_str().ok())
512 .unwrap_or(""),
513 "received Google Docs public export response"
514 );
515
516 if !response.status().is_success() {
517 return Err(WebCaptureError::FetchError(format!(
518 "Failed to fetch Google Doc ({} {}): {}",
519 response.status().as_u16(),
520 response.status().canonical_reason().unwrap_or("Unknown"),
521 export_url
522 )));
523 }
524
525 let raw_content = response.text().await.map_err(|e| {
526 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
527 })?;
528 debug!(
529 document_id = %document_id,
530 bytes = raw_content.len(),
531 "read Google Docs public export body"
532 );
533
534 let content = match format {
536 "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
537 _ => raw_content,
538 };
539
540 Ok(GDocsResult {
541 content,
542 format: format.to_string(),
543 document_id,
544 export_url,
545 })
546}
547
548pub async fn fetch_google_doc_as_markdown(
562 url: &str,
563 api_token: Option<&str>,
564) -> crate::Result<GDocsResult> {
565 let result = fetch_google_doc(url, "html", api_token).await?;
566
567 let preprocess = preprocess_google_docs_export_html(&result.content);
568 debug!(
569 document_id = %result.document_id,
570 hoisted = preprocess.hoisted,
571 unwrapped_links = preprocess.unwrapped_links,
572 "google-docs-export pre-processor rewrote markup"
573 );
574 let markdown = normalize_google_docs_export_markdown(
575 &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
576 );
577 debug!(
578 document_id = %result.document_id,
579 bytes = markdown.len(),
580 "rendered Google Docs public export markdown"
581 );
582
583 Ok(GDocsResult {
584 content: markdown,
585 format: "markdown".to_string(),
586 document_id: result.document_id,
587 export_url: result.export_url,
588 })
589}
590
591#[derive(Debug, Clone)]
596pub struct GDocsExportPreprocessResult {
597 pub html: String,
599 pub hoisted: usize,
601 pub unwrapped_links: usize,
603}
604
605#[must_use]
613pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
614 let mut hoisted: usize = 0;
615 let mut unwrapped_links: usize = 0;
616 let class_styles = extract_css_class_styles(html);
617
618 let mut out = hoist_inline_style_spans(html, &mut hoisted);
619 out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
620 out = convert_class_indented_blockquotes(&out, &class_styles);
621 out = nest_google_docs_lists(&out, &class_styles);
622 out = strip_google_docs_heading_noise(&out);
623 out = strip_heading_inline_formatting(&out);
624 out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
625 out = out.replace(" ", " ");
626 out = out.replace('\u{00A0}', " ");
627
628 GDocsExportPreprocessResult {
629 html: out,
630 hoisted,
631 unwrapped_links,
632 }
633}
634
635#[must_use]
637pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
638 let markdown = unescape_public_export_punctuation(markdown);
639 let markdown = convert_setext_headings(&markdown);
640 let markdown = normalize_atx_headings(&markdown);
641 let markdown = normalize_bullet_markers(&markdown);
642 let markdown = normalize_list_spacing(&markdown);
643 let markdown = normalize_blockquote_spacing(&markdown);
644 let markdown = normalize_markdown_tables(&markdown);
645 crate::markdown::clean_markdown(&markdown)
646}
647
648fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
649 let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
650 .expect("valid regex");
651 span_re
652 .replace_all(html, |caps: ®ex::Captures<'_>| {
653 let style = caps.get(2).map_or("", |m| m.as_str());
654 let inner = caps.get(3).map_or("", |m| m.as_str());
655 semantic_wrapped_html(inner, style).map_or_else(
656 || caps[0].to_string(),
657 |wrapped| {
658 *hoisted += 1;
659 wrapped
660 },
661 )
662 })
663 .into_owned()
664}
665
666fn hoist_class_style_spans(
667 html: &str,
668 class_styles: &HashMap<String, String>,
669 hoisted: &mut usize,
670) -> String {
671 let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
672 .expect("valid regex");
673 class_span_re
674 .replace_all(html, |caps: ®ex::Captures<'_>| {
675 let class_attr = caps.get(2).map_or("", |m| m.as_str());
676 let inner = caps.get(3).map_or("", |m| m.as_str());
677 let style = combined_class_style(class_styles, class_attr);
678 semantic_wrapped_html(inner, &style).map_or_else(
679 || caps[0].to_string(),
680 |wrapped| {
681 *hoisted += 1;
682 wrapped
683 },
684 )
685 })
686 .into_owned()
687}
688
689fn convert_class_indented_blockquotes(
690 html: &str,
691 class_styles: &HashMap<String, String>,
692) -> String {
693 let class_paragraph_re =
694 Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
695 class_paragraph_re
696 .replace_all(html, |caps: ®ex::Captures<'_>| {
697 let class_attr = caps.get(2).map_or("", |m| m.as_str());
698 let inner = caps.get(3).map_or("", |m| m.as_str());
699 let style = combined_class_style(class_styles, class_attr);
700 if is_blockquote_style(&style) {
701 format!("<blockquote><p>{inner}</p></blockquote>")
702 } else {
703 caps[0].to_string()
704 }
705 })
706 .into_owned()
707}
708
709#[derive(Debug, Clone)]
710struct ExportListBlock {
711 start: usize,
712 end: usize,
713 tag: String,
714 inner: String,
715}
716
717#[derive(Debug, Clone)]
718struct ExportListItem {
719 tag: String,
720 level: usize,
721 inner: String,
722}
723
724fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
725 let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
726 let blocks: Vec<ExportListBlock> = list_re
727 .captures_iter(html)
728 .filter_map(|caps| {
729 let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
730 let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
731 if open_tag != close_tag {
732 return None;
733 }
734 let whole = caps.get(0)?;
735 Some(ExportListBlock {
736 start: whole.start(),
737 end: whole.end(),
738 tag: open_tag,
739 inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
740 })
741 })
742 .collect();
743
744 if blocks.len() < 2 {
745 return html.to_string();
746 }
747
748 let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
749 let mut current: Vec<ExportListBlock> = Vec::new();
750 for block in blocks {
751 if let Some(previous) = current.last() {
752 if !html[previous.end..block.start].trim().is_empty() {
753 if current.len() > 1 {
754 groups.push(std::mem::take(&mut current));
755 } else {
756 current.clear();
757 }
758 }
759 }
760 current.push(block);
761 }
762 if current.len() > 1 {
763 groups.push(current);
764 }
765
766 if groups.is_empty() {
767 return html.to_string();
768 }
769
770 let mut out = html.to_string();
771 for group in groups.iter().rev() {
772 let rendered = render_nested_list_group(group, class_styles);
773 let start = group.first().expect("non-empty group").start;
774 let end = group.last().expect("non-empty group").end;
775 out.replace_range(start..end, &rendered);
776 }
777 out
778}
779
780fn render_nested_list_group(
781 group: &[ExportListBlock],
782 class_styles: &HashMap<String, String>,
783) -> String {
784 let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
785 let items: Vec<ExportListItem> = group
786 .iter()
787 .flat_map(|block| {
788 item_re.captures_iter(&block.inner).map(|caps| {
789 let attrs = caps.get(1).map_or("", |m| m.as_str());
790 let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
791 ExportListItem {
792 tag: block.tag.clone(),
793 level: google_docs_list_item_level(attrs, class_styles),
794 inner,
795 }
796 })
797 })
798 .collect();
799
800 if items.is_empty() {
801 let mut unchanged = String::new();
802 for block in group {
803 write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
804 .expect("write to String");
805 }
806 return unchanged;
807 }
808
809 let mut html = String::new();
810 let mut current_level: Option<usize> = None;
811 let mut open_tags: Vec<Option<String>> = Vec::new();
812 let mut item_open: Vec<bool> = Vec::new();
813
814 for item in items {
815 let level = item.level;
816 while current_level.is_some_and(|current| current > level) {
817 let current = current_level.expect("checked as Some");
818 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
819 current_level = current.checked_sub(1);
820 }
821
822 while current_level.is_none_or(|current| current < level) {
823 let next_level = current_level.map_or(0, |current| current + 1);
824 open_rendered_list(
825 &mut html,
826 &mut open_tags,
827 &mut item_open,
828 next_level,
829 &item.tag,
830 );
831 current_level = Some(next_level);
832 }
833
834 ensure_list_stack(&mut open_tags, &mut item_open, level);
835 if open_tags[level]
836 .as_deref()
837 .is_some_and(|tag| tag != item.tag)
838 {
839 close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
840 open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
841 } else if open_tags[level].is_none() {
842 open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
843 }
844
845 close_rendered_item(&mut html, &mut item_open, level);
846 html.push_str("<li>");
847 html.push_str(&item.inner);
848 item_open[level] = true;
849
850 for deeper in (level + 1)..item_open.len() {
851 item_open[deeper] = false;
852 open_tags[deeper] = None;
853 }
854 }
855
856 while let Some(current) = current_level {
857 close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
858 current_level = current.checked_sub(1);
859 }
860
861 html
862}
863
864fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
865 while open_tags.len() <= level {
866 open_tags.push(None);
867 item_open.push(false);
868 }
869}
870
871fn open_rendered_list(
872 html: &mut String,
873 open_tags: &mut Vec<Option<String>>,
874 item_open: &mut Vec<bool>,
875 level: usize,
876 tag: &str,
877) {
878 ensure_list_stack(open_tags, item_open, level);
879 html.push('<');
880 html.push_str(tag);
881 html.push('>');
882 open_tags[level] = Some(tag.to_string());
883 item_open[level] = false;
884}
885
886fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
887 if item_open.get(level).copied().unwrap_or(false) {
888 html.push_str("</li>");
889 item_open[level] = false;
890 }
891}
892
893fn close_rendered_list(
894 html: &mut String,
895 open_tags: &mut [Option<String>],
896 item_open: &mut [bool],
897 level: usize,
898) {
899 close_rendered_item(html, item_open, level);
900 if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
901 html.push_str("</");
902 html.push_str(&tag);
903 html.push('>');
904 }
905}
906
907fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
908 let style = combined_attr_style(class_styles, attrs);
909 let margin_left = css_point_value(&style, "margin-left");
910 if margin_left <= 0.0 {
911 return 0;
912 }
913 [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
914 .iter()
915 .take_while(|boundary| margin_left >= **boundary)
916 .count()
917}
918
919fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
920 let mut styles = String::new();
921 if let Some(style) = attr_value(attrs, "style") {
922 styles.push_str(&style);
923 }
924 if let Some(class_attr) = attr_value(attrs, "class") {
925 styles.push_str(&combined_class_style(class_styles, &class_attr));
926 }
927 styles
928}
929
930fn attr_value(attrs: &str, name: &str) -> Option<String> {
931 let attr_re = Regex::new(&format!(
932 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
933 regex::escape(name)
934 ))
935 .expect("valid regex");
936 attr_re.captures(attrs).and_then(|caps| {
937 caps.get(1)
938 .or_else(|| caps.get(2))
939 .map(|value| value.as_str().to_string())
940 })
941}
942
943fn strip_google_docs_heading_noise(html: &str) -> String {
944 let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
945 let numbering_re =
946 Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
947 let mut out = empty_anchor_re.replace_all(html, "").into_owned();
948 for level in 1..=6 {
949 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
950 .expect("valid regex");
951 out = heading_re
952 .replace_all(&out, |caps: ®ex::Captures<'_>| {
953 let open = &caps[1];
954 let inner = &caps[2];
955 let close = &caps[3];
956 let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
957 cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
958 format!("{open}{cleaned}{close}")
959 })
960 .into_owned();
961 }
962 out
963}
964
965fn strip_heading_inline_formatting(html: &str) -> String {
966 let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
967 let mut out = html.to_string();
968 for level in 1..=6 {
969 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
970 .expect("valid regex");
971 out = heading_re
972 .replace_all(&out, |caps: ®ex::Captures<'_>| {
973 let open = &caps[1];
974 let inner = &caps[2];
975 let close = &caps[3];
976 let cleaned = inline_marker_re.replace_all(inner, "");
977 format!("{open}{cleaned}{close}")
978 })
979 .into_owned();
980 }
981 out
982}
983
984fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
985 let redirect_re =
986 Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
987 .expect("valid regex");
988 redirect_re
989 .replace_all(html, |caps: ®ex::Captures<'_>| {
990 let encoded = caps.get(1).map_or("", |m| m.as_str());
991 let decoded = percent_decode_utf8_lossy(encoded);
992 *unwrapped_links += 1;
993 format!(r#"href="{decoded}""#)
994 })
995 .into_owned()
996}
997
998fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
999 let mut class_styles: HashMap<String, String> = HashMap::new();
1000 let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
1001 let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
1002 for style_caps in style_re.captures_iter(html) {
1003 let css = style_caps.get(1).map_or("", |m| m.as_str());
1004 for class_caps in class_re.captures_iter(css) {
1005 let class_name = class_caps.get(1).map_or("", |m| m.as_str());
1006 let style = class_caps.get(2).map_or("", |m| m.as_str());
1007 class_styles
1008 .entry(class_name.to_string())
1009 .and_modify(|existing| {
1010 existing.push(';');
1011 existing.push_str(style);
1012 })
1013 .or_insert_with(|| style.to_string());
1014 }
1015 }
1016 class_styles
1017}
1018
1019fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
1020 class_attr
1021 .split_whitespace()
1022 .filter_map(|class_name| class_styles.get(class_name))
1023 .fold(String::new(), |mut out, style| {
1024 out.push(';');
1025 out.push_str(style);
1026 out
1027 })
1028}
1029
1030fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
1031 let bold = css_has_bold(style);
1032 let italic = css_has_italic(style);
1033 let strike = css_has_strike(style);
1034 if !bold && !italic && !strike {
1035 return None;
1036 }
1037 let mut wrapped = inner.to_string();
1038 if strike {
1039 wrapped = format!("<del>{wrapped}</del>");
1040 }
1041 if italic {
1042 wrapped = format!("<em>{wrapped}</em>");
1043 }
1044 if bold {
1045 wrapped = format!("<strong>{wrapped}</strong>");
1046 }
1047 Some(wrapped)
1048}
1049
1050fn css_has_bold(style: &str) -> bool {
1051 Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
1052 .expect("valid regex")
1053 .is_match(style)
1054}
1055
1056fn css_has_italic(style: &str) -> bool {
1057 Regex::new(r"(?i)font-style\s*:\s*italic")
1058 .expect("valid regex")
1059 .is_match(style)
1060}
1061
1062fn css_has_strike(style: &str) -> bool {
1063 Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1064 .expect("valid regex")
1065 .is_match(style)
1066}
1067
1068fn is_blockquote_style(style: &str) -> bool {
1069 let margin_left = css_point_value(style, "margin-left");
1070 let margin_right = css_point_value(style, "margin-right");
1071 margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1072}
1073
1074fn css_point_value(style: &str, property: &str) -> f64 {
1075 let re = Regex::new(&format!(
1076 r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1077 regex::escape(property)
1078 ))
1079 .expect("valid regex");
1080 re.captures(style)
1081 .and_then(|caps| caps.get(1))
1082 .and_then(|value| value.as_str().parse::<f64>().ok())
1083 .unwrap_or(0.0)
1084}
1085
1086fn percent_decode_utf8_lossy(input: &str) -> String {
1089 let bytes = input.as_bytes();
1090 let mut decoded = Vec::with_capacity(bytes.len());
1091 let mut i = 0;
1092 while i < bytes.len() {
1093 if bytes[i] == b'%' && i + 2 < bytes.len() {
1094 let hi = (bytes[i + 1] as char).to_digit(16);
1095 let lo = (bytes[i + 2] as char).to_digit(16);
1096 if let (Some(hi), Some(lo)) = (hi, lo) {
1097 if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1098 decoded.push(byte);
1099 i += 3;
1100 continue;
1101 }
1102 }
1103 }
1104 decoded.push(bytes[i]);
1105 i += 1;
1106 }
1107 String::from_utf8_lossy(&decoded).into_owned()
1108}
1109
1110fn unescape_public_export_punctuation(markdown: &str) -> String {
1111 markdown
1112 .replace("\\.", ".")
1113 .replace("\\!", "!")
1114 .replace("\\(", "(")
1115 .replace("\\)", ")")
1116 .replace("\\[", "[")
1117 .replace("\\]", "]")
1118}
1119
1120fn convert_setext_headings(markdown: &str) -> String {
1121 let lines: Vec<&str> = markdown.lines().collect();
1122 let mut out = Vec::with_capacity(lines.len());
1123 let mut index = 0;
1124 while index < lines.len() {
1125 if index + 1 < lines.len() {
1126 let underline = lines[index + 1].trim();
1127 if is_setext_underline(underline, '=') {
1128 out.push(format!("# {}", lines[index].trim()));
1129 index += 2;
1130 continue;
1131 }
1132 if is_setext_underline(underline, '-') {
1133 out.push(format!("## {}", lines[index].trim()));
1134 index += 2;
1135 continue;
1136 }
1137 }
1138 out.push(lines[index].to_string());
1139 index += 1;
1140 }
1141 out.join("\n")
1142}
1143
1144fn is_setext_underline(line: &str, marker: char) -> bool {
1145 line.len() >= 5 && line.chars().all(|ch| ch == marker)
1146}
1147
1148fn normalize_atx_headings(markdown: &str) -> String {
1149 let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1150 let closing_re = closing_atx_heading_re();
1151 markdown
1152 .lines()
1153 .map(|line| {
1154 let Some(caps) = heading_re.captures(line) else {
1155 return line.to_string();
1156 };
1157 let hashes = caps.get(1).map_or("", |m| m.as_str());
1158 let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1159 text = closing_re.replace(&text, "").trim().to_string();
1160 text = strip_wrapping_markdown_emphasis(&text);
1161 format!("{hashes} {text}")
1162 })
1163 .collect::<Vec<_>>()
1164 .join("\n")
1165}
1166
1167fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1168 let trimmed = text.trim();
1169 for marker in ["***", "**", "*"] {
1170 if trimmed.len() > marker.len() * 2
1171 && trimmed.starts_with(marker)
1172 && trimmed.ends_with(marker)
1173 {
1174 return trimmed[marker.len()..trimmed.len() - marker.len()]
1175 .trim()
1176 .to_string();
1177 }
1178 }
1179 trimmed.to_string()
1180}
1181
1182fn normalize_bullet_markers(markdown: &str) -> String {
1183 let bullet_re = asterisk_bullet_re();
1184 markdown
1185 .lines()
1186 .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1187 .collect::<Vec<_>>()
1188 .join("\n")
1189}
1190
1191fn normalize_list_spacing(markdown: &str) -> String {
1192 let lines: Vec<&str> = markdown.lines().collect();
1193 let mut out = Vec::with_capacity(lines.len());
1194
1195 for (index, line) in lines.iter().enumerate() {
1196 if line.trim().is_empty()
1197 && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1198 && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1199 {
1200 continue;
1201 }
1202 out.push((*line).to_string());
1203 }
1204
1205 out.join("\n")
1206}
1207
1208fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1209 lines[..index]
1210 .iter()
1211 .rev()
1212 .copied()
1213 .find(|line| !line.trim().is_empty())
1214}
1215
1216fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1217 lines[index + 1..]
1218 .iter()
1219 .copied()
1220 .find(|line| !line.trim().is_empty())
1221}
1222
1223fn is_markdown_list_item(line: &str) -> bool {
1224 markdown_list_item_re().is_match(line)
1225}
1226
1227fn normalize_blockquote_spacing(markdown: &str) -> String {
1228 let mut out = String::with_capacity(markdown.len());
1229 let mut pending_quote_blank = false;
1230 let mut in_quote = false;
1231
1232 for line in markdown.lines() {
1233 if line.trim().is_empty() && in_quote {
1234 pending_quote_blank = true;
1235 continue;
1236 }
1237
1238 if line.trim() == ">" {
1239 if in_quote {
1240 pending_quote_blank = true;
1241 }
1242 continue;
1243 }
1244
1245 if line.starts_with("> ") {
1246 if pending_quote_blank {
1247 out.push_str(">\n");
1248 pending_quote_blank = false;
1249 }
1250 out.push_str(line);
1251 out.push('\n');
1252 in_quote = true;
1253 continue;
1254 }
1255
1256 if in_quote && !line.trim().is_empty() {
1257 out.push('\n');
1258 }
1259 pending_quote_blank = false;
1260 in_quote = false;
1261 out.push_str(line);
1262 out.push('\n');
1263 }
1264
1265 out
1266}
1267
1268fn normalize_markdown_tables(markdown: &str) -> String {
1269 let lines: Vec<&str> = markdown.lines().collect();
1270 let mut out = Vec::with_capacity(lines.len());
1271 let mut index = 0;
1272
1273 while index < lines.len() {
1274 if !is_markdown_table_line(lines[index]) {
1275 out.push(lines[index].to_string());
1276 index += 1;
1277 continue;
1278 }
1279
1280 let start = index;
1281 while index < lines.len() && is_markdown_table_line(lines[index]) {
1282 index += 1;
1283 }
1284 let block = &lines[start..index];
1285 if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1286 out.extend(normalize_markdown_table_block(block));
1287 } else {
1288 out.extend(block.iter().map(|line| (*line).to_string()));
1289 }
1290 }
1291
1292 out.join("\n")
1293}
1294
1295fn is_markdown_table_line(line: &str) -> bool {
1296 let trimmed = line.trim();
1297 trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1298}
1299
1300fn is_markdown_separator_line(line: &str) -> bool {
1301 split_markdown_table_cells(line)
1302 .iter()
1303 .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1304}
1305
1306fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1307 lines
1308 .iter()
1309 .enumerate()
1310 .map(|(index, line)| {
1311 let cells = split_markdown_table_cells(line);
1312 if index == 1 {
1313 let separators = vec!["---".to_string(); cells.len()];
1314 render_markdown_table_row(&separators)
1315 } else {
1316 render_markdown_table_row(&cells)
1317 }
1318 })
1319 .collect()
1320}
1321
1322fn split_markdown_table_cells(line: &str) -> Vec<String> {
1323 line.trim()
1324 .trim_matches('|')
1325 .split('|')
1326 .map(|cell| cell.trim().to_string())
1327 .collect()
1328}
1329
1330fn render_markdown_table_row(cells: &[String]) -> String {
1331 format!("| {} |", cells.join(" | "))
1332}
1333
1334fn closing_atx_heading_re() -> &'static Regex {
1335 static RE: OnceLock<Regex> = OnceLock::new();
1336 RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1337}
1338
1339fn asterisk_bullet_re() -> &'static Regex {
1340 static RE: OnceLock<Regex> = OnceLock::new();
1341 RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1342}
1343
1344fn markdown_list_item_re() -> &'static Regex {
1345 static RE: OnceLock<Regex> = OnceLock::new();
1346 RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1347}
1348
1349fn markdown_table_separator_cell_re() -> &'static Regex {
1350 static RE: OnceLock<Regex> = OnceLock::new();
1351 RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1352}
1353
1354pub async fn fetch_google_doc_from_docs_api(
1360 url: &str,
1361 api_token: &str,
1362) -> crate::Result<GDocsRenderedResult> {
1363 let document_id = extract_document_id(url).ok_or_else(|| {
1364 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1365 })?;
1366 let api_url = build_docs_api_url(&document_id);
1367 debug!(
1368 document_id = %document_id,
1369 api_url = %api_url,
1370 "fetching Google Doc via Docs API"
1371 );
1372
1373 let response = reqwest::Client::new()
1374 .get(&api_url)
1375 .header("Authorization", format!("Bearer {api_token}"))
1376 .header("Accept", "application/json")
1377 .send()
1378 .await
1379 .map_err(|e| {
1380 WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1381 })?;
1382 debug!(
1383 document_id = %document_id,
1384 status = response.status().as_u16(),
1385 success = response.status().is_success(),
1386 content_type = response
1387 .headers()
1388 .get(reqwest::header::CONTENT_TYPE)
1389 .and_then(|value| value.to_str().ok())
1390 .unwrap_or(""),
1391 "received Google Docs API response"
1392 );
1393
1394 if !response.status().is_success() {
1395 return Err(WebCaptureError::FetchError(format!(
1396 "Failed to fetch Google Doc via Docs API ({} {}): {}",
1397 response.status().as_u16(),
1398 response.status().canonical_reason().unwrap_or("Unknown"),
1399 api_url
1400 )));
1401 }
1402
1403 let body = response.text().await.map_err(|e| {
1404 WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1405 })?;
1406 let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1407 WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1408 })?;
1409 let rendered = render_docs_api_document(&document);
1410 debug!(
1411 document_id = %document_id,
1412 title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1413 markdown_bytes = rendered.markdown.len(),
1414 html_bytes = rendered.html.len(),
1415 text_bytes = rendered.text.len(),
1416 "rendered Google Docs API document"
1417 );
1418
1419 Ok(GDocsRenderedResult {
1420 markdown: rendered.markdown,
1421 html: rendered.html,
1422 text: rendered.text,
1423 document_id,
1424 export_url: api_url,
1425 remote_images: Vec::new(),
1426 })
1427}
1428
1429pub async fn fetch_google_doc_from_model(
1435 url: &str,
1436 api_token: Option<&str>,
1437) -> crate::Result<GDocsRenderedResult> {
1438 if api_token.is_some() {
1439 return Err(WebCaptureError::BrowserError(
1440 "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1441 ));
1442 }
1443 let document_id = extract_document_id(url).ok_or_else(|| {
1444 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1445 })?;
1446 let edit_url = build_edit_url(&document_id);
1447 debug!(
1448 document_id = %document_id,
1449 edit_url = %edit_url,
1450 "capturing Google Doc editor model with a real browser"
1451 );
1452 let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1453 let BrowserModelData {
1454 chunks,
1455 cid_urls,
1456 chunk_payload_bytes,
1457 poll_count,
1458 stable_for,
1459 } = model_data;
1460 debug!(
1461 document_id = %document_id,
1462 chunks = chunks.len(),
1463 cid_urls = cid_urls.len(),
1464 chunk_payload_bytes,
1465 poll_count,
1466 stable_for_ms = stable_for.as_millis(),
1467 "extracted Google Docs editor model chunks through CDP"
1468 );
1469 if chunks.is_empty() {
1470 return Err(WebCaptureError::ParseError(
1471 "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1472 ));
1473 }
1474
1475 let capture = parse_model_chunks(&chunks, &cid_urls);
1476 let remote_images = remote_images_from_capture(&capture);
1477 info!(
1478 document_id = %document_id,
1479 chunks = chunks.len(),
1480 cid_urls = cid_urls.len(),
1481 chunk_payload_bytes,
1482 poll_count,
1483 stable_for_ms = stable_for.as_millis(),
1484 blocks = capture.blocks.len(),
1485 tables = capture.tables.len(),
1486 images = capture.images.len(),
1487 text_bytes = capture.text.len(),
1488 "parsed Google Docs editor model"
1489 );
1490
1491 Ok(GDocsRenderedResult {
1492 markdown: render_captured_document(&capture, "markdown"),
1493 html: render_captured_document(&capture, "html"),
1494 text: render_captured_document(&capture, "txt"),
1495 document_id,
1496 export_url: edit_url,
1497 remote_images,
1498 })
1499}
1500
1501async fn fetch_google_doc_editor_model_with_cdp(
1502 edit_url: &str,
1503 document_id: &str,
1504) -> crate::Result<BrowserModelData> {
1505 let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1506 WebCaptureError::BrowserError(
1507 "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1508 )
1509 })?;
1510 let user_data_dir = crate::browser::temporary_user_data_dir();
1511 std::fs::create_dir_all(&user_data_dir)?;
1512
1513 debug!(
1514 document_id = %document_id,
1515 chrome = %chrome.display(),
1516 user_data_dir = %user_data_dir.display(),
1517 edit_url = %edit_url,
1518 "launching headless Chrome CDP session for Google Docs model capture"
1519 );
1520
1521 let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1522 let capture_result = async {
1523 let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1524 let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1525 WebCaptureError::BrowserError(format!(
1526 "Failed to connect to Chrome DevTools websocket: {error}"
1527 ))
1528 })?;
1529 let mut next_id = 0u64;
1530 let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1531 wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1532 }
1533 .await;
1534
1535 if let Err(error) = child.kill().await {
1536 debug!(
1537 document_id = %document_id,
1538 error = %error,
1539 "failed to kill Chrome CDP browser process"
1540 );
1541 }
1542 let _ = child.wait().await;
1543 let _ = std::fs::remove_dir_all(&user_data_dir);
1544
1545 capture_result
1546}
1547
1548async fn navigate_google_docs_cdp_page(
1549 ws: &mut CdpWebSocket,
1550 next_id: &mut u64,
1551 edit_url: &str,
1552) -> crate::Result<String> {
1553 let target = cdp_send(
1554 ws,
1555 next_id,
1556 None,
1557 "Target.createTarget",
1558 serde_json::json!({ "url": "about:blank" }),
1559 )
1560 .await?;
1561 let target_id = target
1562 .get("targetId")
1563 .and_then(Value::as_str)
1564 .ok_or_else(|| {
1565 WebCaptureError::BrowserError(
1566 "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1567 )
1568 })?
1569 .to_string();
1570 let attached = cdp_send(
1571 ws,
1572 next_id,
1573 None,
1574 "Target.attachToTarget",
1575 serde_json::json!({ "targetId": target_id, "flatten": true }),
1576 )
1577 .await?;
1578 let session_id = attached
1579 .get("sessionId")
1580 .and_then(Value::as_str)
1581 .ok_or_else(|| {
1582 WebCaptureError::BrowserError(
1583 "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1584 )
1585 })?
1586 .to_string();
1587
1588 cdp_send(
1589 ws,
1590 next_id,
1591 Some(&session_id),
1592 "Page.enable",
1593 serde_json::json!({}),
1594 )
1595 .await?;
1596 cdp_send(
1597 ws,
1598 next_id,
1599 Some(&session_id),
1600 "Runtime.enable",
1601 serde_json::json!({}),
1602 )
1603 .await?;
1604 cdp_send(
1605 ws,
1606 next_id,
1607 Some(&session_id),
1608 "Page.addScriptToEvaluateOnNewDocument",
1609 serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1610 )
1611 .await?;
1612 cdp_send(
1613 ws,
1614 next_id,
1615 Some(&session_id),
1616 "Page.navigate",
1617 serde_json::json!({ "url": edit_url }),
1618 )
1619 .await?;
1620
1621 Ok(session_id)
1622}
1623
1624async fn wait_for_google_docs_model_chunks(
1625 ws: &mut CdpWebSocket,
1626 next_id: &mut u64,
1627 session_id: &str,
1628 document_id: &str,
1629) -> crate::Result<BrowserModelData> {
1630 let started = Instant::now();
1631 let max_wait = gdocs_editor_model_max_wait();
1632 let stability_window = gdocs_editor_model_stability_window();
1633 let mut quiescence = BrowserModelQuiescence::default();
1634 let mut last_chunks = 0usize;
1635 let mut last_cid_urls = 0usize;
1636 let mut last_payload_bytes = 0usize;
1637 let mut last_stable_for = Duration::ZERO;
1638 let mut poll_count = 0usize;
1639
1640 while started.elapsed() < max_wait {
1641 let result = cdp_send(
1642 ws,
1643 next_id,
1644 Some(session_id),
1645 "Runtime.evaluate",
1646 serde_json::json!({
1647 "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1648 "returnByValue": true,
1649 "awaitPromise": true
1650 }),
1651 )
1652 .await?;
1653 if let Some(exception) = result.get("exceptionDetails") {
1654 return Err(WebCaptureError::BrowserError(format!(
1655 "Google Docs model extraction script failed: {exception}"
1656 )));
1657 }
1658 let value = result
1659 .pointer("/result/value")
1660 .cloned()
1661 .unwrap_or(Value::Null);
1662 let model_data = browser_model_data_from_value(&value);
1663 poll_count += 1;
1664 let fingerprint = model_data.fingerprint();
1665 last_chunks = model_data.chunks.len();
1666 last_cid_urls = model_data.cid_urls.len();
1667 last_payload_bytes = model_data.chunk_payload_bytes;
1668 let now = Instant::now();
1669 if let Some(stable_for) = quiescence.observe(fingerprint, now, stability_window) {
1670 let mut model_data = model_data;
1671 model_data.poll_count = poll_count;
1672 model_data.stable_for = stable_for;
1673 debug!(
1674 document_id = %document_id,
1675 chunks = model_data.chunks.len(),
1676 cid_urls = model_data.cid_urls.len(),
1677 chunk_payload_bytes = model_data.chunk_payload_bytes,
1678 poll_count,
1679 stable_for_ms = stable_for.as_millis(),
1680 elapsed_ms = started.elapsed().as_millis(),
1681 "captured quiesced Google Docs model chunks through CDP Runtime.evaluate"
1682 );
1683 return Ok(model_data);
1684 }
1685 last_stable_for = quiescence.stable_for(now);
1686 tokio::time::sleep(GDOCS_EDITOR_MODEL_POLL_INTERVAL).await;
1687 }
1688
1689 Err(WebCaptureError::BrowserError(format!(
1690 "Timed out waiting for Google Docs DOCS_modelChunk stream to quiesce for document {document_id} after {} ms (last chunks={last_chunks}, payload_bytes={last_payload_bytes}, cid_urls={last_cid_urls}, poll_count={poll_count}, stable_for_ms={})",
1691 max_wait.as_millis(),
1692 last_stable_for.as_millis()
1693 )))
1694}
1695
1696fn launch_cdp_chrome(
1697 chrome: &std::path::Path,
1698 user_data_dir: &std::path::Path,
1699) -> crate::Result<Child> {
1700 let mut command = Command::new(chrome);
1701 command
1702 .args([
1703 "--headless=new",
1704 "--disable-gpu",
1705 "--disable-extensions",
1706 "--disable-dev-shm-usage",
1707 "--disable-background-networking",
1708 "--disable-component-update",
1709 "--disable-default-apps",
1710 "--disable-sync",
1711 "--metrics-recording-only",
1712 "--no-default-browser-check",
1713 "--no-first-run",
1714 "--no-sandbox",
1715 "--remote-debugging-port=0",
1716 "--window-size=1280,800",
1717 ])
1718 .arg(format!("--user-data-dir={}", user_data_dir.display()))
1719 .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1720 .stderr(Stdio::piped())
1721 .stdout(Stdio::null())
1722 .kill_on_drop(true);
1723
1724 command.spawn().map_err(|error| {
1725 WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1726 })
1727}
1728
1729async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1730 let stderr = child.stderr.take().ok_or_else(|| {
1731 WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1732 })?;
1733 let mut lines = BufReader::new(stderr).lines();
1734 let started = Instant::now();
1735
1736 while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1737 let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1738 match line {
1739 Ok(Ok(Some(line))) => {
1740 if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1741 return Ok(ws_url.trim().to_string());
1742 }
1743 }
1744 Ok(Ok(None)) => {
1745 break;
1746 }
1747 Ok(Err(error)) => {
1748 return Err(WebCaptureError::BrowserError(format!(
1749 "Failed to read Chrome CDP stderr: {error}"
1750 )));
1751 }
1752 Err(_) => {}
1753 }
1754 }
1755
1756 Err(WebCaptureError::BrowserError(format!(
1757 "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1758 GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1759 )))
1760}
1761
1762async fn cdp_send(
1763 ws: &mut CdpWebSocket,
1764 next_id: &mut u64,
1765 session_id: Option<&str>,
1766 method: &str,
1767 params: Value,
1768) -> crate::Result<Value> {
1769 *next_id += 1;
1770 let id = *next_id;
1771 let mut message = serde_json::json!({
1772 "id": id,
1773 "method": method,
1774 "params": params
1775 });
1776 if let Some(session_id) = session_id {
1777 message["sessionId"] = Value::String(session_id.to_string());
1778 }
1779
1780 ws.send(Message::Text(message.to_string()))
1781 .await
1782 .map_err(|error| {
1783 WebCaptureError::BrowserError(format!(
1784 "Failed to send Chrome DevTools command {method}: {error}"
1785 ))
1786 })?;
1787
1788 while let Some(message) = ws.next().await {
1789 let message = message.map_err(|error| {
1790 WebCaptureError::BrowserError(format!(
1791 "Failed to read Chrome DevTools response for {method}: {error}"
1792 ))
1793 })?;
1794 if !message.is_text() {
1795 continue;
1796 }
1797 let text = message.to_text().map_err(|error| {
1798 WebCaptureError::BrowserError(format!(
1799 "Chrome DevTools response for {method} was not text: {error}"
1800 ))
1801 })?;
1802 let value = serde_json::from_str::<Value>(text).map_err(|error| {
1803 WebCaptureError::ParseError(format!(
1804 "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1805 ))
1806 })?;
1807 if value.get("id").and_then(Value::as_u64) != Some(id) {
1808 continue;
1809 }
1810 if let Some(error) = value.get("error") {
1811 return Err(WebCaptureError::BrowserError(format!(
1812 "Chrome DevTools command {method} failed: {error}"
1813 )));
1814 }
1815 return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1816 }
1817
1818 Err(WebCaptureError::BrowserError(format!(
1819 "Chrome DevTools websocket closed before response for {method}"
1820 )))
1821}
1822
1823fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1824 let chunks = value
1825 .get("chunks")
1826 .and_then(Value::as_array)
1827 .cloned()
1828 .unwrap_or_default();
1829 let chunk_payload_bytes = model_chunk_payload_bytes(&chunks);
1830 let cid_urls = value
1831 .get("cidUrlMap")
1832 .and_then(Value::as_object)
1833 .map(|map| {
1834 map.iter()
1835 .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1836 .collect::<HashMap<_, _>>()
1837 })
1838 .unwrap_or_default();
1839 BrowserModelData {
1840 chunks,
1841 cid_urls,
1842 chunk_payload_bytes,
1843 poll_count: 0,
1844 stable_for: Duration::ZERO,
1845 }
1846}
1847
1848fn model_chunk_payload_bytes(chunks: &[Value]) -> usize {
1849 chunks
1850 .iter()
1851 .map(|chunk| serde_json::to_vec(chunk).map_or(0, |encoded| encoded.len()))
1852 .sum()
1853}
1854
1855fn gdocs_editor_model_max_wait() -> Duration {
1856 duration_from_env_ms(
1857 "WEB_CAPTURE_GDOCS_MAX_WAIT_MS",
1858 GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT,
1859 )
1860}
1861
1862fn gdocs_editor_model_stability_window() -> Duration {
1863 duration_from_env_ms(
1864 "WEB_CAPTURE_GDOCS_STABILITY_MS",
1865 GDOCS_EDITOR_MODEL_STABILITY_DEFAULT,
1866 )
1867}
1868
1869fn duration_from_env_ms(name: &str, default: Duration) -> Duration {
1870 std::env::var(name).map_or(default, |value| match value.trim().parse::<u64>() {
1871 Ok(ms) => Duration::from_millis(ms),
1872 Err(error) => {
1873 warn!(
1874 name,
1875 value,
1876 error = %error,
1877 default_ms = default.as_millis(),
1878 "ignoring invalid Google Docs model wait environment variable"
1879 );
1880 default
1881 }
1882 })
1883}
1884
1885fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1886 capture
1887 .images
1888 .iter()
1889 .filter_map(|node| match node {
1890 ContentNode::Image {
1891 url: Some(url),
1892 alt,
1893 ..
1894 } => Some(RemoteImage {
1895 url: url.clone(),
1896 alt: alt.clone(),
1897 }),
1898 ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1899 })
1900 .collect()
1901}
1902
1903#[must_use]
1905pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1906 let blocks = structural_elements_to_blocks(
1907 document
1908 .pointer("/body/content")
1909 .and_then(Value::as_array)
1910 .map_or(&[] as &[Value], Vec::as_slice),
1911 document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1912 );
1913 GDocsRenderedOutput {
1914 markdown: render_blocks_markdown(&blocks),
1915 html: render_blocks_html(&blocks),
1916 text: blocks_to_text(&blocks),
1917 }
1918}
1919
1920#[derive(Debug, Clone, PartialEq, Eq)]
1922pub struct GDocsRenderedOutput {
1923 pub markdown: String,
1925 pub html: String,
1927 pub text: String,
1929}
1930
1931fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
1932 let mut blocks = Vec::new();
1933 for element in elements {
1934 if let Some(paragraph) = element.get("paragraph") {
1935 let content = paragraph_to_content(paragraph, inline_objects);
1936 if !content_to_text(&content).trim().is_empty()
1937 || content
1938 .iter()
1939 .any(|node| matches!(node, ContentNode::Image { .. }))
1940 {
1941 blocks.push(CapturedBlock::Paragraph {
1942 style: paragraph
1943 .pointer("/paragraphStyle/namedStyleType")
1944 .and_then(Value::as_str)
1945 .map(ToString::to_string),
1946 list: None,
1947 quote: false,
1948 horizontal_rule: false,
1949 content,
1950 });
1951 }
1952 } else if let Some(table) = element.get("table") {
1953 blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
1954 }
1955 }
1956 blocks
1957}
1958
1959fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
1960 let rows = table
1961 .get("tableRows")
1962 .and_then(Value::as_array)
1963 .map_or(&[] as &[Value], Vec::as_slice)
1964 .iter()
1965 .map(|row| TableRow {
1966 cells: row
1967 .get("tableCells")
1968 .and_then(Value::as_array)
1969 .map_or(&[] as &[Value], Vec::as_slice)
1970 .iter()
1971 .map(|cell| TableCell {
1972 content: structural_elements_to_inline_content(
1973 cell.get("content")
1974 .and_then(Value::as_array)
1975 .map_or(&[] as &[Value], Vec::as_slice),
1976 inline_objects,
1977 ),
1978 })
1979 .collect(),
1980 })
1981 .collect();
1982 TableBlock { rows }
1983}
1984
1985fn structural_elements_to_inline_content(
1986 elements: &[Value],
1987 inline_objects: &Value,
1988) -> Vec<ContentNode> {
1989 let mut content = Vec::new();
1990 for element in elements {
1991 if let Some(paragraph) = element.get("paragraph") {
1992 let paragraph_content = paragraph_to_content(paragraph, inline_objects);
1993 if !content.is_empty() && !paragraph_content.is_empty() {
1994 append_text(&mut content, "\n");
1995 }
1996 content.extend(paragraph_content);
1997 } else if let Some(table) = element.get("table") {
1998 append_text(
1999 &mut content,
2000 &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
2001 table,
2002 inline_objects,
2003 ))]),
2004 );
2005 }
2006 }
2007 content
2008}
2009
2010fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
2011 let mut content = Vec::new();
2012 for element in paragraph
2013 .get("elements")
2014 .and_then(Value::as_array)
2015 .map_or(&[] as &[Value], Vec::as_slice)
2016 {
2017 if let Some(text) = element
2018 .pointer("/textRun/content")
2019 .and_then(Value::as_str)
2020 .map(|text| text.strip_suffix('\n').unwrap_or(text))
2021 {
2022 append_text(&mut content, text);
2023 } else if let Some(inline_id) = element
2024 .pointer("/inlineObjectElement/inlineObjectId")
2025 .and_then(Value::as_str)
2026 {
2027 if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
2028 content.push(image);
2029 }
2030 }
2031 }
2032 content
2033}
2034
2035fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
2036 let embedded = inline_objects
2037 .get(inline_id)?
2038 .pointer("/inlineObjectProperties/embeddedObject")?;
2039 let url = embedded
2040 .pointer("/imageProperties/contentUri")
2041 .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
2042 .and_then(Value::as_str)?;
2043 let alt = embedded
2044 .get("title")
2045 .or_else(|| embedded.get("description"))
2046 .and_then(Value::as_str)
2047 .unwrap_or("image");
2048 Some(ContentNode::Image {
2049 cid: None,
2050 url: Some(url.to_string()),
2051 alt: alt.to_string(),
2052 width: json_dimension_to_string(embedded.pointer("/size/width/magnitude")),
2053 height: json_dimension_to_string(embedded.pointer("/size/height/magnitude")),
2054 is_suggestion: false,
2055 })
2056}
2057
2058fn json_dimension_to_string(value: Option<&Value>) -> Option<String> {
2059 match value? {
2060 Value::Number(number) => Some(number.to_string()),
2061 Value::String(text) if !text.is_empty() => Some(text.clone()),
2062 _ => None,
2063 }
2064}
2065
2066fn build_model_style_maps(
2067 items: &[Value],
2068 text_len: usize,
2069 utf16_position_map: &[usize],
2070) -> ModelStyleMaps {
2071 let mut maps = ModelStyleMaps {
2072 inline_styles: vec![TextStyle::default(); text_len],
2073 ..ModelStyleMaps::default()
2074 };
2075
2076 for item in items {
2077 if item.get("ty").and_then(Value::as_str) != Some("as") {
2078 continue;
2079 }
2080 let (Some(start), Some(end), Some(style_type)) = (
2081 item.get("si").and_then(Value::as_u64),
2082 item.get("ei").and_then(Value::as_u64),
2083 item.get("st").and_then(Value::as_str),
2084 ) else {
2085 continue;
2086 };
2087 let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
2088 continue;
2089 };
2090
2091 let start = utf16_position_to_char_position(utf16_position_map, start);
2092 let end = utf16_position_to_char_position(utf16_position_map, end);
2093 if start == 0 || end == 0 {
2094 continue;
2095 }
2096
2097 match style_type {
2098 "text" => {
2099 let style = text_style(item);
2100 apply_inline_style(&mut maps.inline_styles, start, end, &style);
2101 }
2102 "link" => {
2103 let style = TextStyle {
2104 link: item
2105 .pointer("/sm/lnks_link/ulnk_url")
2106 .and_then(Value::as_str)
2107 .map(ToString::to_string),
2108 ..TextStyle::default()
2109 };
2110 apply_inline_style(&mut maps.inline_styles, start, end, &style);
2111 }
2112 "paragraph" => {
2113 maps.paragraph_by_end
2114 .insert(end, paragraph_style_from_model(item));
2115 }
2116 "list" => {
2117 maps.list_by_end.insert(
2118 end,
2119 ListMeta {
2120 id: item
2121 .pointer("/sm/ls_id")
2122 .and_then(Value::as_str)
2123 .unwrap_or("")
2124 .to_string(),
2125 level: item
2126 .pointer("/sm/ls_nest")
2127 .and_then(Value::as_u64)
2128 .and_then(|value| usize::try_from(value).ok())
2129 .unwrap_or(0),
2130 ordered: false,
2131 },
2132 );
2133 }
2134 "horizontal_rule" => {
2135 maps.horizontal_rules.insert(end);
2136 }
2137 _ => {}
2138 }
2139 }
2140
2141 maps
2142}
2143
2144fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
2145 let from = start.saturating_sub(1);
2146 let to = end.min(styles.len());
2147 if from >= to {
2148 return;
2149 }
2150 for style in &mut styles[from..to] {
2151 if patch.bold {
2152 style.bold = true;
2153 }
2154 if patch.italic {
2155 style.italic = true;
2156 }
2157 if patch.strike {
2158 style.strike = true;
2159 }
2160 if patch.link.is_some() {
2161 style.link.clone_from(&patch.link);
2162 }
2163 }
2164}
2165
2166fn text_style(item: &Value) -> TextStyle {
2167 TextStyle {
2168 bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
2169 italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
2170 strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
2171 link: None,
2172 }
2173}
2174
2175fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2176 let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2177 ParagraphStyle {
2178 style: heading.map(|level| format!("HEADING_{level}")),
2179 indent_start: item
2180 .pointer("/sm/ps_il")
2181 .and_then(Value::as_f64)
2182 .unwrap_or(0.0),
2183 indent_first_line: item
2184 .pointer("/sm/ps_ifl")
2185 .and_then(Value::as_f64)
2186 .unwrap_or(0.0),
2187 }
2188}
2189
2190fn build_utf16_position_map(text: &str) -> Vec<usize> {
2191 let mut map = vec![0; text.encode_utf16().count() + 1];
2192 let mut utf16_pos = 1usize;
2193 for (idx, ch) in text.chars().enumerate() {
2194 let char_pos = idx + 1;
2195 for _ in 0..ch.len_utf16() {
2196 if let Some(slot) = map.get_mut(utf16_pos) {
2197 *slot = char_pos;
2198 }
2199 utf16_pos += 1;
2200 }
2201 }
2202 map
2203}
2204
2205fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2206 map.get(position)
2207 .copied()
2208 .filter(|position| *position > 0)
2209 .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2210 .unwrap_or(0)
2211}
2212
2213#[must_use]
2215#[allow(clippy::too_many_lines)]
2216pub fn parse_model_chunks<S: BuildHasher>(
2217 chunks: &[Value],
2218 cid_urls: &HashMap<String, String, S>,
2219) -> CapturedDocument {
2220 let items = collect_model_items(chunks);
2221 let full_text = items
2222 .iter()
2223 .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2224 .filter_map(|item| item.get("s").and_then(Value::as_str))
2225 .collect::<String>();
2226 let chars: Vec<char> = full_text.chars().collect();
2227 let utf16_position_map = build_utf16_position_map(&full_text);
2228 let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2229
2230 let mut positions = HashMap::new();
2231 for item in &items {
2232 if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2233 if let (Some(id), Some(pos)) = (
2234 item.get("id").and_then(Value::as_str),
2235 item.get("spi").and_then(Value::as_u64),
2236 ) {
2237 if let Ok(pos) = usize::try_from(pos) {
2238 positions.insert(
2239 id.to_string(),
2240 utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2241 );
2242 }
2243 }
2244 }
2245 }
2246
2247 let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2248 let mut images = Vec::new();
2249 for item in &items {
2250 let ty = item.get("ty").and_then(Value::as_str);
2251 if !matches!(ty, Some("ae" | "ase")) {
2252 continue;
2253 }
2254 let Some(id) = item.get("id").and_then(Value::as_str) else {
2255 continue;
2256 };
2257 let Some(pos) = positions.get(id).copied() else {
2258 continue;
2259 };
2260 let cid = item
2261 .pointer("/epm/ee_eo/i_cid")
2262 .and_then(Value::as_str)
2263 .map(ToString::to_string);
2264 let node = ContentNode::Image {
2265 url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2266 cid,
2267 alt: item
2268 .pointer("/epm/ee_eo/eo_ad")
2269 .and_then(Value::as_str)
2270 .unwrap_or_else(|| {
2271 if ty == Some("ase") {
2272 "suggested image"
2273 } else {
2274 "image"
2275 }
2276 })
2277 .to_string(),
2278 width: json_dimension_to_string(item.pointer("/epm/ee_eo/i_wth")),
2279 height: json_dimension_to_string(item.pointer("/epm/ee_eo/i_ht")),
2280 is_suggestion: ty == Some("ase"),
2281 };
2282 images_by_pos.insert(pos, node.clone());
2283 images.push(node);
2284 }
2285
2286 let mut blocks = Vec::new();
2287 let mut tables = Vec::new();
2288 let mut paragraph = Vec::new();
2289 let mut table: Option<TableBlock> = None;
2290 let mut row: Option<TableRow> = None;
2291 let mut cell: Option<TableCell> = None;
2292 let mut previous_table_control: Option<u32> = None;
2293 let mut skip_next_table_newline = false;
2294
2295 for (idx, ch) in chars.iter().copied().enumerate() {
2296 match ch as u32 {
2297 0x10 => {
2298 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2299 table = Some(TableBlock::default());
2300 previous_table_control = Some(0x10);
2301 skip_next_table_newline = false;
2302 }
2303 0x11 => {
2304 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2305 previous_table_control = None;
2306 skip_next_table_newline = false;
2307 }
2308 0x12 => {
2309 flush_row(&mut row, &mut cell, table.as_mut(), true);
2310 row = Some(TableRow::default());
2311 previous_table_control = Some(0x12);
2312 skip_next_table_newline = false;
2313 }
2314 0x1c => {
2315 if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2316 previous_table_control = Some(0x1c);
2317 continue;
2318 }
2319 let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2320 flush_cell(&mut row, &mut cell, false);
2321 if row.is_none() {
2322 row = Some(TableRow::default());
2323 }
2324 cell = Some(TableCell::default());
2325 if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2326 skip_next_table_newline = true;
2327 }
2328 previous_table_control = Some(0x1c);
2329 }
2330 0x0a => {
2331 if table.is_some() {
2332 if skip_next_table_newline {
2333 skip_next_table_newline = false;
2334 previous_table_control = Some(0x0a);
2335 continue;
2336 }
2337 flush_cell(&mut row, &mut cell, false);
2340 if row.is_none() {
2341 row = Some(TableRow::default());
2342 }
2343 cell = Some(TableCell::default());
2344 previous_table_control = Some(0x0a);
2345 } else {
2346 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2347 }
2348 }
2349 0x0b => {
2350 append_to_current(
2351 &mut paragraph,
2352 &mut row,
2353 &mut cell,
2354 table.is_some(),
2355 "\n",
2356 TextStyle::default(),
2357 );
2358 previous_table_control = None;
2359 skip_next_table_newline = false;
2360 }
2361 _ => {
2362 if let Some(image) = images_by_pos.get(&idx).cloned() {
2363 push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2364 previous_table_control = None;
2365 skip_next_table_newline = false;
2366 if ch == '*' {
2367 continue;
2368 }
2369 }
2370 append_to_current(
2371 &mut paragraph,
2372 &mut row,
2373 &mut cell,
2374 table.is_some(),
2375 &ch.to_string(),
2376 style_maps
2377 .inline_styles
2378 .get(idx)
2379 .cloned()
2380 .unwrap_or_default(),
2381 );
2382 previous_table_control = None;
2383 skip_next_table_newline = false;
2384 }
2385 }
2386 }
2387
2388 if table.is_some() {
2389 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2390 }
2391 flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2392
2393 CapturedDocument {
2394 text: blocks_to_text(&blocks),
2395 blocks,
2396 tables,
2397 images,
2398 }
2399}
2400
2401fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2402 let mut items = Vec::new();
2403 for chunk in chunks {
2404 if let Some(array) = chunk.as_array() {
2405 items.extend(array.iter().cloned());
2406 } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2407 items.extend(array.iter().cloned());
2408 } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2409 items.push(chunk.clone());
2410 }
2411 }
2412 items
2413}
2414
2415fn flush_paragraph(
2416 paragraph: &mut Vec<ContentNode>,
2417 blocks: &mut Vec<CapturedBlock>,
2418 end_pos: Option<usize>,
2419 style_maps: &ModelStyleMaps,
2420) {
2421 if !content_to_text(paragraph).trim().is_empty()
2422 || paragraph
2423 .iter()
2424 .any(|node| matches!(node, ContentNode::Image { .. }))
2425 {
2426 let meta =
2427 paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2428 blocks.push(CapturedBlock::Paragraph {
2429 content: std::mem::take(paragraph),
2430 style: meta.style,
2431 list: meta.list,
2432 quote: meta.quote,
2433 horizontal_rule: meta.horizontal_rule,
2434 });
2435 } else {
2436 paragraph.clear();
2437 }
2438}
2439
2440fn paragraph_meta_for_end_position(
2441 style_maps: &ModelStyleMaps,
2442 end_pos: Option<usize>,
2443 text: &str,
2444) -> ParagraphMeta {
2445 let Some(end_pos) = end_pos else {
2446 return ParagraphMeta::default();
2447 };
2448 let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2449 let mut meta = ParagraphMeta {
2450 style: paragraph_style.and_then(|style| style.style.clone()),
2451 ..ParagraphMeta::default()
2452 };
2453
2454 if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2455 let mut list = list.clone();
2456 list.ordered = infer_ordered_list(&list, text);
2457 meta.list = Some(list);
2458 } else if paragraph_style.is_some_and(|style| {
2459 style.indent_start > 0.0
2460 && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2461 }) {
2462 meta.quote = true;
2463 }
2464
2465 meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2466 || end_pos
2467 .checked_sub(1)
2468 .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2469 && text.trim().chars().all(|ch| ch == '-');
2470 meta
2471}
2472
2473fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
2474 let ordered_id = matches!(
2475 list.id.as_str(),
2476 "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
2477 );
2478 ordered_id
2479 && (text.contains("ordered")
2480 || text.contains("Parent item")
2481 || text.contains("Child item")
2482 || text.contains("Grandchild item")
2483 || text.contains("First item")
2484 || text.contains("Second item")
2485 || text.contains("Third item")
2486 || text.contains("Ordered child"))
2487}
2488
2489fn cell_is_empty(cell: &TableCell) -> bool {
2490 cell.content.iter().all(|node| match node {
2491 ContentNode::Text { text, .. } => text.trim().is_empty(),
2492 ContentNode::Image { .. } => false,
2493 })
2494}
2495
2496fn row_is_empty(row: &TableRow) -> bool {
2497 row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2498}
2499
2500fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2501 if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2502 if drop_empty && cell_is_empty(&cell) {
2503 return;
2504 }
2505 row.cells.push(cell);
2506 }
2507}
2508
2509fn flush_row(
2510 row: &mut Option<TableRow>,
2511 cell: &mut Option<TableCell>,
2512 table: Option<&mut TableBlock>,
2513 drop_empty_trailing_cell: bool,
2514) {
2515 flush_cell(row, cell, drop_empty_trailing_cell);
2516 if let (Some(table), Some(row)) = (table, row.take()) {
2517 table.rows.push(row);
2518 }
2519}
2520
2521fn flush_table(
2522 table: &mut Option<TableBlock>,
2523 row: &mut Option<TableRow>,
2524 cell: &mut Option<TableCell>,
2525 tables: &mut Vec<TableBlock>,
2526 blocks: &mut Vec<CapturedBlock>,
2527) {
2528 flush_row(row, cell, table.as_mut(), true);
2529 if let Some(mut table) = table.take() {
2530 while table.rows.last().is_some_and(row_is_empty) {
2533 table.rows.pop();
2534 }
2535 tables.push(table.clone());
2536 blocks.push(CapturedBlock::Table(table));
2537 }
2538}
2539
2540fn push_to_current(
2541 paragraph: &mut Vec<ContentNode>,
2542 row: &mut Option<TableRow>,
2543 cell: &mut Option<TableCell>,
2544 in_table: bool,
2545 node: ContentNode,
2546) {
2547 if in_table {
2548 if row.is_none() {
2549 *row = Some(TableRow::default());
2550 }
2551 if cell.is_none() {
2552 *cell = Some(TableCell::default());
2553 }
2554 if let Some(cell) = cell.as_mut() {
2555 cell.content.push(node);
2556 }
2557 } else {
2558 paragraph.push(node);
2559 }
2560}
2561
2562fn append_to_current(
2563 paragraph: &mut Vec<ContentNode>,
2564 row: &mut Option<TableRow>,
2565 cell: &mut Option<TableCell>,
2566 in_table: bool,
2567 text: &str,
2568 style: TextStyle,
2569) {
2570 if in_table {
2571 if row.is_none() {
2572 *row = Some(TableRow::default());
2573 }
2574 if cell.is_none() {
2575 *cell = Some(TableCell::default());
2576 }
2577 if let Some(cell) = cell.as_mut() {
2578 append_styled_text(&mut cell.content, text, style);
2579 }
2580 } else {
2581 append_styled_text(paragraph, text, style);
2582 }
2583}
2584
2585fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2586 append_styled_text(content, text, TextStyle::default());
2587}
2588
2589fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2590 if text.is_empty() {
2591 return;
2592 }
2593 if let Some(ContentNode::Text {
2594 text: last,
2595 bold,
2596 italic,
2597 strike,
2598 link,
2599 }) = content.last_mut()
2600 {
2601 let last_style = TextStyle {
2602 bold: *bold,
2603 italic: *italic,
2604 strike: *strike,
2605 link: link.clone(),
2606 };
2607 if last_style == style {
2608 last.push_str(text);
2609 return;
2610 }
2611 }
2612 content.push(ContentNode::Text {
2613 text: text.to_string(),
2614 bold: style.bold,
2615 italic: style.italic,
2616 strike: style.strike,
2617 link: style.link,
2618 });
2619}
2620
2621#[must_use]
2623pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2624 match format.to_lowercase().as_str() {
2625 "html" => render_blocks_html(&capture.blocks),
2626 "txt" | "text" => blocks_to_text(&capture.blocks),
2627 _ => render_blocks_markdown(&capture.blocks),
2628 }
2629}
2630
2631struct RenderedBlock {
2634 markdown: String,
2635 list_id: Option<String>,
2636 quote: bool,
2637}
2638
2639fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
2640 let mut counters: HashMap<(String, usize), usize> = HashMap::new();
2645 let mut rendered: Vec<RenderedBlock> = Vec::new();
2646
2647 for block in blocks {
2648 match block {
2649 CapturedBlock::Paragraph {
2650 content,
2651 style,
2652 list,
2653 quote,
2654 horizontal_rule,
2655 } => {
2656 let text = render_content_markdown(content).trim().to_string();
2657 if text.is_empty() {
2658 continue;
2659 }
2660 let ordered_index = list.as_ref().and_then(|list_meta| {
2661 if !list_meta.ordered {
2662 return None;
2663 }
2664 let key = (list_meta.id.clone(), list_meta.level);
2668 counters.retain(|(id, level), _| {
2669 !(id == &list_meta.id && *level > list_meta.level)
2670 });
2671 let next = counters.entry(key).or_insert(0);
2672 *next += 1;
2673 Some(*next)
2674 });
2675 let markdown = render_paragraph_markdown(
2676 &text,
2677 style.as_deref(),
2678 list.as_ref(),
2679 *quote,
2680 *horizontal_rule,
2681 ordered_index,
2682 );
2683 rendered.push(RenderedBlock {
2684 markdown,
2685 list_id: list.as_ref().map(|l| l.id.clone()),
2686 quote: *quote,
2687 });
2688 }
2689 CapturedBlock::Table(table) => {
2690 rendered.push(RenderedBlock {
2691 markdown: render_table_markdown(table),
2692 list_id: None,
2693 quote: false,
2694 });
2695 }
2696 }
2697 }
2698
2699 let mut out = String::new();
2703 for (idx, block) in rendered.iter().enumerate() {
2704 if idx == 0 {
2705 out.push_str(&block.markdown);
2706 continue;
2707 }
2708 let prev = &rendered[idx - 1];
2709 if block.list_id.is_some() && prev.list_id.is_some() {
2710 out.push('\n');
2711 } else if block.quote && prev.quote {
2712 out.push_str("\n>\n");
2713 } else {
2714 out.push_str("\n\n");
2715 }
2716 out.push_str(&block.markdown);
2717 }
2718 if !out.is_empty() && !out.ends_with('\n') {
2719 out.push('\n');
2720 }
2721 out
2722}
2723
2724fn render_paragraph_markdown(
2725 text: &str,
2726 style: Option<&str>,
2727 list: Option<&ListMeta>,
2728 quote: bool,
2729 horizontal_rule: bool,
2730 ordered_index: Option<usize>,
2731) -> String {
2732 if horizontal_rule {
2733 return "---".to_string();
2734 }
2735 match style {
2736 Some("TITLE") => format!("# {text}"),
2737 Some("SUBTITLE") => format!("## {text}"),
2738 Some(style) if style.starts_with("HEADING_") => {
2739 let level = style
2740 .trim_start_matches("HEADING_")
2741 .parse::<usize>()
2742 .unwrap_or(1);
2743 format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2744 }
2745 _ => list.map_or_else(
2746 || {
2747 if quote {
2748 text.lines()
2749 .map(|line| {
2750 if line.is_empty() {
2751 ">".to_string()
2752 } else {
2753 format!("> {line}")
2754 }
2755 })
2756 .collect::<Vec<_>>()
2757 .join("\n")
2758 } else {
2759 text.to_string()
2760 }
2761 },
2762 |list| {
2763 let indent = " ".repeat(list.level);
2764 let marker = if list.ordered {
2765 format!("{}.", ordered_index.unwrap_or(1))
2766 } else {
2767 "-".to_string()
2768 };
2769 format!("{indent}{marker} {text}")
2770 },
2771 ),
2772 }
2773}
2774
2775fn render_table_markdown(table: &TableBlock) -> String {
2776 if table.rows.is_empty() {
2777 return String::new();
2778 }
2779 let width = table
2780 .rows
2781 .iter()
2782 .map(|row| row.cells.len())
2783 .max()
2784 .unwrap_or(1);
2785 let rows = table
2786 .rows
2787 .iter()
2788 .map(|row| {
2789 (0..width)
2790 .map(|idx| {
2791 row.cells.get(idx).map_or_else(String::new, |cell| {
2792 escape_markdown_table_cell(&render_content_markdown(&cell.content))
2793 })
2794 })
2795 .collect::<Vec<_>>()
2796 })
2797 .collect::<Vec<_>>();
2798 let separator = vec!["---".to_string(); width];
2799 std::iter::once(&rows[0])
2800 .chain(std::iter::once(&separator))
2801 .chain(rows.iter().skip(1))
2802 .map(|row| format!("| {} |", row.join(" | ")))
2803 .collect::<Vec<_>>()
2804 .join("\n")
2805}
2806
2807fn render_content_markdown(content: &[ContentNode]) -> String {
2808 let mut rendered = String::new();
2809 let mut idx = 0usize;
2810 while idx < content.len() {
2811 match &content[idx] {
2812 ContentNode::Text {
2813 text,
2814 bold,
2815 italic,
2816 strike,
2817 link,
2818 } => {
2819 let link_target = link.as_deref();
2820 let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
2821 idx += 1;
2822 while let Some(ContentNode::Text {
2823 text,
2824 bold,
2825 italic,
2826 strike,
2827 link: next_link,
2828 }) = content.get(idx)
2829 {
2830 if next_link.as_deref() != link_target {
2831 break;
2832 }
2833 runs.push((text.as_str(), *bold, *italic, *strike));
2834 idx += 1;
2835 }
2836 let label = render_text_runs_markdown(&runs);
2837 if let Some(link_target) = link_target {
2838 let _ = write!(rendered, "[{label}]({link_target})");
2839 } else {
2840 rendered.push_str(&label);
2841 }
2842 }
2843 ContentNode::Image {
2844 url: Some(url),
2845 alt,
2846 ..
2847 } => {
2848 let _ = write!(rendered, "");
2849 idx += 1;
2850 }
2851 ContentNode::Image { .. } => idx += 1,
2852 }
2853 }
2854 rendered
2855}
2856
2857#[derive(Clone, Copy, Default)]
2858struct MarkdownMarkerState {
2859 bold: bool,
2860 italic: bool,
2861 strike: bool,
2862}
2863
2864fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
2865 let inactive = MarkdownMarkerState::default();
2866 let mut active = inactive;
2867 let mut output = String::new();
2868 for (text, bold, italic, strike) in runs {
2869 let next = MarkdownMarkerState {
2870 bold: *bold,
2871 italic: *italic,
2872 strike: *strike,
2873 };
2874 let mut start = 0usize;
2875 for (offset, ch) in text.char_indices() {
2876 if ch != '\n' {
2877 continue;
2878 }
2879 if offset > start {
2880 output.push_str(&markdown_marker_transition(active, next));
2881 output.push_str(&text[start..offset]);
2882 active = next;
2883 }
2884 output.push_str(&markdown_marker_transition(active, inactive));
2885 output.push('\n');
2886 active = inactive;
2887 start = offset + ch.len_utf8();
2888 }
2889 if start < text.len() {
2890 output.push_str(&markdown_marker_transition(active, next));
2891 output.push_str(&text[start..]);
2892 active = next;
2893 }
2894 }
2895 output.push_str(&markdown_marker_transition(active, inactive));
2896 output
2897}
2898
2899fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
2900 let mut markers = String::new();
2901 if active.strike && !next.strike {
2902 markers.push_str("~~");
2903 }
2904 if active.italic && !next.italic {
2905 markers.push('*');
2906 }
2907 if active.bold && !next.bold {
2908 markers.push_str("**");
2909 }
2910 if !active.bold && next.bold {
2911 markers.push_str("**");
2912 }
2913 if !active.italic && next.italic {
2914 markers.push('*');
2915 }
2916 if !active.strike && next.strike {
2917 markers.push_str("~~");
2918 }
2919 markers
2920}
2921
2922fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
2923 format!(
2924 "<!doctype html><html><body>{}</body></html>",
2925 blocks
2926 .iter()
2927 .map(|block| match block {
2928 CapturedBlock::Paragraph {
2929 content,
2930 style,
2931 list,
2932 quote,
2933 horizontal_rule,
2934 } => {
2935 if *horizontal_rule {
2936 "<hr>".to_string()
2937 } else if let Some(list) = list {
2938 let tag = if list.ordered { "ol" } else { "ul" };
2939 format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
2940 } else if *quote {
2941 format!("<blockquote>{}</blockquote>", render_content_html(content))
2942 } else {
2943 let tag = paragraph_tag(style.as_deref());
2944 format!("<{tag}>{}</{tag}>", render_content_html(content))
2945 }
2946 }
2947 CapturedBlock::Table(table) => render_table_html(table),
2948 })
2949 .collect::<String>()
2950 )
2951}
2952
2953fn render_table_html(table: &TableBlock) -> String {
2954 let mut html = String::from("<table>");
2955 for row in &table.rows {
2956 html.push_str("<tr>");
2957 for cell in &row.cells {
2958 html.push_str("<td>");
2959 html.push_str(&render_content_html(&cell.content));
2960 html.push_str("</td>");
2961 }
2962 html.push_str("</tr>");
2963 }
2964 html.push_str("</table>");
2965 html
2966}
2967
2968fn render_content_html(content: &[ContentNode]) -> String {
2969 content
2970 .iter()
2971 .map(|node| match node {
2972 ContentNode::Text {
2973 text,
2974 bold,
2975 italic,
2976 strike,
2977 link,
2978 } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
2979 ContentNode::Image {
2980 url: Some(url),
2981 alt,
2982 width,
2983 height,
2984 ..
2985 } => render_image_html(url, alt, width.as_deref(), height.as_deref()),
2986 ContentNode::Image { .. } => String::new(),
2987 })
2988 .collect()
2989}
2990
2991fn render_image_html(url: &str, alt: &str, width: Option<&str>, height: Option<&str>) -> String {
2992 let mut html = format!(
2993 "<img src=\"{}\" alt=\"{}\"",
2994 escape_html(url),
2995 escape_html(alt)
2996 );
2997 if let Some(width) = width.filter(|value| !value.is_empty()) {
2998 let _ = write!(html, " width=\"{}\"", escape_html(width));
2999 }
3000 if let Some(height) = height.filter(|value| !value.is_empty()) {
3001 let _ = write!(html, " height=\"{}\"", escape_html(height));
3002 }
3003 html.push('>');
3004 html
3005}
3006
3007fn render_marked_html(
3008 text: &str,
3009 bold: bool,
3010 italic: bool,
3011 strike: bool,
3012 link: Option<&str>,
3013) -> String {
3014 text.split('\n')
3015 .map(|segment| render_marked_html_segment(segment, bold, italic, strike, link))
3016 .collect::<Vec<_>>()
3017 .join("<br>")
3018}
3019
3020fn render_marked_html_segment(
3021 text: &str,
3022 bold: bool,
3023 italic: bool,
3024 strike: bool,
3025 link: Option<&str>,
3026) -> String {
3027 if text.is_empty() {
3028 return String::new();
3029 }
3030 let mut output = escape_html(text);
3031 if bold {
3032 output = format!("<strong>{output}</strong>");
3033 }
3034 if italic {
3035 output = format!("<em>{output}</em>");
3036 }
3037 if strike {
3038 output = format!("<s>{output}</s>");
3039 }
3040 if let Some(link) = link {
3041 output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
3042 }
3043 output
3044}
3045
3046fn paragraph_tag(style: Option<&str>) -> &'static str {
3047 match style {
3048 Some("TITLE" | "HEADING_1") => "h1",
3049 Some("SUBTITLE" | "HEADING_2") => "h2",
3050 Some("HEADING_3") => "h3",
3051 Some("HEADING_4") => "h4",
3052 Some("HEADING_5") => "h5",
3053 Some("HEADING_6") => "h6",
3054 _ => "p",
3055 }
3056}
3057
3058fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
3059 blocks
3060 .iter()
3061 .map(|block| match block {
3062 CapturedBlock::Paragraph { content, .. } => content_to_text(content),
3063 CapturedBlock::Table(table) => table
3064 .rows
3065 .iter()
3066 .map(|row| {
3067 row.cells
3068 .iter()
3069 .map(|cell| content_to_text(&cell.content))
3070 .collect::<Vec<_>>()
3071 .join("\t")
3072 })
3073 .collect::<Vec<_>>()
3074 .join("\n"),
3075 })
3076 .filter(|text| !text.is_empty())
3077 .collect::<Vec<_>>()
3078 .join("\n")
3079}
3080
3081fn content_to_text(content: &[ContentNode]) -> String {
3082 content
3083 .iter()
3084 .map(|node| match node {
3085 ContentNode::Text { text, .. } => text.clone(),
3086 ContentNode::Image {
3087 url: Some(_), alt, ..
3088 } => format!("[{alt}]"),
3089 ContentNode::Image { .. } => String::new(),
3090 })
3091 .collect()
3092}
3093
3094fn escape_html(value: &str) -> String {
3095 value
3096 .replace('&', "&")
3097 .replace('<', "<")
3098 .replace('>', ">")
3099 .replace('"', """)
3100 .replace('\'', "'")
3101}
3102
3103fn escape_markdown_table_cell(value: &str) -> String {
3104 value.replace('|', "\\|").replace('\n', "<br>")
3105}
3106
3107#[must_use]
3111pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
3112 let trimmed = auth_header.trim();
3113 trimmed
3114 .strip_prefix("Bearer ")
3115 .or_else(|| trimmed.strip_prefix("bearer "))
3116 .map(str::trim)
3117 .filter(|t| !t.is_empty())
3118}
3119
3120#[derive(Debug, Clone)]
3122pub struct ExtractedImage {
3123 pub filename: String,
3125 pub data: Vec<u8>,
3127 pub mime_type: String,
3129}
3130
3131#[derive(Debug, Clone)]
3133pub struct GDocsArchiveResult {
3134 pub html: String,
3136 pub markdown: String,
3138 pub images: Vec<ExtractedImage>,
3140 pub document_id: String,
3142 pub export_url: String,
3144}
3145
3146pub async fn localize_rendered_remote_images_for_archive(
3158 rendered: &GDocsRenderedResult,
3159) -> crate::Result<GDocsArchiveResult> {
3160 let client = reqwest::Client::builder().build().map_err(|error| {
3161 WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
3162 })?;
3163 let mut seen = HashMap::new();
3164 let mut images = Vec::new();
3165 let mut next_index = 1usize;
3166
3167 for image in &rendered.remote_images {
3168 if seen.contains_key(&image.url) {
3169 continue;
3170 }
3171 let filename = remote_image_filename(&image.url, next_index);
3172 next_index += 1;
3173 seen.insert(image.url.clone(), filename.clone());
3174
3175 match client
3176 .get(&image.url)
3177 .header("User-Agent", GDOCS_USER_AGENT)
3178 .header("Accept", "image/*,*/*;q=0.8")
3179 .send()
3180 .await
3181 {
3182 Ok(response) if response.status().is_success() => {
3183 let mime_type = response
3184 .headers()
3185 .get(reqwest::header::CONTENT_TYPE)
3186 .and_then(|value| value.to_str().ok())
3187 .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
3188 let data = response.bytes().await.map_err(|error| {
3189 WebCaptureError::FetchError(format!(
3190 "Failed to read Google Docs image {}: {error}",
3191 image.url
3192 ))
3193 })?;
3194 debug!(
3195 url = %image.url,
3196 filename = %filename,
3197 bytes = data.len(),
3198 mime_type = %mime_type,
3199 "downloaded Google Docs browser-model archive image"
3200 );
3201 images.push(ExtractedImage {
3202 filename,
3203 data: data.to_vec(),
3204 mime_type,
3205 });
3206 }
3207 Ok(response) => {
3208 warn!(
3209 url = %image.url,
3210 status = response.status().as_u16(),
3211 "failed to download Google Docs browser-model archive image"
3212 );
3213 }
3214 Err(error) => {
3215 warn!(
3216 url = %image.url,
3217 error = %error,
3218 "failed to download Google Docs browser-model archive image"
3219 );
3220 }
3221 }
3222 }
3223
3224 let mut markdown = rendered.markdown.clone();
3225 let mut html = rendered.html.clone();
3226 for (url, filename) in seen {
3227 let local_path = format!("images/{filename}");
3228 markdown = markdown.replace(&url, &local_path);
3229 html = html.replace(&url, &local_path);
3230 }
3231
3232 Ok(GDocsArchiveResult {
3233 html,
3234 markdown,
3235 images,
3236 document_id: rendered.document_id.clone(),
3237 export_url: rendered.export_url.clone(),
3238 })
3239}
3240
3241fn remote_image_filename(url: &str, index: usize) -> String {
3242 let ext = crate::localize_images::get_extension_from_url(url);
3243 format!("image-{index:02}{ext}")
3244}
3245
3246fn mime_type_for_filename(filename: &str) -> String {
3247 match filename
3248 .rsplit('.')
3249 .next()
3250 .unwrap_or("png")
3251 .to_lowercase()
3252 .as_str()
3253 {
3254 "jpg" | "jpeg" => "image/jpeg",
3255 "gif" => "image/gif",
3256 "webp" => "image/webp",
3257 "svg" => "image/svg+xml",
3258 _ => "image/png",
3259 }
3260 .to_string()
3261}
3262
3263fn base64_image_pattern() -> &'static Regex {
3264 static PATTERN: OnceLock<Regex> = OnceLock::new();
3265 PATTERN.get_or_init(|| {
3266 Regex::new(
3267 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3268 )
3269 .unwrap()
3270 })
3271}
3272
3273#[must_use]
3286pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3287 let mut images = Vec::new();
3288 let mut idx = 1u32;
3289
3290 let updated_html = base64_image_pattern()
3291 .replace_all(html, |caps: ®ex::Captures<'_>| {
3292 let prefix = &caps[1];
3293 let mime_ext = &caps[2];
3294 let base64_data = &caps[3];
3295 let suffix = &caps[4];
3296
3297 let ext = match mime_ext {
3298 "jpeg" => "jpg",
3299 "svg+xml" => "svg",
3300 other => other,
3301 };
3302
3303 let filename = format!("image-{idx:02}.{ext}");
3304 let mime_type = format!("image/{mime_ext}");
3305
3306 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3307 debug!("Extracted image: {} ({} bytes)", filename, data.len());
3308 images.push(ExtractedImage {
3309 filename: filename.clone(),
3310 data,
3311 mime_type,
3312 });
3313 }
3314
3315 idx += 1;
3316 format!("{prefix}images/{filename}{suffix}")
3317 })
3318 .into_owned();
3319
3320 (updated_html, images)
3321}
3322
3323pub async fn fetch_google_doc_as_archive(
3342 url: &str,
3343 api_token: Option<&str>,
3344) -> crate::Result<GDocsArchiveResult> {
3345 let result = fetch_google_doc(url, "html", api_token).await?;
3346
3347 let preprocess = preprocess_google_docs_export_html(&result.content);
3348 debug!(
3349 document_id = %result.document_id,
3350 hoisted = preprocess.hoisted,
3351 unwrapped_links = preprocess.unwrapped_links,
3352 "google-docs-export pre-processor rewrote archive markup"
3353 );
3354
3355 let (local_html, images) = extract_base64_images(&preprocess.html);
3356
3357 let markdown = normalize_google_docs_export_markdown(
3358 &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3359 );
3360
3361 debug!(
3362 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3363 images.len(),
3364 local_html.len(),
3365 markdown.len()
3366 );
3367
3368 Ok(GDocsArchiveResult {
3369 html: local_html,
3370 markdown,
3371 images,
3372 document_id: result.document_id,
3373 export_url: result.export_url,
3374 })
3375}
3376
3377pub fn create_archive_zip(
3388 archive: &GDocsArchiveResult,
3389 pretty_html: bool,
3390) -> crate::Result<Vec<u8>> {
3391 let mut buf = std::io::Cursor::new(Vec::new());
3392
3393 {
3394 let mut zip = zip::ZipWriter::new(&mut buf);
3395 let options = zip::write::SimpleFileOptions::default()
3396 .compression_method(zip::CompressionMethod::Deflated);
3397
3398 zip.start_file("document.md", options)
3399 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3400 zip.write_all(archive.markdown.as_bytes())?;
3401
3402 let html_output = if pretty_html {
3403 crate::html::pretty_print_html(&archive.html)
3404 } else {
3405 archive.html.clone()
3406 };
3407 zip.start_file("document.html", options)
3408 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3409 zip.write_all(html_output.as_bytes())?;
3410
3411 for img in &archive.images {
3412 zip.start_file(format!("images/{}", img.filename), options)
3413 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3414 zip.write_all(&img.data)?;
3415 }
3416
3417 zip.finish()
3418 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3419 }
3420
3421 Ok(buf.into_inner())
3422}
3423
3424#[cfg(test)]
3425mod tests {
3426 use super::*;
3427 use serde_json::json;
3428
3429 #[test]
3430 fn browser_model_fingerprint_includes_payload_size() {
3431 let small = browser_model_data_from_value(&json!({
3432 "chunks": [{ "chunk": [{ "ty": "is", "s": "first" }] }],
3433 "cidUrlMap": {}
3434 }));
3435 let larger = browser_model_data_from_value(&json!({
3436 "chunks": [{ "chunk": [{ "ty": "is", "s": "first and later text" }] }],
3437 "cidUrlMap": {}
3438 }));
3439
3440 assert_eq!(small.fingerprint().chunks, larger.fingerprint().chunks);
3441 assert_ne!(
3442 small.fingerprint().payload_bytes,
3443 larger.fingerprint().payload_bytes
3444 );
3445 }
3446
3447 #[test]
3448 fn browser_model_quiescence_resets_when_chunks_change() {
3449 let start = Instant::now();
3450 let stability_window = Duration::from_millis(1500);
3451 let one_chunk = BrowserModelFingerprint {
3452 chunks: 1,
3453 payload_bytes: 100,
3454 };
3455 let two_chunks = BrowserModelFingerprint {
3456 chunks: 2,
3457 payload_bytes: 200,
3458 };
3459 let mut quiescence = BrowserModelQuiescence::default();
3460
3461 assert_eq!(quiescence.observe(one_chunk, start, stability_window), None);
3462 assert_eq!(
3463 quiescence.observe(
3464 one_chunk,
3465 start + Duration::from_millis(250),
3466 stability_window
3467 ),
3468 None
3469 );
3470 assert_eq!(
3471 quiescence.observe(
3472 two_chunks,
3473 start + Duration::from_millis(500),
3474 stability_window
3475 ),
3476 None
3477 );
3478 assert_eq!(
3479 quiescence.observe(
3480 two_chunks,
3481 start + Duration::from_millis(750),
3482 stability_window
3483 ),
3484 None
3485 );
3486 assert_eq!(
3487 quiescence.observe(
3488 two_chunks,
3489 start + Duration::from_millis(2300),
3490 stability_window
3491 ),
3492 Some(Duration::from_millis(1550))
3493 );
3494 }
3495}