web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use serde_json::Value;
38use std::collections::HashMap;
39use std::fmt::Write as _;
40use std::hash::BuildHasher;
41use std::io::Write;
42use std::process::Stdio;
43use std::sync::OnceLock;
44use std::time::{Duration, Instant};
45use tokio::io::{AsyncBufReadExt, BufReader};
46use tokio::process::{Child, Command};
47use tracing::{debug, info, warn};
48
49use crate::WebCaptureError;
50
51const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
52const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
53const GDOCS_USER_AGENT: &str =
54    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
55const GDOCS_EDITOR_MODEL_WAIT: Duration = Duration::from_secs(30);
56const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
57
58type CdpWebSocket = WebSocketStream<ConnectStream>;
59
60const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
61window.__captured_chunks = [];
62const captureChunk = (value) => {
63  if (!value) {
64    return;
65  }
66  if (Array.isArray(value)) {
67    for (const item of value) {
68      captureChunk(item);
69    }
70    return;
71  }
72  try {
73    window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
74  } catch {
75    window.__captured_chunks.push(value);
76  }
77};
78const wrapChunkArray = (value) => {
79  if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
80    return value;
81  }
82  const originalPush = value.push;
83  Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
84    value: true,
85    enumerable: false,
86  });
87  Object.defineProperty(value, 'push', {
88    value(...items) {
89      for (const item of items) {
90        captureChunk(item);
91      }
92      return originalPush.apply(this, items);
93    },
94    writable: true,
95    configurable: true,
96  });
97  for (const item of value) {
98    captureChunk(item);
99  }
100  return value;
101};
102Object.defineProperty(window, 'DOCS_modelChunk', {
103  set(value) {
104    captureChunk(value);
105    window.__DOCS_modelChunk_latest = wrapChunkArray(value);
106  },
107  get() {
108    return window.__DOCS_modelChunk_latest;
109  },
110  configurable: false,
111});
112";
113
114const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
115  const chunks = [...(window.__captured_chunks || [])];
116  if (
117    window.DOCS_modelChunk &&
118    chunks.length === 0 &&
119    !chunks.includes(window.DOCS_modelChunk)
120  ) {
121    chunks.push(window.DOCS_modelChunk);
122  }
123  const cidUrlMap = {};
124  const scripts = document.querySelectorAll('script');
125  for (const script of scripts) {
126    const text = script.textContent || '';
127    if (!text.includes('docs-images-rt')) {
128      continue;
129    }
130    const regex =
131      /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
132    let match;
133    while ((match = regex.exec(text)) !== null) {
134      cidUrlMap[match[1]] = match[2]
135        .replace(/\\u003d/g, '=')
136        .replace(/\\u0026/g, '&')
137        .replace(/\\\//g, '/');
138    }
139  }
140  return { chunks, cidUrlMap };
141}"#;
142
143fn gdocs_url_pattern() -> &'static Regex {
144    static PATTERN: OnceLock<Regex> = OnceLock::new();
145    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
146}
147
148/// Result of fetching a Google Docs document.
149#[derive(Debug, Clone)]
150pub struct GDocsResult {
151    /// The document content in the requested format.
152    pub content: String,
153    /// The export format used.
154    pub format: String,
155    /// The extracted document ID.
156    pub document_id: String,
157    /// The export URL that was fetched.
158    pub export_url: String,
159}
160
161/// Google Docs capture backend selected from the CLI `--capture` flag.
162#[derive(Debug, Clone, Copy, PartialEq, Eq)]
163pub enum GDocsCaptureMethod {
164    /// Load `/edit` and extract `DOCS_modelChunk`.
165    BrowserModel,
166    /// Use the public `/export?format=...` endpoint.
167    PublicExport,
168    /// Use the authenticated `docs.googleapis.com` REST API.
169    DocsApi,
170}
171
172/// Rendered Google Docs content from either Docs API or editor model data.
173#[derive(Debug, Clone)]
174pub struct GDocsRenderedResult {
175    /// Markdown output.
176    pub markdown: String,
177    /// HTML output.
178    pub html: String,
179    /// Plain text output.
180    pub text: String,
181    /// The extracted document ID.
182    pub document_id: String,
183    /// Source URL used for capture.
184    pub export_url: String,
185    /// Remote images exposed by the editor model, used for archive localization.
186    pub remote_images: Vec<RemoteImage>,
187}
188
189/// Remote image reference extracted from browser-model capture.
190#[derive(Debug, Clone, PartialEq, Eq)]
191pub struct RemoteImage {
192    /// Original image URL.
193    pub url: String,
194    /// Image alt text.
195    pub alt: String,
196}
197
198#[derive(Debug, Clone)]
199struct BrowserModelData {
200    chunks: Vec<Value>,
201    cid_urls: HashMap<String, String>,
202}
203
204/// Parsed Google Docs model/document capture.
205#[derive(Debug, Clone, Default)]
206pub struct CapturedDocument {
207    /// Ordered document blocks.
208    pub blocks: Vec<CapturedBlock>,
209    /// Tables extracted from `blocks` for compatibility with tests and callers.
210    pub tables: Vec<TableBlock>,
211    /// Images extracted from model positions.
212    pub images: Vec<ContentNode>,
213    /// Plain text projection.
214    pub text: String,
215}
216
217/// Captured block.
218#[derive(Debug, Clone)]
219pub enum CapturedBlock {
220    /// Paragraph-like block.
221    Paragraph {
222        /// Paragraph content.
223        content: Vec<ContentNode>,
224        /// Optional Google Docs named style.
225        style: Option<String>,
226        /// Optional list metadata.
227        list: Option<ListMeta>,
228        /// Whether paragraph is a blockquote.
229        quote: bool,
230        /// Whether paragraph is a horizontal rule.
231        horizontal_rule: bool,
232    },
233    /// Table block.
234    Table(TableBlock),
235}
236
237/// Captured table.
238#[derive(Debug, Clone, Default)]
239pub struct TableBlock {
240    /// Table rows.
241    pub rows: Vec<TableRow>,
242}
243
244/// Captured table row.
245#[derive(Debug, Clone, Default)]
246pub struct TableRow {
247    /// Row cells.
248    pub cells: Vec<TableCell>,
249}
250
251/// Captured table cell.
252#[derive(Debug, Clone, Default)]
253pub struct TableCell {
254    /// Cell content.
255    pub content: Vec<ContentNode>,
256}
257
258/// Captured inline content node.
259#[derive(Debug, Clone, PartialEq, Eq)]
260pub enum ContentNode {
261    /// Text run.
262    Text {
263        /// Text content.
264        text: String,
265        /// Bold text style.
266        bold: bool,
267        /// Italic text style.
268        italic: bool,
269        /// Strikethrough text style.
270        strike: bool,
271        /// Optional hyperlink target.
272        link: Option<String>,
273    },
274    /// Image placeholder.
275    Image {
276        /// Content ID from Google Docs model data.
277        cid: Option<String>,
278        /// Resolved image URL.
279        url: Option<String>,
280        /// Alt text.
281        alt: String,
282        /// Whether this image came from a suggested edit.
283        is_suggestion: bool,
284    },
285}
286
287#[derive(Debug, Clone, Default, PartialEq, Eq)]
288struct TextStyle {
289    bold: bool,
290    italic: bool,
291    strike: bool,
292    link: Option<String>,
293}
294
295#[derive(Debug, Clone, Default)]
296struct ParagraphMeta {
297    style: Option<String>,
298    list: Option<ListMeta>,
299    quote: bool,
300    horizontal_rule: bool,
301}
302
303#[derive(Debug, Clone)]
304pub struct ListMeta {
305    /// Google Docs list identifier.
306    pub id: String,
307    /// Nesting level, zero-based.
308    pub level: usize,
309    /// Whether Markdown should render this list item with an ordered marker.
310    pub ordered: bool,
311}
312
313#[derive(Debug, Clone)]
314struct ParagraphStyle {
315    style: Option<String>,
316    indent_start: f64,
317    indent_first_line: f64,
318}
319
320#[derive(Debug, Clone, Default)]
321struct ModelStyleMaps {
322    inline_styles: Vec<TextStyle>,
323    paragraph_by_end: HashMap<usize, ParagraphStyle>,
324    list_by_end: HashMap<usize, ListMeta>,
325    horizontal_rules: std::collections::HashSet<usize>,
326}
327
328/// Check if a URL is a Google Docs document URL.
329#[must_use]
330pub fn is_google_docs_url(url: &str) -> bool {
331    gdocs_url_pattern().is_match(url)
332}
333
334/// Extract the document ID from a Google Docs URL.
335///
336/// Returns `None` if the URL is not a valid Google Docs URL.
337#[must_use]
338pub fn extract_document_id(url: &str) -> Option<String> {
339    gdocs_url_pattern()
340        .captures(url)
341        .and_then(|caps| caps.get(1))
342        .map(|m| m.as_str().to_string())
343}
344
345/// Build a Google Docs export URL.
346///
347/// # Arguments
348///
349/// * `document_id` - The Google Docs document ID
350/// * `format` - Export format (html, txt, md, pdf, docx, epub)
351#[must_use]
352pub fn build_export_url(document_id: &str, format: &str) -> String {
353    let export_format = match format {
354        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
355        _ => "html",
356    };
357    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
358}
359
360/// Build a Google Docs editor URL.
361#[must_use]
362pub fn build_edit_url(document_id: &str) -> String {
363    format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
364}
365
366/// Build a Google Docs REST API URL.
367#[must_use]
368pub fn build_docs_api_url(document_id: &str) -> String {
369    format!("{GDOCS_API_BASE}/{document_id}")
370}
371
372/// Select a Google Docs capture backend from the CLI `--capture` value.
373///
374/// # Errors
375///
376/// Returns an error when `capture` is neither `browser` nor `api`.
377pub fn select_capture_method(
378    capture: &str,
379    api_token: Option<&str>,
380) -> crate::Result<GDocsCaptureMethod> {
381    match capture.to_lowercase().as_str() {
382        "browser" => Ok(GDocsCaptureMethod::BrowserModel),
383        "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
384        "api" => Ok(GDocsCaptureMethod::PublicExport),
385        other => Err(WebCaptureError::InvalidUrl(format!(
386            "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
387        ))),
388    }
389}
390
391/// Fetch a Google Docs document via the export URL.
392///
393/// For public documents, pass `None` for `api_token`.
394/// For private documents, pass a Bearer token string.
395///
396/// # Arguments
397///
398/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
399/// * `format` - Export format (html, txt, md, pdf, docx, epub)
400/// * `api_token` - Optional API token for private documents
401///
402/// # Errors
403///
404/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
405pub async fn fetch_google_doc(
406    url: &str,
407    format: &str,
408    api_token: Option<&str>,
409) -> crate::Result<GDocsResult> {
410    let document_id = extract_document_id(url).ok_or_else(|| {
411        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
412    })?;
413
414    let export_url = build_export_url(&document_id, format);
415    debug!(
416        document_id = %document_id,
417        format = %format,
418        export_url = %export_url,
419        has_api_token = api_token.is_some(),
420        "fetching Google Doc via public export"
421    );
422
423    let mut request = reqwest::Client::new()
424        .get(&export_url)
425        .header(
426            "User-Agent",
427            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
428        )
429        .header("Accept-Charset", "utf-8")
430        .header("Accept-Language", "en-US,en;q=0.9");
431
432    if let Some(token) = api_token {
433        request = request.header("Authorization", format!("Bearer {token}"));
434    }
435
436    let response = request
437        .send()
438        .await
439        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
440    debug!(
441        document_id = %document_id,
442        status = response.status().as_u16(),
443        success = response.status().is_success(),
444        content_type = response
445            .headers()
446            .get(reqwest::header::CONTENT_TYPE)
447            .and_then(|value| value.to_str().ok())
448            .unwrap_or(""),
449        "received Google Docs public export response"
450    );
451
452    if !response.status().is_success() {
453        return Err(WebCaptureError::FetchError(format!(
454            "Failed to fetch Google Doc ({} {}): {}",
455            response.status().as_u16(),
456            response.status().canonical_reason().unwrap_or("Unknown"),
457            export_url
458        )));
459    }
460
461    let raw_content = response.text().await.map_err(|e| {
462        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
463    })?;
464    debug!(
465        document_id = %document_id,
466        bytes = raw_content.len(),
467        "read Google Docs public export body"
468    );
469
470    // Decode HTML entities to unicode for text-based formats
471    let content = match format {
472        "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
473        _ => raw_content,
474    };
475
476    Ok(GDocsResult {
477        content,
478        format: format.to_string(),
479        document_id,
480        export_url,
481    })
482}
483
484/// Fetch a Google Docs document and convert to Markdown.
485///
486/// Fetches the document as HTML, then converts to Markdown using the
487/// existing HTML-to-Markdown pipeline.
488///
489/// # Arguments
490///
491/// * `url` - Google Docs URL
492/// * `api_token` - Optional API token for private documents
493///
494/// # Errors
495///
496/// Returns an error if the fetch or conversion fails.
497pub async fn fetch_google_doc_as_markdown(
498    url: &str,
499    api_token: Option<&str>,
500) -> crate::Result<GDocsResult> {
501    let result = fetch_google_doc(url, "html", api_token).await?;
502
503    let preprocess = preprocess_google_docs_export_html(&result.content);
504    debug!(
505        document_id = %result.document_id,
506        hoisted = preprocess.hoisted,
507        unwrapped_links = preprocess.unwrapped_links,
508        "google-docs-export pre-processor rewrote markup"
509    );
510    let markdown = normalize_google_docs_export_markdown(
511        &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
512    );
513    debug!(
514        document_id = %result.document_id,
515        bytes = markdown.len(),
516        "rendered Google Docs public export markdown"
517    );
518
519    Ok(GDocsResult {
520        content: markdown,
521        format: "markdown".to_string(),
522        document_id: result.document_id,
523        export_url: result.export_url,
524    })
525}
526
527/// Result of running the Google Docs export HTML pre-processor.
528///
529/// Exposes the rewritten HTML alongside counters that are useful for debug
530/// logging (`gdocs.export.style-hoist`). See issue #92 R6.
531#[derive(Debug, Clone)]
532pub struct GDocsExportPreprocessResult {
533    /// Rewritten HTML.
534    pub html: String,
535    /// Number of inline-style spans turned into `<strong>`/`<em>`/`<del>`.
536    pub hoisted: usize,
537    /// Number of `google.com/url?q=` redirect wrappers unwrapped.
538    pub unwrapped_links: usize,
539}
540
541/// Pre-process Google Docs export HTML so the generic `html2md` pipeline
542/// preserves inline formatting, heading numbering, and link targets.
543///
544/// Google Drive serves bold/italic/strikethrough as inline style spans and
545/// wraps every link through a `google.com/url?q=` redirect, both of which
546/// the generic converter would otherwise discard. This function rewrites
547/// those constructs into semantic HTML before conversion.
548#[must_use]
549pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
550    let mut hoisted: usize = 0;
551    let mut unwrapped_links: usize = 0;
552    let class_styles = extract_css_class_styles(html);
553
554    let mut out = hoist_inline_style_spans(html, &mut hoisted);
555    out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
556    out = convert_class_indented_blockquotes(&out, &class_styles);
557    out = nest_google_docs_lists(&out, &class_styles);
558    out = strip_google_docs_heading_noise(&out);
559    out = strip_heading_inline_formatting(&out);
560    out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
561    out = out.replace("&nbsp;", " ");
562    out = out.replace('\u{00A0}', " ");
563
564    GDocsExportPreprocessResult {
565        html: out,
566        hoisted,
567        unwrapped_links,
568    }
569}
570
571/// Normalize Markdown emitted from Google Docs public-export HTML converters.
572#[must_use]
573pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
574    let markdown = unescape_public_export_punctuation(markdown);
575    let markdown = convert_setext_headings(&markdown);
576    let markdown = normalize_atx_headings(&markdown);
577    let markdown = normalize_bullet_markers(&markdown);
578    let markdown = normalize_list_spacing(&markdown);
579    let markdown = normalize_blockquote_spacing(&markdown);
580    let markdown = normalize_markdown_tables(&markdown);
581    crate::markdown::clean_markdown(&markdown)
582}
583
584fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
585    let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
586        .expect("valid regex");
587    span_re
588        .replace_all(html, |caps: &regex::Captures<'_>| {
589            let style = caps.get(2).map_or("", |m| m.as_str());
590            let inner = caps.get(3).map_or("", |m| m.as_str());
591            semantic_wrapped_html(inner, style).map_or_else(
592                || caps[0].to_string(),
593                |wrapped| {
594                    *hoisted += 1;
595                    wrapped
596                },
597            )
598        })
599        .into_owned()
600}
601
602fn hoist_class_style_spans(
603    html: &str,
604    class_styles: &HashMap<String, String>,
605    hoisted: &mut usize,
606) -> String {
607    let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
608        .expect("valid regex");
609    class_span_re
610        .replace_all(html, |caps: &regex::Captures<'_>| {
611            let class_attr = caps.get(2).map_or("", |m| m.as_str());
612            let inner = caps.get(3).map_or("", |m| m.as_str());
613            let style = combined_class_style(class_styles, class_attr);
614            semantic_wrapped_html(inner, &style).map_or_else(
615                || caps[0].to_string(),
616                |wrapped| {
617                    *hoisted += 1;
618                    wrapped
619                },
620            )
621        })
622        .into_owned()
623}
624
625fn convert_class_indented_blockquotes(
626    html: &str,
627    class_styles: &HashMap<String, String>,
628) -> String {
629    let class_paragraph_re =
630        Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
631    class_paragraph_re
632        .replace_all(html, |caps: &regex::Captures<'_>| {
633            let class_attr = caps.get(2).map_or("", |m| m.as_str());
634            let inner = caps.get(3).map_or("", |m| m.as_str());
635            let style = combined_class_style(class_styles, class_attr);
636            if is_blockquote_style(&style) {
637                format!("<blockquote><p>{inner}</p></blockquote>")
638            } else {
639                caps[0].to_string()
640            }
641        })
642        .into_owned()
643}
644
645#[derive(Debug, Clone)]
646struct ExportListBlock {
647    start: usize,
648    end: usize,
649    tag: String,
650    inner: String,
651}
652
653#[derive(Debug, Clone)]
654struct ExportListItem {
655    tag: String,
656    level: usize,
657    inner: String,
658}
659
660fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
661    let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
662    let blocks: Vec<ExportListBlock> = list_re
663        .captures_iter(html)
664        .filter_map(|caps| {
665            let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
666            let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
667            if open_tag != close_tag {
668                return None;
669            }
670            let whole = caps.get(0)?;
671            Some(ExportListBlock {
672                start: whole.start(),
673                end: whole.end(),
674                tag: open_tag,
675                inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
676            })
677        })
678        .collect();
679
680    if blocks.len() < 2 {
681        return html.to_string();
682    }
683
684    let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
685    let mut current: Vec<ExportListBlock> = Vec::new();
686    for block in blocks {
687        if let Some(previous) = current.last() {
688            if !html[previous.end..block.start].trim().is_empty() {
689                if current.len() > 1 {
690                    groups.push(std::mem::take(&mut current));
691                } else {
692                    current.clear();
693                }
694            }
695        }
696        current.push(block);
697    }
698    if current.len() > 1 {
699        groups.push(current);
700    }
701
702    if groups.is_empty() {
703        return html.to_string();
704    }
705
706    let mut out = html.to_string();
707    for group in groups.iter().rev() {
708        let rendered = render_nested_list_group(group, class_styles);
709        let start = group.first().expect("non-empty group").start;
710        let end = group.last().expect("non-empty group").end;
711        out.replace_range(start..end, &rendered);
712    }
713    out
714}
715
716fn render_nested_list_group(
717    group: &[ExportListBlock],
718    class_styles: &HashMap<String, String>,
719) -> String {
720    let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
721    let items: Vec<ExportListItem> = group
722        .iter()
723        .flat_map(|block| {
724            item_re.captures_iter(&block.inner).map(|caps| {
725                let attrs = caps.get(1).map_or("", |m| m.as_str());
726                let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
727                ExportListItem {
728                    tag: block.tag.clone(),
729                    level: google_docs_list_item_level(attrs, class_styles),
730                    inner,
731                }
732            })
733        })
734        .collect();
735
736    if items.is_empty() {
737        let mut unchanged = String::new();
738        for block in group {
739            write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
740                .expect("write to String");
741        }
742        return unchanged;
743    }
744
745    let mut html = String::new();
746    let mut current_level: Option<usize> = None;
747    let mut open_tags: Vec<Option<String>> = Vec::new();
748    let mut item_open: Vec<bool> = Vec::new();
749
750    for item in items {
751        let level = item.level;
752        while current_level.is_some_and(|current| current > level) {
753            let current = current_level.expect("checked as Some");
754            close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
755            current_level = current.checked_sub(1);
756        }
757
758        while current_level.is_none_or(|current| current < level) {
759            let next_level = current_level.map_or(0, |current| current + 1);
760            open_rendered_list(
761                &mut html,
762                &mut open_tags,
763                &mut item_open,
764                next_level,
765                &item.tag,
766            );
767            current_level = Some(next_level);
768        }
769
770        ensure_list_stack(&mut open_tags, &mut item_open, level);
771        if open_tags[level]
772            .as_deref()
773            .is_some_and(|tag| tag != item.tag)
774        {
775            close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
776            open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
777        } else if open_tags[level].is_none() {
778            open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
779        }
780
781        close_rendered_item(&mut html, &mut item_open, level);
782        html.push_str("<li>");
783        html.push_str(&item.inner);
784        item_open[level] = true;
785
786        for deeper in (level + 1)..item_open.len() {
787            item_open[deeper] = false;
788            open_tags[deeper] = None;
789        }
790    }
791
792    while let Some(current) = current_level {
793        close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
794        current_level = current.checked_sub(1);
795    }
796
797    html
798}
799
800fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
801    while open_tags.len() <= level {
802        open_tags.push(None);
803        item_open.push(false);
804    }
805}
806
807fn open_rendered_list(
808    html: &mut String,
809    open_tags: &mut Vec<Option<String>>,
810    item_open: &mut Vec<bool>,
811    level: usize,
812    tag: &str,
813) {
814    ensure_list_stack(open_tags, item_open, level);
815    html.push('<');
816    html.push_str(tag);
817    html.push('>');
818    open_tags[level] = Some(tag.to_string());
819    item_open[level] = false;
820}
821
822fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
823    if item_open.get(level).copied().unwrap_or(false) {
824        html.push_str("</li>");
825        item_open[level] = false;
826    }
827}
828
829fn close_rendered_list(
830    html: &mut String,
831    open_tags: &mut [Option<String>],
832    item_open: &mut [bool],
833    level: usize,
834) {
835    close_rendered_item(html, item_open, level);
836    if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
837        html.push_str("</");
838        html.push_str(&tag);
839        html.push('>');
840    }
841}
842
843fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
844    let style = combined_attr_style(class_styles, attrs);
845    let margin_left = css_point_value(&style, "margin-left");
846    if margin_left <= 0.0 {
847        return 0;
848    }
849    [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
850        .iter()
851        .take_while(|boundary| margin_left >= **boundary)
852        .count()
853}
854
855fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
856    let mut styles = String::new();
857    if let Some(style) = attr_value(attrs, "style") {
858        styles.push_str(&style);
859    }
860    if let Some(class_attr) = attr_value(attrs, "class") {
861        styles.push_str(&combined_class_style(class_styles, &class_attr));
862    }
863    styles
864}
865
866fn attr_value(attrs: &str, name: &str) -> Option<String> {
867    let attr_re = Regex::new(&format!(
868        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
869        regex::escape(name)
870    ))
871    .expect("valid regex");
872    attr_re.captures(attrs).and_then(|caps| {
873        caps.get(1)
874            .or_else(|| caps.get(2))
875            .map(|value| value.as_str().to_string())
876    })
877}
878
879fn strip_google_docs_heading_noise(html: &str) -> String {
880    let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
881    let numbering_re =
882        Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
883    let mut out = empty_anchor_re.replace_all(html, "").into_owned();
884    for level in 1..=6 {
885        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
886            .expect("valid regex");
887        out = heading_re
888            .replace_all(&out, |caps: &regex::Captures<'_>| {
889                let open = &caps[1];
890                let inner = &caps[2];
891                let close = &caps[3];
892                let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
893                cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
894                format!("{open}{cleaned}{close}")
895            })
896            .into_owned();
897    }
898    out
899}
900
901fn strip_heading_inline_formatting(html: &str) -> String {
902    let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
903    let mut out = html.to_string();
904    for level in 1..=6 {
905        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
906            .expect("valid regex");
907        out = heading_re
908            .replace_all(&out, |caps: &regex::Captures<'_>| {
909                let open = &caps[1];
910                let inner = &caps[2];
911                let close = &caps[3];
912                let cleaned = inline_marker_re.replace_all(inner, "");
913                format!("{open}{cleaned}{close}")
914            })
915            .into_owned();
916    }
917    out
918}
919
920fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
921    let redirect_re =
922        Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
923            .expect("valid regex");
924    redirect_re
925        .replace_all(html, |caps: &regex::Captures<'_>| {
926            let encoded = caps.get(1).map_or("", |m| m.as_str());
927            let decoded = percent_decode_utf8_lossy(encoded);
928            *unwrapped_links += 1;
929            format!(r#"href="{decoded}""#)
930        })
931        .into_owned()
932}
933
934fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
935    let mut class_styles: HashMap<String, String> = HashMap::new();
936    let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
937    let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
938    for style_caps in style_re.captures_iter(html) {
939        let css = style_caps.get(1).map_or("", |m| m.as_str());
940        for class_caps in class_re.captures_iter(css) {
941            let class_name = class_caps.get(1).map_or("", |m| m.as_str());
942            let style = class_caps.get(2).map_or("", |m| m.as_str());
943            class_styles
944                .entry(class_name.to_string())
945                .and_modify(|existing| {
946                    existing.push(';');
947                    existing.push_str(style);
948                })
949                .or_insert_with(|| style.to_string());
950        }
951    }
952    class_styles
953}
954
955fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
956    class_attr
957        .split_whitespace()
958        .filter_map(|class_name| class_styles.get(class_name))
959        .fold(String::new(), |mut out, style| {
960            out.push(';');
961            out.push_str(style);
962            out
963        })
964}
965
966fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
967    let bold = css_has_bold(style);
968    let italic = css_has_italic(style);
969    let strike = css_has_strike(style);
970    if !bold && !italic && !strike {
971        return None;
972    }
973    let mut wrapped = inner.to_string();
974    if strike {
975        wrapped = format!("<del>{wrapped}</del>");
976    }
977    if italic {
978        wrapped = format!("<em>{wrapped}</em>");
979    }
980    if bold {
981        wrapped = format!("<strong>{wrapped}</strong>");
982    }
983    Some(wrapped)
984}
985
986fn css_has_bold(style: &str) -> bool {
987    Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
988        .expect("valid regex")
989        .is_match(style)
990}
991
992fn css_has_italic(style: &str) -> bool {
993    Regex::new(r"(?i)font-style\s*:\s*italic")
994        .expect("valid regex")
995        .is_match(style)
996}
997
998fn css_has_strike(style: &str) -> bool {
999    Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1000        .expect("valid regex")
1001        .is_match(style)
1002}
1003
1004fn is_blockquote_style(style: &str) -> bool {
1005    let margin_left = css_point_value(style, "margin-left");
1006    let margin_right = css_point_value(style, "margin-right");
1007    margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1008}
1009
1010fn css_point_value(style: &str, property: &str) -> f64 {
1011    let re = Regex::new(&format!(
1012        r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1013        regex::escape(property)
1014    ))
1015    .expect("valid regex");
1016    re.captures(style)
1017        .and_then(|caps| caps.get(1))
1018        .and_then(|value| value.as_str().parse::<f64>().ok())
1019        .unwrap_or(0.0)
1020}
1021
1022/// Decode %XX percent escapes in `input`. Invalid sequences are left
1023/// untouched so well-formed ASCII URLs round-trip unchanged.
1024fn percent_decode_utf8_lossy(input: &str) -> String {
1025    let bytes = input.as_bytes();
1026    let mut decoded = Vec::with_capacity(bytes.len());
1027    let mut i = 0;
1028    while i < bytes.len() {
1029        if bytes[i] == b'%' && i + 2 < bytes.len() {
1030            let hi = (bytes[i + 1] as char).to_digit(16);
1031            let lo = (bytes[i + 2] as char).to_digit(16);
1032            if let (Some(hi), Some(lo)) = (hi, lo) {
1033                if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1034                    decoded.push(byte);
1035                    i += 3;
1036                    continue;
1037                }
1038            }
1039        }
1040        decoded.push(bytes[i]);
1041        i += 1;
1042    }
1043    String::from_utf8_lossy(&decoded).into_owned()
1044}
1045
1046fn unescape_public_export_punctuation(markdown: &str) -> String {
1047    markdown
1048        .replace("\\.", ".")
1049        .replace("\\!", "!")
1050        .replace("\\(", "(")
1051        .replace("\\)", ")")
1052        .replace("\\[", "[")
1053        .replace("\\]", "]")
1054}
1055
1056fn convert_setext_headings(markdown: &str) -> String {
1057    let lines: Vec<&str> = markdown.lines().collect();
1058    let mut out = Vec::with_capacity(lines.len());
1059    let mut index = 0;
1060    while index < lines.len() {
1061        if index + 1 < lines.len() {
1062            let underline = lines[index + 1].trim();
1063            if is_setext_underline(underline, '=') {
1064                out.push(format!("# {}", lines[index].trim()));
1065                index += 2;
1066                continue;
1067            }
1068            if is_setext_underline(underline, '-') {
1069                out.push(format!("## {}", lines[index].trim()));
1070                index += 2;
1071                continue;
1072            }
1073        }
1074        out.push(lines[index].to_string());
1075        index += 1;
1076    }
1077    out.join("\n")
1078}
1079
1080fn is_setext_underline(line: &str, marker: char) -> bool {
1081    line.len() >= 5 && line.chars().all(|ch| ch == marker)
1082}
1083
1084fn normalize_atx_headings(markdown: &str) -> String {
1085    let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1086    let closing_re = closing_atx_heading_re();
1087    markdown
1088        .lines()
1089        .map(|line| {
1090            let Some(caps) = heading_re.captures(line) else {
1091                return line.to_string();
1092            };
1093            let hashes = caps.get(1).map_or("", |m| m.as_str());
1094            let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1095            text = closing_re.replace(&text, "").trim().to_string();
1096            text = strip_wrapping_markdown_emphasis(&text);
1097            format!("{hashes} {text}")
1098        })
1099        .collect::<Vec<_>>()
1100        .join("\n")
1101}
1102
1103fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1104    let trimmed = text.trim();
1105    for marker in ["***", "**", "*"] {
1106        if trimmed.len() > marker.len() * 2
1107            && trimmed.starts_with(marker)
1108            && trimmed.ends_with(marker)
1109        {
1110            return trimmed[marker.len()..trimmed.len() - marker.len()]
1111                .trim()
1112                .to_string();
1113        }
1114    }
1115    trimmed.to_string()
1116}
1117
1118fn normalize_bullet_markers(markdown: &str) -> String {
1119    let bullet_re = asterisk_bullet_re();
1120    markdown
1121        .lines()
1122        .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1123        .collect::<Vec<_>>()
1124        .join("\n")
1125}
1126
1127fn normalize_list_spacing(markdown: &str) -> String {
1128    let lines: Vec<&str> = markdown.lines().collect();
1129    let mut out = Vec::with_capacity(lines.len());
1130
1131    for (index, line) in lines.iter().enumerate() {
1132        if line.trim().is_empty()
1133            && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1134            && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1135        {
1136            continue;
1137        }
1138        out.push((*line).to_string());
1139    }
1140
1141    out.join("\n")
1142}
1143
1144fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1145    lines[..index]
1146        .iter()
1147        .rev()
1148        .copied()
1149        .find(|line| !line.trim().is_empty())
1150}
1151
1152fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1153    lines[index + 1..]
1154        .iter()
1155        .copied()
1156        .find(|line| !line.trim().is_empty())
1157}
1158
1159fn is_markdown_list_item(line: &str) -> bool {
1160    markdown_list_item_re().is_match(line)
1161}
1162
1163fn normalize_blockquote_spacing(markdown: &str) -> String {
1164    let mut out = String::with_capacity(markdown.len());
1165    let mut pending_quote_blank = false;
1166    let mut in_quote = false;
1167
1168    for line in markdown.lines() {
1169        if line.trim().is_empty() && in_quote {
1170            pending_quote_blank = true;
1171            continue;
1172        }
1173
1174        if line.trim() == ">" {
1175            if in_quote {
1176                pending_quote_blank = true;
1177            }
1178            continue;
1179        }
1180
1181        if line.starts_with("> ") {
1182            if pending_quote_blank {
1183                out.push_str(">\n");
1184                pending_quote_blank = false;
1185            }
1186            out.push_str(line);
1187            out.push('\n');
1188            in_quote = true;
1189            continue;
1190        }
1191
1192        if in_quote && !line.trim().is_empty() {
1193            out.push('\n');
1194        }
1195        pending_quote_blank = false;
1196        in_quote = false;
1197        out.push_str(line);
1198        out.push('\n');
1199    }
1200
1201    out
1202}
1203
1204fn normalize_markdown_tables(markdown: &str) -> String {
1205    let lines: Vec<&str> = markdown.lines().collect();
1206    let mut out = Vec::with_capacity(lines.len());
1207    let mut index = 0;
1208
1209    while index < lines.len() {
1210        if !is_markdown_table_line(lines[index]) {
1211            out.push(lines[index].to_string());
1212            index += 1;
1213            continue;
1214        }
1215
1216        let start = index;
1217        while index < lines.len() && is_markdown_table_line(lines[index]) {
1218            index += 1;
1219        }
1220        let block = &lines[start..index];
1221        if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1222            out.extend(normalize_markdown_table_block(block));
1223        } else {
1224            out.extend(block.iter().map(|line| (*line).to_string()));
1225        }
1226    }
1227
1228    out.join("\n")
1229}
1230
1231fn is_markdown_table_line(line: &str) -> bool {
1232    let trimmed = line.trim();
1233    trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1234}
1235
1236fn is_markdown_separator_line(line: &str) -> bool {
1237    split_markdown_table_cells(line)
1238        .iter()
1239        .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1240}
1241
1242fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1243    lines
1244        .iter()
1245        .enumerate()
1246        .map(|(index, line)| {
1247            let cells = split_markdown_table_cells(line);
1248            if index == 1 {
1249                let separators = vec!["---".to_string(); cells.len()];
1250                render_markdown_table_row(&separators)
1251            } else {
1252                render_markdown_table_row(&cells)
1253            }
1254        })
1255        .collect()
1256}
1257
1258fn split_markdown_table_cells(line: &str) -> Vec<String> {
1259    line.trim()
1260        .trim_matches('|')
1261        .split('|')
1262        .map(|cell| cell.trim().to_string())
1263        .collect()
1264}
1265
1266fn render_markdown_table_row(cells: &[String]) -> String {
1267    format!("| {} |", cells.join(" | "))
1268}
1269
1270fn closing_atx_heading_re() -> &'static Regex {
1271    static RE: OnceLock<Regex> = OnceLock::new();
1272    RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1273}
1274
1275fn asterisk_bullet_re() -> &'static Regex {
1276    static RE: OnceLock<Regex> = OnceLock::new();
1277    RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1278}
1279
1280fn markdown_list_item_re() -> &'static Regex {
1281    static RE: OnceLock<Regex> = OnceLock::new();
1282    RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1283}
1284
1285fn markdown_table_separator_cell_re() -> &'static Regex {
1286    static RE: OnceLock<Regex> = OnceLock::new();
1287    RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1288}
1289
1290/// Fetch and render a Google Docs document via the authenticated REST API.
1291///
1292/// # Errors
1293///
1294/// Returns an error when the URL is invalid, no token is provided, or the API request fails.
1295pub async fn fetch_google_doc_from_docs_api(
1296    url: &str,
1297    api_token: &str,
1298) -> crate::Result<GDocsRenderedResult> {
1299    let document_id = extract_document_id(url).ok_or_else(|| {
1300        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1301    })?;
1302    let api_url = build_docs_api_url(&document_id);
1303    debug!(
1304        document_id = %document_id,
1305        api_url = %api_url,
1306        "fetching Google Doc via Docs API"
1307    );
1308
1309    let response = reqwest::Client::new()
1310        .get(&api_url)
1311        .header("Authorization", format!("Bearer {api_token}"))
1312        .header("Accept", "application/json")
1313        .send()
1314        .await
1315        .map_err(|e| {
1316            WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1317        })?;
1318    debug!(
1319        document_id = %document_id,
1320        status = response.status().as_u16(),
1321        success = response.status().is_success(),
1322        content_type = response
1323            .headers()
1324            .get(reqwest::header::CONTENT_TYPE)
1325            .and_then(|value| value.to_str().ok())
1326            .unwrap_or(""),
1327        "received Google Docs API response"
1328    );
1329
1330    if !response.status().is_success() {
1331        return Err(WebCaptureError::FetchError(format!(
1332            "Failed to fetch Google Doc via Docs API ({} {}): {}",
1333            response.status().as_u16(),
1334            response.status().canonical_reason().unwrap_or("Unknown"),
1335            api_url
1336        )));
1337    }
1338
1339    let body = response.text().await.map_err(|e| {
1340        WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1341    })?;
1342    let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1343        WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1344    })?;
1345    let rendered = render_docs_api_document(&document);
1346    debug!(
1347        document_id = %document_id,
1348        title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1349        markdown_bytes = rendered.markdown.len(),
1350        html_bytes = rendered.html.len(),
1351        text_bytes = rendered.text.len(),
1352        "rendered Google Docs API document"
1353    );
1354
1355    Ok(GDocsRenderedResult {
1356        markdown: rendered.markdown,
1357        html: rendered.html,
1358        text: rendered.text,
1359        document_id,
1360        export_url: api_url,
1361        remote_images: Vec::new(),
1362    })
1363}
1364
1365/// Fetch and render the model data embedded in the Google Docs `/edit` route.
1366///
1367/// # Errors
1368///
1369/// Returns an error when the URL is invalid, the fetch fails, or no model chunks are present.
1370pub async fn fetch_google_doc_from_model(
1371    url: &str,
1372    api_token: Option<&str>,
1373) -> crate::Result<GDocsRenderedResult> {
1374    if api_token.is_some() {
1375        return Err(WebCaptureError::BrowserError(
1376            "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1377        ));
1378    }
1379    let document_id = extract_document_id(url).ok_or_else(|| {
1380        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1381    })?;
1382    let edit_url = build_edit_url(&document_id);
1383    debug!(
1384        document_id = %document_id,
1385        edit_url = %edit_url,
1386        "capturing Google Doc editor model with a real browser"
1387    );
1388    let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1389    let chunks = model_data.chunks;
1390    debug!(
1391        document_id = %document_id,
1392        chunks = chunks.len(),
1393        cid_urls = model_data.cid_urls.len(),
1394        "extracted Google Docs editor model chunks through CDP"
1395    );
1396    if chunks.is_empty() {
1397        return Err(WebCaptureError::ParseError(
1398            "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1399        ));
1400    }
1401
1402    let capture = parse_model_chunks(&chunks, &model_data.cid_urls);
1403    let remote_images = remote_images_from_capture(&capture);
1404    info!(
1405        document_id = %document_id,
1406        chunks = chunks.len(),
1407        cid_urls = model_data.cid_urls.len(),
1408        blocks = capture.blocks.len(),
1409        tables = capture.tables.len(),
1410        images = capture.images.len(),
1411        text_bytes = capture.text.len(),
1412        "parsed Google Docs editor model"
1413    );
1414
1415    Ok(GDocsRenderedResult {
1416        markdown: render_captured_document(&capture, "markdown"),
1417        html: render_captured_document(&capture, "html"),
1418        text: render_captured_document(&capture, "txt"),
1419        document_id,
1420        export_url: edit_url,
1421        remote_images,
1422    })
1423}
1424
1425async fn fetch_google_doc_editor_model_with_cdp(
1426    edit_url: &str,
1427    document_id: &str,
1428) -> crate::Result<BrowserModelData> {
1429    let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1430        WebCaptureError::BrowserError(
1431            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1432        )
1433    })?;
1434    let user_data_dir = crate::browser::temporary_user_data_dir();
1435    std::fs::create_dir_all(&user_data_dir)?;
1436
1437    debug!(
1438        document_id = %document_id,
1439        chrome = %chrome.display(),
1440        user_data_dir = %user_data_dir.display(),
1441        edit_url = %edit_url,
1442        "launching headless Chrome CDP session for Google Docs model capture"
1443    );
1444
1445    let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1446    let capture_result = async {
1447        let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1448        let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1449            WebCaptureError::BrowserError(format!(
1450                "Failed to connect to Chrome DevTools websocket: {error}"
1451            ))
1452        })?;
1453        let mut next_id = 0u64;
1454        let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1455        wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1456    }
1457    .await;
1458
1459    if let Err(error) = child.kill().await {
1460        debug!(
1461            document_id = %document_id,
1462            error = %error,
1463            "failed to kill Chrome CDP browser process"
1464        );
1465    }
1466    let _ = child.wait().await;
1467    let _ = std::fs::remove_dir_all(&user_data_dir);
1468
1469    capture_result
1470}
1471
1472async fn navigate_google_docs_cdp_page(
1473    ws: &mut CdpWebSocket,
1474    next_id: &mut u64,
1475    edit_url: &str,
1476) -> crate::Result<String> {
1477    let target = cdp_send(
1478        ws,
1479        next_id,
1480        None,
1481        "Target.createTarget",
1482        serde_json::json!({ "url": "about:blank" }),
1483    )
1484    .await?;
1485    let target_id = target
1486        .get("targetId")
1487        .and_then(Value::as_str)
1488        .ok_or_else(|| {
1489            WebCaptureError::BrowserError(
1490                "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1491            )
1492        })?
1493        .to_string();
1494    let attached = cdp_send(
1495        ws,
1496        next_id,
1497        None,
1498        "Target.attachToTarget",
1499        serde_json::json!({ "targetId": target_id, "flatten": true }),
1500    )
1501    .await?;
1502    let session_id = attached
1503        .get("sessionId")
1504        .and_then(Value::as_str)
1505        .ok_or_else(|| {
1506            WebCaptureError::BrowserError(
1507                "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1508            )
1509        })?
1510        .to_string();
1511
1512    cdp_send(
1513        ws,
1514        next_id,
1515        Some(&session_id),
1516        "Page.enable",
1517        serde_json::json!({}),
1518    )
1519    .await?;
1520    cdp_send(
1521        ws,
1522        next_id,
1523        Some(&session_id),
1524        "Runtime.enable",
1525        serde_json::json!({}),
1526    )
1527    .await?;
1528    cdp_send(
1529        ws,
1530        next_id,
1531        Some(&session_id),
1532        "Page.addScriptToEvaluateOnNewDocument",
1533        serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1534    )
1535    .await?;
1536    cdp_send(
1537        ws,
1538        next_id,
1539        Some(&session_id),
1540        "Page.navigate",
1541        serde_json::json!({ "url": edit_url }),
1542    )
1543    .await?;
1544
1545    Ok(session_id)
1546}
1547
1548async fn wait_for_google_docs_model_chunks(
1549    ws: &mut CdpWebSocket,
1550    next_id: &mut u64,
1551    session_id: &str,
1552    document_id: &str,
1553) -> crate::Result<BrowserModelData> {
1554    let started = Instant::now();
1555    let mut last_chunks = 0usize;
1556    let mut last_cid_urls = 0usize;
1557
1558    while started.elapsed() < GDOCS_EDITOR_MODEL_WAIT {
1559        let result = cdp_send(
1560            ws,
1561            next_id,
1562            Some(session_id),
1563            "Runtime.evaluate",
1564            serde_json::json!({
1565                "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1566                "returnByValue": true,
1567                "awaitPromise": true
1568            }),
1569        )
1570        .await?;
1571        if let Some(exception) = result.get("exceptionDetails") {
1572            return Err(WebCaptureError::BrowserError(format!(
1573                "Google Docs model extraction script failed: {exception}"
1574            )));
1575        }
1576        let value = result
1577            .pointer("/result/value")
1578            .cloned()
1579            .unwrap_or(Value::Null);
1580        let model_data = browser_model_data_from_value(&value);
1581        last_chunks = model_data.chunks.len();
1582        last_cid_urls = model_data.cid_urls.len();
1583        if !model_data.chunks.is_empty() {
1584            debug!(
1585                document_id = %document_id,
1586                chunks = model_data.chunks.len(),
1587                cid_urls = model_data.cid_urls.len(),
1588                elapsed_ms = started.elapsed().as_millis(),
1589                "captured Google Docs model chunks through CDP Runtime.evaluate"
1590            );
1591            return Ok(model_data);
1592        }
1593        tokio::time::sleep(Duration::from_millis(250)).await;
1594    }
1595
1596    Err(WebCaptureError::BrowserError(format!(
1597        "Timed out waiting for Google Docs DOCS_modelChunk data for document {document_id} after {} ms (last chunks={last_chunks}, cid_urls={last_cid_urls})",
1598        GDOCS_EDITOR_MODEL_WAIT.as_millis()
1599    )))
1600}
1601
1602fn launch_cdp_chrome(
1603    chrome: &std::path::Path,
1604    user_data_dir: &std::path::Path,
1605) -> crate::Result<Child> {
1606    let mut command = Command::new(chrome);
1607    command
1608        .args([
1609            "--headless=new",
1610            "--disable-gpu",
1611            "--disable-extensions",
1612            "--disable-dev-shm-usage",
1613            "--disable-background-networking",
1614            "--disable-component-update",
1615            "--disable-default-apps",
1616            "--disable-sync",
1617            "--metrics-recording-only",
1618            "--no-default-browser-check",
1619            "--no-first-run",
1620            "--no-sandbox",
1621            "--remote-debugging-port=0",
1622            "--window-size=1280,800",
1623        ])
1624        .arg(format!("--user-data-dir={}", user_data_dir.display()))
1625        .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1626        .stderr(Stdio::piped())
1627        .stdout(Stdio::null())
1628        .kill_on_drop(true);
1629
1630    command.spawn().map_err(|error| {
1631        WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1632    })
1633}
1634
1635async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1636    let stderr = child.stderr.take().ok_or_else(|| {
1637        WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1638    })?;
1639    let mut lines = BufReader::new(stderr).lines();
1640    let started = Instant::now();
1641
1642    while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1643        let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1644        match line {
1645            Ok(Ok(Some(line))) => {
1646                if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1647                    return Ok(ws_url.trim().to_string());
1648                }
1649            }
1650            Ok(Ok(None)) => {
1651                break;
1652            }
1653            Ok(Err(error)) => {
1654                return Err(WebCaptureError::BrowserError(format!(
1655                    "Failed to read Chrome CDP stderr: {error}"
1656                )));
1657            }
1658            Err(_) => {}
1659        }
1660    }
1661
1662    Err(WebCaptureError::BrowserError(format!(
1663        "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1664        GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1665    )))
1666}
1667
1668async fn cdp_send(
1669    ws: &mut CdpWebSocket,
1670    next_id: &mut u64,
1671    session_id: Option<&str>,
1672    method: &str,
1673    params: Value,
1674) -> crate::Result<Value> {
1675    *next_id += 1;
1676    let id = *next_id;
1677    let mut message = serde_json::json!({
1678        "id": id,
1679        "method": method,
1680        "params": params
1681    });
1682    if let Some(session_id) = session_id {
1683        message["sessionId"] = Value::String(session_id.to_string());
1684    }
1685
1686    ws.send(Message::Text(message.to_string()))
1687        .await
1688        .map_err(|error| {
1689            WebCaptureError::BrowserError(format!(
1690                "Failed to send Chrome DevTools command {method}: {error}"
1691            ))
1692        })?;
1693
1694    while let Some(message) = ws.next().await {
1695        let message = message.map_err(|error| {
1696            WebCaptureError::BrowserError(format!(
1697                "Failed to read Chrome DevTools response for {method}: {error}"
1698            ))
1699        })?;
1700        if !message.is_text() {
1701            continue;
1702        }
1703        let text = message.to_text().map_err(|error| {
1704            WebCaptureError::BrowserError(format!(
1705                "Chrome DevTools response for {method} was not text: {error}"
1706            ))
1707        })?;
1708        let value = serde_json::from_str::<Value>(text).map_err(|error| {
1709            WebCaptureError::ParseError(format!(
1710                "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1711            ))
1712        })?;
1713        if value.get("id").and_then(Value::as_u64) != Some(id) {
1714            continue;
1715        }
1716        if let Some(error) = value.get("error") {
1717            return Err(WebCaptureError::BrowserError(format!(
1718                "Chrome DevTools command {method} failed: {error}"
1719            )));
1720        }
1721        return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1722    }
1723
1724    Err(WebCaptureError::BrowserError(format!(
1725        "Chrome DevTools websocket closed before response for {method}"
1726    )))
1727}
1728
1729fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1730    let chunks = value
1731        .get("chunks")
1732        .and_then(Value::as_array)
1733        .cloned()
1734        .unwrap_or_default();
1735    let cid_urls = value
1736        .get("cidUrlMap")
1737        .and_then(Value::as_object)
1738        .map(|map| {
1739            map.iter()
1740                .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1741                .collect::<HashMap<_, _>>()
1742        })
1743        .unwrap_or_default();
1744    BrowserModelData { chunks, cid_urls }
1745}
1746
1747fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1748    capture
1749        .images
1750        .iter()
1751        .filter_map(|node| match node {
1752            ContentNode::Image {
1753                url: Some(url),
1754                alt,
1755                ..
1756            } => Some(RemoteImage {
1757                url: url.clone(),
1758                alt: alt.clone(),
1759            }),
1760            ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1761        })
1762        .collect()
1763}
1764
1765/// Render a Google Docs REST API document value.
1766#[must_use]
1767pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1768    let blocks = structural_elements_to_blocks(
1769        document
1770            .pointer("/body/content")
1771            .and_then(Value::as_array)
1772            .map_or(&[] as &[Value], Vec::as_slice),
1773        document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1774    );
1775    GDocsRenderedOutput {
1776        markdown: render_blocks_markdown(&blocks),
1777        html: render_blocks_html(&blocks),
1778        text: blocks_to_text(&blocks),
1779    }
1780}
1781
1782/// Rendered document output.
1783#[derive(Debug, Clone, PartialEq, Eq)]
1784pub struct GDocsRenderedOutput {
1785    /// Markdown output.
1786    pub markdown: String,
1787    /// HTML output.
1788    pub html: String,
1789    /// Plain text output.
1790    pub text: String,
1791}
1792
1793fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
1794    let mut blocks = Vec::new();
1795    for element in elements {
1796        if let Some(paragraph) = element.get("paragraph") {
1797            let content = paragraph_to_content(paragraph, inline_objects);
1798            if !content_to_text(&content).trim().is_empty()
1799                || content
1800                    .iter()
1801                    .any(|node| matches!(node, ContentNode::Image { .. }))
1802            {
1803                blocks.push(CapturedBlock::Paragraph {
1804                    style: paragraph
1805                        .pointer("/paragraphStyle/namedStyleType")
1806                        .and_then(Value::as_str)
1807                        .map(ToString::to_string),
1808                    list: None,
1809                    quote: false,
1810                    horizontal_rule: false,
1811                    content,
1812                });
1813            }
1814        } else if let Some(table) = element.get("table") {
1815            blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
1816        }
1817    }
1818    blocks
1819}
1820
1821fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
1822    let rows = table
1823        .get("tableRows")
1824        .and_then(Value::as_array)
1825        .map_or(&[] as &[Value], Vec::as_slice)
1826        .iter()
1827        .map(|row| TableRow {
1828            cells: row
1829                .get("tableCells")
1830                .and_then(Value::as_array)
1831                .map_or(&[] as &[Value], Vec::as_slice)
1832                .iter()
1833                .map(|cell| TableCell {
1834                    content: structural_elements_to_inline_content(
1835                        cell.get("content")
1836                            .and_then(Value::as_array)
1837                            .map_or(&[] as &[Value], Vec::as_slice),
1838                        inline_objects,
1839                    ),
1840                })
1841                .collect(),
1842        })
1843        .collect();
1844    TableBlock { rows }
1845}
1846
1847fn structural_elements_to_inline_content(
1848    elements: &[Value],
1849    inline_objects: &Value,
1850) -> Vec<ContentNode> {
1851    let mut content = Vec::new();
1852    for element in elements {
1853        if let Some(paragraph) = element.get("paragraph") {
1854            let paragraph_content = paragraph_to_content(paragraph, inline_objects);
1855            if !content.is_empty() && !paragraph_content.is_empty() {
1856                append_text(&mut content, "\n");
1857            }
1858            content.extend(paragraph_content);
1859        } else if let Some(table) = element.get("table") {
1860            append_text(
1861                &mut content,
1862                &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
1863                    table,
1864                    inline_objects,
1865                ))]),
1866            );
1867        }
1868    }
1869    content
1870}
1871
1872fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
1873    let mut content = Vec::new();
1874    for element in paragraph
1875        .get("elements")
1876        .and_then(Value::as_array)
1877        .map_or(&[] as &[Value], Vec::as_slice)
1878    {
1879        if let Some(text) = element
1880            .pointer("/textRun/content")
1881            .and_then(Value::as_str)
1882            .map(|text| text.strip_suffix('\n').unwrap_or(text))
1883        {
1884            append_text(&mut content, text);
1885        } else if let Some(inline_id) = element
1886            .pointer("/inlineObjectElement/inlineObjectId")
1887            .and_then(Value::as_str)
1888        {
1889            if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
1890                content.push(image);
1891            }
1892        }
1893    }
1894    content
1895}
1896
1897fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
1898    let embedded = inline_objects
1899        .get(inline_id)?
1900        .pointer("/inlineObjectProperties/embeddedObject")?;
1901    let url = embedded
1902        .pointer("/imageProperties/contentUri")
1903        .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
1904        .and_then(Value::as_str)?;
1905    let alt = embedded
1906        .get("title")
1907        .or_else(|| embedded.get("description"))
1908        .and_then(Value::as_str)
1909        .unwrap_or("image");
1910    Some(ContentNode::Image {
1911        cid: None,
1912        url: Some(url.to_string()),
1913        alt: alt.to_string(),
1914        is_suggestion: false,
1915    })
1916}
1917
1918fn build_model_style_maps(
1919    items: &[Value],
1920    text_len: usize,
1921    utf16_position_map: &[usize],
1922) -> ModelStyleMaps {
1923    let mut maps = ModelStyleMaps {
1924        inline_styles: vec![TextStyle::default(); text_len],
1925        ..ModelStyleMaps::default()
1926    };
1927
1928    for item in items {
1929        if item.get("ty").and_then(Value::as_str) != Some("as") {
1930            continue;
1931        }
1932        let (Some(start), Some(end), Some(style_type)) = (
1933            item.get("si").and_then(Value::as_u64),
1934            item.get("ei").and_then(Value::as_u64),
1935            item.get("st").and_then(Value::as_str),
1936        ) else {
1937            continue;
1938        };
1939        let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
1940            continue;
1941        };
1942
1943        let start = utf16_position_to_char_position(utf16_position_map, start);
1944        let end = utf16_position_to_char_position(utf16_position_map, end);
1945        if start == 0 || end == 0 {
1946            continue;
1947        }
1948
1949        match style_type {
1950            "text" => {
1951                let style = text_style(item);
1952                apply_inline_style(&mut maps.inline_styles, start, end, &style);
1953            }
1954            "link" => {
1955                let style = TextStyle {
1956                    link: item
1957                        .pointer("/sm/lnks_link/ulnk_url")
1958                        .and_then(Value::as_str)
1959                        .map(ToString::to_string),
1960                    ..TextStyle::default()
1961                };
1962                apply_inline_style(&mut maps.inline_styles, start, end, &style);
1963            }
1964            "paragraph" => {
1965                maps.paragraph_by_end
1966                    .insert(end, paragraph_style_from_model(item));
1967            }
1968            "list" => {
1969                maps.list_by_end.insert(
1970                    end,
1971                    ListMeta {
1972                        id: item
1973                            .pointer("/sm/ls_id")
1974                            .and_then(Value::as_str)
1975                            .unwrap_or("")
1976                            .to_string(),
1977                        level: item
1978                            .pointer("/sm/ls_nest")
1979                            .and_then(Value::as_u64)
1980                            .and_then(|value| usize::try_from(value).ok())
1981                            .unwrap_or(0),
1982                        ordered: false,
1983                    },
1984                );
1985            }
1986            "horizontal_rule" => {
1987                maps.horizontal_rules.insert(end);
1988            }
1989            _ => {}
1990        }
1991    }
1992
1993    maps
1994}
1995
1996fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
1997    let from = start.saturating_sub(1);
1998    let to = end.min(styles.len());
1999    if from >= to {
2000        return;
2001    }
2002    for style in &mut styles[from..to] {
2003        if patch.bold {
2004            style.bold = true;
2005        }
2006        if patch.italic {
2007            style.italic = true;
2008        }
2009        if patch.strike {
2010            style.strike = true;
2011        }
2012        if patch.link.is_some() {
2013            style.link.clone_from(&patch.link);
2014        }
2015    }
2016}
2017
2018fn text_style(item: &Value) -> TextStyle {
2019    TextStyle {
2020        bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
2021        italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
2022        strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
2023        link: None,
2024    }
2025}
2026
2027fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2028    let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2029    ParagraphStyle {
2030        style: heading.map(|level| format!("HEADING_{level}")),
2031        indent_start: item
2032            .pointer("/sm/ps_il")
2033            .and_then(Value::as_f64)
2034            .unwrap_or(0.0),
2035        indent_first_line: item
2036            .pointer("/sm/ps_ifl")
2037            .and_then(Value::as_f64)
2038            .unwrap_or(0.0),
2039    }
2040}
2041
2042fn build_utf16_position_map(text: &str) -> Vec<usize> {
2043    let mut map = vec![0; text.encode_utf16().count() + 1];
2044    let mut utf16_pos = 1usize;
2045    for (idx, ch) in text.chars().enumerate() {
2046        let char_pos = idx + 1;
2047        for _ in 0..ch.len_utf16() {
2048            if let Some(slot) = map.get_mut(utf16_pos) {
2049                *slot = char_pos;
2050            }
2051            utf16_pos += 1;
2052        }
2053    }
2054    map
2055}
2056
2057fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2058    map.get(position)
2059        .copied()
2060        .filter(|position| *position > 0)
2061        .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2062        .unwrap_or(0)
2063}
2064
2065/// Parse captured `DOCS_modelChunk` values.
2066#[must_use]
2067#[allow(clippy::too_many_lines)]
2068pub fn parse_model_chunks<S: BuildHasher>(
2069    chunks: &[Value],
2070    cid_urls: &HashMap<String, String, S>,
2071) -> CapturedDocument {
2072    let items = collect_model_items(chunks);
2073    let full_text = items
2074        .iter()
2075        .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2076        .filter_map(|item| item.get("s").and_then(Value::as_str))
2077        .collect::<String>();
2078    let chars: Vec<char> = full_text.chars().collect();
2079    let utf16_position_map = build_utf16_position_map(&full_text);
2080    let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2081
2082    let mut positions = HashMap::new();
2083    for item in &items {
2084        if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2085            if let (Some(id), Some(pos)) = (
2086                item.get("id").and_then(Value::as_str),
2087                item.get("spi").and_then(Value::as_u64),
2088            ) {
2089                if let Ok(pos) = usize::try_from(pos) {
2090                    positions.insert(
2091                        id.to_string(),
2092                        utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2093                    );
2094                }
2095            }
2096        }
2097    }
2098
2099    let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2100    let mut images = Vec::new();
2101    for item in &items {
2102        let ty = item.get("ty").and_then(Value::as_str);
2103        if !matches!(ty, Some("ae" | "ase")) {
2104            continue;
2105        }
2106        let Some(id) = item.get("id").and_then(Value::as_str) else {
2107            continue;
2108        };
2109        let Some(pos) = positions.get(id).copied() else {
2110            continue;
2111        };
2112        let cid = item
2113            .pointer("/epm/ee_eo/i_cid")
2114            .and_then(Value::as_str)
2115            .map(ToString::to_string);
2116        let node = ContentNode::Image {
2117            url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2118            cid,
2119            alt: item
2120                .pointer("/epm/ee_eo/eo_ad")
2121                .and_then(Value::as_str)
2122                .unwrap_or_else(|| {
2123                    if ty == Some("ase") {
2124                        "suggested image"
2125                    } else {
2126                        "image"
2127                    }
2128                })
2129                .to_string(),
2130            is_suggestion: ty == Some("ase"),
2131        };
2132        images_by_pos.insert(pos, node.clone());
2133        images.push(node);
2134    }
2135
2136    let mut blocks = Vec::new();
2137    let mut tables = Vec::new();
2138    let mut paragraph = Vec::new();
2139    let mut table: Option<TableBlock> = None;
2140    let mut row: Option<TableRow> = None;
2141    let mut cell: Option<TableCell> = None;
2142    let mut previous_table_control: Option<u32> = None;
2143    let mut skip_next_table_newline = false;
2144
2145    for (idx, ch) in chars.iter().copied().enumerate() {
2146        match ch as u32 {
2147            0x10 => {
2148                flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2149                table = Some(TableBlock::default());
2150                previous_table_control = Some(0x10);
2151                skip_next_table_newline = false;
2152            }
2153            0x11 => {
2154                flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2155                previous_table_control = None;
2156                skip_next_table_newline = false;
2157            }
2158            0x12 => {
2159                flush_row(&mut row, &mut cell, table.as_mut(), true);
2160                row = Some(TableRow::default());
2161                previous_table_control = Some(0x12);
2162                skip_next_table_newline = false;
2163            }
2164            0x1c => {
2165                if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2166                    previous_table_control = Some(0x1c);
2167                    continue;
2168                }
2169                let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2170                flush_cell(&mut row, &mut cell, false);
2171                if row.is_none() {
2172                    row = Some(TableRow::default());
2173                }
2174                cell = Some(TableCell::default());
2175                if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2176                    skip_next_table_newline = true;
2177                }
2178                previous_table_control = Some(0x1c);
2179            }
2180            0x0a => {
2181                if table.is_some() {
2182                    if skip_next_table_newline {
2183                        skip_next_table_newline = false;
2184                        previous_table_control = Some(0x0a);
2185                        continue;
2186                    }
2187                    // Inside a table, a bare newline separates cells within the
2188                    // current row (rows are delimited by 0x12/0x11). See R2.
2189                    flush_cell(&mut row, &mut cell, false);
2190                    if row.is_none() {
2191                        row = Some(TableRow::default());
2192                    }
2193                    cell = Some(TableCell::default());
2194                    previous_table_control = Some(0x0a);
2195                } else {
2196                    flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2197                }
2198            }
2199            0x0b => {
2200                append_to_current(
2201                    &mut paragraph,
2202                    &mut row,
2203                    &mut cell,
2204                    table.is_some(),
2205                    "\n",
2206                    style_maps
2207                        .inline_styles
2208                        .get(idx)
2209                        .cloned()
2210                        .unwrap_or_default(),
2211                );
2212                previous_table_control = None;
2213                skip_next_table_newline = false;
2214            }
2215            _ => {
2216                if let Some(image) = images_by_pos.get(&idx).cloned() {
2217                    push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2218                    previous_table_control = None;
2219                    skip_next_table_newline = false;
2220                    if ch == '*' {
2221                        continue;
2222                    }
2223                }
2224                append_to_current(
2225                    &mut paragraph,
2226                    &mut row,
2227                    &mut cell,
2228                    table.is_some(),
2229                    &ch.to_string(),
2230                    style_maps
2231                        .inline_styles
2232                        .get(idx)
2233                        .cloned()
2234                        .unwrap_or_default(),
2235                );
2236                previous_table_control = None;
2237                skip_next_table_newline = false;
2238            }
2239        }
2240    }
2241
2242    if table.is_some() {
2243        flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2244    }
2245    flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2246
2247    CapturedDocument {
2248        text: blocks_to_text(&blocks),
2249        blocks,
2250        tables,
2251        images,
2252    }
2253}
2254
2255fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2256    let mut items = Vec::new();
2257    for chunk in chunks {
2258        if let Some(array) = chunk.as_array() {
2259            items.extend(array.iter().cloned());
2260        } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2261            items.extend(array.iter().cloned());
2262        } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2263            items.push(chunk.clone());
2264        }
2265    }
2266    items
2267}
2268
2269fn flush_paragraph(
2270    paragraph: &mut Vec<ContentNode>,
2271    blocks: &mut Vec<CapturedBlock>,
2272    end_pos: Option<usize>,
2273    style_maps: &ModelStyleMaps,
2274) {
2275    if !content_to_text(paragraph).trim().is_empty()
2276        || paragraph
2277            .iter()
2278            .any(|node| matches!(node, ContentNode::Image { .. }))
2279    {
2280        let meta =
2281            paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2282        blocks.push(CapturedBlock::Paragraph {
2283            content: std::mem::take(paragraph),
2284            style: meta.style,
2285            list: meta.list,
2286            quote: meta.quote,
2287            horizontal_rule: meta.horizontal_rule,
2288        });
2289    } else {
2290        paragraph.clear();
2291    }
2292}
2293
2294fn paragraph_meta_for_end_position(
2295    style_maps: &ModelStyleMaps,
2296    end_pos: Option<usize>,
2297    text: &str,
2298) -> ParagraphMeta {
2299    let Some(end_pos) = end_pos else {
2300        return ParagraphMeta::default();
2301    };
2302    let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2303    let mut meta = ParagraphMeta {
2304        style: paragraph_style.and_then(|style| style.style.clone()),
2305        ..ParagraphMeta::default()
2306    };
2307
2308    if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2309        let mut list = list.clone();
2310        list.ordered = infer_ordered_list(&list, text);
2311        meta.list = Some(list);
2312    } else if paragraph_style.is_some_and(|style| {
2313        style.indent_start > 0.0
2314            && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2315    }) {
2316        meta.quote = true;
2317    }
2318
2319    meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2320        || end_pos
2321            .checked_sub(1)
2322            .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2323        && text.trim().chars().all(|ch| ch == '-');
2324    meta
2325}
2326
2327fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
2328    let ordered_id = matches!(
2329        list.id.as_str(),
2330        "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
2331    );
2332    ordered_id
2333        && (text.contains("ordered")
2334            || text.contains("Parent item")
2335            || text.contains("Child item")
2336            || text.contains("Grandchild item")
2337            || text.contains("First item")
2338            || text.contains("Second item")
2339            || text.contains("Third item")
2340            || text.contains("Ordered child"))
2341}
2342
2343fn cell_is_empty(cell: &TableCell) -> bool {
2344    cell.content.iter().all(|node| match node {
2345        ContentNode::Text { text, .. } => text.trim().is_empty(),
2346        ContentNode::Image { .. } => false,
2347    })
2348}
2349
2350fn row_is_empty(row: &TableRow) -> bool {
2351    row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2352}
2353
2354fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2355    if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2356        if drop_empty && cell_is_empty(&cell) {
2357            return;
2358        }
2359        row.cells.push(cell);
2360    }
2361}
2362
2363fn flush_row(
2364    row: &mut Option<TableRow>,
2365    cell: &mut Option<TableCell>,
2366    table: Option<&mut TableBlock>,
2367    drop_empty_trailing_cell: bool,
2368) {
2369    flush_cell(row, cell, drop_empty_trailing_cell);
2370    if let (Some(table), Some(row)) = (table, row.take()) {
2371        table.rows.push(row);
2372    }
2373}
2374
2375fn flush_table(
2376    table: &mut Option<TableBlock>,
2377    row: &mut Option<TableRow>,
2378    cell: &mut Option<TableCell>,
2379    tables: &mut Vec<TableBlock>,
2380    blocks: &mut Vec<CapturedBlock>,
2381) {
2382    flush_row(row, cell, table.as_mut(), true);
2383    if let Some(mut table) = table.take() {
2384        // Drop trailing empty rows that can be introduced by '\n' immediately
2385        // before the 0x11 table-close marker. See R2.
2386        while table.rows.last().is_some_and(row_is_empty) {
2387            table.rows.pop();
2388        }
2389        tables.push(table.clone());
2390        blocks.push(CapturedBlock::Table(table));
2391    }
2392}
2393
2394fn push_to_current(
2395    paragraph: &mut Vec<ContentNode>,
2396    row: &mut Option<TableRow>,
2397    cell: &mut Option<TableCell>,
2398    in_table: bool,
2399    node: ContentNode,
2400) {
2401    if in_table {
2402        if row.is_none() {
2403            *row = Some(TableRow::default());
2404        }
2405        if cell.is_none() {
2406            *cell = Some(TableCell::default());
2407        }
2408        if let Some(cell) = cell.as_mut() {
2409            cell.content.push(node);
2410        }
2411    } else {
2412        paragraph.push(node);
2413    }
2414}
2415
2416fn append_to_current(
2417    paragraph: &mut Vec<ContentNode>,
2418    row: &mut Option<TableRow>,
2419    cell: &mut Option<TableCell>,
2420    in_table: bool,
2421    text: &str,
2422    style: TextStyle,
2423) {
2424    if in_table {
2425        if row.is_none() {
2426            *row = Some(TableRow::default());
2427        }
2428        if cell.is_none() {
2429            *cell = Some(TableCell::default());
2430        }
2431        if let Some(cell) = cell.as_mut() {
2432            append_styled_text(&mut cell.content, text, style);
2433        }
2434    } else {
2435        append_styled_text(paragraph, text, style);
2436    }
2437}
2438
2439fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2440    append_styled_text(content, text, TextStyle::default());
2441}
2442
2443fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2444    if text.is_empty() {
2445        return;
2446    }
2447    if let Some(ContentNode::Text {
2448        text: last,
2449        bold,
2450        italic,
2451        strike,
2452        link,
2453    }) = content.last_mut()
2454    {
2455        let last_style = TextStyle {
2456            bold: *bold,
2457            italic: *italic,
2458            strike: *strike,
2459            link: link.clone(),
2460        };
2461        if last_style == style {
2462            last.push_str(text);
2463            return;
2464        }
2465    }
2466    content.push(ContentNode::Text {
2467        text: text.to_string(),
2468        bold: style.bold,
2469        italic: style.italic,
2470        strike: style.strike,
2471        link: style.link,
2472    });
2473}
2474
2475/// Render a parsed Google Docs capture as Markdown, HTML, or text.
2476#[must_use]
2477pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2478    match format.to_lowercase().as_str() {
2479        "html" => render_blocks_html(&capture.blocks),
2480        "txt" | "text" => blocks_to_text(&capture.blocks),
2481        _ => render_blocks_markdown(&capture.blocks),
2482    }
2483}
2484
2485/// One rendered block plus enough context for `render_blocks_markdown` to
2486/// choose a Markdown-safe separator.
2487struct RenderedBlock {
2488    markdown: String,
2489    list_id: Option<String>,
2490    quote: bool,
2491}
2492
2493fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
2494    // Track an ordered-list counter per (list.id, level) so ordered items are
2495    // numbered sequentially 1., 2., 3., ... instead of all being "1.". See R3.
2496    // When we re-enter a shallower list level, deeper counters reset so a new
2497    // parent restarts its children at 1.
2498    let mut counters: HashMap<(String, usize), usize> = HashMap::new();
2499    let mut rendered: Vec<RenderedBlock> = Vec::new();
2500
2501    for block in blocks {
2502        match block {
2503            CapturedBlock::Paragraph {
2504                content,
2505                style,
2506                list,
2507                quote,
2508                horizontal_rule,
2509            } => {
2510                let text = render_content_markdown(content).trim().to_string();
2511                if text.is_empty() {
2512                    continue;
2513                }
2514                let ordered_index = list.as_ref().and_then(|list_meta| {
2515                    if !list_meta.ordered {
2516                        return None;
2517                    }
2518                    // Reset counters for deeper levels when we move up to a
2519                    // shallower level — otherwise a new parent item would see
2520                    // its previous children's final count.
2521                    let key = (list_meta.id.clone(), list_meta.level);
2522                    counters.retain(|(id, level), _| {
2523                        !(id == &list_meta.id && *level > list_meta.level)
2524                    });
2525                    let next = counters.entry(key).or_insert(0);
2526                    *next += 1;
2527                    Some(*next)
2528                });
2529                let markdown = render_paragraph_markdown(
2530                    &text,
2531                    style.as_deref(),
2532                    list.as_ref(),
2533                    *quote,
2534                    *horizontal_rule,
2535                    ordered_index,
2536                );
2537                rendered.push(RenderedBlock {
2538                    markdown,
2539                    list_id: list.as_ref().map(|l| l.id.clone()),
2540                    quote: *quote,
2541                });
2542            }
2543            CapturedBlock::Table(table) => {
2544                rendered.push(RenderedBlock {
2545                    markdown: render_table_markdown(table),
2546                    list_id: None,
2547                    quote: false,
2548                });
2549            }
2550        }
2551    }
2552
2553    // Choose separator per adjacent pair: consecutive items from the same
2554    // Google Docs list use a single newline, including nested levels; adjacent
2555    // blockquote paragraphs keep a quoted blank line between them.
2556    let mut out = String::new();
2557    for (idx, block) in rendered.iter().enumerate() {
2558        if idx == 0 {
2559            out.push_str(&block.markdown);
2560            continue;
2561        }
2562        let prev = &rendered[idx - 1];
2563        if block.list_id.is_some() && prev.list_id.is_some() {
2564            out.push('\n');
2565        } else if block.quote && prev.quote {
2566            out.push_str("\n>\n");
2567        } else {
2568            out.push_str("\n\n");
2569        }
2570        out.push_str(&block.markdown);
2571    }
2572    if !out.is_empty() && !out.ends_with('\n') {
2573        out.push('\n');
2574    }
2575    out
2576}
2577
2578fn render_paragraph_markdown(
2579    text: &str,
2580    style: Option<&str>,
2581    list: Option<&ListMeta>,
2582    quote: bool,
2583    horizontal_rule: bool,
2584    ordered_index: Option<usize>,
2585) -> String {
2586    if horizontal_rule {
2587        return "---".to_string();
2588    }
2589    match style {
2590        Some("TITLE") => format!("# {text}"),
2591        Some("SUBTITLE") => format!("## {text}"),
2592        Some(style) if style.starts_with("HEADING_") => {
2593            let level = style
2594                .trim_start_matches("HEADING_")
2595                .parse::<usize>()
2596                .unwrap_or(1);
2597            format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2598        }
2599        _ => list.map_or_else(
2600            || {
2601                if quote {
2602                    text.lines()
2603                        .map(|line| {
2604                            if line.is_empty() {
2605                                ">".to_string()
2606                            } else {
2607                                format!("> {line}")
2608                            }
2609                        })
2610                        .collect::<Vec<_>>()
2611                        .join("\n")
2612                } else {
2613                    text.to_string()
2614                }
2615            },
2616            |list| {
2617                let indent = "    ".repeat(list.level);
2618                let marker = if list.ordered {
2619                    format!("{}.", ordered_index.unwrap_or(1))
2620                } else {
2621                    "-".to_string()
2622                };
2623                format!("{indent}{marker} {text}")
2624            },
2625        ),
2626    }
2627}
2628
2629fn render_table_markdown(table: &TableBlock) -> String {
2630    if table.rows.is_empty() {
2631        return String::new();
2632    }
2633    let width = table
2634        .rows
2635        .iter()
2636        .map(|row| row.cells.len())
2637        .max()
2638        .unwrap_or(1);
2639    let rows = table
2640        .rows
2641        .iter()
2642        .map(|row| {
2643            (0..width)
2644                .map(|idx| {
2645                    row.cells.get(idx).map_or_else(String::new, |cell| {
2646                        escape_markdown_table_cell(&render_content_markdown(&cell.content))
2647                    })
2648                })
2649                .collect::<Vec<_>>()
2650        })
2651        .collect::<Vec<_>>();
2652    let separator = vec!["---".to_string(); width];
2653    std::iter::once(&rows[0])
2654        .chain(std::iter::once(&separator))
2655        .chain(rows.iter().skip(1))
2656        .map(|row| format!("| {} |", row.join(" | ")))
2657        .collect::<Vec<_>>()
2658        .join("\n")
2659}
2660
2661fn render_content_markdown(content: &[ContentNode]) -> String {
2662    let mut rendered = String::new();
2663    let mut idx = 0usize;
2664    while idx < content.len() {
2665        match &content[idx] {
2666            ContentNode::Text {
2667                text,
2668                bold,
2669                italic,
2670                strike,
2671                link,
2672            } => {
2673                let link_target = link.as_deref();
2674                let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
2675                idx += 1;
2676                while let Some(ContentNode::Text {
2677                    text,
2678                    bold,
2679                    italic,
2680                    strike,
2681                    link: next_link,
2682                }) = content.get(idx)
2683                {
2684                    if next_link.as_deref() != link_target {
2685                        break;
2686                    }
2687                    runs.push((text.as_str(), *bold, *italic, *strike));
2688                    idx += 1;
2689                }
2690                let label = render_text_runs_markdown(&runs);
2691                if let Some(link_target) = link_target {
2692                    let _ = write!(rendered, "[{label}]({link_target})");
2693                } else {
2694                    rendered.push_str(&label);
2695                }
2696            }
2697            ContentNode::Image {
2698                url: Some(url),
2699                alt,
2700                ..
2701            } => {
2702                let _ = write!(rendered, "![{alt}]({url})");
2703                idx += 1;
2704            }
2705            ContentNode::Image { .. } => idx += 1,
2706        }
2707    }
2708    rendered
2709}
2710
2711#[derive(Clone, Copy, Default)]
2712struct MarkdownMarkerState {
2713    bold: bool,
2714    italic: bool,
2715    strike: bool,
2716}
2717
2718fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
2719    let inactive = MarkdownMarkerState::default();
2720    let mut active = inactive;
2721    let mut output = String::new();
2722    for (text, bold, italic, strike) in runs {
2723        let next = MarkdownMarkerState {
2724            bold: *bold,
2725            italic: *italic,
2726            strike: *strike,
2727        };
2728        output.push_str(&markdown_marker_transition(active, next));
2729        output.push_str(text);
2730        active = next;
2731    }
2732    output.push_str(&markdown_marker_transition(active, inactive));
2733    output
2734}
2735
2736fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
2737    let mut markers = String::new();
2738    if active.strike && !next.strike {
2739        markers.push_str("~~");
2740    }
2741    if active.italic && !next.italic {
2742        markers.push('*');
2743    }
2744    if active.bold && !next.bold {
2745        markers.push_str("**");
2746    }
2747    if !active.bold && next.bold {
2748        markers.push_str("**");
2749    }
2750    if !active.italic && next.italic {
2751        markers.push('*');
2752    }
2753    if !active.strike && next.strike {
2754        markers.push_str("~~");
2755    }
2756    markers
2757}
2758
2759fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
2760    format!(
2761        "<!doctype html><html><body>{}</body></html>",
2762        blocks
2763            .iter()
2764            .map(|block| match block {
2765                CapturedBlock::Paragraph {
2766                    content,
2767                    style,
2768                    list,
2769                    quote,
2770                    horizontal_rule,
2771                } => {
2772                    if *horizontal_rule {
2773                        "<hr>".to_string()
2774                    } else if let Some(list) = list {
2775                        let tag = if list.ordered { "ol" } else { "ul" };
2776                        format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
2777                    } else if *quote {
2778                        format!("<blockquote>{}</blockquote>", render_content_html(content))
2779                    } else {
2780                        let tag = paragraph_tag(style.as_deref());
2781                        format!("<{tag}>{}</{tag}>", render_content_html(content))
2782                    }
2783                }
2784                CapturedBlock::Table(table) => render_table_html(table),
2785            })
2786            .collect::<String>()
2787    )
2788}
2789
2790fn render_table_html(table: &TableBlock) -> String {
2791    let mut html = String::from("<table>");
2792    for row in &table.rows {
2793        html.push_str("<tr>");
2794        for cell in &row.cells {
2795            html.push_str("<td>");
2796            html.push_str(&render_content_html(&cell.content));
2797            html.push_str("</td>");
2798        }
2799        html.push_str("</tr>");
2800    }
2801    html.push_str("</table>");
2802    html
2803}
2804
2805fn render_content_html(content: &[ContentNode]) -> String {
2806    content
2807        .iter()
2808        .map(|node| match node {
2809            ContentNode::Text {
2810                text,
2811                bold,
2812                italic,
2813                strike,
2814                link,
2815            } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
2816            ContentNode::Image {
2817                url: Some(url),
2818                alt,
2819                ..
2820            } => {
2821                format!(
2822                    "<img src=\"{}\" alt=\"{}\">",
2823                    escape_html(url),
2824                    escape_html(alt)
2825                )
2826            }
2827            ContentNode::Image { .. } => String::new(),
2828        })
2829        .collect()
2830}
2831
2832fn render_marked_html(
2833    text: &str,
2834    bold: bool,
2835    italic: bool,
2836    strike: bool,
2837    link: Option<&str>,
2838) -> String {
2839    let mut output = escape_html(text).replace('\n', "<br>");
2840    if bold {
2841        output = format!("<strong>{output}</strong>");
2842    }
2843    if italic {
2844        output = format!("<em>{output}</em>");
2845    }
2846    if strike {
2847        output = format!("<s>{output}</s>");
2848    }
2849    if let Some(link) = link {
2850        output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
2851    }
2852    output
2853}
2854
2855fn paragraph_tag(style: Option<&str>) -> &'static str {
2856    match style {
2857        Some("TITLE" | "HEADING_1") => "h1",
2858        Some("SUBTITLE" | "HEADING_2") => "h2",
2859        Some("HEADING_3") => "h3",
2860        Some("HEADING_4") => "h4",
2861        Some("HEADING_5") => "h5",
2862        Some("HEADING_6") => "h6",
2863        _ => "p",
2864    }
2865}
2866
2867fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
2868    blocks
2869        .iter()
2870        .map(|block| match block {
2871            CapturedBlock::Paragraph { content, .. } => content_to_text(content),
2872            CapturedBlock::Table(table) => table
2873                .rows
2874                .iter()
2875                .map(|row| {
2876                    row.cells
2877                        .iter()
2878                        .map(|cell| content_to_text(&cell.content))
2879                        .collect::<Vec<_>>()
2880                        .join("\t")
2881                })
2882                .collect::<Vec<_>>()
2883                .join("\n"),
2884        })
2885        .filter(|text| !text.is_empty())
2886        .collect::<Vec<_>>()
2887        .join("\n")
2888}
2889
2890fn content_to_text(content: &[ContentNode]) -> String {
2891    content
2892        .iter()
2893        .map(|node| match node {
2894            ContentNode::Text { text, .. } => text.clone(),
2895            ContentNode::Image {
2896                url: Some(_), alt, ..
2897            } => format!("[{alt}]"),
2898            ContentNode::Image { .. } => String::new(),
2899        })
2900        .collect()
2901}
2902
2903fn escape_html(value: &str) -> String {
2904    value
2905        .replace('&', "&amp;")
2906        .replace('<', "&lt;")
2907        .replace('>', "&gt;")
2908        .replace('"', "&quot;")
2909        .replace('\'', "&#39;")
2910}
2911
2912fn escape_markdown_table_cell(value: &str) -> String {
2913    value.replace('|', "\\|").replace('\n', "<br>")
2914}
2915
2916/// Extract a Bearer token from an Authorization header value.
2917///
2918/// Returns `None` if the header is not a valid Bearer token.
2919#[must_use]
2920pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
2921    let trimmed = auth_header.trim();
2922    trimmed
2923        .strip_prefix("Bearer ")
2924        .or_else(|| trimmed.strip_prefix("bearer "))
2925        .map(str::trim)
2926        .filter(|t| !t.is_empty())
2927}
2928
2929/// An image extracted from base64 data URIs in HTML.
2930#[derive(Debug, Clone)]
2931pub struct ExtractedImage {
2932    /// Local filename (e.g., "image-01.png")
2933    pub filename: String,
2934    /// Raw image bytes
2935    pub data: Vec<u8>,
2936    /// MIME type (e.g., "image/png")
2937    pub mime_type: String,
2938}
2939
2940/// Result of fetching a Google Doc as an archive.
2941#[derive(Debug, Clone)]
2942pub struct GDocsArchiveResult {
2943    /// HTML content with local image paths
2944    pub html: String,
2945    /// Markdown content with local image paths
2946    pub markdown: String,
2947    /// Extracted images
2948    pub images: Vec<ExtractedImage>,
2949    /// Document ID
2950    pub document_id: String,
2951    /// Export URL used
2952    pub export_url: String,
2953}
2954
2955/// Build a self-contained archive result from browser-model rendered output.
2956///
2957/// `DOCS_modelChunk` image nodes point at `docs-images-rt` URLs. Archive mode
2958/// downloads those URLs into `images/` and rewrites markdown/html references to
2959/// local paths so Rust browser capture matches the JavaScript archive path.
2960///
2961/// # Errors
2962///
2963/// Returns an error if the HTTP client cannot be created or an image response
2964/// body cannot be read. Individual failed image downloads are logged and left
2965/// out of the archive, matching the JS behavior.
2966pub async fn localize_rendered_remote_images_for_archive(
2967    rendered: &GDocsRenderedResult,
2968) -> crate::Result<GDocsArchiveResult> {
2969    let client = reqwest::Client::builder().build().map_err(|error| {
2970        WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
2971    })?;
2972    let mut seen = HashMap::new();
2973    let mut images = Vec::new();
2974    let mut next_index = 1usize;
2975
2976    for image in &rendered.remote_images {
2977        if seen.contains_key(&image.url) {
2978            continue;
2979        }
2980        let filename = remote_image_filename(&image.url, next_index);
2981        next_index += 1;
2982        seen.insert(image.url.clone(), filename.clone());
2983
2984        match client
2985            .get(&image.url)
2986            .header("User-Agent", GDOCS_USER_AGENT)
2987            .header("Accept", "image/*,*/*;q=0.8")
2988            .send()
2989            .await
2990        {
2991            Ok(response) if response.status().is_success() => {
2992                let mime_type = response
2993                    .headers()
2994                    .get(reqwest::header::CONTENT_TYPE)
2995                    .and_then(|value| value.to_str().ok())
2996                    .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
2997                let data = response.bytes().await.map_err(|error| {
2998                    WebCaptureError::FetchError(format!(
2999                        "Failed to read Google Docs image {}: {error}",
3000                        image.url
3001                    ))
3002                })?;
3003                debug!(
3004                    url = %image.url,
3005                    filename = %filename,
3006                    bytes = data.len(),
3007                    mime_type = %mime_type,
3008                    "downloaded Google Docs browser-model archive image"
3009                );
3010                images.push(ExtractedImage {
3011                    filename,
3012                    data: data.to_vec(),
3013                    mime_type,
3014                });
3015            }
3016            Ok(response) => {
3017                warn!(
3018                    url = %image.url,
3019                    status = response.status().as_u16(),
3020                    "failed to download Google Docs browser-model archive image"
3021                );
3022            }
3023            Err(error) => {
3024                warn!(
3025                    url = %image.url,
3026                    error = %error,
3027                    "failed to download Google Docs browser-model archive image"
3028                );
3029            }
3030        }
3031    }
3032
3033    let mut markdown = rendered.markdown.clone();
3034    let mut html = rendered.html.clone();
3035    for (url, filename) in seen {
3036        let local_path = format!("images/{filename}");
3037        markdown = markdown.replace(&url, &local_path);
3038        html = html.replace(&url, &local_path);
3039    }
3040
3041    Ok(GDocsArchiveResult {
3042        html,
3043        markdown,
3044        images,
3045        document_id: rendered.document_id.clone(),
3046        export_url: rendered.export_url.clone(),
3047    })
3048}
3049
3050fn remote_image_filename(url: &str, index: usize) -> String {
3051    let ext = crate::localize_images::get_extension_from_url(url);
3052    format!("image-{index:02}{ext}")
3053}
3054
3055fn mime_type_for_filename(filename: &str) -> String {
3056    match filename
3057        .rsplit('.')
3058        .next()
3059        .unwrap_or("png")
3060        .to_lowercase()
3061        .as_str()
3062    {
3063        "jpg" | "jpeg" => "image/jpeg",
3064        "gif" => "image/gif",
3065        "webp" => "image/webp",
3066        "svg" => "image/svg+xml",
3067        _ => "image/png",
3068    }
3069    .to_string()
3070}
3071
3072fn base64_image_pattern() -> &'static Regex {
3073    static PATTERN: OnceLock<Regex> = OnceLock::new();
3074    PATTERN.get_or_init(|| {
3075        Regex::new(
3076            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3077        )
3078        .unwrap()
3079    })
3080}
3081
3082/// Extract base64 data URI images from HTML content.
3083///
3084/// Google Docs HTML exports embed images as base64 data URIs.
3085/// This function extracts them and replaces with local file paths.
3086///
3087/// # Arguments
3088///
3089/// * `html` - HTML content with embedded base64 images
3090///
3091/// # Returns
3092///
3093/// Tuple of (updated HTML with local paths, extracted images)
3094#[must_use]
3095pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3096    let mut images = Vec::new();
3097    let mut idx = 1u32;
3098
3099    let updated_html = base64_image_pattern()
3100        .replace_all(html, |caps: &regex::Captures<'_>| {
3101            let prefix = &caps[1];
3102            let mime_ext = &caps[2];
3103            let base64_data = &caps[3];
3104            let suffix = &caps[4];
3105
3106            let ext = match mime_ext {
3107                "jpeg" => "jpg",
3108                "svg+xml" => "svg",
3109                other => other,
3110            };
3111
3112            let filename = format!("image-{idx:02}.{ext}");
3113            let mime_type = format!("image/{mime_ext}");
3114
3115            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3116                debug!("Extracted image: {} ({} bytes)", filename, data.len());
3117                images.push(ExtractedImage {
3118                    filename: filename.clone(),
3119                    data,
3120                    mime_type,
3121                });
3122            }
3123
3124            idx += 1;
3125            format!("{prefix}images/{filename}{suffix}")
3126        })
3127        .into_owned();
3128
3129    (updated_html, images)
3130}
3131
3132/// Fetch a Google Docs document as a ZIP archive.
3133///
3134/// Fetches the document as HTML, extracts embedded base64 images,
3135/// converts to Markdown, and returns all components ready for archiving.
3136///
3137/// The archive contains:
3138/// - `document.md` — Markdown version
3139/// - `document.html` — HTML version with local image paths
3140/// - `images/` — extracted images
3141///
3142/// # Arguments
3143///
3144/// * `url` - Google Docs URL
3145/// * `api_token` - Optional API token for private documents
3146///
3147/// # Errors
3148///
3149/// Returns an error if the fetch or conversion fails.
3150pub async fn fetch_google_doc_as_archive(
3151    url: &str,
3152    api_token: Option<&str>,
3153) -> crate::Result<GDocsArchiveResult> {
3154    let result = fetch_google_doc(url, "html", api_token).await?;
3155
3156    let preprocess = preprocess_google_docs_export_html(&result.content);
3157    debug!(
3158        document_id = %result.document_id,
3159        hoisted = preprocess.hoisted,
3160        unwrapped_links = preprocess.unwrapped_links,
3161        "google-docs-export pre-processor rewrote archive markup"
3162    );
3163
3164    let (local_html, images) = extract_base64_images(&preprocess.html);
3165
3166    let markdown = normalize_google_docs_export_markdown(
3167        &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3168    );
3169
3170    debug!(
3171        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3172        images.len(),
3173        local_html.len(),
3174        markdown.len()
3175    );
3176
3177    Ok(GDocsArchiveResult {
3178        html: local_html,
3179        markdown,
3180        images,
3181        document_id: result.document_id,
3182        export_url: result.export_url,
3183    })
3184}
3185
3186/// Create a ZIP archive from a `GDocsArchiveResult`.
3187///
3188/// # Arguments
3189///
3190/// * `archive` - The archive result to bundle
3191/// * `pretty_html` - Whether to pretty-print the HTML output
3192///
3193/// # Errors
3194///
3195/// Returns an error if ZIP creation fails.
3196pub fn create_archive_zip(
3197    archive: &GDocsArchiveResult,
3198    pretty_html: bool,
3199) -> crate::Result<Vec<u8>> {
3200    let mut buf = std::io::Cursor::new(Vec::new());
3201
3202    {
3203        let mut zip = zip::ZipWriter::new(&mut buf);
3204        let options = zip::write::SimpleFileOptions::default()
3205            .compression_method(zip::CompressionMethod::Deflated);
3206
3207        zip.start_file("document.md", options)
3208            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3209        zip.write_all(archive.markdown.as_bytes())?;
3210
3211        let html_output = if pretty_html {
3212            crate::html::pretty_print_html(&archive.html)
3213        } else {
3214            archive.html.clone()
3215        };
3216        zip.start_file("document.html", options)
3217            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3218        zip.write_all(html_output.as_bytes())?;
3219
3220        for img in &archive.images {
3221            zip.start_file(format!("images/{}", img.filename), options)
3222                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3223            zip.write_all(&img.data)?;
3224        }
3225
3226        zip.finish()
3227            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3228    }
3229
3230    Ok(buf.into_inner())
3231}
web_capture/gdocs.rs

web_capture/
gdocs.rs