web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use serde_json::Value;
38use std::collections::HashMap;
39use std::fmt::Write as _;
40use std::hash::BuildHasher;
41use std::io::Write;
42use std::process::Stdio;
43use std::sync::OnceLock;
44use std::time::{Duration, Instant};
45use tokio::io::{AsyncBufReadExt, BufReader};
46use tokio::process::{Child, Command};
47use tracing::{debug, info, warn};
48
49use crate::WebCaptureError;
50
51const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
52const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
53const GDOCS_USER_AGENT: &str =
54    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
55const GDOCS_EDITOR_MODEL_WAIT: Duration = Duration::from_secs(30);
56const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
57
58type CdpWebSocket = WebSocketStream<ConnectStream>;
59
60const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
61window.__captured_chunks = [];
62const captureChunk = (value) => {
63  if (!value) {
64    return;
65  }
66  if (Array.isArray(value)) {
67    for (const item of value) {
68      captureChunk(item);
69    }
70    return;
71  }
72  try {
73    window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
74  } catch {
75    window.__captured_chunks.push(value);
76  }
77};
78const wrapChunkArray = (value) => {
79  if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
80    return value;
81  }
82  const originalPush = value.push;
83  Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
84    value: true,
85    enumerable: false,
86  });
87  Object.defineProperty(value, 'push', {
88    value(...items) {
89      for (const item of items) {
90        captureChunk(item);
91      }
92      return originalPush.apply(this, items);
93    },
94    writable: true,
95    configurable: true,
96  });
97  for (const item of value) {
98    captureChunk(item);
99  }
100  return value;
101};
102Object.defineProperty(window, 'DOCS_modelChunk', {
103  set(value) {
104    captureChunk(value);
105    window.__DOCS_modelChunk_latest = wrapChunkArray(value);
106  },
107  get() {
108    return window.__DOCS_modelChunk_latest;
109  },
110  configurable: false,
111});
112";
113
114const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
115  const chunks = [...(window.__captured_chunks || [])];
116  if (
117    window.DOCS_modelChunk &&
118    chunks.length === 0 &&
119    !chunks.includes(window.DOCS_modelChunk)
120  ) {
121    chunks.push(window.DOCS_modelChunk);
122  }
123  const cidUrlMap = {};
124  const scripts = document.querySelectorAll('script');
125  for (const script of scripts) {
126    const text = script.textContent || '';
127    if (!text.includes('docs-images-rt')) {
128      continue;
129    }
130    const regex =
131      /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
132    let match;
133    while ((match = regex.exec(text)) !== null) {
134      cidUrlMap[match[1]] = match[2]
135        .replace(/\\u003d/g, '=')
136        .replace(/\\u0026/g, '&')
137        .replace(/\\\//g, '/');
138    }
139  }
140  return { chunks, cidUrlMap };
141}"#;
142
143fn gdocs_url_pattern() -> &'static Regex {
144    static PATTERN: OnceLock<Regex> = OnceLock::new();
145    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
146}
147
148/// Result of fetching a Google Docs document.
149#[derive(Debug, Clone)]
150pub struct GDocsResult {
151    /// The document content in the requested format.
152    pub content: String,
153    /// The export format used.
154    pub format: String,
155    /// The extracted document ID.
156    pub document_id: String,
157    /// The export URL that was fetched.
158    pub export_url: String,
159}
160
161/// Google Docs capture backend selected from the CLI `--capture` flag.
162#[derive(Debug, Clone, Copy, PartialEq, Eq)]
163pub enum GDocsCaptureMethod {
164    /// Load `/edit` and extract `DOCS_modelChunk`.
165    BrowserModel,
166    /// Use the public `/export?format=...` endpoint.
167    PublicExport,
168    /// Use the authenticated `docs.googleapis.com` REST API.
169    DocsApi,
170}
171
172/// Rendered Google Docs content from either Docs API or editor model data.
173#[derive(Debug, Clone)]
174pub struct GDocsRenderedResult {
175    /// Markdown output.
176    pub markdown: String,
177    /// HTML output.
178    pub html: String,
179    /// Plain text output.
180    pub text: String,
181    /// The extracted document ID.
182    pub document_id: String,
183    /// Source URL used for capture.
184    pub export_url: String,
185    /// Remote images exposed by the editor model, used for archive localization.
186    pub remote_images: Vec<RemoteImage>,
187}
188
189/// Remote image reference extracted from browser-model capture.
190#[derive(Debug, Clone, PartialEq, Eq)]
191pub struct RemoteImage {
192    /// Original image URL.
193    pub url: String,
194    /// Image alt text.
195    pub alt: String,
196}
197
198#[derive(Debug, Clone)]
199struct BrowserModelData {
200    chunks: Vec<Value>,
201    cid_urls: HashMap<String, String>,
202}
203
204/// Parsed Google Docs model/document capture.
205#[derive(Debug, Clone, Default)]
206pub struct CapturedDocument {
207    /// Ordered document blocks.
208    pub blocks: Vec<CapturedBlock>,
209    /// Tables extracted from `blocks` for compatibility with tests and callers.
210    pub tables: Vec<TableBlock>,
211    /// Images extracted from model positions.
212    pub images: Vec<ContentNode>,
213    /// Plain text projection.
214    pub text: String,
215}
216
217/// Captured block.
218#[derive(Debug, Clone)]
219pub enum CapturedBlock {
220    /// Paragraph-like block.
221    Paragraph {
222        /// Paragraph content.
223        content: Vec<ContentNode>,
224        /// Optional Google Docs named style.
225        style: Option<String>,
226        /// Optional list metadata.
227        list: Option<ListMeta>,
228        /// Whether paragraph is a blockquote.
229        quote: bool,
230        /// Whether paragraph is a horizontal rule.
231        horizontal_rule: bool,
232    },
233    /// Table block.
234    Table(TableBlock),
235}
236
237/// Captured table.
238#[derive(Debug, Clone, Default)]
239pub struct TableBlock {
240    /// Table rows.
241    pub rows: Vec<TableRow>,
242}
243
244/// Captured table row.
245#[derive(Debug, Clone, Default)]
246pub struct TableRow {
247    /// Row cells.
248    pub cells: Vec<TableCell>,
249}
250
251/// Captured table cell.
252#[derive(Debug, Clone, Default)]
253pub struct TableCell {
254    /// Cell content.
255    pub content: Vec<ContentNode>,
256}
257
258/// Captured inline content node.
259#[derive(Debug, Clone, PartialEq, Eq)]
260pub enum ContentNode {
261    /// Text run.
262    Text {
263        /// Text content.
264        text: String,
265        /// Bold text style.
266        bold: bool,
267        /// Italic text style.
268        italic: bool,
269        /// Strikethrough text style.
270        strike: bool,
271        /// Optional hyperlink target.
272        link: Option<String>,
273    },
274    /// Image placeholder.
275    Image {
276        /// Content ID from Google Docs model data.
277        cid: Option<String>,
278        /// Resolved image URL.
279        url: Option<String>,
280        /// Alt text.
281        alt: String,
282        /// Whether this image came from a suggested edit.
283        is_suggestion: bool,
284    },
285}
286
287#[derive(Debug, Clone, Default, PartialEq, Eq)]
288struct TextStyle {
289    bold: bool,
290    italic: bool,
291    strike: bool,
292    link: Option<String>,
293}
294
295#[derive(Debug, Clone, Default)]
296struct ParagraphMeta {
297    style: Option<String>,
298    list: Option<ListMeta>,
299    quote: bool,
300    horizontal_rule: bool,
301}
302
303#[derive(Debug, Clone)]
304pub struct ListMeta {
305    /// Google Docs list identifier.
306    pub id: String,
307    /// Nesting level, zero-based.
308    pub level: usize,
309    /// Whether Markdown should render this list item with an ordered marker.
310    pub ordered: bool,
311}
312
313#[derive(Debug, Clone)]
314struct ParagraphStyle {
315    style: Option<String>,
316    indent_start: f64,
317    indent_first_line: f64,
318}
319
320#[derive(Debug, Clone, Default)]
321struct ModelStyleMaps {
322    inline_styles: Vec<TextStyle>,
323    paragraph_by_end: HashMap<usize, ParagraphStyle>,
324    list_by_end: HashMap<usize, ListMeta>,
325    horizontal_rules: std::collections::HashSet<usize>,
326}
327
328/// Check if a URL is a Google Docs document URL.
329#[must_use]
330pub fn is_google_docs_url(url: &str) -> bool {
331    gdocs_url_pattern().is_match(url)
332}
333
334/// Extract the document ID from a Google Docs URL.
335///
336/// Returns `None` if the URL is not a valid Google Docs URL.
337#[must_use]
338pub fn extract_document_id(url: &str) -> Option<String> {
339    gdocs_url_pattern()
340        .captures(url)
341        .and_then(|caps| caps.get(1))
342        .map(|m| m.as_str().to_string())
343}
344
345/// Build a Google Docs export URL.
346///
347/// # Arguments
348///
349/// * `document_id` - The Google Docs document ID
350/// * `format` - Export format (html, txt, md, pdf, docx, epub)
351#[must_use]
352pub fn build_export_url(document_id: &str, format: &str) -> String {
353    let export_format = match format {
354        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
355        _ => "html",
356    };
357    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
358}
359
360/// Build a Google Docs editor URL.
361#[must_use]
362pub fn build_edit_url(document_id: &str) -> String {
363    format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
364}
365
366/// Build a Google Docs REST API URL.
367#[must_use]
368pub fn build_docs_api_url(document_id: &str) -> String {
369    format!("{GDOCS_API_BASE}/{document_id}")
370}
371
372/// Select a Google Docs capture backend from the CLI `--capture` value.
373///
374/// # Errors
375///
376/// Returns an error when `capture` is neither `browser` nor `api`.
377pub fn select_capture_method(
378    capture: &str,
379    api_token: Option<&str>,
380) -> crate::Result<GDocsCaptureMethod> {
381    match capture.to_lowercase().as_str() {
382        "browser" => Ok(GDocsCaptureMethod::BrowserModel),
383        "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
384        "api" => Ok(GDocsCaptureMethod::PublicExport),
385        other => Err(WebCaptureError::InvalidUrl(format!(
386            "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
387        ))),
388    }
389}
390
391/// Fetch a Google Docs document via the export URL.
392///
393/// For public documents, pass `None` for `api_token`.
394/// For private documents, pass a Bearer token string.
395///
396/// # Arguments
397///
398/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
399/// * `format` - Export format (html, txt, md, pdf, docx, epub)
400/// * `api_token` - Optional API token for private documents
401///
402/// # Errors
403///
404/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
405pub async fn fetch_google_doc(
406    url: &str,
407    format: &str,
408    api_token: Option<&str>,
409) -> crate::Result<GDocsResult> {
410    let document_id = extract_document_id(url).ok_or_else(|| {
411        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
412    })?;
413
414    let export_url = build_export_url(&document_id, format);
415    debug!(
416        document_id = %document_id,
417        format = %format,
418        export_url = %export_url,
419        has_api_token = api_token.is_some(),
420        "fetching Google Doc via public export"
421    );
422
423    let mut request = reqwest::Client::new()
424        .get(&export_url)
425        .header(
426            "User-Agent",
427            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
428        )
429        .header("Accept-Charset", "utf-8")
430        .header("Accept-Language", "en-US,en;q=0.9");
431
432    if let Some(token) = api_token {
433        request = request.header("Authorization", format!("Bearer {token}"));
434    }
435
436    let response = request
437        .send()
438        .await
439        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
440    debug!(
441        document_id = %document_id,
442        status = response.status().as_u16(),
443        success = response.status().is_success(),
444        content_type = response
445            .headers()
446            .get(reqwest::header::CONTENT_TYPE)
447            .and_then(|value| value.to_str().ok())
448            .unwrap_or(""),
449        "received Google Docs public export response"
450    );
451
452    if !response.status().is_success() {
453        return Err(WebCaptureError::FetchError(format!(
454            "Failed to fetch Google Doc ({} {}): {}",
455            response.status().as_u16(),
456            response.status().canonical_reason().unwrap_or("Unknown"),
457            export_url
458        )));
459    }
460
461    let raw_content = response.text().await.map_err(|e| {
462        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
463    })?;
464    debug!(
465        document_id = %document_id,
466        bytes = raw_content.len(),
467        "read Google Docs public export body"
468    );
469
470    // Decode HTML entities to unicode for text-based formats
471    let content = match format {
472        "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
473        _ => raw_content,
474    };
475
476    Ok(GDocsResult {
477        content,
478        format: format.to_string(),
479        document_id,
480        export_url,
481    })
482}
483
484/// Fetch a Google Docs document and convert to Markdown.
485///
486/// Fetches the document as HTML, then converts to Markdown using the
487/// existing HTML-to-Markdown pipeline.
488///
489/// # Arguments
490///
491/// * `url` - Google Docs URL
492/// * `api_token` - Optional API token for private documents
493///
494/// # Errors
495///
496/// Returns an error if the fetch or conversion fails.
497pub async fn fetch_google_doc_as_markdown(
498    url: &str,
499    api_token: Option<&str>,
500) -> crate::Result<GDocsResult> {
501    let result = fetch_google_doc(url, "html", api_token).await?;
502
503    let preprocess = preprocess_google_docs_export_html(&result.content);
504    debug!(
505        document_id = %result.document_id,
506        hoisted = preprocess.hoisted,
507        unwrapped_links = preprocess.unwrapped_links,
508        "google-docs-export pre-processor rewrote markup"
509    );
510    let markdown =
511        crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?;
512    debug!(
513        document_id = %result.document_id,
514        bytes = markdown.len(),
515        "rendered Google Docs public export markdown"
516    );
517
518    Ok(GDocsResult {
519        content: markdown,
520        format: "markdown".to_string(),
521        document_id: result.document_id,
522        export_url: result.export_url,
523    })
524}
525
526/// Result of running the Google Docs export HTML pre-processor.
527///
528/// Exposes the rewritten HTML alongside counters that are useful for debug
529/// logging (`gdocs.export.style-hoist`). See issue #92 R6.
530#[derive(Debug, Clone)]
531pub struct GDocsExportPreprocessResult {
532    /// Rewritten HTML.
533    pub html: String,
534    /// Number of inline-style spans turned into `<strong>`/`<em>`/`<del>`.
535    pub hoisted: usize,
536    /// Number of `google.com/url?q=` redirect wrappers unwrapped.
537    pub unwrapped_links: usize,
538}
539
540/// Pre-process Google Docs export HTML so the generic `html2md` pipeline
541/// preserves inline formatting, heading numbering, and link targets.
542///
543/// Google Drive serves bold/italic/strikethrough as inline style spans and
544/// wraps every link through a `google.com/url?q=` redirect, both of which
545/// the generic converter would otherwise discard. This function rewrites
546/// those constructs into semantic HTML before conversion.
547#[must_use]
548pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
549    let mut hoisted: usize = 0;
550    let mut unwrapped_links: usize = 0;
551    let class_styles = extract_css_class_styles(html);
552
553    let mut out = hoist_inline_style_spans(html, &mut hoisted);
554    out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
555    out = convert_class_indented_blockquotes(&out, &class_styles);
556    out = strip_google_docs_heading_noise(&out);
557    out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
558    out = out.replace("&nbsp;", " ");
559    out = out.replace('\u{00A0}', " ");
560
561    GDocsExportPreprocessResult {
562        html: out,
563        hoisted,
564        unwrapped_links,
565    }
566}
567
568fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
569    let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
570        .expect("valid regex");
571    span_re
572        .replace_all(html, |caps: &regex::Captures<'_>| {
573            let style = caps.get(2).map_or("", |m| m.as_str());
574            let inner = caps.get(3).map_or("", |m| m.as_str());
575            semantic_wrapped_html(inner, style).map_or_else(
576                || caps[0].to_string(),
577                |wrapped| {
578                    *hoisted += 1;
579                    wrapped
580                },
581            )
582        })
583        .into_owned()
584}
585
586fn hoist_class_style_spans(
587    html: &str,
588    class_styles: &HashMap<String, String>,
589    hoisted: &mut usize,
590) -> String {
591    let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
592        .expect("valid regex");
593    class_span_re
594        .replace_all(html, |caps: &regex::Captures<'_>| {
595            let class_attr = caps.get(2).map_or("", |m| m.as_str());
596            let inner = caps.get(3).map_or("", |m| m.as_str());
597            let style = combined_class_style(class_styles, class_attr);
598            semantic_wrapped_html(inner, &style).map_or_else(
599                || caps[0].to_string(),
600                |wrapped| {
601                    *hoisted += 1;
602                    wrapped
603                },
604            )
605        })
606        .into_owned()
607}
608
609fn convert_class_indented_blockquotes(
610    html: &str,
611    class_styles: &HashMap<String, String>,
612) -> String {
613    let class_paragraph_re =
614        Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
615    class_paragraph_re
616        .replace_all(html, |caps: &regex::Captures<'_>| {
617            let class_attr = caps.get(2).map_or("", |m| m.as_str());
618            let inner = caps.get(3).map_or("", |m| m.as_str());
619            let style = combined_class_style(class_styles, class_attr);
620            if is_blockquote_style(&style) {
621                format!("<blockquote><p>{inner}</p></blockquote>")
622            } else {
623                caps[0].to_string()
624            }
625        })
626        .into_owned()
627}
628
629fn strip_google_docs_heading_noise(html: &str) -> String {
630    let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
631    let numbering_re =
632        Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
633    let mut out = empty_anchor_re.replace_all(html, "").into_owned();
634    for level in 1..=6 {
635        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
636            .expect("valid regex");
637        out = heading_re
638            .replace_all(&out, |caps: &regex::Captures<'_>| {
639                let open = &caps[1];
640                let inner = &caps[2];
641                let close = &caps[3];
642                let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
643                cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
644                format!("{open}{cleaned}{close}")
645            })
646            .into_owned();
647    }
648    out
649}
650
651fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
652    let redirect_re =
653        Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
654            .expect("valid regex");
655    redirect_re
656        .replace_all(html, |caps: &regex::Captures<'_>| {
657            let encoded = caps.get(1).map_or("", |m| m.as_str());
658            let decoded = percent_decode_utf8_lossy(encoded);
659            *unwrapped_links += 1;
660            format!(r#"href="{decoded}""#)
661        })
662        .into_owned()
663}
664
665fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
666    let mut class_styles: HashMap<String, String> = HashMap::new();
667    let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
668    let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
669    for style_caps in style_re.captures_iter(html) {
670        let css = style_caps.get(1).map_or("", |m| m.as_str());
671        for class_caps in class_re.captures_iter(css) {
672            let class_name = class_caps.get(1).map_or("", |m| m.as_str());
673            let style = class_caps.get(2).map_or("", |m| m.as_str());
674            class_styles
675                .entry(class_name.to_string())
676                .and_modify(|existing| {
677                    existing.push(';');
678                    existing.push_str(style);
679                })
680                .or_insert_with(|| style.to_string());
681        }
682    }
683    class_styles
684}
685
686fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
687    class_attr
688        .split_whitespace()
689        .filter_map(|class_name| class_styles.get(class_name))
690        .fold(String::new(), |mut out, style| {
691            out.push(';');
692            out.push_str(style);
693            out
694        })
695}
696
697fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
698    let bold = css_has_bold(style);
699    let italic = css_has_italic(style);
700    let strike = css_has_strike(style);
701    if !bold && !italic && !strike {
702        return None;
703    }
704    let mut wrapped = inner.to_string();
705    if strike {
706        wrapped = format!("<del>{wrapped}</del>");
707    }
708    if italic {
709        wrapped = format!("<em>{wrapped}</em>");
710    }
711    if bold {
712        wrapped = format!("<strong>{wrapped}</strong>");
713    }
714    Some(wrapped)
715}
716
717fn css_has_bold(style: &str) -> bool {
718    Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
719        .expect("valid regex")
720        .is_match(style)
721}
722
723fn css_has_italic(style: &str) -> bool {
724    Regex::new(r"(?i)font-style\s*:\s*italic")
725        .expect("valid regex")
726        .is_match(style)
727}
728
729fn css_has_strike(style: &str) -> bool {
730    Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
731        .expect("valid regex")
732        .is_match(style)
733}
734
735fn is_blockquote_style(style: &str) -> bool {
736    let margin_left = css_point_value(style, "margin-left");
737    let margin_right = css_point_value(style, "margin-right");
738    margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
739}
740
741fn css_point_value(style: &str, property: &str) -> f64 {
742    let re = Regex::new(&format!(
743        r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
744        regex::escape(property)
745    ))
746    .expect("valid regex");
747    re.captures(style)
748        .and_then(|caps| caps.get(1))
749        .and_then(|value| value.as_str().parse::<f64>().ok())
750        .unwrap_or(0.0)
751}
752
753/// Decode %XX percent escapes in `input`. Invalid sequences are left
754/// untouched so well-formed ASCII URLs round-trip unchanged.
755fn percent_decode_utf8_lossy(input: &str) -> String {
756    let bytes = input.as_bytes();
757    let mut decoded = Vec::with_capacity(bytes.len());
758    let mut i = 0;
759    while i < bytes.len() {
760        if bytes[i] == b'%' && i + 2 < bytes.len() {
761            let hi = (bytes[i + 1] as char).to_digit(16);
762            let lo = (bytes[i + 2] as char).to_digit(16);
763            if let (Some(hi), Some(lo)) = (hi, lo) {
764                if let Ok(byte) = u8::try_from((hi << 4) | lo) {
765                    decoded.push(byte);
766                    i += 3;
767                    continue;
768                }
769            }
770        }
771        decoded.push(bytes[i]);
772        i += 1;
773    }
774    String::from_utf8_lossy(&decoded).into_owned()
775}
776
777/// Fetch and render a Google Docs document via the authenticated REST API.
778///
779/// # Errors
780///
781/// Returns an error when the URL is invalid, no token is provided, or the API request fails.
782pub async fn fetch_google_doc_from_docs_api(
783    url: &str,
784    api_token: &str,
785) -> crate::Result<GDocsRenderedResult> {
786    let document_id = extract_document_id(url).ok_or_else(|| {
787        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
788    })?;
789    let api_url = build_docs_api_url(&document_id);
790    debug!(
791        document_id = %document_id,
792        api_url = %api_url,
793        "fetching Google Doc via Docs API"
794    );
795
796    let response = reqwest::Client::new()
797        .get(&api_url)
798        .header("Authorization", format!("Bearer {api_token}"))
799        .header("Accept", "application/json")
800        .send()
801        .await
802        .map_err(|e| {
803            WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
804        })?;
805    debug!(
806        document_id = %document_id,
807        status = response.status().as_u16(),
808        success = response.status().is_success(),
809        content_type = response
810            .headers()
811            .get(reqwest::header::CONTENT_TYPE)
812            .and_then(|value| value.to_str().ok())
813            .unwrap_or(""),
814        "received Google Docs API response"
815    );
816
817    if !response.status().is_success() {
818        return Err(WebCaptureError::FetchError(format!(
819            "Failed to fetch Google Doc via Docs API ({} {}): {}",
820            response.status().as_u16(),
821            response.status().canonical_reason().unwrap_or("Unknown"),
822            api_url
823        )));
824    }
825
826    let body = response.text().await.map_err(|e| {
827        WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
828    })?;
829    let document = serde_json::from_str::<Value>(&body).map_err(|e| {
830        WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
831    })?;
832    let rendered = render_docs_api_document(&document);
833    debug!(
834        document_id = %document_id,
835        title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
836        markdown_bytes = rendered.markdown.len(),
837        html_bytes = rendered.html.len(),
838        text_bytes = rendered.text.len(),
839        "rendered Google Docs API document"
840    );
841
842    Ok(GDocsRenderedResult {
843        markdown: rendered.markdown,
844        html: rendered.html,
845        text: rendered.text,
846        document_id,
847        export_url: api_url,
848        remote_images: Vec::new(),
849    })
850}
851
852/// Fetch and render the model data embedded in the Google Docs `/edit` route.
853///
854/// # Errors
855///
856/// Returns an error when the URL is invalid, the fetch fails, or no model chunks are present.
857pub async fn fetch_google_doc_from_model(
858    url: &str,
859    api_token: Option<&str>,
860) -> crate::Result<GDocsRenderedResult> {
861    if api_token.is_some() {
862        return Err(WebCaptureError::BrowserError(
863            "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
864        ));
865    }
866    let document_id = extract_document_id(url).ok_or_else(|| {
867        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
868    })?;
869    let edit_url = build_edit_url(&document_id);
870    debug!(
871        document_id = %document_id,
872        edit_url = %edit_url,
873        "capturing Google Doc editor model with a real browser"
874    );
875    let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
876    let chunks = model_data.chunks;
877    debug!(
878        document_id = %document_id,
879        chunks = chunks.len(),
880        cid_urls = model_data.cid_urls.len(),
881        "extracted Google Docs editor model chunks through CDP"
882    );
883    if chunks.is_empty() {
884        return Err(WebCaptureError::ParseError(
885            "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
886        ));
887    }
888
889    let capture = parse_model_chunks(&chunks, &model_data.cid_urls);
890    let remote_images = remote_images_from_capture(&capture);
891    info!(
892        document_id = %document_id,
893        chunks = chunks.len(),
894        cid_urls = model_data.cid_urls.len(),
895        blocks = capture.blocks.len(),
896        tables = capture.tables.len(),
897        images = capture.images.len(),
898        text_bytes = capture.text.len(),
899        "parsed Google Docs editor model"
900    );
901
902    Ok(GDocsRenderedResult {
903        markdown: render_captured_document(&capture, "markdown"),
904        html: render_captured_document(&capture, "html"),
905        text: render_captured_document(&capture, "txt"),
906        document_id,
907        export_url: edit_url,
908        remote_images,
909    })
910}
911
912async fn fetch_google_doc_editor_model_with_cdp(
913    edit_url: &str,
914    document_id: &str,
915) -> crate::Result<BrowserModelData> {
916    let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
917        WebCaptureError::BrowserError(
918            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
919        )
920    })?;
921    let user_data_dir = crate::browser::temporary_user_data_dir();
922    std::fs::create_dir_all(&user_data_dir)?;
923
924    debug!(
925        document_id = %document_id,
926        chrome = %chrome.display(),
927        user_data_dir = %user_data_dir.display(),
928        edit_url = %edit_url,
929        "launching headless Chrome CDP session for Google Docs model capture"
930    );
931
932    let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
933    let capture_result = async {
934        let ws_url = wait_for_devtools_ws_url(&mut child).await?;
935        let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
936            WebCaptureError::BrowserError(format!(
937                "Failed to connect to Chrome DevTools websocket: {error}"
938            ))
939        })?;
940        let mut next_id = 0u64;
941        let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
942        wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
943    }
944    .await;
945
946    if let Err(error) = child.kill().await {
947        debug!(
948            document_id = %document_id,
949            error = %error,
950            "failed to kill Chrome CDP browser process"
951        );
952    }
953    let _ = child.wait().await;
954    let _ = std::fs::remove_dir_all(&user_data_dir);
955
956    capture_result
957}
958
959async fn navigate_google_docs_cdp_page(
960    ws: &mut CdpWebSocket,
961    next_id: &mut u64,
962    edit_url: &str,
963) -> crate::Result<String> {
964    let target = cdp_send(
965        ws,
966        next_id,
967        None,
968        "Target.createTarget",
969        serde_json::json!({ "url": "about:blank" }),
970    )
971    .await?;
972    let target_id = target
973        .get("targetId")
974        .and_then(Value::as_str)
975        .ok_or_else(|| {
976            WebCaptureError::BrowserError(
977                "Chrome DevTools Target.createTarget did not return targetId".to_string(),
978            )
979        })?
980        .to_string();
981    let attached = cdp_send(
982        ws,
983        next_id,
984        None,
985        "Target.attachToTarget",
986        serde_json::json!({ "targetId": target_id, "flatten": true }),
987    )
988    .await?;
989    let session_id = attached
990        .get("sessionId")
991        .and_then(Value::as_str)
992        .ok_or_else(|| {
993            WebCaptureError::BrowserError(
994                "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
995            )
996        })?
997        .to_string();
998
999    cdp_send(
1000        ws,
1001        next_id,
1002        Some(&session_id),
1003        "Page.enable",
1004        serde_json::json!({}),
1005    )
1006    .await?;
1007    cdp_send(
1008        ws,
1009        next_id,
1010        Some(&session_id),
1011        "Runtime.enable",
1012        serde_json::json!({}),
1013    )
1014    .await?;
1015    cdp_send(
1016        ws,
1017        next_id,
1018        Some(&session_id),
1019        "Page.addScriptToEvaluateOnNewDocument",
1020        serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1021    )
1022    .await?;
1023    cdp_send(
1024        ws,
1025        next_id,
1026        Some(&session_id),
1027        "Page.navigate",
1028        serde_json::json!({ "url": edit_url }),
1029    )
1030    .await?;
1031
1032    Ok(session_id)
1033}
1034
1035async fn wait_for_google_docs_model_chunks(
1036    ws: &mut CdpWebSocket,
1037    next_id: &mut u64,
1038    session_id: &str,
1039    document_id: &str,
1040) -> crate::Result<BrowserModelData> {
1041    let started = Instant::now();
1042    let mut last_chunks = 0usize;
1043    let mut last_cid_urls = 0usize;
1044
1045    while started.elapsed() < GDOCS_EDITOR_MODEL_WAIT {
1046        let result = cdp_send(
1047            ws,
1048            next_id,
1049            Some(session_id),
1050            "Runtime.evaluate",
1051            serde_json::json!({
1052                "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1053                "returnByValue": true,
1054                "awaitPromise": true
1055            }),
1056        )
1057        .await?;
1058        if let Some(exception) = result.get("exceptionDetails") {
1059            return Err(WebCaptureError::BrowserError(format!(
1060                "Google Docs model extraction script failed: {exception}"
1061            )));
1062        }
1063        let value = result
1064            .pointer("/result/value")
1065            .cloned()
1066            .unwrap_or(Value::Null);
1067        let model_data = browser_model_data_from_value(&value);
1068        last_chunks = model_data.chunks.len();
1069        last_cid_urls = model_data.cid_urls.len();
1070        if !model_data.chunks.is_empty() {
1071            debug!(
1072                document_id = %document_id,
1073                chunks = model_data.chunks.len(),
1074                cid_urls = model_data.cid_urls.len(),
1075                elapsed_ms = started.elapsed().as_millis(),
1076                "captured Google Docs model chunks through CDP Runtime.evaluate"
1077            );
1078            return Ok(model_data);
1079        }
1080        tokio::time::sleep(Duration::from_millis(250)).await;
1081    }
1082
1083    Err(WebCaptureError::BrowserError(format!(
1084        "Timed out waiting for Google Docs DOCS_modelChunk data for document {document_id} after {} ms (last chunks={last_chunks}, cid_urls={last_cid_urls})",
1085        GDOCS_EDITOR_MODEL_WAIT.as_millis()
1086    )))
1087}
1088
1089fn launch_cdp_chrome(
1090    chrome: &std::path::Path,
1091    user_data_dir: &std::path::Path,
1092) -> crate::Result<Child> {
1093    let mut command = Command::new(chrome);
1094    command
1095        .args([
1096            "--headless=new",
1097            "--disable-gpu",
1098            "--disable-extensions",
1099            "--disable-dev-shm-usage",
1100            "--disable-background-networking",
1101            "--disable-component-update",
1102            "--disable-default-apps",
1103            "--disable-sync",
1104            "--metrics-recording-only",
1105            "--no-default-browser-check",
1106            "--no-first-run",
1107            "--no-sandbox",
1108            "--remote-debugging-port=0",
1109            "--window-size=1280,800",
1110        ])
1111        .arg(format!("--user-data-dir={}", user_data_dir.display()))
1112        .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1113        .stderr(Stdio::piped())
1114        .stdout(Stdio::null())
1115        .kill_on_drop(true);
1116
1117    command.spawn().map_err(|error| {
1118        WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1119    })
1120}
1121
1122async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1123    let stderr = child.stderr.take().ok_or_else(|| {
1124        WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1125    })?;
1126    let mut lines = BufReader::new(stderr).lines();
1127    let started = Instant::now();
1128
1129    while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1130        let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1131        match line {
1132            Ok(Ok(Some(line))) => {
1133                if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1134                    return Ok(ws_url.trim().to_string());
1135                }
1136            }
1137            Ok(Ok(None)) => {
1138                break;
1139            }
1140            Ok(Err(error)) => {
1141                return Err(WebCaptureError::BrowserError(format!(
1142                    "Failed to read Chrome CDP stderr: {error}"
1143                )));
1144            }
1145            Err(_) => {}
1146        }
1147    }
1148
1149    Err(WebCaptureError::BrowserError(format!(
1150        "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1151        GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1152    )))
1153}
1154
1155async fn cdp_send(
1156    ws: &mut CdpWebSocket,
1157    next_id: &mut u64,
1158    session_id: Option<&str>,
1159    method: &str,
1160    params: Value,
1161) -> crate::Result<Value> {
1162    *next_id += 1;
1163    let id = *next_id;
1164    let mut message = serde_json::json!({
1165        "id": id,
1166        "method": method,
1167        "params": params
1168    });
1169    if let Some(session_id) = session_id {
1170        message["sessionId"] = Value::String(session_id.to_string());
1171    }
1172
1173    ws.send(Message::Text(message.to_string()))
1174        .await
1175        .map_err(|error| {
1176            WebCaptureError::BrowserError(format!(
1177                "Failed to send Chrome DevTools command {method}: {error}"
1178            ))
1179        })?;
1180
1181    while let Some(message) = ws.next().await {
1182        let message = message.map_err(|error| {
1183            WebCaptureError::BrowserError(format!(
1184                "Failed to read Chrome DevTools response for {method}: {error}"
1185            ))
1186        })?;
1187        if !message.is_text() {
1188            continue;
1189        }
1190        let text = message.to_text().map_err(|error| {
1191            WebCaptureError::BrowserError(format!(
1192                "Chrome DevTools response for {method} was not text: {error}"
1193            ))
1194        })?;
1195        let value = serde_json::from_str::<Value>(text).map_err(|error| {
1196            WebCaptureError::ParseError(format!(
1197                "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1198            ))
1199        })?;
1200        if value.get("id").and_then(Value::as_u64) != Some(id) {
1201            continue;
1202        }
1203        if let Some(error) = value.get("error") {
1204            return Err(WebCaptureError::BrowserError(format!(
1205                "Chrome DevTools command {method} failed: {error}"
1206            )));
1207        }
1208        return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1209    }
1210
1211    Err(WebCaptureError::BrowserError(format!(
1212        "Chrome DevTools websocket closed before response for {method}"
1213    )))
1214}
1215
1216fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1217    let chunks = value
1218        .get("chunks")
1219        .and_then(Value::as_array)
1220        .cloned()
1221        .unwrap_or_default();
1222    let cid_urls = value
1223        .get("cidUrlMap")
1224        .and_then(Value::as_object)
1225        .map(|map| {
1226            map.iter()
1227                .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1228                .collect::<HashMap<_, _>>()
1229        })
1230        .unwrap_or_default();
1231    BrowserModelData { chunks, cid_urls }
1232}
1233
1234fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1235    capture
1236        .images
1237        .iter()
1238        .filter_map(|node| match node {
1239            ContentNode::Image {
1240                url: Some(url),
1241                alt,
1242                ..
1243            } => Some(RemoteImage {
1244                url: url.clone(),
1245                alt: alt.clone(),
1246            }),
1247            ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1248        })
1249        .collect()
1250}
1251
1252/// Render a Google Docs REST API document value.
1253#[must_use]
1254pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1255    let blocks = structural_elements_to_blocks(
1256        document
1257            .pointer("/body/content")
1258            .and_then(Value::as_array)
1259            .map_or(&[] as &[Value], Vec::as_slice),
1260        document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1261    );
1262    GDocsRenderedOutput {
1263        markdown: render_blocks_markdown(&blocks),
1264        html: render_blocks_html(&blocks),
1265        text: blocks_to_text(&blocks),
1266    }
1267}
1268
1269/// Rendered document output.
1270#[derive(Debug, Clone, PartialEq, Eq)]
1271pub struct GDocsRenderedOutput {
1272    /// Markdown output.
1273    pub markdown: String,
1274    /// HTML output.
1275    pub html: String,
1276    /// Plain text output.
1277    pub text: String,
1278}
1279
1280fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
1281    let mut blocks = Vec::new();
1282    for element in elements {
1283        if let Some(paragraph) = element.get("paragraph") {
1284            let content = paragraph_to_content(paragraph, inline_objects);
1285            if !content_to_text(&content).trim().is_empty()
1286                || content
1287                    .iter()
1288                    .any(|node| matches!(node, ContentNode::Image { .. }))
1289            {
1290                blocks.push(CapturedBlock::Paragraph {
1291                    style: paragraph
1292                        .pointer("/paragraphStyle/namedStyleType")
1293                        .and_then(Value::as_str)
1294                        .map(ToString::to_string),
1295                    list: None,
1296                    quote: false,
1297                    horizontal_rule: false,
1298                    content,
1299                });
1300            }
1301        } else if let Some(table) = element.get("table") {
1302            blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
1303        }
1304    }
1305    blocks
1306}
1307
1308fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
1309    let rows = table
1310        .get("tableRows")
1311        .and_then(Value::as_array)
1312        .map_or(&[] as &[Value], Vec::as_slice)
1313        .iter()
1314        .map(|row| TableRow {
1315            cells: row
1316                .get("tableCells")
1317                .and_then(Value::as_array)
1318                .map_or(&[] as &[Value], Vec::as_slice)
1319                .iter()
1320                .map(|cell| TableCell {
1321                    content: structural_elements_to_inline_content(
1322                        cell.get("content")
1323                            .and_then(Value::as_array)
1324                            .map_or(&[] as &[Value], Vec::as_slice),
1325                        inline_objects,
1326                    ),
1327                })
1328                .collect(),
1329        })
1330        .collect();
1331    TableBlock { rows }
1332}
1333
1334fn structural_elements_to_inline_content(
1335    elements: &[Value],
1336    inline_objects: &Value,
1337) -> Vec<ContentNode> {
1338    let mut content = Vec::new();
1339    for element in elements {
1340        if let Some(paragraph) = element.get("paragraph") {
1341            let paragraph_content = paragraph_to_content(paragraph, inline_objects);
1342            if !content.is_empty() && !paragraph_content.is_empty() {
1343                append_text(&mut content, "\n");
1344            }
1345            content.extend(paragraph_content);
1346        } else if let Some(table) = element.get("table") {
1347            append_text(
1348                &mut content,
1349                &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
1350                    table,
1351                    inline_objects,
1352                ))]),
1353            );
1354        }
1355    }
1356    content
1357}
1358
1359fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
1360    let mut content = Vec::new();
1361    for element in paragraph
1362        .get("elements")
1363        .and_then(Value::as_array)
1364        .map_or(&[] as &[Value], Vec::as_slice)
1365    {
1366        if let Some(text) = element
1367            .pointer("/textRun/content")
1368            .and_then(Value::as_str)
1369            .map(|text| text.strip_suffix('\n').unwrap_or(text))
1370        {
1371            append_text(&mut content, text);
1372        } else if let Some(inline_id) = element
1373            .pointer("/inlineObjectElement/inlineObjectId")
1374            .and_then(Value::as_str)
1375        {
1376            if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
1377                content.push(image);
1378            }
1379        }
1380    }
1381    content
1382}
1383
1384fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
1385    let embedded = inline_objects
1386        .get(inline_id)?
1387        .pointer("/inlineObjectProperties/embeddedObject")?;
1388    let url = embedded
1389        .pointer("/imageProperties/contentUri")
1390        .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
1391        .and_then(Value::as_str)?;
1392    let alt = embedded
1393        .get("title")
1394        .or_else(|| embedded.get("description"))
1395        .and_then(Value::as_str)
1396        .unwrap_or("image");
1397    Some(ContentNode::Image {
1398        cid: None,
1399        url: Some(url.to_string()),
1400        alt: alt.to_string(),
1401        is_suggestion: false,
1402    })
1403}
1404
1405fn build_model_style_maps(
1406    items: &[Value],
1407    text_len: usize,
1408    utf16_position_map: &[usize],
1409) -> ModelStyleMaps {
1410    let mut maps = ModelStyleMaps {
1411        inline_styles: vec![TextStyle::default(); text_len],
1412        ..ModelStyleMaps::default()
1413    };
1414
1415    for item in items {
1416        if item.get("ty").and_then(Value::as_str) != Some("as") {
1417            continue;
1418        }
1419        let (Some(start), Some(end), Some(style_type)) = (
1420            item.get("si").and_then(Value::as_u64),
1421            item.get("ei").and_then(Value::as_u64),
1422            item.get("st").and_then(Value::as_str),
1423        ) else {
1424            continue;
1425        };
1426        let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
1427            continue;
1428        };
1429
1430        let start = utf16_position_to_char_position(utf16_position_map, start);
1431        let end = utf16_position_to_char_position(utf16_position_map, end);
1432        if start == 0 || end == 0 {
1433            continue;
1434        }
1435
1436        match style_type {
1437            "text" => {
1438                let style = text_style(item);
1439                apply_inline_style(&mut maps.inline_styles, start, end, &style);
1440            }
1441            "link" => {
1442                let style = TextStyle {
1443                    link: item
1444                        .pointer("/sm/lnks_link/ulnk_url")
1445                        .and_then(Value::as_str)
1446                        .map(ToString::to_string),
1447                    ..TextStyle::default()
1448                };
1449                apply_inline_style(&mut maps.inline_styles, start, end, &style);
1450            }
1451            "paragraph" => {
1452                maps.paragraph_by_end
1453                    .insert(end, paragraph_style_from_model(item));
1454            }
1455            "list" => {
1456                maps.list_by_end.insert(
1457                    end,
1458                    ListMeta {
1459                        id: item
1460                            .pointer("/sm/ls_id")
1461                            .and_then(Value::as_str)
1462                            .unwrap_or("")
1463                            .to_string(),
1464                        level: item
1465                            .pointer("/sm/ls_nest")
1466                            .and_then(Value::as_u64)
1467                            .and_then(|value| usize::try_from(value).ok())
1468                            .unwrap_or(0),
1469                        ordered: false,
1470                    },
1471                );
1472            }
1473            "horizontal_rule" => {
1474                maps.horizontal_rules.insert(end);
1475            }
1476            _ => {}
1477        }
1478    }
1479
1480    maps
1481}
1482
1483fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
1484    let from = start.saturating_sub(1);
1485    let to = end.min(styles.len());
1486    if from >= to {
1487        return;
1488    }
1489    for style in &mut styles[from..to] {
1490        if patch.bold {
1491            style.bold = true;
1492        }
1493        if patch.italic {
1494            style.italic = true;
1495        }
1496        if patch.strike {
1497            style.strike = true;
1498        }
1499        if patch.link.is_some() {
1500            style.link.clone_from(&patch.link);
1501        }
1502    }
1503}
1504
1505fn text_style(item: &Value) -> TextStyle {
1506    TextStyle {
1507        bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
1508        italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
1509        strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
1510        link: None,
1511    }
1512}
1513
1514fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
1515    let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
1516    ParagraphStyle {
1517        style: heading.map(|level| format!("HEADING_{level}")),
1518        indent_start: item
1519            .pointer("/sm/ps_il")
1520            .and_then(Value::as_f64)
1521            .unwrap_or(0.0),
1522        indent_first_line: item
1523            .pointer("/sm/ps_ifl")
1524            .and_then(Value::as_f64)
1525            .unwrap_or(0.0),
1526    }
1527}
1528
1529fn build_utf16_position_map(text: &str) -> Vec<usize> {
1530    let mut map = vec![0; text.encode_utf16().count() + 1];
1531    let mut utf16_pos = 1usize;
1532    for (idx, ch) in text.chars().enumerate() {
1533        let char_pos = idx + 1;
1534        for _ in 0..ch.len_utf16() {
1535            if let Some(slot) = map.get_mut(utf16_pos) {
1536                *slot = char_pos;
1537            }
1538            utf16_pos += 1;
1539        }
1540    }
1541    map
1542}
1543
1544fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
1545    map.get(position)
1546        .copied()
1547        .filter(|position| *position > 0)
1548        .or_else(|| map.iter().rfind(|position| **position > 0).copied())
1549        .unwrap_or(0)
1550}
1551
1552/// Parse captured `DOCS_modelChunk` values.
1553#[must_use]
1554#[allow(clippy::too_many_lines)]
1555pub fn parse_model_chunks<S: BuildHasher>(
1556    chunks: &[Value],
1557    cid_urls: &HashMap<String, String, S>,
1558) -> CapturedDocument {
1559    let items = collect_model_items(chunks);
1560    let full_text = items
1561        .iter()
1562        .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
1563        .filter_map(|item| item.get("s").and_then(Value::as_str))
1564        .collect::<String>();
1565    let chars: Vec<char> = full_text.chars().collect();
1566    let utf16_position_map = build_utf16_position_map(&full_text);
1567    let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
1568
1569    let mut positions = HashMap::new();
1570    for item in &items {
1571        if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
1572            if let (Some(id), Some(pos)) = (
1573                item.get("id").and_then(Value::as_str),
1574                item.get("spi").and_then(Value::as_u64),
1575            ) {
1576                if let Ok(pos) = usize::try_from(pos) {
1577                    positions.insert(
1578                        id.to_string(),
1579                        utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
1580                    );
1581                }
1582            }
1583        }
1584    }
1585
1586    let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
1587    let mut images = Vec::new();
1588    for item in &items {
1589        let ty = item.get("ty").and_then(Value::as_str);
1590        if !matches!(ty, Some("ae" | "ase")) {
1591            continue;
1592        }
1593        let Some(id) = item.get("id").and_then(Value::as_str) else {
1594            continue;
1595        };
1596        let Some(pos) = positions.get(id).copied() else {
1597            continue;
1598        };
1599        let cid = item
1600            .pointer("/epm/ee_eo/i_cid")
1601            .and_then(Value::as_str)
1602            .map(ToString::to_string);
1603        let node = ContentNode::Image {
1604            url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
1605            cid,
1606            alt: item
1607                .pointer("/epm/ee_eo/eo_ad")
1608                .and_then(Value::as_str)
1609                .unwrap_or_else(|| {
1610                    if ty == Some("ase") {
1611                        "suggested image"
1612                    } else {
1613                        "image"
1614                    }
1615                })
1616                .to_string(),
1617            is_suggestion: ty == Some("ase"),
1618        };
1619        images_by_pos.insert(pos, node.clone());
1620        images.push(node);
1621    }
1622
1623    let mut blocks = Vec::new();
1624    let mut tables = Vec::new();
1625    let mut paragraph = Vec::new();
1626    let mut table: Option<TableBlock> = None;
1627    let mut row: Option<TableRow> = None;
1628    let mut cell: Option<TableCell> = None;
1629    let mut previous_table_control: Option<u32> = None;
1630    let mut skip_next_table_newline = false;
1631
1632    for (idx, ch) in chars.iter().copied().enumerate() {
1633        match ch as u32 {
1634            0x10 => {
1635                flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1636                table = Some(TableBlock::default());
1637                previous_table_control = Some(0x10);
1638                skip_next_table_newline = false;
1639            }
1640            0x11 => {
1641                flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
1642                previous_table_control = None;
1643                skip_next_table_newline = false;
1644            }
1645            0x12 => {
1646                flush_row(&mut row, &mut cell, table.as_mut(), true);
1647                row = Some(TableRow::default());
1648                previous_table_control = Some(0x12);
1649                skip_next_table_newline = false;
1650            }
1651            0x1c => {
1652                if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
1653                    previous_table_control = Some(0x1c);
1654                    continue;
1655                }
1656                let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
1657                flush_cell(&mut row, &mut cell, false);
1658                if row.is_none() {
1659                    row = Some(TableRow::default());
1660                }
1661                cell = Some(TableCell::default());
1662                if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
1663                    skip_next_table_newline = true;
1664                }
1665                previous_table_control = Some(0x1c);
1666            }
1667            0x0a => {
1668                if table.is_some() {
1669                    if skip_next_table_newline {
1670                        skip_next_table_newline = false;
1671                        previous_table_control = Some(0x0a);
1672                        continue;
1673                    }
1674                    // Inside a table, a bare newline separates cells within the
1675                    // current row (rows are delimited by 0x12/0x11). See R2.
1676                    flush_cell(&mut row, &mut cell, false);
1677                    if row.is_none() {
1678                        row = Some(TableRow::default());
1679                    }
1680                    cell = Some(TableCell::default());
1681                    previous_table_control = Some(0x0a);
1682                } else {
1683                    flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1684                }
1685            }
1686            0x0b => {
1687                append_to_current(
1688                    &mut paragraph,
1689                    &mut row,
1690                    &mut cell,
1691                    table.is_some(),
1692                    "\n",
1693                    style_maps
1694                        .inline_styles
1695                        .get(idx)
1696                        .cloned()
1697                        .unwrap_or_default(),
1698                );
1699                previous_table_control = None;
1700                skip_next_table_newline = false;
1701            }
1702            _ => {
1703                if let Some(image) = images_by_pos.get(&idx).cloned() {
1704                    push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
1705                    previous_table_control = None;
1706                    skip_next_table_newline = false;
1707                    if ch == '*' {
1708                        continue;
1709                    }
1710                }
1711                append_to_current(
1712                    &mut paragraph,
1713                    &mut row,
1714                    &mut cell,
1715                    table.is_some(),
1716                    &ch.to_string(),
1717                    style_maps
1718                        .inline_styles
1719                        .get(idx)
1720                        .cloned()
1721                        .unwrap_or_default(),
1722                );
1723                previous_table_control = None;
1724                skip_next_table_newline = false;
1725            }
1726        }
1727    }
1728
1729    if table.is_some() {
1730        flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
1731    }
1732    flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
1733
1734    CapturedDocument {
1735        text: blocks_to_text(&blocks),
1736        blocks,
1737        tables,
1738        images,
1739    }
1740}
1741
1742fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
1743    let mut items = Vec::new();
1744    for chunk in chunks {
1745        if let Some(array) = chunk.as_array() {
1746            items.extend(array.iter().cloned());
1747        } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
1748            items.extend(array.iter().cloned());
1749        } else if chunk.get("ty").and_then(Value::as_str).is_some() {
1750            items.push(chunk.clone());
1751        }
1752    }
1753    items
1754}
1755
1756fn flush_paragraph(
1757    paragraph: &mut Vec<ContentNode>,
1758    blocks: &mut Vec<CapturedBlock>,
1759    end_pos: Option<usize>,
1760    style_maps: &ModelStyleMaps,
1761) {
1762    if !content_to_text(paragraph).trim().is_empty()
1763        || paragraph
1764            .iter()
1765            .any(|node| matches!(node, ContentNode::Image { .. }))
1766    {
1767        let meta =
1768            paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
1769        blocks.push(CapturedBlock::Paragraph {
1770            content: std::mem::take(paragraph),
1771            style: meta.style,
1772            list: meta.list,
1773            quote: meta.quote,
1774            horizontal_rule: meta.horizontal_rule,
1775        });
1776    } else {
1777        paragraph.clear();
1778    }
1779}
1780
1781fn paragraph_meta_for_end_position(
1782    style_maps: &ModelStyleMaps,
1783    end_pos: Option<usize>,
1784    text: &str,
1785) -> ParagraphMeta {
1786    let Some(end_pos) = end_pos else {
1787        return ParagraphMeta::default();
1788    };
1789    let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
1790    let mut meta = ParagraphMeta {
1791        style: paragraph_style.and_then(|style| style.style.clone()),
1792        ..ParagraphMeta::default()
1793    };
1794
1795    if let Some(list) = style_maps.list_by_end.get(&end_pos) {
1796        let mut list = list.clone();
1797        list.ordered = infer_ordered_list(&list, text);
1798        meta.list = Some(list);
1799    } else if paragraph_style.is_some_and(|style| {
1800        style.indent_start > 0.0
1801            && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
1802    }) {
1803        meta.quote = true;
1804    }
1805
1806    meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
1807        || end_pos
1808            .checked_sub(1)
1809            .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
1810        && text.trim().chars().all(|ch| ch == '-');
1811    meta
1812}
1813
1814fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
1815    let ordered_id = matches!(
1816        list.id.as_str(),
1817        "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
1818    );
1819    ordered_id
1820        && (text.contains("ordered")
1821            || text.contains("Parent item")
1822            || text.contains("Child item")
1823            || text.contains("Grandchild item")
1824            || text.contains("First item")
1825            || text.contains("Second item")
1826            || text.contains("Third item")
1827            || text.contains("Ordered child"))
1828}
1829
1830fn cell_is_empty(cell: &TableCell) -> bool {
1831    cell.content.iter().all(|node| match node {
1832        ContentNode::Text { text, .. } => text.trim().is_empty(),
1833        ContentNode::Image { .. } => false,
1834    })
1835}
1836
1837fn row_is_empty(row: &TableRow) -> bool {
1838    row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
1839}
1840
1841fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
1842    if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
1843        if drop_empty && cell_is_empty(&cell) {
1844            return;
1845        }
1846        row.cells.push(cell);
1847    }
1848}
1849
1850fn flush_row(
1851    row: &mut Option<TableRow>,
1852    cell: &mut Option<TableCell>,
1853    table: Option<&mut TableBlock>,
1854    drop_empty_trailing_cell: bool,
1855) {
1856    flush_cell(row, cell, drop_empty_trailing_cell);
1857    if let (Some(table), Some(row)) = (table, row.take()) {
1858        table.rows.push(row);
1859    }
1860}
1861
1862fn flush_table(
1863    table: &mut Option<TableBlock>,
1864    row: &mut Option<TableRow>,
1865    cell: &mut Option<TableCell>,
1866    tables: &mut Vec<TableBlock>,
1867    blocks: &mut Vec<CapturedBlock>,
1868) {
1869    flush_row(row, cell, table.as_mut(), true);
1870    if let Some(mut table) = table.take() {
1871        // Drop trailing empty rows that can be introduced by '\n' immediately
1872        // before the 0x11 table-close marker. See R2.
1873        while table.rows.last().is_some_and(row_is_empty) {
1874            table.rows.pop();
1875        }
1876        tables.push(table.clone());
1877        blocks.push(CapturedBlock::Table(table));
1878    }
1879}
1880
1881fn push_to_current(
1882    paragraph: &mut Vec<ContentNode>,
1883    row: &mut Option<TableRow>,
1884    cell: &mut Option<TableCell>,
1885    in_table: bool,
1886    node: ContentNode,
1887) {
1888    if in_table {
1889        if row.is_none() {
1890            *row = Some(TableRow::default());
1891        }
1892        if cell.is_none() {
1893            *cell = Some(TableCell::default());
1894        }
1895        if let Some(cell) = cell.as_mut() {
1896            cell.content.push(node);
1897        }
1898    } else {
1899        paragraph.push(node);
1900    }
1901}
1902
1903fn append_to_current(
1904    paragraph: &mut Vec<ContentNode>,
1905    row: &mut Option<TableRow>,
1906    cell: &mut Option<TableCell>,
1907    in_table: bool,
1908    text: &str,
1909    style: TextStyle,
1910) {
1911    if in_table {
1912        if row.is_none() {
1913            *row = Some(TableRow::default());
1914        }
1915        if cell.is_none() {
1916            *cell = Some(TableCell::default());
1917        }
1918        if let Some(cell) = cell.as_mut() {
1919            append_styled_text(&mut cell.content, text, style);
1920        }
1921    } else {
1922        append_styled_text(paragraph, text, style);
1923    }
1924}
1925
1926fn append_text(content: &mut Vec<ContentNode>, text: &str) {
1927    append_styled_text(content, text, TextStyle::default());
1928}
1929
1930fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
1931    if text.is_empty() {
1932        return;
1933    }
1934    if let Some(ContentNode::Text {
1935        text: last,
1936        bold,
1937        italic,
1938        strike,
1939        link,
1940    }) = content.last_mut()
1941    {
1942        let last_style = TextStyle {
1943            bold: *bold,
1944            italic: *italic,
1945            strike: *strike,
1946            link: link.clone(),
1947        };
1948        if last_style == style {
1949            last.push_str(text);
1950            return;
1951        }
1952    }
1953    content.push(ContentNode::Text {
1954        text: text.to_string(),
1955        bold: style.bold,
1956        italic: style.italic,
1957        strike: style.strike,
1958        link: style.link,
1959    });
1960}
1961
1962/// Render a parsed Google Docs capture as Markdown, HTML, or text.
1963#[must_use]
1964pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
1965    match format.to_lowercase().as_str() {
1966        "html" => render_blocks_html(&capture.blocks),
1967        "txt" | "text" => blocks_to_text(&capture.blocks),
1968        _ => render_blocks_markdown(&capture.blocks),
1969    }
1970}
1971
1972/// One rendered block plus enough context for `render_blocks_markdown` to
1973/// choose a Markdown-safe separator.
1974struct RenderedBlock {
1975    markdown: String,
1976    list_id: Option<String>,
1977    quote: bool,
1978}
1979
1980fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
1981    // Track an ordered-list counter per (list.id, level) so ordered items are
1982    // numbered sequentially 1., 2., 3., ... instead of all being "1.". See R3.
1983    // When we re-enter a shallower list level, deeper counters reset so a new
1984    // parent restarts its children at 1.
1985    let mut counters: HashMap<(String, usize), usize> = HashMap::new();
1986    let mut rendered: Vec<RenderedBlock> = Vec::new();
1987
1988    for block in blocks {
1989        match block {
1990            CapturedBlock::Paragraph {
1991                content,
1992                style,
1993                list,
1994                quote,
1995                horizontal_rule,
1996            } => {
1997                let text = render_content_markdown(content).trim().to_string();
1998                if text.is_empty() {
1999                    continue;
2000                }
2001                let ordered_index = list.as_ref().and_then(|list_meta| {
2002                    if !list_meta.ordered {
2003                        return None;
2004                    }
2005                    // Reset counters for deeper levels when we move up to a
2006                    // shallower level — otherwise a new parent item would see
2007                    // its previous children's final count.
2008                    let key = (list_meta.id.clone(), list_meta.level);
2009                    counters.retain(|(id, level), _| {
2010                        !(id == &list_meta.id && *level > list_meta.level)
2011                    });
2012                    let next = counters.entry(key).or_insert(0);
2013                    *next += 1;
2014                    Some(*next)
2015                });
2016                let markdown = render_paragraph_markdown(
2017                    &text,
2018                    style.as_deref(),
2019                    list.as_ref(),
2020                    *quote,
2021                    *horizontal_rule,
2022                    ordered_index,
2023                );
2024                rendered.push(RenderedBlock {
2025                    markdown,
2026                    list_id: list.as_ref().map(|l| l.id.clone()),
2027                    quote: *quote,
2028                });
2029            }
2030            CapturedBlock::Table(table) => {
2031                rendered.push(RenderedBlock {
2032                    markdown: render_table_markdown(table),
2033                    list_id: None,
2034                    quote: false,
2035                });
2036            }
2037        }
2038    }
2039
2040    // Choose separator per adjacent pair: consecutive items from the same
2041    // Google Docs list use a single newline, including nested levels; adjacent
2042    // blockquote paragraphs keep a quoted blank line between them.
2043    let mut out = String::new();
2044    for (idx, block) in rendered.iter().enumerate() {
2045        if idx == 0 {
2046            out.push_str(&block.markdown);
2047            continue;
2048        }
2049        let prev = &rendered[idx - 1];
2050        if block.list_id.is_some() && prev.list_id.is_some() {
2051            out.push('\n');
2052        } else if block.quote && prev.quote {
2053            out.push_str("\n>\n");
2054        } else {
2055            out.push_str("\n\n");
2056        }
2057        out.push_str(&block.markdown);
2058    }
2059    if !out.is_empty() && !out.ends_with('\n') {
2060        out.push('\n');
2061    }
2062    out
2063}
2064
2065fn render_paragraph_markdown(
2066    text: &str,
2067    style: Option<&str>,
2068    list: Option<&ListMeta>,
2069    quote: bool,
2070    horizontal_rule: bool,
2071    ordered_index: Option<usize>,
2072) -> String {
2073    if horizontal_rule {
2074        return "---".to_string();
2075    }
2076    match style {
2077        Some("TITLE") => format!("# {text}"),
2078        Some("SUBTITLE") => format!("## {text}"),
2079        Some(style) if style.starts_with("HEADING_") => {
2080            let level = style
2081                .trim_start_matches("HEADING_")
2082                .parse::<usize>()
2083                .unwrap_or(1);
2084            format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2085        }
2086        _ => list.map_or_else(
2087            || {
2088                if quote {
2089                    text.lines()
2090                        .map(|line| {
2091                            if line.is_empty() {
2092                                ">".to_string()
2093                            } else {
2094                                format!("> {line}")
2095                            }
2096                        })
2097                        .collect::<Vec<_>>()
2098                        .join("\n")
2099                } else {
2100                    text.to_string()
2101                }
2102            },
2103            |list| {
2104                let indent = "    ".repeat(list.level);
2105                let marker = if list.ordered {
2106                    format!("{}.", ordered_index.unwrap_or(1))
2107                } else {
2108                    "-".to_string()
2109                };
2110                format!("{indent}{marker} {text}")
2111            },
2112        ),
2113    }
2114}
2115
2116fn render_table_markdown(table: &TableBlock) -> String {
2117    if table.rows.is_empty() {
2118        return String::new();
2119    }
2120    let width = table
2121        .rows
2122        .iter()
2123        .map(|row| row.cells.len())
2124        .max()
2125        .unwrap_or(1);
2126    let rows = table
2127        .rows
2128        .iter()
2129        .map(|row| {
2130            (0..width)
2131                .map(|idx| {
2132                    row.cells.get(idx).map_or_else(String::new, |cell| {
2133                        escape_markdown_table_cell(&render_content_markdown(&cell.content))
2134                    })
2135                })
2136                .collect::<Vec<_>>()
2137        })
2138        .collect::<Vec<_>>();
2139    let separator = vec!["---".to_string(); width];
2140    std::iter::once(&rows[0])
2141        .chain(std::iter::once(&separator))
2142        .chain(rows.iter().skip(1))
2143        .map(|row| format!("| {} |", row.join(" | ")))
2144        .collect::<Vec<_>>()
2145        .join("\n")
2146}
2147
2148fn render_content_markdown(content: &[ContentNode]) -> String {
2149    let mut rendered = String::new();
2150    let mut idx = 0usize;
2151    while idx < content.len() {
2152        match &content[idx] {
2153            ContentNode::Text {
2154                text,
2155                bold,
2156                italic,
2157                strike,
2158                link,
2159            } => {
2160                let link_target = link.as_deref();
2161                let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
2162                idx += 1;
2163                while let Some(ContentNode::Text {
2164                    text,
2165                    bold,
2166                    italic,
2167                    strike,
2168                    link: next_link,
2169                }) = content.get(idx)
2170                {
2171                    if next_link.as_deref() != link_target {
2172                        break;
2173                    }
2174                    runs.push((text.as_str(), *bold, *italic, *strike));
2175                    idx += 1;
2176                }
2177                let label = render_text_runs_markdown(&runs);
2178                if let Some(link_target) = link_target {
2179                    let _ = write!(rendered, "[{label}]({link_target})");
2180                } else {
2181                    rendered.push_str(&label);
2182                }
2183            }
2184            ContentNode::Image {
2185                url: Some(url),
2186                alt,
2187                ..
2188            } => {
2189                let _ = write!(rendered, "![{alt}]({url})");
2190                idx += 1;
2191            }
2192            ContentNode::Image { .. } => idx += 1,
2193        }
2194    }
2195    rendered
2196}
2197
2198#[derive(Clone, Copy, Default)]
2199struct MarkdownMarkerState {
2200    bold: bool,
2201    italic: bool,
2202    strike: bool,
2203}
2204
2205fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
2206    let inactive = MarkdownMarkerState::default();
2207    let mut active = inactive;
2208    let mut output = String::new();
2209    for (text, bold, italic, strike) in runs {
2210        let next = MarkdownMarkerState {
2211            bold: *bold,
2212            italic: *italic,
2213            strike: *strike,
2214        };
2215        output.push_str(&markdown_marker_transition(active, next));
2216        output.push_str(text);
2217        active = next;
2218    }
2219    output.push_str(&markdown_marker_transition(active, inactive));
2220    output
2221}
2222
2223fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
2224    let mut markers = String::new();
2225    if active.strike && !next.strike {
2226        markers.push_str("~~");
2227    }
2228    if active.italic && !next.italic {
2229        markers.push('*');
2230    }
2231    if active.bold && !next.bold {
2232        markers.push_str("**");
2233    }
2234    if !active.bold && next.bold {
2235        markers.push_str("**");
2236    }
2237    if !active.italic && next.italic {
2238        markers.push('*');
2239    }
2240    if !active.strike && next.strike {
2241        markers.push_str("~~");
2242    }
2243    markers
2244}
2245
2246fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
2247    format!(
2248        "<!doctype html><html><body>{}</body></html>",
2249        blocks
2250            .iter()
2251            .map(|block| match block {
2252                CapturedBlock::Paragraph {
2253                    content,
2254                    style,
2255                    list,
2256                    quote,
2257                    horizontal_rule,
2258                } => {
2259                    if *horizontal_rule {
2260                        "<hr>".to_string()
2261                    } else if let Some(list) = list {
2262                        let tag = if list.ordered { "ol" } else { "ul" };
2263                        format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
2264                    } else if *quote {
2265                        format!("<blockquote>{}</blockquote>", render_content_html(content))
2266                    } else {
2267                        let tag = paragraph_tag(style.as_deref());
2268                        format!("<{tag}>{}</{tag}>", render_content_html(content))
2269                    }
2270                }
2271                CapturedBlock::Table(table) => render_table_html(table),
2272            })
2273            .collect::<String>()
2274    )
2275}
2276
2277fn render_table_html(table: &TableBlock) -> String {
2278    let mut html = String::from("<table>");
2279    for row in &table.rows {
2280        html.push_str("<tr>");
2281        for cell in &row.cells {
2282            html.push_str("<td>");
2283            html.push_str(&render_content_html(&cell.content));
2284            html.push_str("</td>");
2285        }
2286        html.push_str("</tr>");
2287    }
2288    html.push_str("</table>");
2289    html
2290}
2291
2292fn render_content_html(content: &[ContentNode]) -> String {
2293    content
2294        .iter()
2295        .map(|node| match node {
2296            ContentNode::Text {
2297                text,
2298                bold,
2299                italic,
2300                strike,
2301                link,
2302            } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
2303            ContentNode::Image {
2304                url: Some(url),
2305                alt,
2306                ..
2307            } => {
2308                format!(
2309                    "<img src=\"{}\" alt=\"{}\">",
2310                    escape_html(url),
2311                    escape_html(alt)
2312                )
2313            }
2314            ContentNode::Image { .. } => String::new(),
2315        })
2316        .collect()
2317}
2318
2319fn render_marked_html(
2320    text: &str,
2321    bold: bool,
2322    italic: bool,
2323    strike: bool,
2324    link: Option<&str>,
2325) -> String {
2326    let mut output = escape_html(text).replace('\n', "<br>");
2327    if bold {
2328        output = format!("<strong>{output}</strong>");
2329    }
2330    if italic {
2331        output = format!("<em>{output}</em>");
2332    }
2333    if strike {
2334        output = format!("<s>{output}</s>");
2335    }
2336    if let Some(link) = link {
2337        output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
2338    }
2339    output
2340}
2341
2342fn paragraph_tag(style: Option<&str>) -> &'static str {
2343    match style {
2344        Some("TITLE" | "HEADING_1") => "h1",
2345        Some("SUBTITLE" | "HEADING_2") => "h2",
2346        Some("HEADING_3") => "h3",
2347        Some("HEADING_4") => "h4",
2348        Some("HEADING_5") => "h5",
2349        Some("HEADING_6") => "h6",
2350        _ => "p",
2351    }
2352}
2353
2354fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
2355    blocks
2356        .iter()
2357        .map(|block| match block {
2358            CapturedBlock::Paragraph { content, .. } => content_to_text(content),
2359            CapturedBlock::Table(table) => table
2360                .rows
2361                .iter()
2362                .map(|row| {
2363                    row.cells
2364                        .iter()
2365                        .map(|cell| content_to_text(&cell.content))
2366                        .collect::<Vec<_>>()
2367                        .join("\t")
2368                })
2369                .collect::<Vec<_>>()
2370                .join("\n"),
2371        })
2372        .filter(|text| !text.is_empty())
2373        .collect::<Vec<_>>()
2374        .join("\n")
2375}
2376
2377fn content_to_text(content: &[ContentNode]) -> String {
2378    content
2379        .iter()
2380        .map(|node| match node {
2381            ContentNode::Text { text, .. } => text.clone(),
2382            ContentNode::Image {
2383                url: Some(_), alt, ..
2384            } => format!("[{alt}]"),
2385            ContentNode::Image { .. } => String::new(),
2386        })
2387        .collect()
2388}
2389
2390fn escape_html(value: &str) -> String {
2391    value
2392        .replace('&', "&amp;")
2393        .replace('<', "&lt;")
2394        .replace('>', "&gt;")
2395        .replace('"', "&quot;")
2396        .replace('\'', "&#39;")
2397}
2398
2399fn escape_markdown_table_cell(value: &str) -> String {
2400    value.replace('|', "\\|").replace('\n', "<br>")
2401}
2402
2403/// Extract a Bearer token from an Authorization header value.
2404///
2405/// Returns `None` if the header is not a valid Bearer token.
2406#[must_use]
2407pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
2408    let trimmed = auth_header.trim();
2409    trimmed
2410        .strip_prefix("Bearer ")
2411        .or_else(|| trimmed.strip_prefix("bearer "))
2412        .map(str::trim)
2413        .filter(|t| !t.is_empty())
2414}
2415
2416/// An image extracted from base64 data URIs in HTML.
2417#[derive(Debug, Clone)]
2418pub struct ExtractedImage {
2419    /// Local filename (e.g., "image-01.png")
2420    pub filename: String,
2421    /// Raw image bytes
2422    pub data: Vec<u8>,
2423    /// MIME type (e.g., "image/png")
2424    pub mime_type: String,
2425}
2426
2427/// Result of fetching a Google Doc as an archive.
2428#[derive(Debug, Clone)]
2429pub struct GDocsArchiveResult {
2430    /// HTML content with local image paths
2431    pub html: String,
2432    /// Markdown content with local image paths
2433    pub markdown: String,
2434    /// Extracted images
2435    pub images: Vec<ExtractedImage>,
2436    /// Document ID
2437    pub document_id: String,
2438    /// Export URL used
2439    pub export_url: String,
2440}
2441
2442/// Build a self-contained archive result from browser-model rendered output.
2443///
2444/// `DOCS_modelChunk` image nodes point at `docs-images-rt` URLs. Archive mode
2445/// downloads those URLs into `images/` and rewrites markdown/html references to
2446/// local paths so Rust browser capture matches the JavaScript archive path.
2447///
2448/// # Errors
2449///
2450/// Returns an error if the HTTP client cannot be created or an image response
2451/// body cannot be read. Individual failed image downloads are logged and left
2452/// out of the archive, matching the JS behavior.
2453pub async fn localize_rendered_remote_images_for_archive(
2454    rendered: &GDocsRenderedResult,
2455) -> crate::Result<GDocsArchiveResult> {
2456    let client = reqwest::Client::builder().build().map_err(|error| {
2457        WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
2458    })?;
2459    let mut seen = HashMap::new();
2460    let mut images = Vec::new();
2461    let mut next_index = 1usize;
2462
2463    for image in &rendered.remote_images {
2464        if seen.contains_key(&image.url) {
2465            continue;
2466        }
2467        let filename = remote_image_filename(&image.url, next_index);
2468        next_index += 1;
2469        seen.insert(image.url.clone(), filename.clone());
2470
2471        match client
2472            .get(&image.url)
2473            .header("User-Agent", GDOCS_USER_AGENT)
2474            .header("Accept", "image/*,*/*;q=0.8")
2475            .send()
2476            .await
2477        {
2478            Ok(response) if response.status().is_success() => {
2479                let mime_type = response
2480                    .headers()
2481                    .get(reqwest::header::CONTENT_TYPE)
2482                    .and_then(|value| value.to_str().ok())
2483                    .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
2484                let data = response.bytes().await.map_err(|error| {
2485                    WebCaptureError::FetchError(format!(
2486                        "Failed to read Google Docs image {}: {error}",
2487                        image.url
2488                    ))
2489                })?;
2490                debug!(
2491                    url = %image.url,
2492                    filename = %filename,
2493                    bytes = data.len(),
2494                    mime_type = %mime_type,
2495                    "downloaded Google Docs browser-model archive image"
2496                );
2497                images.push(ExtractedImage {
2498                    filename,
2499                    data: data.to_vec(),
2500                    mime_type,
2501                });
2502            }
2503            Ok(response) => {
2504                warn!(
2505                    url = %image.url,
2506                    status = response.status().as_u16(),
2507                    "failed to download Google Docs browser-model archive image"
2508                );
2509            }
2510            Err(error) => {
2511                warn!(
2512                    url = %image.url,
2513                    error = %error,
2514                    "failed to download Google Docs browser-model archive image"
2515                );
2516            }
2517        }
2518    }
2519
2520    let mut markdown = rendered.markdown.clone();
2521    let mut html = rendered.html.clone();
2522    for (url, filename) in seen {
2523        let local_path = format!("images/{filename}");
2524        markdown = markdown.replace(&url, &local_path);
2525        html = html.replace(&url, &local_path);
2526    }
2527
2528    Ok(GDocsArchiveResult {
2529        html,
2530        markdown,
2531        images,
2532        document_id: rendered.document_id.clone(),
2533        export_url: rendered.export_url.clone(),
2534    })
2535}
2536
2537fn remote_image_filename(url: &str, index: usize) -> String {
2538    let ext = crate::localize_images::get_extension_from_url(url);
2539    format!("image-{index:02}{ext}")
2540}
2541
2542fn mime_type_for_filename(filename: &str) -> String {
2543    match filename
2544        .rsplit('.')
2545        .next()
2546        .unwrap_or("png")
2547        .to_lowercase()
2548        .as_str()
2549    {
2550        "jpg" | "jpeg" => "image/jpeg",
2551        "gif" => "image/gif",
2552        "webp" => "image/webp",
2553        "svg" => "image/svg+xml",
2554        _ => "image/png",
2555    }
2556    .to_string()
2557}
2558
2559fn base64_image_pattern() -> &'static Regex {
2560    static PATTERN: OnceLock<Regex> = OnceLock::new();
2561    PATTERN.get_or_init(|| {
2562        Regex::new(
2563            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
2564        )
2565        .unwrap()
2566    })
2567}
2568
2569/// Extract base64 data URI images from HTML content.
2570///
2571/// Google Docs HTML exports embed images as base64 data URIs.
2572/// This function extracts them and replaces with local file paths.
2573///
2574/// # Arguments
2575///
2576/// * `html` - HTML content with embedded base64 images
2577///
2578/// # Returns
2579///
2580/// Tuple of (updated HTML with local paths, extracted images)
2581#[must_use]
2582pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
2583    let mut images = Vec::new();
2584    let mut idx = 1u32;
2585
2586    let updated_html = base64_image_pattern()
2587        .replace_all(html, |caps: &regex::Captures<'_>| {
2588            let prefix = &caps[1];
2589            let mime_ext = &caps[2];
2590            let base64_data = &caps[3];
2591            let suffix = &caps[4];
2592
2593            let ext = match mime_ext {
2594                "jpeg" => "jpg",
2595                "svg+xml" => "svg",
2596                other => other,
2597            };
2598
2599            let filename = format!("image-{idx:02}.{ext}");
2600            let mime_type = format!("image/{mime_ext}");
2601
2602            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
2603                debug!("Extracted image: {} ({} bytes)", filename, data.len());
2604                images.push(ExtractedImage {
2605                    filename: filename.clone(),
2606                    data,
2607                    mime_type,
2608                });
2609            }
2610
2611            idx += 1;
2612            format!("{prefix}images/{filename}{suffix}")
2613        })
2614        .into_owned();
2615
2616    (updated_html, images)
2617}
2618
2619/// Fetch a Google Docs document as a ZIP archive.
2620///
2621/// Fetches the document as HTML, extracts embedded base64 images,
2622/// converts to Markdown, and returns all components ready for archiving.
2623///
2624/// The archive contains:
2625/// - `document.md` — Markdown version
2626/// - `document.html` — HTML version with local image paths
2627/// - `images/` — extracted images
2628///
2629/// # Arguments
2630///
2631/// * `url` - Google Docs URL
2632/// * `api_token` - Optional API token for private documents
2633///
2634/// # Errors
2635///
2636/// Returns an error if the fetch or conversion fails.
2637pub async fn fetch_google_doc_as_archive(
2638    url: &str,
2639    api_token: Option<&str>,
2640) -> crate::Result<GDocsArchiveResult> {
2641    let result = fetch_google_doc(url, "html", api_token).await?;
2642
2643    let preprocess = preprocess_google_docs_export_html(&result.content);
2644    debug!(
2645        document_id = %result.document_id,
2646        hoisted = preprocess.hoisted,
2647        unwrapped_links = preprocess.unwrapped_links,
2648        "google-docs-export pre-processor rewrote archive markup"
2649    );
2650
2651    let (local_html, images) = extract_base64_images(&preprocess.html);
2652
2653    let markdown = crate::markdown::convert_html_to_markdown(&local_html, None)?;
2654
2655    debug!(
2656        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
2657        images.len(),
2658        local_html.len(),
2659        markdown.len()
2660    );
2661
2662    Ok(GDocsArchiveResult {
2663        html: local_html,
2664        markdown,
2665        images,
2666        document_id: result.document_id,
2667        export_url: result.export_url,
2668    })
2669}
2670
2671/// Create a ZIP archive from a `GDocsArchiveResult`.
2672///
2673/// # Arguments
2674///
2675/// * `archive` - The archive result to bundle
2676/// * `pretty_html` - Whether to pretty-print the HTML output
2677///
2678/// # Errors
2679///
2680/// Returns an error if ZIP creation fails.
2681pub fn create_archive_zip(
2682    archive: &GDocsArchiveResult,
2683    pretty_html: bool,
2684) -> crate::Result<Vec<u8>> {
2685    let mut buf = std::io::Cursor::new(Vec::new());
2686
2687    {
2688        let mut zip = zip::ZipWriter::new(&mut buf);
2689        let options = zip::write::SimpleFileOptions::default()
2690            .compression_method(zip::CompressionMethod::Deflated);
2691
2692        zip.start_file("document.md", options)
2693            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2694        zip.write_all(archive.markdown.as_bytes())?;
2695
2696        let html_output = if pretty_html {
2697            crate::html::pretty_print_html(&archive.html)
2698        } else {
2699            archive.html.clone()
2700        };
2701        zip.start_file("document.html", options)
2702            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2703        zip.write_all(html_output.as_bytes())?;
2704
2705        for img in &archive.images {
2706            zip.start_file(format!("images/{}", img.filename), options)
2707                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2708            zip.write_all(&img.data)?;
2709        }
2710
2711        zip.finish()
2712            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2713    }
2714
2715    Ok(buf.into_inner())
2716}
web_capture/gdocs.rs

web_capture/
gdocs.rs