Skip to main content

web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use serde_json::Value;
38use std::collections::HashMap;
39use std::fmt::Write as _;
40use std::hash::BuildHasher;
41use std::io::Write;
42use std::process::Stdio;
43use std::sync::OnceLock;
44use std::time::{Duration, Instant};
45use tokio::io::{AsyncBufReadExt, BufReader};
46use tokio::process::{Child, Command};
47use tracing::{debug, info, warn};
48
49use crate::WebCaptureError;
50
51const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
52const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
53const GDOCS_USER_AGENT: &str =
54    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
55const GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT: Duration = Duration::from_secs(30);
56const GDOCS_EDITOR_MODEL_STABILITY_DEFAULT: Duration = Duration::from_millis(1500);
57const GDOCS_EDITOR_MODEL_POLL_INTERVAL: Duration = Duration::from_millis(250);
58const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
59
60type CdpWebSocket = WebSocketStream<ConnectStream>;
61
62const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
63window.__captured_chunks = [];
64const captureChunk = (value) => {
65  if (!value) {
66    return;
67  }
68  if (Array.isArray(value)) {
69    for (const item of value) {
70      captureChunk(item);
71    }
72    return;
73  }
74  try {
75    window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
76  } catch {
77    window.__captured_chunks.push(value);
78  }
79};
80const wrapChunkArray = (value) => {
81  if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
82    return value;
83  }
84  const originalPush = value.push;
85  Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
86    value: true,
87    enumerable: false,
88  });
89  Object.defineProperty(value, 'push', {
90    value(...items) {
91      for (const item of items) {
92        captureChunk(item);
93      }
94      return originalPush.apply(this, items);
95    },
96    writable: true,
97    configurable: true,
98  });
99  for (const item of value) {
100    captureChunk(item);
101  }
102  return value;
103};
104Object.defineProperty(window, 'DOCS_modelChunk', {
105  set(value) {
106    captureChunk(value);
107    window.__DOCS_modelChunk_latest = wrapChunkArray(value);
108  },
109  get() {
110    return window.__DOCS_modelChunk_latest;
111  },
112  configurable: false,
113});
114";
115
116const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
117  const chunks = [...(window.__captured_chunks || [])];
118  if (
119    window.DOCS_modelChunk &&
120    chunks.length === 0 &&
121    !chunks.includes(window.DOCS_modelChunk)
122  ) {
123    chunks.push(window.DOCS_modelChunk);
124  }
125  const cidUrlMap = {};
126  const scripts = document.querySelectorAll('script');
127  for (const script of scripts) {
128    const text = script.textContent || '';
129    if (!text.includes('docs-images-rt')) {
130      continue;
131    }
132    const regex =
133      /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
134    let match;
135    while ((match = regex.exec(text)) !== null) {
136      cidUrlMap[match[1]] = match[2]
137        .replace(/\\u003d/g, '=')
138        .replace(/\\u0026/g, '&')
139        .replace(/\\\//g, '/');
140    }
141  }
142  return { chunks, cidUrlMap };
143}"#;
144
145fn gdocs_url_pattern() -> &'static Regex {
146    static PATTERN: OnceLock<Regex> = OnceLock::new();
147    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
148}
149
150/// Result of fetching a Google Docs document.
151#[derive(Debug, Clone)]
152pub struct GDocsResult {
153    /// The document content in the requested format.
154    pub content: String,
155    /// The export format used.
156    pub format: String,
157    /// The extracted document ID.
158    pub document_id: String,
159    /// The export URL that was fetched.
160    pub export_url: String,
161}
162
163/// Google Docs capture backend selected from the CLI `--capture` flag.
164#[derive(Debug, Clone, Copy, PartialEq, Eq)]
165pub enum GDocsCaptureMethod {
166    /// Load `/edit` and extract `DOCS_modelChunk`.
167    BrowserModel,
168    /// Use the public `/export?format=...` endpoint.
169    PublicExport,
170    /// Use the authenticated `docs.googleapis.com` REST API.
171    DocsApi,
172}
173
174/// Rendered Google Docs content from either Docs API or editor model data.
175#[derive(Debug, Clone)]
176pub struct GDocsRenderedResult {
177    /// Markdown output.
178    pub markdown: String,
179    /// HTML output.
180    pub html: String,
181    /// Plain text output.
182    pub text: String,
183    /// The extracted document ID.
184    pub document_id: String,
185    /// Source URL used for capture.
186    pub export_url: String,
187    /// Remote images exposed by the editor model, used for archive localization.
188    pub remote_images: Vec<RemoteImage>,
189}
190
191/// Remote image reference extracted from browser-model capture.
192#[derive(Debug, Clone, PartialEq, Eq)]
193pub struct RemoteImage {
194    /// Original image URL.
195    pub url: String,
196    /// Image alt text.
197    pub alt: String,
198}
199
200#[derive(Debug, Clone)]
201struct BrowserModelData {
202    chunks: Vec<Value>,
203    cid_urls: HashMap<String, String>,
204    chunk_payload_bytes: usize,
205    poll_count: usize,
206    stable_for: Duration,
207}
208
209#[derive(Debug, Clone, Copy, PartialEq, Eq)]
210struct BrowserModelFingerprint {
211    chunks: usize,
212    payload_bytes: usize,
213}
214
215#[derive(Debug, Default)]
216struct BrowserModelQuiescence {
217    last_fingerprint: Option<BrowserModelFingerprint>,
218    stable_since: Option<Instant>,
219}
220
221impl BrowserModelData {
222    const fn fingerprint(&self) -> BrowserModelFingerprint {
223        BrowserModelFingerprint {
224            chunks: self.chunks.len(),
225            payload_bytes: self.chunk_payload_bytes,
226        }
227    }
228}
229
230impl BrowserModelQuiescence {
231    fn observe(
232        &mut self,
233        fingerprint: BrowserModelFingerprint,
234        now: Instant,
235        stability_window: Duration,
236    ) -> Option<Duration> {
237        if fingerprint.chunks == 0 {
238            self.last_fingerprint = Some(fingerprint);
239            self.stable_since = None;
240            return None;
241        }
242
243        if self.last_fingerprint == Some(fingerprint) {
244            let stable_since = *self.stable_since.get_or_insert(now);
245            let stable_for = now.saturating_duration_since(stable_since);
246            if stable_for >= stability_window {
247                return Some(stable_for);
248            }
249        } else {
250            self.last_fingerprint = Some(fingerprint);
251            self.stable_since = None;
252        }
253
254        None
255    }
256
257    fn stable_for(&self, now: Instant) -> Duration {
258        self.stable_since.map_or(Duration::ZERO, |stable_since| {
259            now.saturating_duration_since(stable_since)
260        })
261    }
262}
263
264/// Parsed Google Docs model/document capture.
265#[derive(Debug, Clone, Default)]
266pub struct CapturedDocument {
267    /// Ordered document blocks.
268    pub blocks: Vec<CapturedBlock>,
269    /// Tables extracted from `blocks` for compatibility with tests and callers.
270    pub tables: Vec<TableBlock>,
271    /// Images extracted from model positions.
272    pub images: Vec<ContentNode>,
273    /// Plain text projection.
274    pub text: String,
275}
276
277/// Captured block.
278#[derive(Debug, Clone)]
279pub enum CapturedBlock {
280    /// Paragraph-like block.
281    Paragraph {
282        /// Paragraph content.
283        content: Vec<ContentNode>,
284        /// Optional Google Docs named style.
285        style: Option<String>,
286        /// Optional list metadata.
287        list: Option<ListMeta>,
288        /// Whether paragraph is a blockquote.
289        quote: bool,
290        /// Whether paragraph is a horizontal rule.
291        horizontal_rule: bool,
292    },
293    /// Table block.
294    Table(TableBlock),
295}
296
297/// Captured table.
298#[derive(Debug, Clone, Default)]
299pub struct TableBlock {
300    /// Table rows.
301    pub rows: Vec<TableRow>,
302}
303
304/// Captured table row.
305#[derive(Debug, Clone, Default)]
306pub struct TableRow {
307    /// Row cells.
308    pub cells: Vec<TableCell>,
309}
310
311/// Captured table cell.
312#[derive(Debug, Clone, Default)]
313pub struct TableCell {
314    /// Cell content.
315    pub content: Vec<ContentNode>,
316}
317
318/// Captured inline content node.
319#[derive(Debug, Clone, PartialEq, Eq)]
320pub enum ContentNode {
321    /// Text run.
322    Text {
323        /// Text content.
324        text: String,
325        /// Bold text style.
326        bold: bool,
327        /// Italic text style.
328        italic: bool,
329        /// Strikethrough text style.
330        strike: bool,
331        /// Optional hyperlink target.
332        link: Option<String>,
333    },
334    /// Image placeholder.
335    Image {
336        /// Content ID from Google Docs model data.
337        cid: Option<String>,
338        /// Resolved image URL.
339        url: Option<String>,
340        /// Alt text.
341        alt: String,
342        /// Editor-model image width, when available.
343        width: Option<String>,
344        /// Editor-model image height, when available.
345        height: Option<String>,
346        /// Whether this image came from a suggested edit.
347        is_suggestion: bool,
348    },
349}
350
351#[derive(Debug, Clone, Default, PartialEq, Eq)]
352struct TextStyle {
353    bold: bool,
354    italic: bool,
355    strike: bool,
356    link: Option<String>,
357}
358
359#[derive(Debug, Clone, Default)]
360struct ParagraphMeta {
361    style: Option<String>,
362    list: Option<ListMeta>,
363    quote: bool,
364    horizontal_rule: bool,
365}
366
367#[derive(Debug, Clone)]
368pub struct ListMeta {
369    /// Google Docs list identifier.
370    pub id: String,
371    /// Nesting level, zero-based.
372    pub level: usize,
373    /// Whether Markdown should render this list item with an ordered marker.
374    pub ordered: bool,
375}
376
377#[derive(Debug, Clone)]
378struct ParagraphStyle {
379    style: Option<String>,
380    indent_start: f64,
381    indent_first_line: f64,
382}
383
384#[derive(Debug, Clone, Default)]
385struct ModelStyleMaps {
386    inline_styles: Vec<TextStyle>,
387    paragraph_by_end: HashMap<usize, ParagraphStyle>,
388    list_by_end: HashMap<usize, ListMeta>,
389    horizontal_rules: std::collections::HashSet<usize>,
390}
391
392/// Check if a URL is a Google Docs document URL.
393#[must_use]
394pub fn is_google_docs_url(url: &str) -> bool {
395    gdocs_url_pattern().is_match(url)
396}
397
398/// Extract the document ID from a Google Docs URL.
399///
400/// Returns `None` if the URL is not a valid Google Docs URL.
401#[must_use]
402pub fn extract_document_id(url: &str) -> Option<String> {
403    gdocs_url_pattern()
404        .captures(url)
405        .and_then(|caps| caps.get(1))
406        .map(|m| m.as_str().to_string())
407}
408
409/// Build a Google Docs export URL.
410///
411/// # Arguments
412///
413/// * `document_id` - The Google Docs document ID
414/// * `format` - Export format (html, txt, md, pdf, docx, epub)
415#[must_use]
416pub fn build_export_url(document_id: &str, format: &str) -> String {
417    let export_format = match format {
418        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
419        _ => "html",
420    };
421    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
422}
423
424/// Build a Google Docs editor URL.
425#[must_use]
426pub fn build_edit_url(document_id: &str) -> String {
427    format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
428}
429
430/// Build a Google Docs REST API URL.
431#[must_use]
432pub fn build_docs_api_url(document_id: &str) -> String {
433    format!("{GDOCS_API_BASE}/{document_id}")
434}
435
436/// Select a Google Docs capture backend from the CLI `--capture` value.
437///
438/// # Errors
439///
440/// Returns an error when `capture` is neither `browser` nor `api`.
441pub fn select_capture_method(
442    capture: &str,
443    api_token: Option<&str>,
444) -> crate::Result<GDocsCaptureMethod> {
445    match capture.to_lowercase().as_str() {
446        "browser" => Ok(GDocsCaptureMethod::BrowserModel),
447        "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
448        "api" => Ok(GDocsCaptureMethod::PublicExport),
449        other => Err(WebCaptureError::InvalidUrl(format!(
450            "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
451        ))),
452    }
453}
454
455/// Fetch a Google Docs document via the export URL.
456///
457/// For public documents, pass `None` for `api_token`.
458/// For private documents, pass a Bearer token string.
459///
460/// # Arguments
461///
462/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
463/// * `format` - Export format (html, txt, md, pdf, docx, epub)
464/// * `api_token` - Optional API token for private documents
465///
466/// # Errors
467///
468/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
469pub async fn fetch_google_doc(
470    url: &str,
471    format: &str,
472    api_token: Option<&str>,
473) -> crate::Result<GDocsResult> {
474    let document_id = extract_document_id(url).ok_or_else(|| {
475        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
476    })?;
477
478    let export_url = build_export_url(&document_id, format);
479    debug!(
480        document_id = %document_id,
481        format = %format,
482        export_url = %export_url,
483        has_api_token = api_token.is_some(),
484        "fetching Google Doc via public export"
485    );
486
487    let mut request = reqwest::Client::new()
488        .get(&export_url)
489        .header(
490            "User-Agent",
491            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
492        )
493        .header("Accept-Charset", "utf-8")
494        .header("Accept-Language", "en-US,en;q=0.9");
495
496    if let Some(token) = api_token {
497        request = request.header("Authorization", format!("Bearer {token}"));
498    }
499
500    let response = request
501        .send()
502        .await
503        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
504    debug!(
505        document_id = %document_id,
506        status = response.status().as_u16(),
507        success = response.status().is_success(),
508        content_type = response
509            .headers()
510            .get(reqwest::header::CONTENT_TYPE)
511            .and_then(|value| value.to_str().ok())
512            .unwrap_or(""),
513        "received Google Docs public export response"
514    );
515
516    if !response.status().is_success() {
517        return Err(WebCaptureError::FetchError(format!(
518            "Failed to fetch Google Doc ({} {}): {}",
519            response.status().as_u16(),
520            response.status().canonical_reason().unwrap_or("Unknown"),
521            export_url
522        )));
523    }
524
525    let raw_content = response.text().await.map_err(|e| {
526        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
527    })?;
528    debug!(
529        document_id = %document_id,
530        bytes = raw_content.len(),
531        "read Google Docs public export body"
532    );
533
534    // Decode HTML entities to unicode for text-based formats
535    let content = match format {
536        "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
537        _ => raw_content,
538    };
539
540    Ok(GDocsResult {
541        content,
542        format: format.to_string(),
543        document_id,
544        export_url,
545    })
546}
547
548/// Fetch a Google Docs document and convert to Markdown.
549///
550/// Fetches the document as HTML, then converts to Markdown using the
551/// existing HTML-to-Markdown pipeline.
552///
553/// # Arguments
554///
555/// * `url` - Google Docs URL
556/// * `api_token` - Optional API token for private documents
557///
558/// # Errors
559///
560/// Returns an error if the fetch or conversion fails.
561pub async fn fetch_google_doc_as_markdown(
562    url: &str,
563    api_token: Option<&str>,
564) -> crate::Result<GDocsResult> {
565    let result = fetch_google_doc(url, "html", api_token).await?;
566
567    let preprocess = preprocess_google_docs_export_html(&result.content);
568    debug!(
569        document_id = %result.document_id,
570        hoisted = preprocess.hoisted,
571        unwrapped_links = preprocess.unwrapped_links,
572        "google-docs-export pre-processor rewrote markup"
573    );
574    let markdown = normalize_google_docs_export_markdown(
575        &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
576    );
577    debug!(
578        document_id = %result.document_id,
579        bytes = markdown.len(),
580        "rendered Google Docs public export markdown"
581    );
582
583    Ok(GDocsResult {
584        content: markdown,
585        format: "markdown".to_string(),
586        document_id: result.document_id,
587        export_url: result.export_url,
588    })
589}
590
591/// Result of running the Google Docs export HTML pre-processor.
592///
593/// Exposes the rewritten HTML alongside counters that are useful for debug
594/// logging (`gdocs.export.style-hoist`). See issue #92 R6.
595#[derive(Debug, Clone)]
596pub struct GDocsExportPreprocessResult {
597    /// Rewritten HTML.
598    pub html: String,
599    /// Number of inline-style spans turned into `<strong>`/`<em>`/`<del>`.
600    pub hoisted: usize,
601    /// Number of `google.com/url?q=` redirect wrappers unwrapped.
602    pub unwrapped_links: usize,
603}
604
605/// Pre-process Google Docs export HTML so the generic `html2md` pipeline
606/// preserves inline formatting, heading numbering, and link targets.
607///
608/// Google Drive serves bold/italic/strikethrough as inline style spans and
609/// wraps every link through a `google.com/url?q=` redirect, both of which
610/// the generic converter would otherwise discard. This function rewrites
611/// those constructs into semantic HTML before conversion.
612#[must_use]
613pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
614    let mut hoisted: usize = 0;
615    let mut unwrapped_links: usize = 0;
616    let class_styles = extract_css_class_styles(html);
617
618    let mut out = hoist_inline_style_spans(html, &mut hoisted);
619    out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
620    out = convert_class_indented_blockquotes(&out, &class_styles);
621    out = nest_google_docs_lists(&out, &class_styles);
622    out = strip_google_docs_heading_noise(&out);
623    out = strip_heading_inline_formatting(&out);
624    out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
625    out = out.replace("&nbsp;", " ");
626    out = out.replace('\u{00A0}', " ");
627
628    GDocsExportPreprocessResult {
629        html: out,
630        hoisted,
631        unwrapped_links,
632    }
633}
634
635/// Normalize Markdown emitted from Google Docs public-export HTML converters.
636#[must_use]
637pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
638    let markdown = unescape_public_export_punctuation(markdown);
639    let markdown = convert_setext_headings(&markdown);
640    let markdown = normalize_atx_headings(&markdown);
641    let markdown = normalize_bullet_markers(&markdown);
642    let markdown = normalize_list_spacing(&markdown);
643    let markdown = normalize_blockquote_spacing(&markdown);
644    let markdown = normalize_markdown_tables(&markdown);
645    crate::markdown::clean_markdown(&markdown)
646}
647
648fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
649    let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
650        .expect("valid regex");
651    span_re
652        .replace_all(html, |caps: &regex::Captures<'_>| {
653            let style = caps.get(2).map_or("", |m| m.as_str());
654            let inner = caps.get(3).map_or("", |m| m.as_str());
655            semantic_wrapped_html(inner, style).map_or_else(
656                || caps[0].to_string(),
657                |wrapped| {
658                    *hoisted += 1;
659                    wrapped
660                },
661            )
662        })
663        .into_owned()
664}
665
666fn hoist_class_style_spans(
667    html: &str,
668    class_styles: &HashMap<String, String>,
669    hoisted: &mut usize,
670) -> String {
671    let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
672        .expect("valid regex");
673    class_span_re
674        .replace_all(html, |caps: &regex::Captures<'_>| {
675            let class_attr = caps.get(2).map_or("", |m| m.as_str());
676            let inner = caps.get(3).map_or("", |m| m.as_str());
677            let style = combined_class_style(class_styles, class_attr);
678            semantic_wrapped_html(inner, &style).map_or_else(
679                || caps[0].to_string(),
680                |wrapped| {
681                    *hoisted += 1;
682                    wrapped
683                },
684            )
685        })
686        .into_owned()
687}
688
689fn convert_class_indented_blockquotes(
690    html: &str,
691    class_styles: &HashMap<String, String>,
692) -> String {
693    let class_paragraph_re =
694        Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
695    class_paragraph_re
696        .replace_all(html, |caps: &regex::Captures<'_>| {
697            let class_attr = caps.get(2).map_or("", |m| m.as_str());
698            let inner = caps.get(3).map_or("", |m| m.as_str());
699            let style = combined_class_style(class_styles, class_attr);
700            if is_blockquote_style(&style) {
701                format!("<blockquote><p>{inner}</p></blockquote>")
702            } else {
703                caps[0].to_string()
704            }
705        })
706        .into_owned()
707}
708
709#[derive(Debug, Clone)]
710struct ExportListBlock {
711    start: usize,
712    end: usize,
713    tag: String,
714    inner: String,
715}
716
717#[derive(Debug, Clone)]
718struct ExportListItem {
719    tag: String,
720    level: usize,
721    inner: String,
722}
723
724fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
725    let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
726    let blocks: Vec<ExportListBlock> = list_re
727        .captures_iter(html)
728        .filter_map(|caps| {
729            let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
730            let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
731            if open_tag != close_tag {
732                return None;
733            }
734            let whole = caps.get(0)?;
735            Some(ExportListBlock {
736                start: whole.start(),
737                end: whole.end(),
738                tag: open_tag,
739                inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
740            })
741        })
742        .collect();
743
744    if blocks.len() < 2 {
745        return html.to_string();
746    }
747
748    let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
749    let mut current: Vec<ExportListBlock> = Vec::new();
750    for block in blocks {
751        if let Some(previous) = current.last() {
752            if !html[previous.end..block.start].trim().is_empty() {
753                if current.len() > 1 {
754                    groups.push(std::mem::take(&mut current));
755                } else {
756                    current.clear();
757                }
758            }
759        }
760        current.push(block);
761    }
762    if current.len() > 1 {
763        groups.push(current);
764    }
765
766    if groups.is_empty() {
767        return html.to_string();
768    }
769
770    let mut out = html.to_string();
771    for group in groups.iter().rev() {
772        let rendered = render_nested_list_group(group, class_styles);
773        let start = group.first().expect("non-empty group").start;
774        let end = group.last().expect("non-empty group").end;
775        out.replace_range(start..end, &rendered);
776    }
777    out
778}
779
780fn render_nested_list_group(
781    group: &[ExportListBlock],
782    class_styles: &HashMap<String, String>,
783) -> String {
784    let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
785    let items: Vec<ExportListItem> = group
786        .iter()
787        .flat_map(|block| {
788            item_re.captures_iter(&block.inner).map(|caps| {
789                let attrs = caps.get(1).map_or("", |m| m.as_str());
790                let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
791                ExportListItem {
792                    tag: block.tag.clone(),
793                    level: google_docs_list_item_level(attrs, class_styles),
794                    inner,
795                }
796            })
797        })
798        .collect();
799
800    if items.is_empty() {
801        let mut unchanged = String::new();
802        for block in group {
803            write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
804                .expect("write to String");
805        }
806        return unchanged;
807    }
808
809    let mut html = String::new();
810    let mut current_level: Option<usize> = None;
811    let mut open_tags: Vec<Option<String>> = Vec::new();
812    let mut item_open: Vec<bool> = Vec::new();
813
814    for item in items {
815        let level = item.level;
816        while current_level.is_some_and(|current| current > level) {
817            let current = current_level.expect("checked as Some");
818            close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
819            current_level = current.checked_sub(1);
820        }
821
822        while current_level.is_none_or(|current| current < level) {
823            let next_level = current_level.map_or(0, |current| current + 1);
824            open_rendered_list(
825                &mut html,
826                &mut open_tags,
827                &mut item_open,
828                next_level,
829                &item.tag,
830            );
831            current_level = Some(next_level);
832        }
833
834        ensure_list_stack(&mut open_tags, &mut item_open, level);
835        if open_tags[level]
836            .as_deref()
837            .is_some_and(|tag| tag != item.tag)
838        {
839            close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
840            open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
841        } else if open_tags[level].is_none() {
842            open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
843        }
844
845        close_rendered_item(&mut html, &mut item_open, level);
846        html.push_str("<li>");
847        html.push_str(&item.inner);
848        item_open[level] = true;
849
850        for deeper in (level + 1)..item_open.len() {
851            item_open[deeper] = false;
852            open_tags[deeper] = None;
853        }
854    }
855
856    while let Some(current) = current_level {
857        close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
858        current_level = current.checked_sub(1);
859    }
860
861    html
862}
863
864fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
865    while open_tags.len() <= level {
866        open_tags.push(None);
867        item_open.push(false);
868    }
869}
870
871fn open_rendered_list(
872    html: &mut String,
873    open_tags: &mut Vec<Option<String>>,
874    item_open: &mut Vec<bool>,
875    level: usize,
876    tag: &str,
877) {
878    ensure_list_stack(open_tags, item_open, level);
879    html.push('<');
880    html.push_str(tag);
881    html.push('>');
882    open_tags[level] = Some(tag.to_string());
883    item_open[level] = false;
884}
885
886fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
887    if item_open.get(level).copied().unwrap_or(false) {
888        html.push_str("</li>");
889        item_open[level] = false;
890    }
891}
892
893fn close_rendered_list(
894    html: &mut String,
895    open_tags: &mut [Option<String>],
896    item_open: &mut [bool],
897    level: usize,
898) {
899    close_rendered_item(html, item_open, level);
900    if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
901        html.push_str("</");
902        html.push_str(&tag);
903        html.push('>');
904    }
905}
906
907fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
908    let style = combined_attr_style(class_styles, attrs);
909    let margin_left = css_point_value(&style, "margin-left");
910    if margin_left <= 0.0 {
911        return 0;
912    }
913    [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
914        .iter()
915        .take_while(|boundary| margin_left >= **boundary)
916        .count()
917}
918
919fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
920    let mut styles = String::new();
921    if let Some(style) = attr_value(attrs, "style") {
922        styles.push_str(&style);
923    }
924    if let Some(class_attr) = attr_value(attrs, "class") {
925        styles.push_str(&combined_class_style(class_styles, &class_attr));
926    }
927    styles
928}
929
930fn attr_value(attrs: &str, name: &str) -> Option<String> {
931    let attr_re = Regex::new(&format!(
932        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
933        regex::escape(name)
934    ))
935    .expect("valid regex");
936    attr_re.captures(attrs).and_then(|caps| {
937        caps.get(1)
938            .or_else(|| caps.get(2))
939            .map(|value| value.as_str().to_string())
940    })
941}
942
943fn strip_google_docs_heading_noise(html: &str) -> String {
944    let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
945    let numbering_re =
946        Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
947    let mut out = empty_anchor_re.replace_all(html, "").into_owned();
948    for level in 1..=6 {
949        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
950            .expect("valid regex");
951        out = heading_re
952            .replace_all(&out, |caps: &regex::Captures<'_>| {
953                let open = &caps[1];
954                let inner = &caps[2];
955                let close = &caps[3];
956                let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
957                cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
958                format!("{open}{cleaned}{close}")
959            })
960            .into_owned();
961    }
962    out
963}
964
965fn strip_heading_inline_formatting(html: &str) -> String {
966    let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
967    let mut out = html.to_string();
968    for level in 1..=6 {
969        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
970            .expect("valid regex");
971        out = heading_re
972            .replace_all(&out, |caps: &regex::Captures<'_>| {
973                let open = &caps[1];
974                let inner = &caps[2];
975                let close = &caps[3];
976                let cleaned = inline_marker_re.replace_all(inner, "");
977                format!("{open}{cleaned}{close}")
978            })
979            .into_owned();
980    }
981    out
982}
983
984fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
985    let redirect_re =
986        Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
987            .expect("valid regex");
988    redirect_re
989        .replace_all(html, |caps: &regex::Captures<'_>| {
990            let encoded = caps.get(1).map_or("", |m| m.as_str());
991            let decoded = percent_decode_utf8_lossy(encoded);
992            *unwrapped_links += 1;
993            format!(r#"href="{decoded}""#)
994        })
995        .into_owned()
996}
997
998fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
999    let mut class_styles: HashMap<String, String> = HashMap::new();
1000    let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
1001    let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
1002    for style_caps in style_re.captures_iter(html) {
1003        let css = style_caps.get(1).map_or("", |m| m.as_str());
1004        for class_caps in class_re.captures_iter(css) {
1005            let class_name = class_caps.get(1).map_or("", |m| m.as_str());
1006            let style = class_caps.get(2).map_or("", |m| m.as_str());
1007            class_styles
1008                .entry(class_name.to_string())
1009                .and_modify(|existing| {
1010                    existing.push(';');
1011                    existing.push_str(style);
1012                })
1013                .or_insert_with(|| style.to_string());
1014        }
1015    }
1016    class_styles
1017}
1018
1019fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
1020    class_attr
1021        .split_whitespace()
1022        .filter_map(|class_name| class_styles.get(class_name))
1023        .fold(String::new(), |mut out, style| {
1024            out.push(';');
1025            out.push_str(style);
1026            out
1027        })
1028}
1029
1030fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
1031    let bold = css_has_bold(style);
1032    let italic = css_has_italic(style);
1033    let strike = css_has_strike(style);
1034    if !bold && !italic && !strike {
1035        return None;
1036    }
1037    let mut wrapped = inner.to_string();
1038    if strike {
1039        wrapped = format!("<del>{wrapped}</del>");
1040    }
1041    if italic {
1042        wrapped = format!("<em>{wrapped}</em>");
1043    }
1044    if bold {
1045        wrapped = format!("<strong>{wrapped}</strong>");
1046    }
1047    Some(wrapped)
1048}
1049
1050fn css_has_bold(style: &str) -> bool {
1051    Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
1052        .expect("valid regex")
1053        .is_match(style)
1054}
1055
1056fn css_has_italic(style: &str) -> bool {
1057    Regex::new(r"(?i)font-style\s*:\s*italic")
1058        .expect("valid regex")
1059        .is_match(style)
1060}
1061
1062fn css_has_strike(style: &str) -> bool {
1063    Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1064        .expect("valid regex")
1065        .is_match(style)
1066}
1067
1068fn is_blockquote_style(style: &str) -> bool {
1069    let margin_left = css_point_value(style, "margin-left");
1070    let margin_right = css_point_value(style, "margin-right");
1071    margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1072}
1073
1074fn css_point_value(style: &str, property: &str) -> f64 {
1075    let re = Regex::new(&format!(
1076        r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1077        regex::escape(property)
1078    ))
1079    .expect("valid regex");
1080    re.captures(style)
1081        .and_then(|caps| caps.get(1))
1082        .and_then(|value| value.as_str().parse::<f64>().ok())
1083        .unwrap_or(0.0)
1084}
1085
1086/// Decode %XX percent escapes in `input`. Invalid sequences are left
1087/// untouched so well-formed ASCII URLs round-trip unchanged.
1088fn percent_decode_utf8_lossy(input: &str) -> String {
1089    let bytes = input.as_bytes();
1090    let mut decoded = Vec::with_capacity(bytes.len());
1091    let mut i = 0;
1092    while i < bytes.len() {
1093        if bytes[i] == b'%' && i + 2 < bytes.len() {
1094            let hi = (bytes[i + 1] as char).to_digit(16);
1095            let lo = (bytes[i + 2] as char).to_digit(16);
1096            if let (Some(hi), Some(lo)) = (hi, lo) {
1097                if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1098                    decoded.push(byte);
1099                    i += 3;
1100                    continue;
1101                }
1102            }
1103        }
1104        decoded.push(bytes[i]);
1105        i += 1;
1106    }
1107    String::from_utf8_lossy(&decoded).into_owned()
1108}
1109
1110fn unescape_public_export_punctuation(markdown: &str) -> String {
1111    markdown
1112        .replace("\\.", ".")
1113        .replace("\\!", "!")
1114        .replace("\\(", "(")
1115        .replace("\\)", ")")
1116        .replace("\\[", "[")
1117        .replace("\\]", "]")
1118}
1119
1120fn convert_setext_headings(markdown: &str) -> String {
1121    let lines: Vec<&str> = markdown.lines().collect();
1122    let mut out = Vec::with_capacity(lines.len());
1123    let mut index = 0;
1124    while index < lines.len() {
1125        if index + 1 < lines.len() {
1126            let underline = lines[index + 1].trim();
1127            if is_setext_underline(underline, '=') {
1128                out.push(format!("# {}", lines[index].trim()));
1129                index += 2;
1130                continue;
1131            }
1132            if is_setext_underline(underline, '-') {
1133                out.push(format!("## {}", lines[index].trim()));
1134                index += 2;
1135                continue;
1136            }
1137        }
1138        out.push(lines[index].to_string());
1139        index += 1;
1140    }
1141    out.join("\n")
1142}
1143
1144fn is_setext_underline(line: &str, marker: char) -> bool {
1145    line.len() >= 5 && line.chars().all(|ch| ch == marker)
1146}
1147
1148fn normalize_atx_headings(markdown: &str) -> String {
1149    let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1150    let closing_re = closing_atx_heading_re();
1151    markdown
1152        .lines()
1153        .map(|line| {
1154            let Some(caps) = heading_re.captures(line) else {
1155                return line.to_string();
1156            };
1157            let hashes = caps.get(1).map_or("", |m| m.as_str());
1158            let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1159            text = closing_re.replace(&text, "").trim().to_string();
1160            text = strip_wrapping_markdown_emphasis(&text);
1161            format!("{hashes} {text}")
1162        })
1163        .collect::<Vec<_>>()
1164        .join("\n")
1165}
1166
1167fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1168    let trimmed = text.trim();
1169    for marker in ["***", "**", "*"] {
1170        if trimmed.len() > marker.len() * 2
1171            && trimmed.starts_with(marker)
1172            && trimmed.ends_with(marker)
1173        {
1174            return trimmed[marker.len()..trimmed.len() - marker.len()]
1175                .trim()
1176                .to_string();
1177        }
1178    }
1179    trimmed.to_string()
1180}
1181
1182fn normalize_bullet_markers(markdown: &str) -> String {
1183    let bullet_re = asterisk_bullet_re();
1184    markdown
1185        .lines()
1186        .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1187        .collect::<Vec<_>>()
1188        .join("\n")
1189}
1190
1191fn normalize_list_spacing(markdown: &str) -> String {
1192    let lines: Vec<&str> = markdown.lines().collect();
1193    let mut out = Vec::with_capacity(lines.len());
1194
1195    for (index, line) in lines.iter().enumerate() {
1196        if line.trim().is_empty()
1197            && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1198            && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1199        {
1200            continue;
1201        }
1202        out.push((*line).to_string());
1203    }
1204
1205    out.join("\n")
1206}
1207
1208fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1209    lines[..index]
1210        .iter()
1211        .rev()
1212        .copied()
1213        .find(|line| !line.trim().is_empty())
1214}
1215
1216fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1217    lines[index + 1..]
1218        .iter()
1219        .copied()
1220        .find(|line| !line.trim().is_empty())
1221}
1222
1223fn is_markdown_list_item(line: &str) -> bool {
1224    markdown_list_item_re().is_match(line)
1225}
1226
1227fn normalize_blockquote_spacing(markdown: &str) -> String {
1228    let mut out = String::with_capacity(markdown.len());
1229    let mut pending_quote_blank = false;
1230    let mut in_quote = false;
1231
1232    for line in markdown.lines() {
1233        if line.trim().is_empty() && in_quote {
1234            pending_quote_blank = true;
1235            continue;
1236        }
1237
1238        if line.trim() == ">" {
1239            if in_quote {
1240                pending_quote_blank = true;
1241            }
1242            continue;
1243        }
1244
1245        if line.starts_with("> ") {
1246            if pending_quote_blank {
1247                out.push_str(">\n");
1248                pending_quote_blank = false;
1249            }
1250            out.push_str(line);
1251            out.push('\n');
1252            in_quote = true;
1253            continue;
1254        }
1255
1256        if in_quote && !line.trim().is_empty() {
1257            out.push('\n');
1258        }
1259        pending_quote_blank = false;
1260        in_quote = false;
1261        out.push_str(line);
1262        out.push('\n');
1263    }
1264
1265    out
1266}
1267
1268fn normalize_markdown_tables(markdown: &str) -> String {
1269    let lines: Vec<&str> = markdown.lines().collect();
1270    let mut out = Vec::with_capacity(lines.len());
1271    let mut index = 0;
1272
1273    while index < lines.len() {
1274        if !is_markdown_table_line(lines[index]) {
1275            out.push(lines[index].to_string());
1276            index += 1;
1277            continue;
1278        }
1279
1280        let start = index;
1281        while index < lines.len() && is_markdown_table_line(lines[index]) {
1282            index += 1;
1283        }
1284        let block = &lines[start..index];
1285        if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1286            out.extend(normalize_markdown_table_block(block));
1287        } else {
1288            out.extend(block.iter().map(|line| (*line).to_string()));
1289        }
1290    }
1291
1292    out.join("\n")
1293}
1294
1295fn is_markdown_table_line(line: &str) -> bool {
1296    let trimmed = line.trim();
1297    trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1298}
1299
1300fn is_markdown_separator_line(line: &str) -> bool {
1301    split_markdown_table_cells(line)
1302        .iter()
1303        .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1304}
1305
1306fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1307    lines
1308        .iter()
1309        .enumerate()
1310        .map(|(index, line)| {
1311            let cells = split_markdown_table_cells(line);
1312            if index == 1 {
1313                let separators = vec!["---".to_string(); cells.len()];
1314                render_markdown_table_row(&separators)
1315            } else {
1316                render_markdown_table_row(&cells)
1317            }
1318        })
1319        .collect()
1320}
1321
1322fn split_markdown_table_cells(line: &str) -> Vec<String> {
1323    line.trim()
1324        .trim_matches('|')
1325        .split('|')
1326        .map(|cell| cell.trim().to_string())
1327        .collect()
1328}
1329
1330fn render_markdown_table_row(cells: &[String]) -> String {
1331    format!("| {} |", cells.join(" | "))
1332}
1333
1334fn closing_atx_heading_re() -> &'static Regex {
1335    static RE: OnceLock<Regex> = OnceLock::new();
1336    RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1337}
1338
1339fn asterisk_bullet_re() -> &'static Regex {
1340    static RE: OnceLock<Regex> = OnceLock::new();
1341    RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1342}
1343
1344fn markdown_list_item_re() -> &'static Regex {
1345    static RE: OnceLock<Regex> = OnceLock::new();
1346    RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1347}
1348
1349fn markdown_table_separator_cell_re() -> &'static Regex {
1350    static RE: OnceLock<Regex> = OnceLock::new();
1351    RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1352}
1353
1354/// Fetch and render a Google Docs document via the authenticated REST API.
1355///
1356/// # Errors
1357///
1358/// Returns an error when the URL is invalid, no token is provided, or the API request fails.
1359pub async fn fetch_google_doc_from_docs_api(
1360    url: &str,
1361    api_token: &str,
1362) -> crate::Result<GDocsRenderedResult> {
1363    let document_id = extract_document_id(url).ok_or_else(|| {
1364        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1365    })?;
1366    let api_url = build_docs_api_url(&document_id);
1367    debug!(
1368        document_id = %document_id,
1369        api_url = %api_url,
1370        "fetching Google Doc via Docs API"
1371    );
1372
1373    let response = reqwest::Client::new()
1374        .get(&api_url)
1375        .header("Authorization", format!("Bearer {api_token}"))
1376        .header("Accept", "application/json")
1377        .send()
1378        .await
1379        .map_err(|e| {
1380            WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1381        })?;
1382    debug!(
1383        document_id = %document_id,
1384        status = response.status().as_u16(),
1385        success = response.status().is_success(),
1386        content_type = response
1387            .headers()
1388            .get(reqwest::header::CONTENT_TYPE)
1389            .and_then(|value| value.to_str().ok())
1390            .unwrap_or(""),
1391        "received Google Docs API response"
1392    );
1393
1394    if !response.status().is_success() {
1395        return Err(WebCaptureError::FetchError(format!(
1396            "Failed to fetch Google Doc via Docs API ({} {}): {}",
1397            response.status().as_u16(),
1398            response.status().canonical_reason().unwrap_or("Unknown"),
1399            api_url
1400        )));
1401    }
1402
1403    let body = response.text().await.map_err(|e| {
1404        WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1405    })?;
1406    let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1407        WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1408    })?;
1409    let rendered = render_docs_api_document(&document);
1410    debug!(
1411        document_id = %document_id,
1412        title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1413        markdown_bytes = rendered.markdown.len(),
1414        html_bytes = rendered.html.len(),
1415        text_bytes = rendered.text.len(),
1416        "rendered Google Docs API document"
1417    );
1418
1419    Ok(GDocsRenderedResult {
1420        markdown: rendered.markdown,
1421        html: rendered.html,
1422        text: rendered.text,
1423        document_id,
1424        export_url: api_url,
1425        remote_images: Vec::new(),
1426    })
1427}
1428
1429/// Fetch and render the model data embedded in the Google Docs `/edit` route.
1430///
1431/// # Errors
1432///
1433/// Returns an error when the URL is invalid, the fetch fails, or no model chunks are present.
1434pub async fn fetch_google_doc_from_model(
1435    url: &str,
1436    api_token: Option<&str>,
1437) -> crate::Result<GDocsRenderedResult> {
1438    if api_token.is_some() {
1439        return Err(WebCaptureError::BrowserError(
1440            "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1441        ));
1442    }
1443    let document_id = extract_document_id(url).ok_or_else(|| {
1444        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1445    })?;
1446    let edit_url = build_edit_url(&document_id);
1447    debug!(
1448        document_id = %document_id,
1449        edit_url = %edit_url,
1450        "capturing Google Doc editor model with a real browser"
1451    );
1452    let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1453    let BrowserModelData {
1454        chunks,
1455        cid_urls,
1456        chunk_payload_bytes,
1457        poll_count,
1458        stable_for,
1459    } = model_data;
1460    debug!(
1461        document_id = %document_id,
1462        chunks = chunks.len(),
1463        cid_urls = cid_urls.len(),
1464        chunk_payload_bytes,
1465        poll_count,
1466        stable_for_ms = stable_for.as_millis(),
1467        "extracted Google Docs editor model chunks through CDP"
1468    );
1469    if chunks.is_empty() {
1470        return Err(WebCaptureError::ParseError(
1471            "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1472        ));
1473    }
1474
1475    let capture = parse_model_chunks(&chunks, &cid_urls);
1476    let remote_images = remote_images_from_capture(&capture);
1477    info!(
1478        document_id = %document_id,
1479        chunks = chunks.len(),
1480        cid_urls = cid_urls.len(),
1481        chunk_payload_bytes,
1482        poll_count,
1483        stable_for_ms = stable_for.as_millis(),
1484        blocks = capture.blocks.len(),
1485        tables = capture.tables.len(),
1486        images = capture.images.len(),
1487        text_bytes = capture.text.len(),
1488        "parsed Google Docs editor model"
1489    );
1490
1491    Ok(GDocsRenderedResult {
1492        markdown: render_captured_document(&capture, "markdown"),
1493        html: render_captured_document(&capture, "html"),
1494        text: render_captured_document(&capture, "txt"),
1495        document_id,
1496        export_url: edit_url,
1497        remote_images,
1498    })
1499}
1500
1501async fn fetch_google_doc_editor_model_with_cdp(
1502    edit_url: &str,
1503    document_id: &str,
1504) -> crate::Result<BrowserModelData> {
1505    let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1506        WebCaptureError::BrowserError(
1507            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1508        )
1509    })?;
1510    let user_data_dir = crate::browser::temporary_user_data_dir();
1511    std::fs::create_dir_all(&user_data_dir)?;
1512
1513    debug!(
1514        document_id = %document_id,
1515        chrome = %chrome.display(),
1516        user_data_dir = %user_data_dir.display(),
1517        edit_url = %edit_url,
1518        "launching headless Chrome CDP session for Google Docs model capture"
1519    );
1520
1521    let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1522    let capture_result = async {
1523        let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1524        let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1525            WebCaptureError::BrowserError(format!(
1526                "Failed to connect to Chrome DevTools websocket: {error}"
1527            ))
1528        })?;
1529        let mut next_id = 0u64;
1530        let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1531        wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1532    }
1533    .await;
1534
1535    if let Err(error) = child.kill().await {
1536        debug!(
1537            document_id = %document_id,
1538            error = %error,
1539            "failed to kill Chrome CDP browser process"
1540        );
1541    }
1542    let _ = child.wait().await;
1543    let _ = std::fs::remove_dir_all(&user_data_dir);
1544
1545    capture_result
1546}
1547
1548async fn navigate_google_docs_cdp_page(
1549    ws: &mut CdpWebSocket,
1550    next_id: &mut u64,
1551    edit_url: &str,
1552) -> crate::Result<String> {
1553    let target = cdp_send(
1554        ws,
1555        next_id,
1556        None,
1557        "Target.createTarget",
1558        serde_json::json!({ "url": "about:blank" }),
1559    )
1560    .await?;
1561    let target_id = target
1562        .get("targetId")
1563        .and_then(Value::as_str)
1564        .ok_or_else(|| {
1565            WebCaptureError::BrowserError(
1566                "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1567            )
1568        })?
1569        .to_string();
1570    let attached = cdp_send(
1571        ws,
1572        next_id,
1573        None,
1574        "Target.attachToTarget",
1575        serde_json::json!({ "targetId": target_id, "flatten": true }),
1576    )
1577    .await?;
1578    let session_id = attached
1579        .get("sessionId")
1580        .and_then(Value::as_str)
1581        .ok_or_else(|| {
1582            WebCaptureError::BrowserError(
1583                "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1584            )
1585        })?
1586        .to_string();
1587
1588    cdp_send(
1589        ws,
1590        next_id,
1591        Some(&session_id),
1592        "Page.enable",
1593        serde_json::json!({}),
1594    )
1595    .await?;
1596    cdp_send(
1597        ws,
1598        next_id,
1599        Some(&session_id),
1600        "Runtime.enable",
1601        serde_json::json!({}),
1602    )
1603    .await?;
1604    cdp_send(
1605        ws,
1606        next_id,
1607        Some(&session_id),
1608        "Page.addScriptToEvaluateOnNewDocument",
1609        serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1610    )
1611    .await?;
1612    cdp_send(
1613        ws,
1614        next_id,
1615        Some(&session_id),
1616        "Page.navigate",
1617        serde_json::json!({ "url": edit_url }),
1618    )
1619    .await?;
1620
1621    Ok(session_id)
1622}
1623
1624async fn wait_for_google_docs_model_chunks(
1625    ws: &mut CdpWebSocket,
1626    next_id: &mut u64,
1627    session_id: &str,
1628    document_id: &str,
1629) -> crate::Result<BrowserModelData> {
1630    let started = Instant::now();
1631    let max_wait = gdocs_editor_model_max_wait();
1632    let stability_window = gdocs_editor_model_stability_window();
1633    let mut quiescence = BrowserModelQuiescence::default();
1634    let mut last_chunks = 0usize;
1635    let mut last_cid_urls = 0usize;
1636    let mut last_payload_bytes = 0usize;
1637    let mut last_stable_for = Duration::ZERO;
1638    let mut poll_count = 0usize;
1639
1640    while started.elapsed() < max_wait {
1641        let result = cdp_send(
1642            ws,
1643            next_id,
1644            Some(session_id),
1645            "Runtime.evaluate",
1646            serde_json::json!({
1647                "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1648                "returnByValue": true,
1649                "awaitPromise": true
1650            }),
1651        )
1652        .await?;
1653        if let Some(exception) = result.get("exceptionDetails") {
1654            return Err(WebCaptureError::BrowserError(format!(
1655                "Google Docs model extraction script failed: {exception}"
1656            )));
1657        }
1658        let value = result
1659            .pointer("/result/value")
1660            .cloned()
1661            .unwrap_or(Value::Null);
1662        let model_data = browser_model_data_from_value(&value);
1663        poll_count += 1;
1664        let fingerprint = model_data.fingerprint();
1665        last_chunks = model_data.chunks.len();
1666        last_cid_urls = model_data.cid_urls.len();
1667        last_payload_bytes = model_data.chunk_payload_bytes;
1668        let now = Instant::now();
1669        if let Some(stable_for) = quiescence.observe(fingerprint, now, stability_window) {
1670            let mut model_data = model_data;
1671            model_data.poll_count = poll_count;
1672            model_data.stable_for = stable_for;
1673            debug!(
1674                document_id = %document_id,
1675                chunks = model_data.chunks.len(),
1676                cid_urls = model_data.cid_urls.len(),
1677                chunk_payload_bytes = model_data.chunk_payload_bytes,
1678                poll_count,
1679                stable_for_ms = stable_for.as_millis(),
1680                elapsed_ms = started.elapsed().as_millis(),
1681                "captured quiesced Google Docs model chunks through CDP Runtime.evaluate"
1682            );
1683            return Ok(model_data);
1684        }
1685        last_stable_for = quiescence.stable_for(now);
1686        tokio::time::sleep(GDOCS_EDITOR_MODEL_POLL_INTERVAL).await;
1687    }
1688
1689    Err(WebCaptureError::BrowserError(format!(
1690        "Timed out waiting for Google Docs DOCS_modelChunk stream to quiesce for document {document_id} after {} ms (last chunks={last_chunks}, payload_bytes={last_payload_bytes}, cid_urls={last_cid_urls}, poll_count={poll_count}, stable_for_ms={})",
1691        max_wait.as_millis(),
1692        last_stable_for.as_millis()
1693    )))
1694}
1695
1696fn launch_cdp_chrome(
1697    chrome: &std::path::Path,
1698    user_data_dir: &std::path::Path,
1699) -> crate::Result<Child> {
1700    let mut command = Command::new(chrome);
1701    command
1702        .args([
1703            "--headless=new",
1704            "--disable-gpu",
1705            "--disable-extensions",
1706            "--disable-dev-shm-usage",
1707            "--disable-background-networking",
1708            "--disable-component-update",
1709            "--disable-default-apps",
1710            "--disable-sync",
1711            "--metrics-recording-only",
1712            "--no-default-browser-check",
1713            "--no-first-run",
1714            "--no-sandbox",
1715            "--remote-debugging-port=0",
1716            "--window-size=1280,800",
1717        ])
1718        .arg(format!("--user-data-dir={}", user_data_dir.display()))
1719        .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1720        .stderr(Stdio::piped())
1721        .stdout(Stdio::null())
1722        .kill_on_drop(true);
1723
1724    command.spawn().map_err(|error| {
1725        WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1726    })
1727}
1728
1729async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1730    let stderr = child.stderr.take().ok_or_else(|| {
1731        WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1732    })?;
1733    let mut lines = BufReader::new(stderr).lines();
1734    let started = Instant::now();
1735
1736    while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1737        let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1738        match line {
1739            Ok(Ok(Some(line))) => {
1740                if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1741                    return Ok(ws_url.trim().to_string());
1742                }
1743            }
1744            Ok(Ok(None)) => {
1745                break;
1746            }
1747            Ok(Err(error)) => {
1748                return Err(WebCaptureError::BrowserError(format!(
1749                    "Failed to read Chrome CDP stderr: {error}"
1750                )));
1751            }
1752            Err(_) => {}
1753        }
1754    }
1755
1756    Err(WebCaptureError::BrowserError(format!(
1757        "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1758        GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1759    )))
1760}
1761
1762async fn cdp_send(
1763    ws: &mut CdpWebSocket,
1764    next_id: &mut u64,
1765    session_id: Option<&str>,
1766    method: &str,
1767    params: Value,
1768) -> crate::Result<Value> {
1769    *next_id += 1;
1770    let id = *next_id;
1771    let mut message = serde_json::json!({
1772        "id": id,
1773        "method": method,
1774        "params": params
1775    });
1776    if let Some(session_id) = session_id {
1777        message["sessionId"] = Value::String(session_id.to_string());
1778    }
1779
1780    ws.send(Message::Text(message.to_string()))
1781        .await
1782        .map_err(|error| {
1783            WebCaptureError::BrowserError(format!(
1784                "Failed to send Chrome DevTools command {method}: {error}"
1785            ))
1786        })?;
1787
1788    while let Some(message) = ws.next().await {
1789        let message = message.map_err(|error| {
1790            WebCaptureError::BrowserError(format!(
1791                "Failed to read Chrome DevTools response for {method}: {error}"
1792            ))
1793        })?;
1794        if !message.is_text() {
1795            continue;
1796        }
1797        let text = message.to_text().map_err(|error| {
1798            WebCaptureError::BrowserError(format!(
1799                "Chrome DevTools response for {method} was not text: {error}"
1800            ))
1801        })?;
1802        let value = serde_json::from_str::<Value>(text).map_err(|error| {
1803            WebCaptureError::ParseError(format!(
1804                "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1805            ))
1806        })?;
1807        if value.get("id").and_then(Value::as_u64) != Some(id) {
1808            continue;
1809        }
1810        if let Some(error) = value.get("error") {
1811            return Err(WebCaptureError::BrowserError(format!(
1812                "Chrome DevTools command {method} failed: {error}"
1813            )));
1814        }
1815        return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1816    }
1817
1818    Err(WebCaptureError::BrowserError(format!(
1819        "Chrome DevTools websocket closed before response for {method}"
1820    )))
1821}
1822
1823fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1824    let chunks = value
1825        .get("chunks")
1826        .and_then(Value::as_array)
1827        .cloned()
1828        .unwrap_or_default();
1829    let chunk_payload_bytes = model_chunk_payload_bytes(&chunks);
1830    let cid_urls = value
1831        .get("cidUrlMap")
1832        .and_then(Value::as_object)
1833        .map(|map| {
1834            map.iter()
1835                .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1836                .collect::<HashMap<_, _>>()
1837        })
1838        .unwrap_or_default();
1839    BrowserModelData {
1840        chunks,
1841        cid_urls,
1842        chunk_payload_bytes,
1843        poll_count: 0,
1844        stable_for: Duration::ZERO,
1845    }
1846}
1847
1848fn model_chunk_payload_bytes(chunks: &[Value]) -> usize {
1849    chunks
1850        .iter()
1851        .map(|chunk| serde_json::to_vec(chunk).map_or(0, |encoded| encoded.len()))
1852        .sum()
1853}
1854
1855fn gdocs_editor_model_max_wait() -> Duration {
1856    duration_from_env_ms(
1857        "WEB_CAPTURE_GDOCS_MAX_WAIT_MS",
1858        GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT,
1859    )
1860}
1861
1862fn gdocs_editor_model_stability_window() -> Duration {
1863    duration_from_env_ms(
1864        "WEB_CAPTURE_GDOCS_STABILITY_MS",
1865        GDOCS_EDITOR_MODEL_STABILITY_DEFAULT,
1866    )
1867}
1868
1869fn duration_from_env_ms(name: &str, default: Duration) -> Duration {
1870    std::env::var(name).map_or(default, |value| match value.trim().parse::<u64>() {
1871        Ok(ms) => Duration::from_millis(ms),
1872        Err(error) => {
1873            warn!(
1874                name,
1875                value,
1876                error = %error,
1877                default_ms = default.as_millis(),
1878                "ignoring invalid Google Docs model wait environment variable"
1879            );
1880            default
1881        }
1882    })
1883}
1884
1885fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1886    capture
1887        .images
1888        .iter()
1889        .filter_map(|node| match node {
1890            ContentNode::Image {
1891                url: Some(url),
1892                alt,
1893                ..
1894            } => Some(RemoteImage {
1895                url: url.clone(),
1896                alt: alt.clone(),
1897            }),
1898            ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1899        })
1900        .collect()
1901}
1902
1903/// Render a Google Docs REST API document value.
1904#[must_use]
1905pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1906    let blocks = structural_elements_to_blocks(
1907        document
1908            .pointer("/body/content")
1909            .and_then(Value::as_array)
1910            .map_or(&[] as &[Value], Vec::as_slice),
1911        document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1912    );
1913    GDocsRenderedOutput {
1914        markdown: render_blocks_markdown(&blocks),
1915        html: render_blocks_html(&blocks),
1916        text: blocks_to_text(&blocks),
1917    }
1918}
1919
1920/// Rendered document output.
1921#[derive(Debug, Clone, PartialEq, Eq)]
1922pub struct GDocsRenderedOutput {
1923    /// Markdown output.
1924    pub markdown: String,
1925    /// HTML output.
1926    pub html: String,
1927    /// Plain text output.
1928    pub text: String,
1929}
1930
1931fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
1932    let mut blocks = Vec::new();
1933    for element in elements {
1934        if let Some(paragraph) = element.get("paragraph") {
1935            let content = paragraph_to_content(paragraph, inline_objects);
1936            if !content_to_text(&content).trim().is_empty()
1937                || content
1938                    .iter()
1939                    .any(|node| matches!(node, ContentNode::Image { .. }))
1940            {
1941                blocks.push(CapturedBlock::Paragraph {
1942                    style: paragraph
1943                        .pointer("/paragraphStyle/namedStyleType")
1944                        .and_then(Value::as_str)
1945                        .map(ToString::to_string),
1946                    list: None,
1947                    quote: false,
1948                    horizontal_rule: false,
1949                    content,
1950                });
1951            }
1952        } else if let Some(table) = element.get("table") {
1953            blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
1954        }
1955    }
1956    blocks
1957}
1958
1959fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
1960    let rows = table
1961        .get("tableRows")
1962        .and_then(Value::as_array)
1963        .map_or(&[] as &[Value], Vec::as_slice)
1964        .iter()
1965        .map(|row| TableRow {
1966            cells: row
1967                .get("tableCells")
1968                .and_then(Value::as_array)
1969                .map_or(&[] as &[Value], Vec::as_slice)
1970                .iter()
1971                .map(|cell| TableCell {
1972                    content: structural_elements_to_inline_content(
1973                        cell.get("content")
1974                            .and_then(Value::as_array)
1975                            .map_or(&[] as &[Value], Vec::as_slice),
1976                        inline_objects,
1977                    ),
1978                })
1979                .collect(),
1980        })
1981        .collect();
1982    TableBlock { rows }
1983}
1984
1985fn structural_elements_to_inline_content(
1986    elements: &[Value],
1987    inline_objects: &Value,
1988) -> Vec<ContentNode> {
1989    let mut content = Vec::new();
1990    for element in elements {
1991        if let Some(paragraph) = element.get("paragraph") {
1992            let paragraph_content = paragraph_to_content(paragraph, inline_objects);
1993            if !content.is_empty() && !paragraph_content.is_empty() {
1994                append_text(&mut content, "\n");
1995            }
1996            content.extend(paragraph_content);
1997        } else if let Some(table) = element.get("table") {
1998            append_text(
1999                &mut content,
2000                &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
2001                    table,
2002                    inline_objects,
2003                ))]),
2004            );
2005        }
2006    }
2007    content
2008}
2009
2010fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
2011    let mut content = Vec::new();
2012    for element in paragraph
2013        .get("elements")
2014        .and_then(Value::as_array)
2015        .map_or(&[] as &[Value], Vec::as_slice)
2016    {
2017        if let Some(text) = element
2018            .pointer("/textRun/content")
2019            .and_then(Value::as_str)
2020            .map(|text| text.strip_suffix('\n').unwrap_or(text))
2021        {
2022            append_text(&mut content, text);
2023        } else if let Some(inline_id) = element
2024            .pointer("/inlineObjectElement/inlineObjectId")
2025            .and_then(Value::as_str)
2026        {
2027            if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
2028                content.push(image);
2029            }
2030        }
2031    }
2032    content
2033}
2034
2035fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
2036    let embedded = inline_objects
2037        .get(inline_id)?
2038        .pointer("/inlineObjectProperties/embeddedObject")?;
2039    let url = embedded
2040        .pointer("/imageProperties/contentUri")
2041        .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
2042        .and_then(Value::as_str)?;
2043    let alt = embedded
2044        .get("title")
2045        .or_else(|| embedded.get("description"))
2046        .and_then(Value::as_str)
2047        .unwrap_or("image");
2048    Some(ContentNode::Image {
2049        cid: None,
2050        url: Some(url.to_string()),
2051        alt: alt.to_string(),
2052        width: json_dimension_to_string(embedded.pointer("/size/width/magnitude")),
2053        height: json_dimension_to_string(embedded.pointer("/size/height/magnitude")),
2054        is_suggestion: false,
2055    })
2056}
2057
2058fn json_dimension_to_string(value: Option<&Value>) -> Option<String> {
2059    match value? {
2060        Value::Number(number) => Some(number.to_string()),
2061        Value::String(text) if !text.is_empty() => Some(text.clone()),
2062        _ => None,
2063    }
2064}
2065
2066fn build_model_style_maps(
2067    items: &[Value],
2068    text_len: usize,
2069    utf16_position_map: &[usize],
2070) -> ModelStyleMaps {
2071    let mut maps = ModelStyleMaps {
2072        inline_styles: vec![TextStyle::default(); text_len],
2073        ..ModelStyleMaps::default()
2074    };
2075
2076    for item in items {
2077        if item.get("ty").and_then(Value::as_str) != Some("as") {
2078            continue;
2079        }
2080        let (Some(start), Some(end), Some(style_type)) = (
2081            item.get("si").and_then(Value::as_u64),
2082            item.get("ei").and_then(Value::as_u64),
2083            item.get("st").and_then(Value::as_str),
2084        ) else {
2085            continue;
2086        };
2087        let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
2088            continue;
2089        };
2090
2091        let start = utf16_position_to_char_position(utf16_position_map, start);
2092        let end = utf16_position_to_char_position(utf16_position_map, end);
2093        if start == 0 || end == 0 {
2094            continue;
2095        }
2096
2097        match style_type {
2098            "text" => {
2099                let style = text_style(item);
2100                apply_inline_style(&mut maps.inline_styles, start, end, &style);
2101            }
2102            "link" => {
2103                let style = TextStyle {
2104                    link: item
2105                        .pointer("/sm/lnks_link/ulnk_url")
2106                        .and_then(Value::as_str)
2107                        .map(ToString::to_string),
2108                    ..TextStyle::default()
2109                };
2110                apply_inline_style(&mut maps.inline_styles, start, end, &style);
2111            }
2112            "paragraph" => {
2113                maps.paragraph_by_end
2114                    .insert(end, paragraph_style_from_model(item));
2115            }
2116            "list" => {
2117                maps.list_by_end.insert(
2118                    end,
2119                    ListMeta {
2120                        id: item
2121                            .pointer("/sm/ls_id")
2122                            .and_then(Value::as_str)
2123                            .unwrap_or("")
2124                            .to_string(),
2125                        level: item
2126                            .pointer("/sm/ls_nest")
2127                            .and_then(Value::as_u64)
2128                            .and_then(|value| usize::try_from(value).ok())
2129                            .unwrap_or(0),
2130                        ordered: false,
2131                    },
2132                );
2133            }
2134            "horizontal_rule" => {
2135                maps.horizontal_rules.insert(end);
2136            }
2137            _ => {}
2138        }
2139    }
2140
2141    maps
2142}
2143
2144fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
2145    let from = start.saturating_sub(1);
2146    let to = end.min(styles.len());
2147    if from >= to {
2148        return;
2149    }
2150    for style in &mut styles[from..to] {
2151        if patch.bold {
2152            style.bold = true;
2153        }
2154        if patch.italic {
2155            style.italic = true;
2156        }
2157        if patch.strike {
2158            style.strike = true;
2159        }
2160        if patch.link.is_some() {
2161            style.link.clone_from(&patch.link);
2162        }
2163    }
2164}
2165
2166fn text_style(item: &Value) -> TextStyle {
2167    TextStyle {
2168        bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
2169        italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
2170        strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
2171        link: None,
2172    }
2173}
2174
2175fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2176    let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2177    ParagraphStyle {
2178        style: heading.map(|level| format!("HEADING_{level}")),
2179        indent_start: item
2180            .pointer("/sm/ps_il")
2181            .and_then(Value::as_f64)
2182            .unwrap_or(0.0),
2183        indent_first_line: item
2184            .pointer("/sm/ps_ifl")
2185            .and_then(Value::as_f64)
2186            .unwrap_or(0.0),
2187    }
2188}
2189
2190fn build_utf16_position_map(text: &str) -> Vec<usize> {
2191    let mut map = vec![0; text.encode_utf16().count() + 1];
2192    let mut utf16_pos = 1usize;
2193    for (idx, ch) in text.chars().enumerate() {
2194        let char_pos = idx + 1;
2195        for _ in 0..ch.len_utf16() {
2196            if let Some(slot) = map.get_mut(utf16_pos) {
2197                *slot = char_pos;
2198            }
2199            utf16_pos += 1;
2200        }
2201    }
2202    map
2203}
2204
2205fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2206    map.get(position)
2207        .copied()
2208        .filter(|position| *position > 0)
2209        .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2210        .unwrap_or(0)
2211}
2212
2213/// Parse captured `DOCS_modelChunk` values.
2214#[must_use]
2215#[allow(clippy::too_many_lines)]
2216pub fn parse_model_chunks<S: BuildHasher>(
2217    chunks: &[Value],
2218    cid_urls: &HashMap<String, String, S>,
2219) -> CapturedDocument {
2220    let items = collect_model_items(chunks);
2221    let full_text = items
2222        .iter()
2223        .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2224        .filter_map(|item| item.get("s").and_then(Value::as_str))
2225        .collect::<String>();
2226    let chars: Vec<char> = full_text.chars().collect();
2227    let utf16_position_map = build_utf16_position_map(&full_text);
2228    let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2229
2230    let mut positions = HashMap::new();
2231    for item in &items {
2232        if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2233            if let (Some(id), Some(pos)) = (
2234                item.get("id").and_then(Value::as_str),
2235                item.get("spi").and_then(Value::as_u64),
2236            ) {
2237                if let Ok(pos) = usize::try_from(pos) {
2238                    positions.insert(
2239                        id.to_string(),
2240                        utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2241                    );
2242                }
2243            }
2244        }
2245    }
2246
2247    let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2248    let mut images = Vec::new();
2249    for item in &items {
2250        let ty = item.get("ty").and_then(Value::as_str);
2251        if !matches!(ty, Some("ae" | "ase")) {
2252            continue;
2253        }
2254        let Some(id) = item.get("id").and_then(Value::as_str) else {
2255            continue;
2256        };
2257        let Some(pos) = positions.get(id).copied() else {
2258            continue;
2259        };
2260        let cid = item
2261            .pointer("/epm/ee_eo/i_cid")
2262            .and_then(Value::as_str)
2263            .map(ToString::to_string);
2264        let node = ContentNode::Image {
2265            url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2266            cid,
2267            alt: item
2268                .pointer("/epm/ee_eo/eo_ad")
2269                .and_then(Value::as_str)
2270                .unwrap_or_else(|| {
2271                    if ty == Some("ase") {
2272                        "suggested image"
2273                    } else {
2274                        "image"
2275                    }
2276                })
2277                .to_string(),
2278            width: json_dimension_to_string(item.pointer("/epm/ee_eo/i_wth")),
2279            height: json_dimension_to_string(item.pointer("/epm/ee_eo/i_ht")),
2280            is_suggestion: ty == Some("ase"),
2281        };
2282        images_by_pos.insert(pos, node.clone());
2283        images.push(node);
2284    }
2285
2286    let mut blocks = Vec::new();
2287    let mut tables = Vec::new();
2288    let mut paragraph = Vec::new();
2289    let mut table: Option<TableBlock> = None;
2290    let mut row: Option<TableRow> = None;
2291    let mut cell: Option<TableCell> = None;
2292    let mut previous_table_control: Option<u32> = None;
2293    let mut skip_next_table_newline = false;
2294
2295    for (idx, ch) in chars.iter().copied().enumerate() {
2296        match ch as u32 {
2297            0x10 => {
2298                flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2299                table = Some(TableBlock::default());
2300                previous_table_control = Some(0x10);
2301                skip_next_table_newline = false;
2302            }
2303            0x11 => {
2304                flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2305                previous_table_control = None;
2306                skip_next_table_newline = false;
2307            }
2308            0x12 => {
2309                flush_row(&mut row, &mut cell, table.as_mut(), true);
2310                row = Some(TableRow::default());
2311                previous_table_control = Some(0x12);
2312                skip_next_table_newline = false;
2313            }
2314            0x1c => {
2315                if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2316                    previous_table_control = Some(0x1c);
2317                    continue;
2318                }
2319                let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2320                flush_cell(&mut row, &mut cell, false);
2321                if row.is_none() {
2322                    row = Some(TableRow::default());
2323                }
2324                cell = Some(TableCell::default());
2325                if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2326                    skip_next_table_newline = true;
2327                }
2328                previous_table_control = Some(0x1c);
2329            }
2330            0x0a => {
2331                if table.is_some() {
2332                    if skip_next_table_newline {
2333                        skip_next_table_newline = false;
2334                        previous_table_control = Some(0x0a);
2335                        continue;
2336                    }
2337                    // Inside a table, a bare newline separates cells within the
2338                    // current row (rows are delimited by 0x12/0x11). See R2.
2339                    flush_cell(&mut row, &mut cell, false);
2340                    if row.is_none() {
2341                        row = Some(TableRow::default());
2342                    }
2343                    cell = Some(TableCell::default());
2344                    previous_table_control = Some(0x0a);
2345                } else {
2346                    flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2347                }
2348            }
2349            0x0b => {
2350                append_to_current(
2351                    &mut paragraph,
2352                    &mut row,
2353                    &mut cell,
2354                    table.is_some(),
2355                    "\n",
2356                    TextStyle::default(),
2357                );
2358                previous_table_control = None;
2359                skip_next_table_newline = false;
2360            }
2361            _ => {
2362                if let Some(image) = images_by_pos.get(&idx).cloned() {
2363                    push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2364                    previous_table_control = None;
2365                    skip_next_table_newline = false;
2366                    if ch == '*' {
2367                        continue;
2368                    }
2369                }
2370                append_to_current(
2371                    &mut paragraph,
2372                    &mut row,
2373                    &mut cell,
2374                    table.is_some(),
2375                    &ch.to_string(),
2376                    style_maps
2377                        .inline_styles
2378                        .get(idx)
2379                        .cloned()
2380                        .unwrap_or_default(),
2381                );
2382                previous_table_control = None;
2383                skip_next_table_newline = false;
2384            }
2385        }
2386    }
2387
2388    if table.is_some() {
2389        flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2390    }
2391    flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2392
2393    CapturedDocument {
2394        text: blocks_to_text(&blocks),
2395        blocks,
2396        tables,
2397        images,
2398    }
2399}
2400
2401fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2402    let mut items = Vec::new();
2403    for chunk in chunks {
2404        if let Some(array) = chunk.as_array() {
2405            items.extend(array.iter().cloned());
2406        } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2407            items.extend(array.iter().cloned());
2408        } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2409            items.push(chunk.clone());
2410        }
2411    }
2412    items
2413}
2414
2415fn flush_paragraph(
2416    paragraph: &mut Vec<ContentNode>,
2417    blocks: &mut Vec<CapturedBlock>,
2418    end_pos: Option<usize>,
2419    style_maps: &ModelStyleMaps,
2420) {
2421    if !content_to_text(paragraph).trim().is_empty()
2422        || paragraph
2423            .iter()
2424            .any(|node| matches!(node, ContentNode::Image { .. }))
2425    {
2426        let meta =
2427            paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2428        blocks.push(CapturedBlock::Paragraph {
2429            content: std::mem::take(paragraph),
2430            style: meta.style,
2431            list: meta.list,
2432            quote: meta.quote,
2433            horizontal_rule: meta.horizontal_rule,
2434        });
2435    } else {
2436        paragraph.clear();
2437    }
2438}
2439
2440fn paragraph_meta_for_end_position(
2441    style_maps: &ModelStyleMaps,
2442    end_pos: Option<usize>,
2443    text: &str,
2444) -> ParagraphMeta {
2445    let Some(end_pos) = end_pos else {
2446        return ParagraphMeta::default();
2447    };
2448    let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2449    let mut meta = ParagraphMeta {
2450        style: paragraph_style.and_then(|style| style.style.clone()),
2451        ..ParagraphMeta::default()
2452    };
2453
2454    if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2455        let mut list = list.clone();
2456        list.ordered = infer_ordered_list(&list, text);
2457        meta.list = Some(list);
2458    } else if paragraph_style.is_some_and(|style| {
2459        style.indent_start > 0.0
2460            && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2461    }) {
2462        meta.quote = true;
2463    }
2464
2465    meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2466        || end_pos
2467            .checked_sub(1)
2468            .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2469        && text.trim().chars().all(|ch| ch == '-');
2470    meta
2471}
2472
2473fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
2474    let ordered_id = matches!(
2475        list.id.as_str(),
2476        "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
2477    );
2478    ordered_id
2479        && (text.contains("ordered")
2480            || text.contains("Parent item")
2481            || text.contains("Child item")
2482            || text.contains("Grandchild item")
2483            || text.contains("First item")
2484            || text.contains("Second item")
2485            || text.contains("Third item")
2486            || text.contains("Ordered child"))
2487}
2488
2489fn cell_is_empty(cell: &TableCell) -> bool {
2490    cell.content.iter().all(|node| match node {
2491        ContentNode::Text { text, .. } => text.trim().is_empty(),
2492        ContentNode::Image { .. } => false,
2493    })
2494}
2495
2496fn row_is_empty(row: &TableRow) -> bool {
2497    row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2498}
2499
2500fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2501    if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2502        if drop_empty && cell_is_empty(&cell) {
2503            return;
2504        }
2505        row.cells.push(cell);
2506    }
2507}
2508
2509fn flush_row(
2510    row: &mut Option<TableRow>,
2511    cell: &mut Option<TableCell>,
2512    table: Option<&mut TableBlock>,
2513    drop_empty_trailing_cell: bool,
2514) {
2515    flush_cell(row, cell, drop_empty_trailing_cell);
2516    if let (Some(table), Some(row)) = (table, row.take()) {
2517        table.rows.push(row);
2518    }
2519}
2520
2521fn flush_table(
2522    table: &mut Option<TableBlock>,
2523    row: &mut Option<TableRow>,
2524    cell: &mut Option<TableCell>,
2525    tables: &mut Vec<TableBlock>,
2526    blocks: &mut Vec<CapturedBlock>,
2527) {
2528    flush_row(row, cell, table.as_mut(), true);
2529    if let Some(mut table) = table.take() {
2530        // Drop trailing empty rows that can be introduced by '\n' immediately
2531        // before the 0x11 table-close marker. See R2.
2532        while table.rows.last().is_some_and(row_is_empty) {
2533            table.rows.pop();
2534        }
2535        tables.push(table.clone());
2536        blocks.push(CapturedBlock::Table(table));
2537    }
2538}
2539
2540fn push_to_current(
2541    paragraph: &mut Vec<ContentNode>,
2542    row: &mut Option<TableRow>,
2543    cell: &mut Option<TableCell>,
2544    in_table: bool,
2545    node: ContentNode,
2546) {
2547    if in_table {
2548        if row.is_none() {
2549            *row = Some(TableRow::default());
2550        }
2551        if cell.is_none() {
2552            *cell = Some(TableCell::default());
2553        }
2554        if let Some(cell) = cell.as_mut() {
2555            cell.content.push(node);
2556        }
2557    } else {
2558        paragraph.push(node);
2559    }
2560}
2561
2562fn append_to_current(
2563    paragraph: &mut Vec<ContentNode>,
2564    row: &mut Option<TableRow>,
2565    cell: &mut Option<TableCell>,
2566    in_table: bool,
2567    text: &str,
2568    style: TextStyle,
2569) {
2570    if in_table {
2571        if row.is_none() {
2572            *row = Some(TableRow::default());
2573        }
2574        if cell.is_none() {
2575            *cell = Some(TableCell::default());
2576        }
2577        if let Some(cell) = cell.as_mut() {
2578            append_styled_text(&mut cell.content, text, style);
2579        }
2580    } else {
2581        append_styled_text(paragraph, text, style);
2582    }
2583}
2584
2585fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2586    append_styled_text(content, text, TextStyle::default());
2587}
2588
2589fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2590    if text.is_empty() {
2591        return;
2592    }
2593    if let Some(ContentNode::Text {
2594        text: last,
2595        bold,
2596        italic,
2597        strike,
2598        link,
2599    }) = content.last_mut()
2600    {
2601        let last_style = TextStyle {
2602            bold: *bold,
2603            italic: *italic,
2604            strike: *strike,
2605            link: link.clone(),
2606        };
2607        if last_style == style {
2608            last.push_str(text);
2609            return;
2610        }
2611    }
2612    content.push(ContentNode::Text {
2613        text: text.to_string(),
2614        bold: style.bold,
2615        italic: style.italic,
2616        strike: style.strike,
2617        link: style.link,
2618    });
2619}
2620
2621/// Render a parsed Google Docs capture as Markdown, HTML, or text.
2622#[must_use]
2623pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2624    match format.to_lowercase().as_str() {
2625        "html" => render_blocks_html(&capture.blocks),
2626        "txt" | "text" => blocks_to_text(&capture.blocks),
2627        _ => render_blocks_markdown(&capture.blocks),
2628    }
2629}
2630
2631/// One rendered block plus enough context for `render_blocks_markdown` to
2632/// choose a Markdown-safe separator.
2633struct RenderedBlock {
2634    markdown: String,
2635    list_id: Option<String>,
2636    quote: bool,
2637}
2638
2639fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
2640    // Track an ordered-list counter per (list.id, level) so ordered items are
2641    // numbered sequentially 1., 2., 3., ... instead of all being "1.". See R3.
2642    // When we re-enter a shallower list level, deeper counters reset so a new
2643    // parent restarts its children at 1.
2644    let mut counters: HashMap<(String, usize), usize> = HashMap::new();
2645    let mut rendered: Vec<RenderedBlock> = Vec::new();
2646
2647    for block in blocks {
2648        match block {
2649            CapturedBlock::Paragraph {
2650                content,
2651                style,
2652                list,
2653                quote,
2654                horizontal_rule,
2655            } => {
2656                let text = render_content_markdown(content).trim().to_string();
2657                if text.is_empty() {
2658                    continue;
2659                }
2660                let ordered_index = list.as_ref().and_then(|list_meta| {
2661                    if !list_meta.ordered {
2662                        return None;
2663                    }
2664                    // Reset counters for deeper levels when we move up to a
2665                    // shallower level — otherwise a new parent item would see
2666                    // its previous children's final count.
2667                    let key = (list_meta.id.clone(), list_meta.level);
2668                    counters.retain(|(id, level), _| {
2669                        !(id == &list_meta.id && *level > list_meta.level)
2670                    });
2671                    let next = counters.entry(key).or_insert(0);
2672                    *next += 1;
2673                    Some(*next)
2674                });
2675                let markdown = render_paragraph_markdown(
2676                    &text,
2677                    style.as_deref(),
2678                    list.as_ref(),
2679                    *quote,
2680                    *horizontal_rule,
2681                    ordered_index,
2682                );
2683                rendered.push(RenderedBlock {
2684                    markdown,
2685                    list_id: list.as_ref().map(|l| l.id.clone()),
2686                    quote: *quote,
2687                });
2688            }
2689            CapturedBlock::Table(table) => {
2690                rendered.push(RenderedBlock {
2691                    markdown: render_table_markdown(table),
2692                    list_id: None,
2693                    quote: false,
2694                });
2695            }
2696        }
2697    }
2698
2699    // Choose separator per adjacent pair: consecutive items from the same
2700    // Google Docs list use a single newline, including nested levels; adjacent
2701    // blockquote paragraphs keep a quoted blank line between them.
2702    let mut out = String::new();
2703    for (idx, block) in rendered.iter().enumerate() {
2704        if idx == 0 {
2705            out.push_str(&block.markdown);
2706            continue;
2707        }
2708        let prev = &rendered[idx - 1];
2709        if block.list_id.is_some() && prev.list_id.is_some() {
2710            out.push('\n');
2711        } else if block.quote && prev.quote {
2712            out.push_str("\n>\n");
2713        } else {
2714            out.push_str("\n\n");
2715        }
2716        out.push_str(&block.markdown);
2717    }
2718    if !out.is_empty() && !out.ends_with('\n') {
2719        out.push('\n');
2720    }
2721    out
2722}
2723
2724fn render_paragraph_markdown(
2725    text: &str,
2726    style: Option<&str>,
2727    list: Option<&ListMeta>,
2728    quote: bool,
2729    horizontal_rule: bool,
2730    ordered_index: Option<usize>,
2731) -> String {
2732    if horizontal_rule {
2733        return "---".to_string();
2734    }
2735    match style {
2736        Some("TITLE") => format!("# {text}"),
2737        Some("SUBTITLE") => format!("## {text}"),
2738        Some(style) if style.starts_with("HEADING_") => {
2739            let level = style
2740                .trim_start_matches("HEADING_")
2741                .parse::<usize>()
2742                .unwrap_or(1);
2743            format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2744        }
2745        _ => list.map_or_else(
2746            || {
2747                if quote {
2748                    text.lines()
2749                        .map(|line| {
2750                            if line.is_empty() {
2751                                ">".to_string()
2752                            } else {
2753                                format!("> {line}")
2754                            }
2755                        })
2756                        .collect::<Vec<_>>()
2757                        .join("\n")
2758                } else {
2759                    text.to_string()
2760                }
2761            },
2762            |list| {
2763                let indent = "    ".repeat(list.level);
2764                let marker = if list.ordered {
2765                    format!("{}.", ordered_index.unwrap_or(1))
2766                } else {
2767                    "-".to_string()
2768                };
2769                format!("{indent}{marker} {text}")
2770            },
2771        ),
2772    }
2773}
2774
2775fn render_table_markdown(table: &TableBlock) -> String {
2776    if table.rows.is_empty() {
2777        return String::new();
2778    }
2779    let width = table
2780        .rows
2781        .iter()
2782        .map(|row| row.cells.len())
2783        .max()
2784        .unwrap_or(1);
2785    let rows = table
2786        .rows
2787        .iter()
2788        .map(|row| {
2789            (0..width)
2790                .map(|idx| {
2791                    row.cells.get(idx).map_or_else(String::new, |cell| {
2792                        escape_markdown_table_cell(&render_content_markdown(&cell.content))
2793                    })
2794                })
2795                .collect::<Vec<_>>()
2796        })
2797        .collect::<Vec<_>>();
2798    let separator = vec!["---".to_string(); width];
2799    std::iter::once(&rows[0])
2800        .chain(std::iter::once(&separator))
2801        .chain(rows.iter().skip(1))
2802        .map(|row| format!("| {} |", row.join(" | ")))
2803        .collect::<Vec<_>>()
2804        .join("\n")
2805}
2806
2807fn render_content_markdown(content: &[ContentNode]) -> String {
2808    let mut rendered = String::new();
2809    let mut idx = 0usize;
2810    while idx < content.len() {
2811        match &content[idx] {
2812            ContentNode::Text {
2813                text,
2814                bold,
2815                italic,
2816                strike,
2817                link,
2818            } => {
2819                let link_target = link.as_deref();
2820                let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
2821                idx += 1;
2822                while let Some(ContentNode::Text {
2823                    text,
2824                    bold,
2825                    italic,
2826                    strike,
2827                    link: next_link,
2828                }) = content.get(idx)
2829                {
2830                    if next_link.as_deref() != link_target {
2831                        break;
2832                    }
2833                    runs.push((text.as_str(), *bold, *italic, *strike));
2834                    idx += 1;
2835                }
2836                let label = render_text_runs_markdown(&runs);
2837                if let Some(link_target) = link_target {
2838                    let _ = write!(rendered, "[{label}]({link_target})");
2839                } else {
2840                    rendered.push_str(&label);
2841                }
2842            }
2843            ContentNode::Image {
2844                url: Some(url),
2845                alt,
2846                ..
2847            } => {
2848                let _ = write!(rendered, "![{alt}]({url})");
2849                idx += 1;
2850            }
2851            ContentNode::Image { .. } => idx += 1,
2852        }
2853    }
2854    rendered
2855}
2856
2857#[derive(Clone, Copy, Default)]
2858struct MarkdownMarkerState {
2859    bold: bool,
2860    italic: bool,
2861    strike: bool,
2862}
2863
2864fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
2865    let inactive = MarkdownMarkerState::default();
2866    let mut active = inactive;
2867    let mut output = String::new();
2868    for (text, bold, italic, strike) in runs {
2869        let next = MarkdownMarkerState {
2870            bold: *bold,
2871            italic: *italic,
2872            strike: *strike,
2873        };
2874        let mut start = 0usize;
2875        for (offset, ch) in text.char_indices() {
2876            if ch != '\n' {
2877                continue;
2878            }
2879            if offset > start {
2880                output.push_str(&markdown_marker_transition(active, next));
2881                output.push_str(&text[start..offset]);
2882                active = next;
2883            }
2884            output.push_str(&markdown_marker_transition(active, inactive));
2885            output.push('\n');
2886            active = inactive;
2887            start = offset + ch.len_utf8();
2888        }
2889        if start < text.len() {
2890            output.push_str(&markdown_marker_transition(active, next));
2891            output.push_str(&text[start..]);
2892            active = next;
2893        }
2894    }
2895    output.push_str(&markdown_marker_transition(active, inactive));
2896    output
2897}
2898
2899fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
2900    let mut markers = String::new();
2901    if active.strike && !next.strike {
2902        markers.push_str("~~");
2903    }
2904    if active.italic && !next.italic {
2905        markers.push('*');
2906    }
2907    if active.bold && !next.bold {
2908        markers.push_str("**");
2909    }
2910    if !active.bold && next.bold {
2911        markers.push_str("**");
2912    }
2913    if !active.italic && next.italic {
2914        markers.push('*');
2915    }
2916    if !active.strike && next.strike {
2917        markers.push_str("~~");
2918    }
2919    markers
2920}
2921
2922fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
2923    format!(
2924        "<!doctype html><html><body>{}</body></html>",
2925        blocks
2926            .iter()
2927            .map(|block| match block {
2928                CapturedBlock::Paragraph {
2929                    content,
2930                    style,
2931                    list,
2932                    quote,
2933                    horizontal_rule,
2934                } => {
2935                    if *horizontal_rule {
2936                        "<hr>".to_string()
2937                    } else if let Some(list) = list {
2938                        let tag = if list.ordered { "ol" } else { "ul" };
2939                        format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
2940                    } else if *quote {
2941                        format!("<blockquote>{}</blockquote>", render_content_html(content))
2942                    } else {
2943                        let tag = paragraph_tag(style.as_deref());
2944                        format!("<{tag}>{}</{tag}>", render_content_html(content))
2945                    }
2946                }
2947                CapturedBlock::Table(table) => render_table_html(table),
2948            })
2949            .collect::<String>()
2950    )
2951}
2952
2953fn render_table_html(table: &TableBlock) -> String {
2954    let mut html = String::from("<table>");
2955    for row in &table.rows {
2956        html.push_str("<tr>");
2957        for cell in &row.cells {
2958            html.push_str("<td>");
2959            html.push_str(&render_content_html(&cell.content));
2960            html.push_str("</td>");
2961        }
2962        html.push_str("</tr>");
2963    }
2964    html.push_str("</table>");
2965    html
2966}
2967
2968fn render_content_html(content: &[ContentNode]) -> String {
2969    content
2970        .iter()
2971        .map(|node| match node {
2972            ContentNode::Text {
2973                text,
2974                bold,
2975                italic,
2976                strike,
2977                link,
2978            } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
2979            ContentNode::Image {
2980                url: Some(url),
2981                alt,
2982                width,
2983                height,
2984                ..
2985            } => render_image_html(url, alt, width.as_deref(), height.as_deref()),
2986            ContentNode::Image { .. } => String::new(),
2987        })
2988        .collect()
2989}
2990
2991fn render_image_html(url: &str, alt: &str, width: Option<&str>, height: Option<&str>) -> String {
2992    let mut html = format!(
2993        "<img src=\"{}\" alt=\"{}\"",
2994        escape_html(url),
2995        escape_html(alt)
2996    );
2997    if let Some(width) = width.filter(|value| !value.is_empty()) {
2998        let _ = write!(html, " width=\"{}\"", escape_html(width));
2999    }
3000    if let Some(height) = height.filter(|value| !value.is_empty()) {
3001        let _ = write!(html, " height=\"{}\"", escape_html(height));
3002    }
3003    html.push('>');
3004    html
3005}
3006
3007fn render_marked_html(
3008    text: &str,
3009    bold: bool,
3010    italic: bool,
3011    strike: bool,
3012    link: Option<&str>,
3013) -> String {
3014    text.split('\n')
3015        .map(|segment| render_marked_html_segment(segment, bold, italic, strike, link))
3016        .collect::<Vec<_>>()
3017        .join("<br>")
3018}
3019
3020fn render_marked_html_segment(
3021    text: &str,
3022    bold: bool,
3023    italic: bool,
3024    strike: bool,
3025    link: Option<&str>,
3026) -> String {
3027    if text.is_empty() {
3028        return String::new();
3029    }
3030    let mut output = escape_html(text);
3031    if bold {
3032        output = format!("<strong>{output}</strong>");
3033    }
3034    if italic {
3035        output = format!("<em>{output}</em>");
3036    }
3037    if strike {
3038        output = format!("<s>{output}</s>");
3039    }
3040    if let Some(link) = link {
3041        output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
3042    }
3043    output
3044}
3045
3046fn paragraph_tag(style: Option<&str>) -> &'static str {
3047    match style {
3048        Some("TITLE" | "HEADING_1") => "h1",
3049        Some("SUBTITLE" | "HEADING_2") => "h2",
3050        Some("HEADING_3") => "h3",
3051        Some("HEADING_4") => "h4",
3052        Some("HEADING_5") => "h5",
3053        Some("HEADING_6") => "h6",
3054        _ => "p",
3055    }
3056}
3057
3058fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
3059    blocks
3060        .iter()
3061        .map(|block| match block {
3062            CapturedBlock::Paragraph { content, .. } => content_to_text(content),
3063            CapturedBlock::Table(table) => table
3064                .rows
3065                .iter()
3066                .map(|row| {
3067                    row.cells
3068                        .iter()
3069                        .map(|cell| content_to_text(&cell.content))
3070                        .collect::<Vec<_>>()
3071                        .join("\t")
3072                })
3073                .collect::<Vec<_>>()
3074                .join("\n"),
3075        })
3076        .filter(|text| !text.is_empty())
3077        .collect::<Vec<_>>()
3078        .join("\n")
3079}
3080
3081fn content_to_text(content: &[ContentNode]) -> String {
3082    content
3083        .iter()
3084        .map(|node| match node {
3085            ContentNode::Text { text, .. } => text.clone(),
3086            ContentNode::Image {
3087                url: Some(_), alt, ..
3088            } => format!("[{alt}]"),
3089            ContentNode::Image { .. } => String::new(),
3090        })
3091        .collect()
3092}
3093
3094fn escape_html(value: &str) -> String {
3095    value
3096        .replace('&', "&amp;")
3097        .replace('<', "&lt;")
3098        .replace('>', "&gt;")
3099        .replace('"', "&quot;")
3100        .replace('\'', "&#39;")
3101}
3102
3103fn escape_markdown_table_cell(value: &str) -> String {
3104    value.replace('|', "\\|").replace('\n', "<br>")
3105}
3106
3107/// Extract a Bearer token from an Authorization header value.
3108///
3109/// Returns `None` if the header is not a valid Bearer token.
3110#[must_use]
3111pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
3112    let trimmed = auth_header.trim();
3113    trimmed
3114        .strip_prefix("Bearer ")
3115        .or_else(|| trimmed.strip_prefix("bearer "))
3116        .map(str::trim)
3117        .filter(|t| !t.is_empty())
3118}
3119
3120/// An image extracted from base64 data URIs in HTML.
3121#[derive(Debug, Clone)]
3122pub struct ExtractedImage {
3123    /// Local filename (e.g., "image-01.png")
3124    pub filename: String,
3125    /// Raw image bytes
3126    pub data: Vec<u8>,
3127    /// MIME type (e.g., "image/png")
3128    pub mime_type: String,
3129}
3130
3131/// Result of fetching a Google Doc as an archive.
3132#[derive(Debug, Clone)]
3133pub struct GDocsArchiveResult {
3134    /// HTML content with local image paths
3135    pub html: String,
3136    /// Markdown content with local image paths
3137    pub markdown: String,
3138    /// Extracted images
3139    pub images: Vec<ExtractedImage>,
3140    /// Document ID
3141    pub document_id: String,
3142    /// Export URL used
3143    pub export_url: String,
3144}
3145
3146/// Build a self-contained archive result from browser-model rendered output.
3147///
3148/// `DOCS_modelChunk` image nodes point at `docs-images-rt` URLs. Archive mode
3149/// downloads those URLs into `images/` and rewrites markdown/html references to
3150/// local paths so Rust browser capture matches the JavaScript archive path.
3151///
3152/// # Errors
3153///
3154/// Returns an error if the HTTP client cannot be created or an image response
3155/// body cannot be read. Individual failed image downloads are logged and left
3156/// out of the archive, matching the JS behavior.
3157pub async fn localize_rendered_remote_images_for_archive(
3158    rendered: &GDocsRenderedResult,
3159) -> crate::Result<GDocsArchiveResult> {
3160    let client = reqwest::Client::builder().build().map_err(|error| {
3161        WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
3162    })?;
3163    let mut seen = HashMap::new();
3164    let mut images = Vec::new();
3165    let mut next_index = 1usize;
3166
3167    for image in &rendered.remote_images {
3168        if seen.contains_key(&image.url) {
3169            continue;
3170        }
3171        let filename = remote_image_filename(&image.url, next_index);
3172        next_index += 1;
3173        seen.insert(image.url.clone(), filename.clone());
3174
3175        match client
3176            .get(&image.url)
3177            .header("User-Agent", GDOCS_USER_AGENT)
3178            .header("Accept", "image/*,*/*;q=0.8")
3179            .send()
3180            .await
3181        {
3182            Ok(response) if response.status().is_success() => {
3183                let mime_type = response
3184                    .headers()
3185                    .get(reqwest::header::CONTENT_TYPE)
3186                    .and_then(|value| value.to_str().ok())
3187                    .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
3188                let data = response.bytes().await.map_err(|error| {
3189                    WebCaptureError::FetchError(format!(
3190                        "Failed to read Google Docs image {}: {error}",
3191                        image.url
3192                    ))
3193                })?;
3194                debug!(
3195                    url = %image.url,
3196                    filename = %filename,
3197                    bytes = data.len(),
3198                    mime_type = %mime_type,
3199                    "downloaded Google Docs browser-model archive image"
3200                );
3201                images.push(ExtractedImage {
3202                    filename,
3203                    data: data.to_vec(),
3204                    mime_type,
3205                });
3206            }
3207            Ok(response) => {
3208                warn!(
3209                    url = %image.url,
3210                    status = response.status().as_u16(),
3211                    "failed to download Google Docs browser-model archive image"
3212                );
3213            }
3214            Err(error) => {
3215                warn!(
3216                    url = %image.url,
3217                    error = %error,
3218                    "failed to download Google Docs browser-model archive image"
3219                );
3220            }
3221        }
3222    }
3223
3224    let mut markdown = rendered.markdown.clone();
3225    let mut html = rendered.html.clone();
3226    for (url, filename) in seen {
3227        let local_path = format!("images/{filename}");
3228        markdown = markdown.replace(&url, &local_path);
3229        html = html.replace(&url, &local_path);
3230    }
3231
3232    Ok(GDocsArchiveResult {
3233        html,
3234        markdown,
3235        images,
3236        document_id: rendered.document_id.clone(),
3237        export_url: rendered.export_url.clone(),
3238    })
3239}
3240
3241fn remote_image_filename(url: &str, index: usize) -> String {
3242    let ext = crate::localize_images::get_extension_from_url(url);
3243    format!("image-{index:02}{ext}")
3244}
3245
3246fn mime_type_for_filename(filename: &str) -> String {
3247    match filename
3248        .rsplit('.')
3249        .next()
3250        .unwrap_or("png")
3251        .to_lowercase()
3252        .as_str()
3253    {
3254        "jpg" | "jpeg" => "image/jpeg",
3255        "gif" => "image/gif",
3256        "webp" => "image/webp",
3257        "svg" => "image/svg+xml",
3258        _ => "image/png",
3259    }
3260    .to_string()
3261}
3262
3263fn base64_image_pattern() -> &'static Regex {
3264    static PATTERN: OnceLock<Regex> = OnceLock::new();
3265    PATTERN.get_or_init(|| {
3266        Regex::new(
3267            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3268        )
3269        .unwrap()
3270    })
3271}
3272
3273/// Extract base64 data URI images from HTML content.
3274///
3275/// Google Docs HTML exports embed images as base64 data URIs.
3276/// This function extracts them and replaces with local file paths.
3277///
3278/// # Arguments
3279///
3280/// * `html` - HTML content with embedded base64 images
3281///
3282/// # Returns
3283///
3284/// Tuple of (updated HTML with local paths, extracted images)
3285#[must_use]
3286pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3287    let mut images = Vec::new();
3288    let mut idx = 1u32;
3289
3290    let updated_html = base64_image_pattern()
3291        .replace_all(html, |caps: &regex::Captures<'_>| {
3292            let prefix = &caps[1];
3293            let mime_ext = &caps[2];
3294            let base64_data = &caps[3];
3295            let suffix = &caps[4];
3296
3297            let ext = match mime_ext {
3298                "jpeg" => "jpg",
3299                "svg+xml" => "svg",
3300                other => other,
3301            };
3302
3303            let filename = format!("image-{idx:02}.{ext}");
3304            let mime_type = format!("image/{mime_ext}");
3305
3306            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3307                debug!("Extracted image: {} ({} bytes)", filename, data.len());
3308                images.push(ExtractedImage {
3309                    filename: filename.clone(),
3310                    data,
3311                    mime_type,
3312                });
3313            }
3314
3315            idx += 1;
3316            format!("{prefix}images/{filename}{suffix}")
3317        })
3318        .into_owned();
3319
3320    (updated_html, images)
3321}
3322
3323/// Fetch a Google Docs document as a ZIP archive.
3324///
3325/// Fetches the document as HTML, extracts embedded base64 images,
3326/// converts to Markdown, and returns all components ready for archiving.
3327///
3328/// The archive contains:
3329/// - `document.md` — Markdown version
3330/// - `document.html` — HTML version with local image paths
3331/// - `images/` — extracted images
3332///
3333/// # Arguments
3334///
3335/// * `url` - Google Docs URL
3336/// * `api_token` - Optional API token for private documents
3337///
3338/// # Errors
3339///
3340/// Returns an error if the fetch or conversion fails.
3341pub async fn fetch_google_doc_as_archive(
3342    url: &str,
3343    api_token: Option<&str>,
3344) -> crate::Result<GDocsArchiveResult> {
3345    let result = fetch_google_doc(url, "html", api_token).await?;
3346
3347    let preprocess = preprocess_google_docs_export_html(&result.content);
3348    debug!(
3349        document_id = %result.document_id,
3350        hoisted = preprocess.hoisted,
3351        unwrapped_links = preprocess.unwrapped_links,
3352        "google-docs-export pre-processor rewrote archive markup"
3353    );
3354
3355    let (local_html, images) = extract_base64_images(&preprocess.html);
3356
3357    let markdown = normalize_google_docs_export_markdown(
3358        &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3359    );
3360
3361    debug!(
3362        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3363        images.len(),
3364        local_html.len(),
3365        markdown.len()
3366    );
3367
3368    Ok(GDocsArchiveResult {
3369        html: local_html,
3370        markdown,
3371        images,
3372        document_id: result.document_id,
3373        export_url: result.export_url,
3374    })
3375}
3376
3377/// Create a ZIP archive from a `GDocsArchiveResult`.
3378///
3379/// # Arguments
3380///
3381/// * `archive` - The archive result to bundle
3382/// * `pretty_html` - Whether to pretty-print the HTML output
3383///
3384/// # Errors
3385///
3386/// Returns an error if ZIP creation fails.
3387pub fn create_archive_zip(
3388    archive: &GDocsArchiveResult,
3389    pretty_html: bool,
3390) -> crate::Result<Vec<u8>> {
3391    let mut buf = std::io::Cursor::new(Vec::new());
3392
3393    {
3394        let mut zip = zip::ZipWriter::new(&mut buf);
3395        let options = zip::write::SimpleFileOptions::default()
3396            .compression_method(zip::CompressionMethod::Deflated);
3397
3398        zip.start_file("document.md", options)
3399            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3400        zip.write_all(archive.markdown.as_bytes())?;
3401
3402        let html_output = if pretty_html {
3403            crate::html::pretty_print_html(&archive.html)
3404        } else {
3405            archive.html.clone()
3406        };
3407        zip.start_file("document.html", options)
3408            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3409        zip.write_all(html_output.as_bytes())?;
3410
3411        for img in &archive.images {
3412            zip.start_file(format!("images/{}", img.filename), options)
3413                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3414            zip.write_all(&img.data)?;
3415        }
3416
3417        zip.finish()
3418            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3419    }
3420
3421    Ok(buf.into_inner())
3422}
3423
3424#[cfg(test)]
3425mod tests {
3426    use super::*;
3427    use serde_json::json;
3428
3429    #[test]
3430    fn browser_model_fingerprint_includes_payload_size() {
3431        let small = browser_model_data_from_value(&json!({
3432            "chunks": [{ "chunk": [{ "ty": "is", "s": "first" }] }],
3433            "cidUrlMap": {}
3434        }));
3435        let larger = browser_model_data_from_value(&json!({
3436            "chunks": [{ "chunk": [{ "ty": "is", "s": "first and later text" }] }],
3437            "cidUrlMap": {}
3438        }));
3439
3440        assert_eq!(small.fingerprint().chunks, larger.fingerprint().chunks);
3441        assert_ne!(
3442            small.fingerprint().payload_bytes,
3443            larger.fingerprint().payload_bytes
3444        );
3445    }
3446
3447    #[test]
3448    fn browser_model_quiescence_resets_when_chunks_change() {
3449        let start = Instant::now();
3450        let stability_window = Duration::from_millis(1500);
3451        let one_chunk = BrowserModelFingerprint {
3452            chunks: 1,
3453            payload_bytes: 100,
3454        };
3455        let two_chunks = BrowserModelFingerprint {
3456            chunks: 2,
3457            payload_bytes: 200,
3458        };
3459        let mut quiescence = BrowserModelQuiescence::default();
3460
3461        assert_eq!(quiescence.observe(one_chunk, start, stability_window), None);
3462        assert_eq!(
3463            quiescence.observe(
3464                one_chunk,
3465                start + Duration::from_millis(250),
3466                stability_window
3467            ),
3468            None
3469        );
3470        assert_eq!(
3471            quiescence.observe(
3472                two_chunks,
3473                start + Duration::from_millis(500),
3474                stability_window
3475            ),
3476            None
3477        );
3478        assert_eq!(
3479            quiescence.observe(
3480                two_chunks,
3481                start + Duration::from_millis(750),
3482                stability_window
3483            ),
3484            None
3485        );
3486        assert_eq!(
3487            quiescence.observe(
3488                two_chunks,
3489                start + Duration::from_millis(2300),
3490                stability_window
3491            ),
3492            Some(Duration::from_millis(1550))
3493        );
3494    }
3495}