web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use serde_json::Value;
38use std::collections::HashMap;
39use std::fmt::Write as _;
40use std::hash::BuildHasher;
41use std::io::Write;
42use std::process::Stdio;
43use std::sync::OnceLock;
44use std::time::{Duration, Instant};
45use tokio::io::{AsyncBufReadExt, BufReader};
46use tokio::process::{Child, Command};
47use tracing::{debug, info, warn};
48
49use crate::WebCaptureError;
50
51const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
52const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
53const GDOCS_USER_AGENT: &str =
54    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
55const GDOCS_EDITOR_MODEL_WAIT: Duration = Duration::from_secs(30);
56const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
57
58type CdpWebSocket = WebSocketStream<ConnectStream>;
59
60const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
61window.__captured_chunks = [];
62const captureChunk = (value) => {
63  if (!value) {
64    return;
65  }
66  if (Array.isArray(value)) {
67    for (const item of value) {
68      captureChunk(item);
69    }
70    return;
71  }
72  try {
73    window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
74  } catch {
75    window.__captured_chunks.push(value);
76  }
77};
78const wrapChunkArray = (value) => {
79  if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
80    return value;
81  }
82  const originalPush = value.push;
83  Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
84    value: true,
85    enumerable: false,
86  });
87  Object.defineProperty(value, 'push', {
88    value(...items) {
89      for (const item of items) {
90        captureChunk(item);
91      }
92      return originalPush.apply(this, items);
93    },
94    writable: true,
95    configurable: true,
96  });
97  for (const item of value) {
98    captureChunk(item);
99  }
100  return value;
101};
102Object.defineProperty(window, 'DOCS_modelChunk', {
103  set(value) {
104    captureChunk(value);
105    window.__DOCS_modelChunk_latest = wrapChunkArray(value);
106  },
107  get() {
108    return window.__DOCS_modelChunk_latest;
109  },
110  configurable: false,
111});
112";
113
114const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
115  const chunks = [...(window.__captured_chunks || [])];
116  if (
117    window.DOCS_modelChunk &&
118    chunks.length === 0 &&
119    !chunks.includes(window.DOCS_modelChunk)
120  ) {
121    chunks.push(window.DOCS_modelChunk);
122  }
123  const cidUrlMap = {};
124  const scripts = document.querySelectorAll('script');
125  for (const script of scripts) {
126    const text = script.textContent || '';
127    if (!text.includes('docs-images-rt')) {
128      continue;
129    }
130    const regex =
131      /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
132    let match;
133    while ((match = regex.exec(text)) !== null) {
134      cidUrlMap[match[1]] = match[2]
135        .replace(/\\u003d/g, '=')
136        .replace(/\\u0026/g, '&')
137        .replace(/\\\//g, '/');
138    }
139  }
140  return { chunks, cidUrlMap };
141}"#;
142
143fn gdocs_url_pattern() -> &'static Regex {
144    static PATTERN: OnceLock<Regex> = OnceLock::new();
145    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
146}
147
148/// Result of fetching a Google Docs document.
149#[derive(Debug, Clone)]
150pub struct GDocsResult {
151    /// The document content in the requested format.
152    pub content: String,
153    /// The export format used.
154    pub format: String,
155    /// The extracted document ID.
156    pub document_id: String,
157    /// The export URL that was fetched.
158    pub export_url: String,
159}
160
161/// Google Docs capture backend selected from the CLI `--capture` flag.
162#[derive(Debug, Clone, Copy, PartialEq, Eq)]
163pub enum GDocsCaptureMethod {
164    /// Load `/edit` and extract `DOCS_modelChunk`.
165    BrowserModel,
166    /// Use the public `/export?format=...` endpoint.
167    PublicExport,
168    /// Use the authenticated `docs.googleapis.com` REST API.
169    DocsApi,
170}
171
172/// Rendered Google Docs content from either Docs API or editor model data.
173#[derive(Debug, Clone)]
174pub struct GDocsRenderedResult {
175    /// Markdown output.
176    pub markdown: String,
177    /// HTML output.
178    pub html: String,
179    /// Plain text output.
180    pub text: String,
181    /// The extracted document ID.
182    pub document_id: String,
183    /// Source URL used for capture.
184    pub export_url: String,
185    /// Remote images exposed by the editor model, used for archive localization.
186    pub remote_images: Vec<RemoteImage>,
187}
188
189/// Remote image reference extracted from browser-model capture.
190#[derive(Debug, Clone, PartialEq, Eq)]
191pub struct RemoteImage {
192    /// Original image URL.
193    pub url: String,
194    /// Image alt text.
195    pub alt: String,
196}
197
198#[derive(Debug, Clone)]
199struct BrowserModelData {
200    chunks: Vec<Value>,
201    cid_urls: HashMap<String, String>,
202}
203
204/// Parsed Google Docs model/document capture.
205#[derive(Debug, Clone, Default)]
206pub struct CapturedDocument {
207    /// Ordered document blocks.
208    pub blocks: Vec<CapturedBlock>,
209    /// Tables extracted from `blocks` for compatibility with tests and callers.
210    pub tables: Vec<TableBlock>,
211    /// Images extracted from model positions.
212    pub images: Vec<ContentNode>,
213    /// Plain text projection.
214    pub text: String,
215}
216
217/// Captured block.
218#[derive(Debug, Clone)]
219pub enum CapturedBlock {
220    /// Paragraph-like block.
221    Paragraph {
222        /// Paragraph content.
223        content: Vec<ContentNode>,
224        /// Optional Google Docs named style.
225        style: Option<String>,
226        /// Optional list metadata.
227        list: Option<ListMeta>,
228        /// Whether paragraph is a blockquote.
229        quote: bool,
230        /// Whether paragraph is a horizontal rule.
231        horizontal_rule: bool,
232    },
233    /// Table block.
234    Table(TableBlock),
235}
236
237/// Captured table.
238#[derive(Debug, Clone, Default)]
239pub struct TableBlock {
240    /// Table rows.
241    pub rows: Vec<TableRow>,
242}
243
244/// Captured table row.
245#[derive(Debug, Clone, Default)]
246pub struct TableRow {
247    /// Row cells.
248    pub cells: Vec<TableCell>,
249}
250
251/// Captured table cell.
252#[derive(Debug, Clone, Default)]
253pub struct TableCell {
254    /// Cell content.
255    pub content: Vec<ContentNode>,
256}
257
258/// Captured inline content node.
259#[derive(Debug, Clone, PartialEq, Eq)]
260pub enum ContentNode {
261    /// Text run.
262    Text {
263        /// Text content.
264        text: String,
265        /// Bold text style.
266        bold: bool,
267        /// Italic text style.
268        italic: bool,
269        /// Strikethrough text style.
270        strike: bool,
271        /// Optional hyperlink target.
272        link: Option<String>,
273    },
274    /// Image placeholder.
275    Image {
276        /// Content ID from Google Docs model data.
277        cid: Option<String>,
278        /// Resolved image URL.
279        url: Option<String>,
280        /// Alt text.
281        alt: String,
282        /// Editor-model image width, when available.
283        width: Option<String>,
284        /// Editor-model image height, when available.
285        height: Option<String>,
286        /// Whether this image came from a suggested edit.
287        is_suggestion: bool,
288    },
289}
290
291#[derive(Debug, Clone, Default, PartialEq, Eq)]
292struct TextStyle {
293    bold: bool,
294    italic: bool,
295    strike: bool,
296    link: Option<String>,
297}
298
299#[derive(Debug, Clone, Default)]
300struct ParagraphMeta {
301    style: Option<String>,
302    list: Option<ListMeta>,
303    quote: bool,
304    horizontal_rule: bool,
305}
306
307#[derive(Debug, Clone)]
308pub struct ListMeta {
309    /// Google Docs list identifier.
310    pub id: String,
311    /// Nesting level, zero-based.
312    pub level: usize,
313    /// Whether Markdown should render this list item with an ordered marker.
314    pub ordered: bool,
315}
316
317#[derive(Debug, Clone)]
318struct ParagraphStyle {
319    style: Option<String>,
320    indent_start: f64,
321    indent_first_line: f64,
322}
323
324#[derive(Debug, Clone, Default)]
325struct ModelStyleMaps {
326    inline_styles: Vec<TextStyle>,
327    paragraph_by_end: HashMap<usize, ParagraphStyle>,
328    list_by_end: HashMap<usize, ListMeta>,
329    horizontal_rules: std::collections::HashSet<usize>,
330}
331
332/// Check if a URL is a Google Docs document URL.
333#[must_use]
334pub fn is_google_docs_url(url: &str) -> bool {
335    gdocs_url_pattern().is_match(url)
336}
337
338/// Extract the document ID from a Google Docs URL.
339///
340/// Returns `None` if the URL is not a valid Google Docs URL.
341#[must_use]
342pub fn extract_document_id(url: &str) -> Option<String> {
343    gdocs_url_pattern()
344        .captures(url)
345        .and_then(|caps| caps.get(1))
346        .map(|m| m.as_str().to_string())
347}
348
349/// Build a Google Docs export URL.
350///
351/// # Arguments
352///
353/// * `document_id` - The Google Docs document ID
354/// * `format` - Export format (html, txt, md, pdf, docx, epub)
355#[must_use]
356pub fn build_export_url(document_id: &str, format: &str) -> String {
357    let export_format = match format {
358        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
359        _ => "html",
360    };
361    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
362}
363
364/// Build a Google Docs editor URL.
365#[must_use]
366pub fn build_edit_url(document_id: &str) -> String {
367    format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
368}
369
370/// Build a Google Docs REST API URL.
371#[must_use]
372pub fn build_docs_api_url(document_id: &str) -> String {
373    format!("{GDOCS_API_BASE}/{document_id}")
374}
375
376/// Select a Google Docs capture backend from the CLI `--capture` value.
377///
378/// # Errors
379///
380/// Returns an error when `capture` is neither `browser` nor `api`.
381pub fn select_capture_method(
382    capture: &str,
383    api_token: Option<&str>,
384) -> crate::Result<GDocsCaptureMethod> {
385    match capture.to_lowercase().as_str() {
386        "browser" => Ok(GDocsCaptureMethod::BrowserModel),
387        "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
388        "api" => Ok(GDocsCaptureMethod::PublicExport),
389        other => Err(WebCaptureError::InvalidUrl(format!(
390            "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
391        ))),
392    }
393}
394
395/// Fetch a Google Docs document via the export URL.
396///
397/// For public documents, pass `None` for `api_token`.
398/// For private documents, pass a Bearer token string.
399///
400/// # Arguments
401///
402/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
403/// * `format` - Export format (html, txt, md, pdf, docx, epub)
404/// * `api_token` - Optional API token for private documents
405///
406/// # Errors
407///
408/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
409pub async fn fetch_google_doc(
410    url: &str,
411    format: &str,
412    api_token: Option<&str>,
413) -> crate::Result<GDocsResult> {
414    let document_id = extract_document_id(url).ok_or_else(|| {
415        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
416    })?;
417
418    let export_url = build_export_url(&document_id, format);
419    debug!(
420        document_id = %document_id,
421        format = %format,
422        export_url = %export_url,
423        has_api_token = api_token.is_some(),
424        "fetching Google Doc via public export"
425    );
426
427    let mut request = reqwest::Client::new()
428        .get(&export_url)
429        .header(
430            "User-Agent",
431            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
432        )
433        .header("Accept-Charset", "utf-8")
434        .header("Accept-Language", "en-US,en;q=0.9");
435
436    if let Some(token) = api_token {
437        request = request.header("Authorization", format!("Bearer {token}"));
438    }
439
440    let response = request
441        .send()
442        .await
443        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
444    debug!(
445        document_id = %document_id,
446        status = response.status().as_u16(),
447        success = response.status().is_success(),
448        content_type = response
449            .headers()
450            .get(reqwest::header::CONTENT_TYPE)
451            .and_then(|value| value.to_str().ok())
452            .unwrap_or(""),
453        "received Google Docs public export response"
454    );
455
456    if !response.status().is_success() {
457        return Err(WebCaptureError::FetchError(format!(
458            "Failed to fetch Google Doc ({} {}): {}",
459            response.status().as_u16(),
460            response.status().canonical_reason().unwrap_or("Unknown"),
461            export_url
462        )));
463    }
464
465    let raw_content = response.text().await.map_err(|e| {
466        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
467    })?;
468    debug!(
469        document_id = %document_id,
470        bytes = raw_content.len(),
471        "read Google Docs public export body"
472    );
473
474    // Decode HTML entities to unicode for text-based formats
475    let content = match format {
476        "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
477        _ => raw_content,
478    };
479
480    Ok(GDocsResult {
481        content,
482        format: format.to_string(),
483        document_id,
484        export_url,
485    })
486}
487
488/// Fetch a Google Docs document and convert to Markdown.
489///
490/// Fetches the document as HTML, then converts to Markdown using the
491/// existing HTML-to-Markdown pipeline.
492///
493/// # Arguments
494///
495/// * `url` - Google Docs URL
496/// * `api_token` - Optional API token for private documents
497///
498/// # Errors
499///
500/// Returns an error if the fetch or conversion fails.
501pub async fn fetch_google_doc_as_markdown(
502    url: &str,
503    api_token: Option<&str>,
504) -> crate::Result<GDocsResult> {
505    let result = fetch_google_doc(url, "html", api_token).await?;
506
507    let preprocess = preprocess_google_docs_export_html(&result.content);
508    debug!(
509        document_id = %result.document_id,
510        hoisted = preprocess.hoisted,
511        unwrapped_links = preprocess.unwrapped_links,
512        "google-docs-export pre-processor rewrote markup"
513    );
514    let markdown = normalize_google_docs_export_markdown(
515        &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
516    );
517    debug!(
518        document_id = %result.document_id,
519        bytes = markdown.len(),
520        "rendered Google Docs public export markdown"
521    );
522
523    Ok(GDocsResult {
524        content: markdown,
525        format: "markdown".to_string(),
526        document_id: result.document_id,
527        export_url: result.export_url,
528    })
529}
530
531/// Result of running the Google Docs export HTML pre-processor.
532///
533/// Exposes the rewritten HTML alongside counters that are useful for debug
534/// logging (`gdocs.export.style-hoist`). See issue #92 R6.
535#[derive(Debug, Clone)]
536pub struct GDocsExportPreprocessResult {
537    /// Rewritten HTML.
538    pub html: String,
539    /// Number of inline-style spans turned into `<strong>`/`<em>`/`<del>`.
540    pub hoisted: usize,
541    /// Number of `google.com/url?q=` redirect wrappers unwrapped.
542    pub unwrapped_links: usize,
543}
544
545/// Pre-process Google Docs export HTML so the generic `html2md` pipeline
546/// preserves inline formatting, heading numbering, and link targets.
547///
548/// Google Drive serves bold/italic/strikethrough as inline style spans and
549/// wraps every link through a `google.com/url?q=` redirect, both of which
550/// the generic converter would otherwise discard. This function rewrites
551/// those constructs into semantic HTML before conversion.
552#[must_use]
553pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
554    let mut hoisted: usize = 0;
555    let mut unwrapped_links: usize = 0;
556    let class_styles = extract_css_class_styles(html);
557
558    let mut out = hoist_inline_style_spans(html, &mut hoisted);
559    out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
560    out = convert_class_indented_blockquotes(&out, &class_styles);
561    out = nest_google_docs_lists(&out, &class_styles);
562    out = strip_google_docs_heading_noise(&out);
563    out = strip_heading_inline_formatting(&out);
564    out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
565    out = out.replace("&nbsp;", " ");
566    out = out.replace('\u{00A0}', " ");
567
568    GDocsExportPreprocessResult {
569        html: out,
570        hoisted,
571        unwrapped_links,
572    }
573}
574
575/// Normalize Markdown emitted from Google Docs public-export HTML converters.
576#[must_use]
577pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
578    let markdown = unescape_public_export_punctuation(markdown);
579    let markdown = convert_setext_headings(&markdown);
580    let markdown = normalize_atx_headings(&markdown);
581    let markdown = normalize_bullet_markers(&markdown);
582    let markdown = normalize_list_spacing(&markdown);
583    let markdown = normalize_blockquote_spacing(&markdown);
584    let markdown = normalize_markdown_tables(&markdown);
585    crate::markdown::clean_markdown(&markdown)
586}
587
588fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
589    let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
590        .expect("valid regex");
591    span_re
592        .replace_all(html, |caps: &regex::Captures<'_>| {
593            let style = caps.get(2).map_or("", |m| m.as_str());
594            let inner = caps.get(3).map_or("", |m| m.as_str());
595            semantic_wrapped_html(inner, style).map_or_else(
596                || caps[0].to_string(),
597                |wrapped| {
598                    *hoisted += 1;
599                    wrapped
600                },
601            )
602        })
603        .into_owned()
604}
605
606fn hoist_class_style_spans(
607    html: &str,
608    class_styles: &HashMap<String, String>,
609    hoisted: &mut usize,
610) -> String {
611    let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
612        .expect("valid regex");
613    class_span_re
614        .replace_all(html, |caps: &regex::Captures<'_>| {
615            let class_attr = caps.get(2).map_or("", |m| m.as_str());
616            let inner = caps.get(3).map_or("", |m| m.as_str());
617            let style = combined_class_style(class_styles, class_attr);
618            semantic_wrapped_html(inner, &style).map_or_else(
619                || caps[0].to_string(),
620                |wrapped| {
621                    *hoisted += 1;
622                    wrapped
623                },
624            )
625        })
626        .into_owned()
627}
628
629fn convert_class_indented_blockquotes(
630    html: &str,
631    class_styles: &HashMap<String, String>,
632) -> String {
633    let class_paragraph_re =
634        Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
635    class_paragraph_re
636        .replace_all(html, |caps: &regex::Captures<'_>| {
637            let class_attr = caps.get(2).map_or("", |m| m.as_str());
638            let inner = caps.get(3).map_or("", |m| m.as_str());
639            let style = combined_class_style(class_styles, class_attr);
640            if is_blockquote_style(&style) {
641                format!("<blockquote><p>{inner}</p></blockquote>")
642            } else {
643                caps[0].to_string()
644            }
645        })
646        .into_owned()
647}
648
649#[derive(Debug, Clone)]
650struct ExportListBlock {
651    start: usize,
652    end: usize,
653    tag: String,
654    inner: String,
655}
656
657#[derive(Debug, Clone)]
658struct ExportListItem {
659    tag: String,
660    level: usize,
661    inner: String,
662}
663
664fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
665    let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
666    let blocks: Vec<ExportListBlock> = list_re
667        .captures_iter(html)
668        .filter_map(|caps| {
669            let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
670            let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
671            if open_tag != close_tag {
672                return None;
673            }
674            let whole = caps.get(0)?;
675            Some(ExportListBlock {
676                start: whole.start(),
677                end: whole.end(),
678                tag: open_tag,
679                inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
680            })
681        })
682        .collect();
683
684    if blocks.len() < 2 {
685        return html.to_string();
686    }
687
688    let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
689    let mut current: Vec<ExportListBlock> = Vec::new();
690    for block in blocks {
691        if let Some(previous) = current.last() {
692            if !html[previous.end..block.start].trim().is_empty() {
693                if current.len() > 1 {
694                    groups.push(std::mem::take(&mut current));
695                } else {
696                    current.clear();
697                }
698            }
699        }
700        current.push(block);
701    }
702    if current.len() > 1 {
703        groups.push(current);
704    }
705
706    if groups.is_empty() {
707        return html.to_string();
708    }
709
710    let mut out = html.to_string();
711    for group in groups.iter().rev() {
712        let rendered = render_nested_list_group(group, class_styles);
713        let start = group.first().expect("non-empty group").start;
714        let end = group.last().expect("non-empty group").end;
715        out.replace_range(start..end, &rendered);
716    }
717    out
718}
719
720fn render_nested_list_group(
721    group: &[ExportListBlock],
722    class_styles: &HashMap<String, String>,
723) -> String {
724    let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
725    let items: Vec<ExportListItem> = group
726        .iter()
727        .flat_map(|block| {
728            item_re.captures_iter(&block.inner).map(|caps| {
729                let attrs = caps.get(1).map_or("", |m| m.as_str());
730                let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
731                ExportListItem {
732                    tag: block.tag.clone(),
733                    level: google_docs_list_item_level(attrs, class_styles),
734                    inner,
735                }
736            })
737        })
738        .collect();
739
740    if items.is_empty() {
741        let mut unchanged = String::new();
742        for block in group {
743            write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
744                .expect("write to String");
745        }
746        return unchanged;
747    }
748
749    let mut html = String::new();
750    let mut current_level: Option<usize> = None;
751    let mut open_tags: Vec<Option<String>> = Vec::new();
752    let mut item_open: Vec<bool> = Vec::new();
753
754    for item in items {
755        let level = item.level;
756        while current_level.is_some_and(|current| current > level) {
757            let current = current_level.expect("checked as Some");
758            close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
759            current_level = current.checked_sub(1);
760        }
761
762        while current_level.is_none_or(|current| current < level) {
763            let next_level = current_level.map_or(0, |current| current + 1);
764            open_rendered_list(
765                &mut html,
766                &mut open_tags,
767                &mut item_open,
768                next_level,
769                &item.tag,
770            );
771            current_level = Some(next_level);
772        }
773
774        ensure_list_stack(&mut open_tags, &mut item_open, level);
775        if open_tags[level]
776            .as_deref()
777            .is_some_and(|tag| tag != item.tag)
778        {
779            close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
780            open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
781        } else if open_tags[level].is_none() {
782            open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
783        }
784
785        close_rendered_item(&mut html, &mut item_open, level);
786        html.push_str("<li>");
787        html.push_str(&item.inner);
788        item_open[level] = true;
789
790        for deeper in (level + 1)..item_open.len() {
791            item_open[deeper] = false;
792            open_tags[deeper] = None;
793        }
794    }
795
796    while let Some(current) = current_level {
797        close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
798        current_level = current.checked_sub(1);
799    }
800
801    html
802}
803
804fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
805    while open_tags.len() <= level {
806        open_tags.push(None);
807        item_open.push(false);
808    }
809}
810
811fn open_rendered_list(
812    html: &mut String,
813    open_tags: &mut Vec<Option<String>>,
814    item_open: &mut Vec<bool>,
815    level: usize,
816    tag: &str,
817) {
818    ensure_list_stack(open_tags, item_open, level);
819    html.push('<');
820    html.push_str(tag);
821    html.push('>');
822    open_tags[level] = Some(tag.to_string());
823    item_open[level] = false;
824}
825
826fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
827    if item_open.get(level).copied().unwrap_or(false) {
828        html.push_str("</li>");
829        item_open[level] = false;
830    }
831}
832
833fn close_rendered_list(
834    html: &mut String,
835    open_tags: &mut [Option<String>],
836    item_open: &mut [bool],
837    level: usize,
838) {
839    close_rendered_item(html, item_open, level);
840    if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
841        html.push_str("</");
842        html.push_str(&tag);
843        html.push('>');
844    }
845}
846
847fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
848    let style = combined_attr_style(class_styles, attrs);
849    let margin_left = css_point_value(&style, "margin-left");
850    if margin_left <= 0.0 {
851        return 0;
852    }
853    [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
854        .iter()
855        .take_while(|boundary| margin_left >= **boundary)
856        .count()
857}
858
859fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
860    let mut styles = String::new();
861    if let Some(style) = attr_value(attrs, "style") {
862        styles.push_str(&style);
863    }
864    if let Some(class_attr) = attr_value(attrs, "class") {
865        styles.push_str(&combined_class_style(class_styles, &class_attr));
866    }
867    styles
868}
869
870fn attr_value(attrs: &str, name: &str) -> Option<String> {
871    let attr_re = Regex::new(&format!(
872        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
873        regex::escape(name)
874    ))
875    .expect("valid regex");
876    attr_re.captures(attrs).and_then(|caps| {
877        caps.get(1)
878            .or_else(|| caps.get(2))
879            .map(|value| value.as_str().to_string())
880    })
881}
882
883fn strip_google_docs_heading_noise(html: &str) -> String {
884    let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
885    let numbering_re =
886        Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
887    let mut out = empty_anchor_re.replace_all(html, "").into_owned();
888    for level in 1..=6 {
889        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
890            .expect("valid regex");
891        out = heading_re
892            .replace_all(&out, |caps: &regex::Captures<'_>| {
893                let open = &caps[1];
894                let inner = &caps[2];
895                let close = &caps[3];
896                let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
897                cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
898                format!("{open}{cleaned}{close}")
899            })
900            .into_owned();
901    }
902    out
903}
904
905fn strip_heading_inline_formatting(html: &str) -> String {
906    let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
907    let mut out = html.to_string();
908    for level in 1..=6 {
909        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
910            .expect("valid regex");
911        out = heading_re
912            .replace_all(&out, |caps: &regex::Captures<'_>| {
913                let open = &caps[1];
914                let inner = &caps[2];
915                let close = &caps[3];
916                let cleaned = inline_marker_re.replace_all(inner, "");
917                format!("{open}{cleaned}{close}")
918            })
919            .into_owned();
920    }
921    out
922}
923
924fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
925    let redirect_re =
926        Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
927            .expect("valid regex");
928    redirect_re
929        .replace_all(html, |caps: &regex::Captures<'_>| {
930            let encoded = caps.get(1).map_or("", |m| m.as_str());
931            let decoded = percent_decode_utf8_lossy(encoded);
932            *unwrapped_links += 1;
933            format!(r#"href="{decoded}""#)
934        })
935        .into_owned()
936}
937
938fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
939    let mut class_styles: HashMap<String, String> = HashMap::new();
940    let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
941    let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
942    for style_caps in style_re.captures_iter(html) {
943        let css = style_caps.get(1).map_or("", |m| m.as_str());
944        for class_caps in class_re.captures_iter(css) {
945            let class_name = class_caps.get(1).map_or("", |m| m.as_str());
946            let style = class_caps.get(2).map_or("", |m| m.as_str());
947            class_styles
948                .entry(class_name.to_string())
949                .and_modify(|existing| {
950                    existing.push(';');
951                    existing.push_str(style);
952                })
953                .or_insert_with(|| style.to_string());
954        }
955    }
956    class_styles
957}
958
959fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
960    class_attr
961        .split_whitespace()
962        .filter_map(|class_name| class_styles.get(class_name))
963        .fold(String::new(), |mut out, style| {
964            out.push(';');
965            out.push_str(style);
966            out
967        })
968}
969
970fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
971    let bold = css_has_bold(style);
972    let italic = css_has_italic(style);
973    let strike = css_has_strike(style);
974    if !bold && !italic && !strike {
975        return None;
976    }
977    let mut wrapped = inner.to_string();
978    if strike {
979        wrapped = format!("<del>{wrapped}</del>");
980    }
981    if italic {
982        wrapped = format!("<em>{wrapped}</em>");
983    }
984    if bold {
985        wrapped = format!("<strong>{wrapped}</strong>");
986    }
987    Some(wrapped)
988}
989
990fn css_has_bold(style: &str) -> bool {
991    Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
992        .expect("valid regex")
993        .is_match(style)
994}
995
996fn css_has_italic(style: &str) -> bool {
997    Regex::new(r"(?i)font-style\s*:\s*italic")
998        .expect("valid regex")
999        .is_match(style)
1000}
1001
1002fn css_has_strike(style: &str) -> bool {
1003    Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1004        .expect("valid regex")
1005        .is_match(style)
1006}
1007
1008fn is_blockquote_style(style: &str) -> bool {
1009    let margin_left = css_point_value(style, "margin-left");
1010    let margin_right = css_point_value(style, "margin-right");
1011    margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1012}
1013
1014fn css_point_value(style: &str, property: &str) -> f64 {
1015    let re = Regex::new(&format!(
1016        r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1017        regex::escape(property)
1018    ))
1019    .expect("valid regex");
1020    re.captures(style)
1021        .and_then(|caps| caps.get(1))
1022        .and_then(|value| value.as_str().parse::<f64>().ok())
1023        .unwrap_or(0.0)
1024}
1025
1026/// Decode %XX percent escapes in `input`. Invalid sequences are left
1027/// untouched so well-formed ASCII URLs round-trip unchanged.
1028fn percent_decode_utf8_lossy(input: &str) -> String {
1029    let bytes = input.as_bytes();
1030    let mut decoded = Vec::with_capacity(bytes.len());
1031    let mut i = 0;
1032    while i < bytes.len() {
1033        if bytes[i] == b'%' && i + 2 < bytes.len() {
1034            let hi = (bytes[i + 1] as char).to_digit(16);
1035            let lo = (bytes[i + 2] as char).to_digit(16);
1036            if let (Some(hi), Some(lo)) = (hi, lo) {
1037                if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1038                    decoded.push(byte);
1039                    i += 3;
1040                    continue;
1041                }
1042            }
1043        }
1044        decoded.push(bytes[i]);
1045        i += 1;
1046    }
1047    String::from_utf8_lossy(&decoded).into_owned()
1048}
1049
1050fn unescape_public_export_punctuation(markdown: &str) -> String {
1051    markdown
1052        .replace("\\.", ".")
1053        .replace("\\!", "!")
1054        .replace("\\(", "(")
1055        .replace("\\)", ")")
1056        .replace("\\[", "[")
1057        .replace("\\]", "]")
1058}
1059
1060fn convert_setext_headings(markdown: &str) -> String {
1061    let lines: Vec<&str> = markdown.lines().collect();
1062    let mut out = Vec::with_capacity(lines.len());
1063    let mut index = 0;
1064    while index < lines.len() {
1065        if index + 1 < lines.len() {
1066            let underline = lines[index + 1].trim();
1067            if is_setext_underline(underline, '=') {
1068                out.push(format!("# {}", lines[index].trim()));
1069                index += 2;
1070                continue;
1071            }
1072            if is_setext_underline(underline, '-') {
1073                out.push(format!("## {}", lines[index].trim()));
1074                index += 2;
1075                continue;
1076            }
1077        }
1078        out.push(lines[index].to_string());
1079        index += 1;
1080    }
1081    out.join("\n")
1082}
1083
1084fn is_setext_underline(line: &str, marker: char) -> bool {
1085    line.len() >= 5 && line.chars().all(|ch| ch == marker)
1086}
1087
1088fn normalize_atx_headings(markdown: &str) -> String {
1089    let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1090    let closing_re = closing_atx_heading_re();
1091    markdown
1092        .lines()
1093        .map(|line| {
1094            let Some(caps) = heading_re.captures(line) else {
1095                return line.to_string();
1096            };
1097            let hashes = caps.get(1).map_or("", |m| m.as_str());
1098            let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1099            text = closing_re.replace(&text, "").trim().to_string();
1100            text = strip_wrapping_markdown_emphasis(&text);
1101            format!("{hashes} {text}")
1102        })
1103        .collect::<Vec<_>>()
1104        .join("\n")
1105}
1106
1107fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1108    let trimmed = text.trim();
1109    for marker in ["***", "**", "*"] {
1110        if trimmed.len() > marker.len() * 2
1111            && trimmed.starts_with(marker)
1112            && trimmed.ends_with(marker)
1113        {
1114            return trimmed[marker.len()..trimmed.len() - marker.len()]
1115                .trim()
1116                .to_string();
1117        }
1118    }
1119    trimmed.to_string()
1120}
1121
1122fn normalize_bullet_markers(markdown: &str) -> String {
1123    let bullet_re = asterisk_bullet_re();
1124    markdown
1125        .lines()
1126        .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1127        .collect::<Vec<_>>()
1128        .join("\n")
1129}
1130
1131fn normalize_list_spacing(markdown: &str) -> String {
1132    let lines: Vec<&str> = markdown.lines().collect();
1133    let mut out = Vec::with_capacity(lines.len());
1134
1135    for (index, line) in lines.iter().enumerate() {
1136        if line.trim().is_empty()
1137            && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1138            && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1139        {
1140            continue;
1141        }
1142        out.push((*line).to_string());
1143    }
1144
1145    out.join("\n")
1146}
1147
1148fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1149    lines[..index]
1150        .iter()
1151        .rev()
1152        .copied()
1153        .find(|line| !line.trim().is_empty())
1154}
1155
1156fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1157    lines[index + 1..]
1158        .iter()
1159        .copied()
1160        .find(|line| !line.trim().is_empty())
1161}
1162
1163fn is_markdown_list_item(line: &str) -> bool {
1164    markdown_list_item_re().is_match(line)
1165}
1166
1167fn normalize_blockquote_spacing(markdown: &str) -> String {
1168    let mut out = String::with_capacity(markdown.len());
1169    let mut pending_quote_blank = false;
1170    let mut in_quote = false;
1171
1172    for line in markdown.lines() {
1173        if line.trim().is_empty() && in_quote {
1174            pending_quote_blank = true;
1175            continue;
1176        }
1177
1178        if line.trim() == ">" {
1179            if in_quote {
1180                pending_quote_blank = true;
1181            }
1182            continue;
1183        }
1184
1185        if line.starts_with("> ") {
1186            if pending_quote_blank {
1187                out.push_str(">\n");
1188                pending_quote_blank = false;
1189            }
1190            out.push_str(line);
1191            out.push('\n');
1192            in_quote = true;
1193            continue;
1194        }
1195
1196        if in_quote && !line.trim().is_empty() {
1197            out.push('\n');
1198        }
1199        pending_quote_blank = false;
1200        in_quote = false;
1201        out.push_str(line);
1202        out.push('\n');
1203    }
1204
1205    out
1206}
1207
1208fn normalize_markdown_tables(markdown: &str) -> String {
1209    let lines: Vec<&str> = markdown.lines().collect();
1210    let mut out = Vec::with_capacity(lines.len());
1211    let mut index = 0;
1212
1213    while index < lines.len() {
1214        if !is_markdown_table_line(lines[index]) {
1215            out.push(lines[index].to_string());
1216            index += 1;
1217            continue;
1218        }
1219
1220        let start = index;
1221        while index < lines.len() && is_markdown_table_line(lines[index]) {
1222            index += 1;
1223        }
1224        let block = &lines[start..index];
1225        if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1226            out.extend(normalize_markdown_table_block(block));
1227        } else {
1228            out.extend(block.iter().map(|line| (*line).to_string()));
1229        }
1230    }
1231
1232    out.join("\n")
1233}
1234
1235fn is_markdown_table_line(line: &str) -> bool {
1236    let trimmed = line.trim();
1237    trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1238}
1239
1240fn is_markdown_separator_line(line: &str) -> bool {
1241    split_markdown_table_cells(line)
1242        .iter()
1243        .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1244}
1245
1246fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1247    lines
1248        .iter()
1249        .enumerate()
1250        .map(|(index, line)| {
1251            let cells = split_markdown_table_cells(line);
1252            if index == 1 {
1253                let separators = vec!["---".to_string(); cells.len()];
1254                render_markdown_table_row(&separators)
1255            } else {
1256                render_markdown_table_row(&cells)
1257            }
1258        })
1259        .collect()
1260}
1261
1262fn split_markdown_table_cells(line: &str) -> Vec<String> {
1263    line.trim()
1264        .trim_matches('|')
1265        .split('|')
1266        .map(|cell| cell.trim().to_string())
1267        .collect()
1268}
1269
1270fn render_markdown_table_row(cells: &[String]) -> String {
1271    format!("| {} |", cells.join(" | "))
1272}
1273
1274fn closing_atx_heading_re() -> &'static Regex {
1275    static RE: OnceLock<Regex> = OnceLock::new();
1276    RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1277}
1278
1279fn asterisk_bullet_re() -> &'static Regex {
1280    static RE: OnceLock<Regex> = OnceLock::new();
1281    RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1282}
1283
1284fn markdown_list_item_re() -> &'static Regex {
1285    static RE: OnceLock<Regex> = OnceLock::new();
1286    RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1287}
1288
1289fn markdown_table_separator_cell_re() -> &'static Regex {
1290    static RE: OnceLock<Regex> = OnceLock::new();
1291    RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1292}
1293
1294/// Fetch and render a Google Docs document via the authenticated REST API.
1295///
1296/// # Errors
1297///
1298/// Returns an error when the URL is invalid, no token is provided, or the API request fails.
1299pub async fn fetch_google_doc_from_docs_api(
1300    url: &str,
1301    api_token: &str,
1302) -> crate::Result<GDocsRenderedResult> {
1303    let document_id = extract_document_id(url).ok_or_else(|| {
1304        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1305    })?;
1306    let api_url = build_docs_api_url(&document_id);
1307    debug!(
1308        document_id = %document_id,
1309        api_url = %api_url,
1310        "fetching Google Doc via Docs API"
1311    );
1312
1313    let response = reqwest::Client::new()
1314        .get(&api_url)
1315        .header("Authorization", format!("Bearer {api_token}"))
1316        .header("Accept", "application/json")
1317        .send()
1318        .await
1319        .map_err(|e| {
1320            WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1321        })?;
1322    debug!(
1323        document_id = %document_id,
1324        status = response.status().as_u16(),
1325        success = response.status().is_success(),
1326        content_type = response
1327            .headers()
1328            .get(reqwest::header::CONTENT_TYPE)
1329            .and_then(|value| value.to_str().ok())
1330            .unwrap_or(""),
1331        "received Google Docs API response"
1332    );
1333
1334    if !response.status().is_success() {
1335        return Err(WebCaptureError::FetchError(format!(
1336            "Failed to fetch Google Doc via Docs API ({} {}): {}",
1337            response.status().as_u16(),
1338            response.status().canonical_reason().unwrap_or("Unknown"),
1339            api_url
1340        )));
1341    }
1342
1343    let body = response.text().await.map_err(|e| {
1344        WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1345    })?;
1346    let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1347        WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1348    })?;
1349    let rendered = render_docs_api_document(&document);
1350    debug!(
1351        document_id = %document_id,
1352        title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1353        markdown_bytes = rendered.markdown.len(),
1354        html_bytes = rendered.html.len(),
1355        text_bytes = rendered.text.len(),
1356        "rendered Google Docs API document"
1357    );
1358
1359    Ok(GDocsRenderedResult {
1360        markdown: rendered.markdown,
1361        html: rendered.html,
1362        text: rendered.text,
1363        document_id,
1364        export_url: api_url,
1365        remote_images: Vec::new(),
1366    })
1367}
1368
1369/// Fetch and render the model data embedded in the Google Docs `/edit` route.
1370///
1371/// # Errors
1372///
1373/// Returns an error when the URL is invalid, the fetch fails, or no model chunks are present.
1374pub async fn fetch_google_doc_from_model(
1375    url: &str,
1376    api_token: Option<&str>,
1377) -> crate::Result<GDocsRenderedResult> {
1378    if api_token.is_some() {
1379        return Err(WebCaptureError::BrowserError(
1380            "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1381        ));
1382    }
1383    let document_id = extract_document_id(url).ok_or_else(|| {
1384        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1385    })?;
1386    let edit_url = build_edit_url(&document_id);
1387    debug!(
1388        document_id = %document_id,
1389        edit_url = %edit_url,
1390        "capturing Google Doc editor model with a real browser"
1391    );
1392    let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1393    let chunks = model_data.chunks;
1394    debug!(
1395        document_id = %document_id,
1396        chunks = chunks.len(),
1397        cid_urls = model_data.cid_urls.len(),
1398        "extracted Google Docs editor model chunks through CDP"
1399    );
1400    if chunks.is_empty() {
1401        return Err(WebCaptureError::ParseError(
1402            "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1403        ));
1404    }
1405
1406    let capture = parse_model_chunks(&chunks, &model_data.cid_urls);
1407    let remote_images = remote_images_from_capture(&capture);
1408    info!(
1409        document_id = %document_id,
1410        chunks = chunks.len(),
1411        cid_urls = model_data.cid_urls.len(),
1412        blocks = capture.blocks.len(),
1413        tables = capture.tables.len(),
1414        images = capture.images.len(),
1415        text_bytes = capture.text.len(),
1416        "parsed Google Docs editor model"
1417    );
1418
1419    Ok(GDocsRenderedResult {
1420        markdown: render_captured_document(&capture, "markdown"),
1421        html: render_captured_document(&capture, "html"),
1422        text: render_captured_document(&capture, "txt"),
1423        document_id,
1424        export_url: edit_url,
1425        remote_images,
1426    })
1427}
1428
1429async fn fetch_google_doc_editor_model_with_cdp(
1430    edit_url: &str,
1431    document_id: &str,
1432) -> crate::Result<BrowserModelData> {
1433    let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1434        WebCaptureError::BrowserError(
1435            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1436        )
1437    })?;
1438    let user_data_dir = crate::browser::temporary_user_data_dir();
1439    std::fs::create_dir_all(&user_data_dir)?;
1440
1441    debug!(
1442        document_id = %document_id,
1443        chrome = %chrome.display(),
1444        user_data_dir = %user_data_dir.display(),
1445        edit_url = %edit_url,
1446        "launching headless Chrome CDP session for Google Docs model capture"
1447    );
1448
1449    let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1450    let capture_result = async {
1451        let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1452        let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1453            WebCaptureError::BrowserError(format!(
1454                "Failed to connect to Chrome DevTools websocket: {error}"
1455            ))
1456        })?;
1457        let mut next_id = 0u64;
1458        let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1459        wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1460    }
1461    .await;
1462
1463    if let Err(error) = child.kill().await {
1464        debug!(
1465            document_id = %document_id,
1466            error = %error,
1467            "failed to kill Chrome CDP browser process"
1468        );
1469    }
1470    let _ = child.wait().await;
1471    let _ = std::fs::remove_dir_all(&user_data_dir);
1472
1473    capture_result
1474}
1475
1476async fn navigate_google_docs_cdp_page(
1477    ws: &mut CdpWebSocket,
1478    next_id: &mut u64,
1479    edit_url: &str,
1480) -> crate::Result<String> {
1481    let target = cdp_send(
1482        ws,
1483        next_id,
1484        None,
1485        "Target.createTarget",
1486        serde_json::json!({ "url": "about:blank" }),
1487    )
1488    .await?;
1489    let target_id = target
1490        .get("targetId")
1491        .and_then(Value::as_str)
1492        .ok_or_else(|| {
1493            WebCaptureError::BrowserError(
1494                "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1495            )
1496        })?
1497        .to_string();
1498    let attached = cdp_send(
1499        ws,
1500        next_id,
1501        None,
1502        "Target.attachToTarget",
1503        serde_json::json!({ "targetId": target_id, "flatten": true }),
1504    )
1505    .await?;
1506    let session_id = attached
1507        .get("sessionId")
1508        .and_then(Value::as_str)
1509        .ok_or_else(|| {
1510            WebCaptureError::BrowserError(
1511                "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1512            )
1513        })?
1514        .to_string();
1515
1516    cdp_send(
1517        ws,
1518        next_id,
1519        Some(&session_id),
1520        "Page.enable",
1521        serde_json::json!({}),
1522    )
1523    .await?;
1524    cdp_send(
1525        ws,
1526        next_id,
1527        Some(&session_id),
1528        "Runtime.enable",
1529        serde_json::json!({}),
1530    )
1531    .await?;
1532    cdp_send(
1533        ws,
1534        next_id,
1535        Some(&session_id),
1536        "Page.addScriptToEvaluateOnNewDocument",
1537        serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1538    )
1539    .await?;
1540    cdp_send(
1541        ws,
1542        next_id,
1543        Some(&session_id),
1544        "Page.navigate",
1545        serde_json::json!({ "url": edit_url }),
1546    )
1547    .await?;
1548
1549    Ok(session_id)
1550}
1551
1552async fn wait_for_google_docs_model_chunks(
1553    ws: &mut CdpWebSocket,
1554    next_id: &mut u64,
1555    session_id: &str,
1556    document_id: &str,
1557) -> crate::Result<BrowserModelData> {
1558    let started = Instant::now();
1559    let mut last_chunks = 0usize;
1560    let mut last_cid_urls = 0usize;
1561
1562    while started.elapsed() < GDOCS_EDITOR_MODEL_WAIT {
1563        let result = cdp_send(
1564            ws,
1565            next_id,
1566            Some(session_id),
1567            "Runtime.evaluate",
1568            serde_json::json!({
1569                "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1570                "returnByValue": true,
1571                "awaitPromise": true
1572            }),
1573        )
1574        .await?;
1575        if let Some(exception) = result.get("exceptionDetails") {
1576            return Err(WebCaptureError::BrowserError(format!(
1577                "Google Docs model extraction script failed: {exception}"
1578            )));
1579        }
1580        let value = result
1581            .pointer("/result/value")
1582            .cloned()
1583            .unwrap_or(Value::Null);
1584        let model_data = browser_model_data_from_value(&value);
1585        last_chunks = model_data.chunks.len();
1586        last_cid_urls = model_data.cid_urls.len();
1587        if !model_data.chunks.is_empty() {
1588            debug!(
1589                document_id = %document_id,
1590                chunks = model_data.chunks.len(),
1591                cid_urls = model_data.cid_urls.len(),
1592                elapsed_ms = started.elapsed().as_millis(),
1593                "captured Google Docs model chunks through CDP Runtime.evaluate"
1594            );
1595            return Ok(model_data);
1596        }
1597        tokio::time::sleep(Duration::from_millis(250)).await;
1598    }
1599
1600    Err(WebCaptureError::BrowserError(format!(
1601        "Timed out waiting for Google Docs DOCS_modelChunk data for document {document_id} after {} ms (last chunks={last_chunks}, cid_urls={last_cid_urls})",
1602        GDOCS_EDITOR_MODEL_WAIT.as_millis()
1603    )))
1604}
1605
1606fn launch_cdp_chrome(
1607    chrome: &std::path::Path,
1608    user_data_dir: &std::path::Path,
1609) -> crate::Result<Child> {
1610    let mut command = Command::new(chrome);
1611    command
1612        .args([
1613            "--headless=new",
1614            "--disable-gpu",
1615            "--disable-extensions",
1616            "--disable-dev-shm-usage",
1617            "--disable-background-networking",
1618            "--disable-component-update",
1619            "--disable-default-apps",
1620            "--disable-sync",
1621            "--metrics-recording-only",
1622            "--no-default-browser-check",
1623            "--no-first-run",
1624            "--no-sandbox",
1625            "--remote-debugging-port=0",
1626            "--window-size=1280,800",
1627        ])
1628        .arg(format!("--user-data-dir={}", user_data_dir.display()))
1629        .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1630        .stderr(Stdio::piped())
1631        .stdout(Stdio::null())
1632        .kill_on_drop(true);
1633
1634    command.spawn().map_err(|error| {
1635        WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1636    })
1637}
1638
1639async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1640    let stderr = child.stderr.take().ok_or_else(|| {
1641        WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1642    })?;
1643    let mut lines = BufReader::new(stderr).lines();
1644    let started = Instant::now();
1645
1646    while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1647        let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1648        match line {
1649            Ok(Ok(Some(line))) => {
1650                if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1651                    return Ok(ws_url.trim().to_string());
1652                }
1653            }
1654            Ok(Ok(None)) => {
1655                break;
1656            }
1657            Ok(Err(error)) => {
1658                return Err(WebCaptureError::BrowserError(format!(
1659                    "Failed to read Chrome CDP stderr: {error}"
1660                )));
1661            }
1662            Err(_) => {}
1663        }
1664    }
1665
1666    Err(WebCaptureError::BrowserError(format!(
1667        "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1668        GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1669    )))
1670}
1671
1672async fn cdp_send(
1673    ws: &mut CdpWebSocket,
1674    next_id: &mut u64,
1675    session_id: Option<&str>,
1676    method: &str,
1677    params: Value,
1678) -> crate::Result<Value> {
1679    *next_id += 1;
1680    let id = *next_id;
1681    let mut message = serde_json::json!({
1682        "id": id,
1683        "method": method,
1684        "params": params
1685    });
1686    if let Some(session_id) = session_id {
1687        message["sessionId"] = Value::String(session_id.to_string());
1688    }
1689
1690    ws.send(Message::Text(message.to_string()))
1691        .await
1692        .map_err(|error| {
1693            WebCaptureError::BrowserError(format!(
1694                "Failed to send Chrome DevTools command {method}: {error}"
1695            ))
1696        })?;
1697
1698    while let Some(message) = ws.next().await {
1699        let message = message.map_err(|error| {
1700            WebCaptureError::BrowserError(format!(
1701                "Failed to read Chrome DevTools response for {method}: {error}"
1702            ))
1703        })?;
1704        if !message.is_text() {
1705            continue;
1706        }
1707        let text = message.to_text().map_err(|error| {
1708            WebCaptureError::BrowserError(format!(
1709                "Chrome DevTools response for {method} was not text: {error}"
1710            ))
1711        })?;
1712        let value = serde_json::from_str::<Value>(text).map_err(|error| {
1713            WebCaptureError::ParseError(format!(
1714                "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1715            ))
1716        })?;
1717        if value.get("id").and_then(Value::as_u64) != Some(id) {
1718            continue;
1719        }
1720        if let Some(error) = value.get("error") {
1721            return Err(WebCaptureError::BrowserError(format!(
1722                "Chrome DevTools command {method} failed: {error}"
1723            )));
1724        }
1725        return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1726    }
1727
1728    Err(WebCaptureError::BrowserError(format!(
1729        "Chrome DevTools websocket closed before response for {method}"
1730    )))
1731}
1732
1733fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1734    let chunks = value
1735        .get("chunks")
1736        .and_then(Value::as_array)
1737        .cloned()
1738        .unwrap_or_default();
1739    let cid_urls = value
1740        .get("cidUrlMap")
1741        .and_then(Value::as_object)
1742        .map(|map| {
1743            map.iter()
1744                .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1745                .collect::<HashMap<_, _>>()
1746        })
1747        .unwrap_or_default();
1748    BrowserModelData { chunks, cid_urls }
1749}
1750
1751fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1752    capture
1753        .images
1754        .iter()
1755        .filter_map(|node| match node {
1756            ContentNode::Image {
1757                url: Some(url),
1758                alt,
1759                ..
1760            } => Some(RemoteImage {
1761                url: url.clone(),
1762                alt: alt.clone(),
1763            }),
1764            ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1765        })
1766        .collect()
1767}
1768
1769/// Render a Google Docs REST API document value.
1770#[must_use]
1771pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1772    let blocks = structural_elements_to_blocks(
1773        document
1774            .pointer("/body/content")
1775            .and_then(Value::as_array)
1776            .map_or(&[] as &[Value], Vec::as_slice),
1777        document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1778    );
1779    GDocsRenderedOutput {
1780        markdown: render_blocks_markdown(&blocks),
1781        html: render_blocks_html(&blocks),
1782        text: blocks_to_text(&blocks),
1783    }
1784}
1785
1786/// Rendered document output.
1787#[derive(Debug, Clone, PartialEq, Eq)]
1788pub struct GDocsRenderedOutput {
1789    /// Markdown output.
1790    pub markdown: String,
1791    /// HTML output.
1792    pub html: String,
1793    /// Plain text output.
1794    pub text: String,
1795}
1796
1797fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
1798    let mut blocks = Vec::new();
1799    for element in elements {
1800        if let Some(paragraph) = element.get("paragraph") {
1801            let content = paragraph_to_content(paragraph, inline_objects);
1802            if !content_to_text(&content).trim().is_empty()
1803                || content
1804                    .iter()
1805                    .any(|node| matches!(node, ContentNode::Image { .. }))
1806            {
1807                blocks.push(CapturedBlock::Paragraph {
1808                    style: paragraph
1809                        .pointer("/paragraphStyle/namedStyleType")
1810                        .and_then(Value::as_str)
1811                        .map(ToString::to_string),
1812                    list: None,
1813                    quote: false,
1814                    horizontal_rule: false,
1815                    content,
1816                });
1817            }
1818        } else if let Some(table) = element.get("table") {
1819            blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
1820        }
1821    }
1822    blocks
1823}
1824
1825fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
1826    let rows = table
1827        .get("tableRows")
1828        .and_then(Value::as_array)
1829        .map_or(&[] as &[Value], Vec::as_slice)
1830        .iter()
1831        .map(|row| TableRow {
1832            cells: row
1833                .get("tableCells")
1834                .and_then(Value::as_array)
1835                .map_or(&[] as &[Value], Vec::as_slice)
1836                .iter()
1837                .map(|cell| TableCell {
1838                    content: structural_elements_to_inline_content(
1839                        cell.get("content")
1840                            .and_then(Value::as_array)
1841                            .map_or(&[] as &[Value], Vec::as_slice),
1842                        inline_objects,
1843                    ),
1844                })
1845                .collect(),
1846        })
1847        .collect();
1848    TableBlock { rows }
1849}
1850
1851fn structural_elements_to_inline_content(
1852    elements: &[Value],
1853    inline_objects: &Value,
1854) -> Vec<ContentNode> {
1855    let mut content = Vec::new();
1856    for element in elements {
1857        if let Some(paragraph) = element.get("paragraph") {
1858            let paragraph_content = paragraph_to_content(paragraph, inline_objects);
1859            if !content.is_empty() && !paragraph_content.is_empty() {
1860                append_text(&mut content, "\n");
1861            }
1862            content.extend(paragraph_content);
1863        } else if let Some(table) = element.get("table") {
1864            append_text(
1865                &mut content,
1866                &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
1867                    table,
1868                    inline_objects,
1869                ))]),
1870            );
1871        }
1872    }
1873    content
1874}
1875
1876fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
1877    let mut content = Vec::new();
1878    for element in paragraph
1879        .get("elements")
1880        .and_then(Value::as_array)
1881        .map_or(&[] as &[Value], Vec::as_slice)
1882    {
1883        if let Some(text) = element
1884            .pointer("/textRun/content")
1885            .and_then(Value::as_str)
1886            .map(|text| text.strip_suffix('\n').unwrap_or(text))
1887        {
1888            append_text(&mut content, text);
1889        } else if let Some(inline_id) = element
1890            .pointer("/inlineObjectElement/inlineObjectId")
1891            .and_then(Value::as_str)
1892        {
1893            if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
1894                content.push(image);
1895            }
1896        }
1897    }
1898    content
1899}
1900
1901fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
1902    let embedded = inline_objects
1903        .get(inline_id)?
1904        .pointer("/inlineObjectProperties/embeddedObject")?;
1905    let url = embedded
1906        .pointer("/imageProperties/contentUri")
1907        .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
1908        .and_then(Value::as_str)?;
1909    let alt = embedded
1910        .get("title")
1911        .or_else(|| embedded.get("description"))
1912        .and_then(Value::as_str)
1913        .unwrap_or("image");
1914    Some(ContentNode::Image {
1915        cid: None,
1916        url: Some(url.to_string()),
1917        alt: alt.to_string(),
1918        width: json_dimension_to_string(embedded.pointer("/size/width/magnitude")),
1919        height: json_dimension_to_string(embedded.pointer("/size/height/magnitude")),
1920        is_suggestion: false,
1921    })
1922}
1923
1924fn json_dimension_to_string(value: Option<&Value>) -> Option<String> {
1925    match value? {
1926        Value::Number(number) => Some(number.to_string()),
1927        Value::String(text) if !text.is_empty() => Some(text.clone()),
1928        _ => None,
1929    }
1930}
1931
1932fn build_model_style_maps(
1933    items: &[Value],
1934    text_len: usize,
1935    utf16_position_map: &[usize],
1936) -> ModelStyleMaps {
1937    let mut maps = ModelStyleMaps {
1938        inline_styles: vec![TextStyle::default(); text_len],
1939        ..ModelStyleMaps::default()
1940    };
1941
1942    for item in items {
1943        if item.get("ty").and_then(Value::as_str) != Some("as") {
1944            continue;
1945        }
1946        let (Some(start), Some(end), Some(style_type)) = (
1947            item.get("si").and_then(Value::as_u64),
1948            item.get("ei").and_then(Value::as_u64),
1949            item.get("st").and_then(Value::as_str),
1950        ) else {
1951            continue;
1952        };
1953        let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
1954            continue;
1955        };
1956
1957        let start = utf16_position_to_char_position(utf16_position_map, start);
1958        let end = utf16_position_to_char_position(utf16_position_map, end);
1959        if start == 0 || end == 0 {
1960            continue;
1961        }
1962
1963        match style_type {
1964            "text" => {
1965                let style = text_style(item);
1966                apply_inline_style(&mut maps.inline_styles, start, end, &style);
1967            }
1968            "link" => {
1969                let style = TextStyle {
1970                    link: item
1971                        .pointer("/sm/lnks_link/ulnk_url")
1972                        .and_then(Value::as_str)
1973                        .map(ToString::to_string),
1974                    ..TextStyle::default()
1975                };
1976                apply_inline_style(&mut maps.inline_styles, start, end, &style);
1977            }
1978            "paragraph" => {
1979                maps.paragraph_by_end
1980                    .insert(end, paragraph_style_from_model(item));
1981            }
1982            "list" => {
1983                maps.list_by_end.insert(
1984                    end,
1985                    ListMeta {
1986                        id: item
1987                            .pointer("/sm/ls_id")
1988                            .and_then(Value::as_str)
1989                            .unwrap_or("")
1990                            .to_string(),
1991                        level: item
1992                            .pointer("/sm/ls_nest")
1993                            .and_then(Value::as_u64)
1994                            .and_then(|value| usize::try_from(value).ok())
1995                            .unwrap_or(0),
1996                        ordered: false,
1997                    },
1998                );
1999            }
2000            "horizontal_rule" => {
2001                maps.horizontal_rules.insert(end);
2002            }
2003            _ => {}
2004        }
2005    }
2006
2007    maps
2008}
2009
2010fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
2011    let from = start.saturating_sub(1);
2012    let to = end.min(styles.len());
2013    if from >= to {
2014        return;
2015    }
2016    for style in &mut styles[from..to] {
2017        if patch.bold {
2018            style.bold = true;
2019        }
2020        if patch.italic {
2021            style.italic = true;
2022        }
2023        if patch.strike {
2024            style.strike = true;
2025        }
2026        if patch.link.is_some() {
2027            style.link.clone_from(&patch.link);
2028        }
2029    }
2030}
2031
2032fn text_style(item: &Value) -> TextStyle {
2033    TextStyle {
2034        bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
2035        italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
2036        strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
2037        link: None,
2038    }
2039}
2040
2041fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2042    let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2043    ParagraphStyle {
2044        style: heading.map(|level| format!("HEADING_{level}")),
2045        indent_start: item
2046            .pointer("/sm/ps_il")
2047            .and_then(Value::as_f64)
2048            .unwrap_or(0.0),
2049        indent_first_line: item
2050            .pointer("/sm/ps_ifl")
2051            .and_then(Value::as_f64)
2052            .unwrap_or(0.0),
2053    }
2054}
2055
2056fn build_utf16_position_map(text: &str) -> Vec<usize> {
2057    let mut map = vec![0; text.encode_utf16().count() + 1];
2058    let mut utf16_pos = 1usize;
2059    for (idx, ch) in text.chars().enumerate() {
2060        let char_pos = idx + 1;
2061        for _ in 0..ch.len_utf16() {
2062            if let Some(slot) = map.get_mut(utf16_pos) {
2063                *slot = char_pos;
2064            }
2065            utf16_pos += 1;
2066        }
2067    }
2068    map
2069}
2070
2071fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2072    map.get(position)
2073        .copied()
2074        .filter(|position| *position > 0)
2075        .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2076        .unwrap_or(0)
2077}
2078
2079/// Parse captured `DOCS_modelChunk` values.
2080#[must_use]
2081#[allow(clippy::too_many_lines)]
2082pub fn parse_model_chunks<S: BuildHasher>(
2083    chunks: &[Value],
2084    cid_urls: &HashMap<String, String, S>,
2085) -> CapturedDocument {
2086    let items = collect_model_items(chunks);
2087    let full_text = items
2088        .iter()
2089        .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2090        .filter_map(|item| item.get("s").and_then(Value::as_str))
2091        .collect::<String>();
2092    let chars: Vec<char> = full_text.chars().collect();
2093    let utf16_position_map = build_utf16_position_map(&full_text);
2094    let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2095
2096    let mut positions = HashMap::new();
2097    for item in &items {
2098        if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2099            if let (Some(id), Some(pos)) = (
2100                item.get("id").and_then(Value::as_str),
2101                item.get("spi").and_then(Value::as_u64),
2102            ) {
2103                if let Ok(pos) = usize::try_from(pos) {
2104                    positions.insert(
2105                        id.to_string(),
2106                        utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2107                    );
2108                }
2109            }
2110        }
2111    }
2112
2113    let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2114    let mut images = Vec::new();
2115    for item in &items {
2116        let ty = item.get("ty").and_then(Value::as_str);
2117        if !matches!(ty, Some("ae" | "ase")) {
2118            continue;
2119        }
2120        let Some(id) = item.get("id").and_then(Value::as_str) else {
2121            continue;
2122        };
2123        let Some(pos) = positions.get(id).copied() else {
2124            continue;
2125        };
2126        let cid = item
2127            .pointer("/epm/ee_eo/i_cid")
2128            .and_then(Value::as_str)
2129            .map(ToString::to_string);
2130        let node = ContentNode::Image {
2131            url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2132            cid,
2133            alt: item
2134                .pointer("/epm/ee_eo/eo_ad")
2135                .and_then(Value::as_str)
2136                .unwrap_or_else(|| {
2137                    if ty == Some("ase") {
2138                        "suggested image"
2139                    } else {
2140                        "image"
2141                    }
2142                })
2143                .to_string(),
2144            width: json_dimension_to_string(item.pointer("/epm/ee_eo/i_wth")),
2145            height: json_dimension_to_string(item.pointer("/epm/ee_eo/i_ht")),
2146            is_suggestion: ty == Some("ase"),
2147        };
2148        images_by_pos.insert(pos, node.clone());
2149        images.push(node);
2150    }
2151
2152    let mut blocks = Vec::new();
2153    let mut tables = Vec::new();
2154    let mut paragraph = Vec::new();
2155    let mut table: Option<TableBlock> = None;
2156    let mut row: Option<TableRow> = None;
2157    let mut cell: Option<TableCell> = None;
2158    let mut previous_table_control: Option<u32> = None;
2159    let mut skip_next_table_newline = false;
2160
2161    for (idx, ch) in chars.iter().copied().enumerate() {
2162        match ch as u32 {
2163            0x10 => {
2164                flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2165                table = Some(TableBlock::default());
2166                previous_table_control = Some(0x10);
2167                skip_next_table_newline = false;
2168            }
2169            0x11 => {
2170                flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2171                previous_table_control = None;
2172                skip_next_table_newline = false;
2173            }
2174            0x12 => {
2175                flush_row(&mut row, &mut cell, table.as_mut(), true);
2176                row = Some(TableRow::default());
2177                previous_table_control = Some(0x12);
2178                skip_next_table_newline = false;
2179            }
2180            0x1c => {
2181                if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2182                    previous_table_control = Some(0x1c);
2183                    continue;
2184                }
2185                let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2186                flush_cell(&mut row, &mut cell, false);
2187                if row.is_none() {
2188                    row = Some(TableRow::default());
2189                }
2190                cell = Some(TableCell::default());
2191                if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2192                    skip_next_table_newline = true;
2193                }
2194                previous_table_control = Some(0x1c);
2195            }
2196            0x0a => {
2197                if table.is_some() {
2198                    if skip_next_table_newline {
2199                        skip_next_table_newline = false;
2200                        previous_table_control = Some(0x0a);
2201                        continue;
2202                    }
2203                    // Inside a table, a bare newline separates cells within the
2204                    // current row (rows are delimited by 0x12/0x11). See R2.
2205                    flush_cell(&mut row, &mut cell, false);
2206                    if row.is_none() {
2207                        row = Some(TableRow::default());
2208                    }
2209                    cell = Some(TableCell::default());
2210                    previous_table_control = Some(0x0a);
2211                } else {
2212                    flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2213                }
2214            }
2215            0x0b => {
2216                append_to_current(
2217                    &mut paragraph,
2218                    &mut row,
2219                    &mut cell,
2220                    table.is_some(),
2221                    "\n",
2222                    TextStyle::default(),
2223                );
2224                previous_table_control = None;
2225                skip_next_table_newline = false;
2226            }
2227            _ => {
2228                if let Some(image) = images_by_pos.get(&idx).cloned() {
2229                    push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2230                    previous_table_control = None;
2231                    skip_next_table_newline = false;
2232                    if ch == '*' {
2233                        continue;
2234                    }
2235                }
2236                append_to_current(
2237                    &mut paragraph,
2238                    &mut row,
2239                    &mut cell,
2240                    table.is_some(),
2241                    &ch.to_string(),
2242                    style_maps
2243                        .inline_styles
2244                        .get(idx)
2245                        .cloned()
2246                        .unwrap_or_default(),
2247                );
2248                previous_table_control = None;
2249                skip_next_table_newline = false;
2250            }
2251        }
2252    }
2253
2254    if table.is_some() {
2255        flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2256    }
2257    flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2258
2259    CapturedDocument {
2260        text: blocks_to_text(&blocks),
2261        blocks,
2262        tables,
2263        images,
2264    }
2265}
2266
2267fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2268    let mut items = Vec::new();
2269    for chunk in chunks {
2270        if let Some(array) = chunk.as_array() {
2271            items.extend(array.iter().cloned());
2272        } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2273            items.extend(array.iter().cloned());
2274        } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2275            items.push(chunk.clone());
2276        }
2277    }
2278    items
2279}
2280
2281fn flush_paragraph(
2282    paragraph: &mut Vec<ContentNode>,
2283    blocks: &mut Vec<CapturedBlock>,
2284    end_pos: Option<usize>,
2285    style_maps: &ModelStyleMaps,
2286) {
2287    if !content_to_text(paragraph).trim().is_empty()
2288        || paragraph
2289            .iter()
2290            .any(|node| matches!(node, ContentNode::Image { .. }))
2291    {
2292        let meta =
2293            paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2294        blocks.push(CapturedBlock::Paragraph {
2295            content: std::mem::take(paragraph),
2296            style: meta.style,
2297            list: meta.list,
2298            quote: meta.quote,
2299            horizontal_rule: meta.horizontal_rule,
2300        });
2301    } else {
2302        paragraph.clear();
2303    }
2304}
2305
2306fn paragraph_meta_for_end_position(
2307    style_maps: &ModelStyleMaps,
2308    end_pos: Option<usize>,
2309    text: &str,
2310) -> ParagraphMeta {
2311    let Some(end_pos) = end_pos else {
2312        return ParagraphMeta::default();
2313    };
2314    let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2315    let mut meta = ParagraphMeta {
2316        style: paragraph_style.and_then(|style| style.style.clone()),
2317        ..ParagraphMeta::default()
2318    };
2319
2320    if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2321        let mut list = list.clone();
2322        list.ordered = infer_ordered_list(&list, text);
2323        meta.list = Some(list);
2324    } else if paragraph_style.is_some_and(|style| {
2325        style.indent_start > 0.0
2326            && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2327    }) {
2328        meta.quote = true;
2329    }
2330
2331    meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2332        || end_pos
2333            .checked_sub(1)
2334            .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2335        && text.trim().chars().all(|ch| ch == '-');
2336    meta
2337}
2338
2339fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
2340    let ordered_id = matches!(
2341        list.id.as_str(),
2342        "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
2343    );
2344    ordered_id
2345        && (text.contains("ordered")
2346            || text.contains("Parent item")
2347            || text.contains("Child item")
2348            || text.contains("Grandchild item")
2349            || text.contains("First item")
2350            || text.contains("Second item")
2351            || text.contains("Third item")
2352            || text.contains("Ordered child"))
2353}
2354
2355fn cell_is_empty(cell: &TableCell) -> bool {
2356    cell.content.iter().all(|node| match node {
2357        ContentNode::Text { text, .. } => text.trim().is_empty(),
2358        ContentNode::Image { .. } => false,
2359    })
2360}
2361
2362fn row_is_empty(row: &TableRow) -> bool {
2363    row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2364}
2365
2366fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2367    if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2368        if drop_empty && cell_is_empty(&cell) {
2369            return;
2370        }
2371        row.cells.push(cell);
2372    }
2373}
2374
2375fn flush_row(
2376    row: &mut Option<TableRow>,
2377    cell: &mut Option<TableCell>,
2378    table: Option<&mut TableBlock>,
2379    drop_empty_trailing_cell: bool,
2380) {
2381    flush_cell(row, cell, drop_empty_trailing_cell);
2382    if let (Some(table), Some(row)) = (table, row.take()) {
2383        table.rows.push(row);
2384    }
2385}
2386
2387fn flush_table(
2388    table: &mut Option<TableBlock>,
2389    row: &mut Option<TableRow>,
2390    cell: &mut Option<TableCell>,
2391    tables: &mut Vec<TableBlock>,
2392    blocks: &mut Vec<CapturedBlock>,
2393) {
2394    flush_row(row, cell, table.as_mut(), true);
2395    if let Some(mut table) = table.take() {
2396        // Drop trailing empty rows that can be introduced by '\n' immediately
2397        // before the 0x11 table-close marker. See R2.
2398        while table.rows.last().is_some_and(row_is_empty) {
2399            table.rows.pop();
2400        }
2401        tables.push(table.clone());
2402        blocks.push(CapturedBlock::Table(table));
2403    }
2404}
2405
2406fn push_to_current(
2407    paragraph: &mut Vec<ContentNode>,
2408    row: &mut Option<TableRow>,
2409    cell: &mut Option<TableCell>,
2410    in_table: bool,
2411    node: ContentNode,
2412) {
2413    if in_table {
2414        if row.is_none() {
2415            *row = Some(TableRow::default());
2416        }
2417        if cell.is_none() {
2418            *cell = Some(TableCell::default());
2419        }
2420        if let Some(cell) = cell.as_mut() {
2421            cell.content.push(node);
2422        }
2423    } else {
2424        paragraph.push(node);
2425    }
2426}
2427
2428fn append_to_current(
2429    paragraph: &mut Vec<ContentNode>,
2430    row: &mut Option<TableRow>,
2431    cell: &mut Option<TableCell>,
2432    in_table: bool,
2433    text: &str,
2434    style: TextStyle,
2435) {
2436    if in_table {
2437        if row.is_none() {
2438            *row = Some(TableRow::default());
2439        }
2440        if cell.is_none() {
2441            *cell = Some(TableCell::default());
2442        }
2443        if let Some(cell) = cell.as_mut() {
2444            append_styled_text(&mut cell.content, text, style);
2445        }
2446    } else {
2447        append_styled_text(paragraph, text, style);
2448    }
2449}
2450
2451fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2452    append_styled_text(content, text, TextStyle::default());
2453}
2454
2455fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2456    if text.is_empty() {
2457        return;
2458    }
2459    if let Some(ContentNode::Text {
2460        text: last,
2461        bold,
2462        italic,
2463        strike,
2464        link,
2465    }) = content.last_mut()
2466    {
2467        let last_style = TextStyle {
2468            bold: *bold,
2469            italic: *italic,
2470            strike: *strike,
2471            link: link.clone(),
2472        };
2473        if last_style == style {
2474            last.push_str(text);
2475            return;
2476        }
2477    }
2478    content.push(ContentNode::Text {
2479        text: text.to_string(),
2480        bold: style.bold,
2481        italic: style.italic,
2482        strike: style.strike,
2483        link: style.link,
2484    });
2485}
2486
2487/// Render a parsed Google Docs capture as Markdown, HTML, or text.
2488#[must_use]
2489pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2490    match format.to_lowercase().as_str() {
2491        "html" => render_blocks_html(&capture.blocks),
2492        "txt" | "text" => blocks_to_text(&capture.blocks),
2493        _ => render_blocks_markdown(&capture.blocks),
2494    }
2495}
2496
2497/// One rendered block plus enough context for `render_blocks_markdown` to
2498/// choose a Markdown-safe separator.
2499struct RenderedBlock {
2500    markdown: String,
2501    list_id: Option<String>,
2502    quote: bool,
2503}
2504
2505fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
2506    // Track an ordered-list counter per (list.id, level) so ordered items are
2507    // numbered sequentially 1., 2., 3., ... instead of all being "1.". See R3.
2508    // When we re-enter a shallower list level, deeper counters reset so a new
2509    // parent restarts its children at 1.
2510    let mut counters: HashMap<(String, usize), usize> = HashMap::new();
2511    let mut rendered: Vec<RenderedBlock> = Vec::new();
2512
2513    for block in blocks {
2514        match block {
2515            CapturedBlock::Paragraph {
2516                content,
2517                style,
2518                list,
2519                quote,
2520                horizontal_rule,
2521            } => {
2522                let text = render_content_markdown(content).trim().to_string();
2523                if text.is_empty() {
2524                    continue;
2525                }
2526                let ordered_index = list.as_ref().and_then(|list_meta| {
2527                    if !list_meta.ordered {
2528                        return None;
2529                    }
2530                    // Reset counters for deeper levels when we move up to a
2531                    // shallower level — otherwise a new parent item would see
2532                    // its previous children's final count.
2533                    let key = (list_meta.id.clone(), list_meta.level);
2534                    counters.retain(|(id, level), _| {
2535                        !(id == &list_meta.id && *level > list_meta.level)
2536                    });
2537                    let next = counters.entry(key).or_insert(0);
2538                    *next += 1;
2539                    Some(*next)
2540                });
2541                let markdown = render_paragraph_markdown(
2542                    &text,
2543                    style.as_deref(),
2544                    list.as_ref(),
2545                    *quote,
2546                    *horizontal_rule,
2547                    ordered_index,
2548                );
2549                rendered.push(RenderedBlock {
2550                    markdown,
2551                    list_id: list.as_ref().map(|l| l.id.clone()),
2552                    quote: *quote,
2553                });
2554            }
2555            CapturedBlock::Table(table) => {
2556                rendered.push(RenderedBlock {
2557                    markdown: render_table_markdown(table),
2558                    list_id: None,
2559                    quote: false,
2560                });
2561            }
2562        }
2563    }
2564
2565    // Choose separator per adjacent pair: consecutive items from the same
2566    // Google Docs list use a single newline, including nested levels; adjacent
2567    // blockquote paragraphs keep a quoted blank line between them.
2568    let mut out = String::new();
2569    for (idx, block) in rendered.iter().enumerate() {
2570        if idx == 0 {
2571            out.push_str(&block.markdown);
2572            continue;
2573        }
2574        let prev = &rendered[idx - 1];
2575        if block.list_id.is_some() && prev.list_id.is_some() {
2576            out.push('\n');
2577        } else if block.quote && prev.quote {
2578            out.push_str("\n>\n");
2579        } else {
2580            out.push_str("\n\n");
2581        }
2582        out.push_str(&block.markdown);
2583    }
2584    if !out.is_empty() && !out.ends_with('\n') {
2585        out.push('\n');
2586    }
2587    out
2588}
2589
2590fn render_paragraph_markdown(
2591    text: &str,
2592    style: Option<&str>,
2593    list: Option<&ListMeta>,
2594    quote: bool,
2595    horizontal_rule: bool,
2596    ordered_index: Option<usize>,
2597) -> String {
2598    if horizontal_rule {
2599        return "---".to_string();
2600    }
2601    match style {
2602        Some("TITLE") => format!("# {text}"),
2603        Some("SUBTITLE") => format!("## {text}"),
2604        Some(style) if style.starts_with("HEADING_") => {
2605            let level = style
2606                .trim_start_matches("HEADING_")
2607                .parse::<usize>()
2608                .unwrap_or(1);
2609            format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2610        }
2611        _ => list.map_or_else(
2612            || {
2613                if quote {
2614                    text.lines()
2615                        .map(|line| {
2616                            if line.is_empty() {
2617                                ">".to_string()
2618                            } else {
2619                                format!("> {line}")
2620                            }
2621                        })
2622                        .collect::<Vec<_>>()
2623                        .join("\n")
2624                } else {
2625                    text.to_string()
2626                }
2627            },
2628            |list| {
2629                let indent = "    ".repeat(list.level);
2630                let marker = if list.ordered {
2631                    format!("{}.", ordered_index.unwrap_or(1))
2632                } else {
2633                    "-".to_string()
2634                };
2635                format!("{indent}{marker} {text}")
2636            },
2637        ),
2638    }
2639}
2640
2641fn render_table_markdown(table: &TableBlock) -> String {
2642    if table.rows.is_empty() {
2643        return String::new();
2644    }
2645    let width = table
2646        .rows
2647        .iter()
2648        .map(|row| row.cells.len())
2649        .max()
2650        .unwrap_or(1);
2651    let rows = table
2652        .rows
2653        .iter()
2654        .map(|row| {
2655            (0..width)
2656                .map(|idx| {
2657                    row.cells.get(idx).map_or_else(String::new, |cell| {
2658                        escape_markdown_table_cell(&render_content_markdown(&cell.content))
2659                    })
2660                })
2661                .collect::<Vec<_>>()
2662        })
2663        .collect::<Vec<_>>();
2664    let separator = vec!["---".to_string(); width];
2665    std::iter::once(&rows[0])
2666        .chain(std::iter::once(&separator))
2667        .chain(rows.iter().skip(1))
2668        .map(|row| format!("| {} |", row.join(" | ")))
2669        .collect::<Vec<_>>()
2670        .join("\n")
2671}
2672
2673fn render_content_markdown(content: &[ContentNode]) -> String {
2674    let mut rendered = String::new();
2675    let mut idx = 0usize;
2676    while idx < content.len() {
2677        match &content[idx] {
2678            ContentNode::Text {
2679                text,
2680                bold,
2681                italic,
2682                strike,
2683                link,
2684            } => {
2685                let link_target = link.as_deref();
2686                let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
2687                idx += 1;
2688                while let Some(ContentNode::Text {
2689                    text,
2690                    bold,
2691                    italic,
2692                    strike,
2693                    link: next_link,
2694                }) = content.get(idx)
2695                {
2696                    if next_link.as_deref() != link_target {
2697                        break;
2698                    }
2699                    runs.push((text.as_str(), *bold, *italic, *strike));
2700                    idx += 1;
2701                }
2702                let label = render_text_runs_markdown(&runs);
2703                if let Some(link_target) = link_target {
2704                    let _ = write!(rendered, "[{label}]({link_target})");
2705                } else {
2706                    rendered.push_str(&label);
2707                }
2708            }
2709            ContentNode::Image {
2710                url: Some(url),
2711                alt,
2712                ..
2713            } => {
2714                let _ = write!(rendered, "![{alt}]({url})");
2715                idx += 1;
2716            }
2717            ContentNode::Image { .. } => idx += 1,
2718        }
2719    }
2720    rendered
2721}
2722
2723#[derive(Clone, Copy, Default)]
2724struct MarkdownMarkerState {
2725    bold: bool,
2726    italic: bool,
2727    strike: bool,
2728}
2729
2730fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
2731    let inactive = MarkdownMarkerState::default();
2732    let mut active = inactive;
2733    let mut output = String::new();
2734    for (text, bold, italic, strike) in runs {
2735        let next = MarkdownMarkerState {
2736            bold: *bold,
2737            italic: *italic,
2738            strike: *strike,
2739        };
2740        let mut start = 0usize;
2741        for (offset, ch) in text.char_indices() {
2742            if ch != '\n' {
2743                continue;
2744            }
2745            if offset > start {
2746                output.push_str(&markdown_marker_transition(active, next));
2747                output.push_str(&text[start..offset]);
2748                active = next;
2749            }
2750            output.push_str(&markdown_marker_transition(active, inactive));
2751            output.push('\n');
2752            active = inactive;
2753            start = offset + ch.len_utf8();
2754        }
2755        if start < text.len() {
2756            output.push_str(&markdown_marker_transition(active, next));
2757            output.push_str(&text[start..]);
2758            active = next;
2759        }
2760    }
2761    output.push_str(&markdown_marker_transition(active, inactive));
2762    output
2763}
2764
2765fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
2766    let mut markers = String::new();
2767    if active.strike && !next.strike {
2768        markers.push_str("~~");
2769    }
2770    if active.italic && !next.italic {
2771        markers.push('*');
2772    }
2773    if active.bold && !next.bold {
2774        markers.push_str("**");
2775    }
2776    if !active.bold && next.bold {
2777        markers.push_str("**");
2778    }
2779    if !active.italic && next.italic {
2780        markers.push('*');
2781    }
2782    if !active.strike && next.strike {
2783        markers.push_str("~~");
2784    }
2785    markers
2786}
2787
2788fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
2789    format!(
2790        "<!doctype html><html><body>{}</body></html>",
2791        blocks
2792            .iter()
2793            .map(|block| match block {
2794                CapturedBlock::Paragraph {
2795                    content,
2796                    style,
2797                    list,
2798                    quote,
2799                    horizontal_rule,
2800                } => {
2801                    if *horizontal_rule {
2802                        "<hr>".to_string()
2803                    } else if let Some(list) = list {
2804                        let tag = if list.ordered { "ol" } else { "ul" };
2805                        format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
2806                    } else if *quote {
2807                        format!("<blockquote>{}</blockquote>", render_content_html(content))
2808                    } else {
2809                        let tag = paragraph_tag(style.as_deref());
2810                        format!("<{tag}>{}</{tag}>", render_content_html(content))
2811                    }
2812                }
2813                CapturedBlock::Table(table) => render_table_html(table),
2814            })
2815            .collect::<String>()
2816    )
2817}
2818
2819fn render_table_html(table: &TableBlock) -> String {
2820    let mut html = String::from("<table>");
2821    for row in &table.rows {
2822        html.push_str("<tr>");
2823        for cell in &row.cells {
2824            html.push_str("<td>");
2825            html.push_str(&render_content_html(&cell.content));
2826            html.push_str("</td>");
2827        }
2828        html.push_str("</tr>");
2829    }
2830    html.push_str("</table>");
2831    html
2832}
2833
2834fn render_content_html(content: &[ContentNode]) -> String {
2835    content
2836        .iter()
2837        .map(|node| match node {
2838            ContentNode::Text {
2839                text,
2840                bold,
2841                italic,
2842                strike,
2843                link,
2844            } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
2845            ContentNode::Image {
2846                url: Some(url),
2847                alt,
2848                width,
2849                height,
2850                ..
2851            } => render_image_html(url, alt, width.as_deref(), height.as_deref()),
2852            ContentNode::Image { .. } => String::new(),
2853        })
2854        .collect()
2855}
2856
2857fn render_image_html(url: &str, alt: &str, width: Option<&str>, height: Option<&str>) -> String {
2858    let mut html = format!(
2859        "<img src=\"{}\" alt=\"{}\"",
2860        escape_html(url),
2861        escape_html(alt)
2862    );
2863    if let Some(width) = width.filter(|value| !value.is_empty()) {
2864        let _ = write!(html, " width=\"{}\"", escape_html(width));
2865    }
2866    if let Some(height) = height.filter(|value| !value.is_empty()) {
2867        let _ = write!(html, " height=\"{}\"", escape_html(height));
2868    }
2869    html.push('>');
2870    html
2871}
2872
2873fn render_marked_html(
2874    text: &str,
2875    bold: bool,
2876    italic: bool,
2877    strike: bool,
2878    link: Option<&str>,
2879) -> String {
2880    text.split('\n')
2881        .map(|segment| render_marked_html_segment(segment, bold, italic, strike, link))
2882        .collect::<Vec<_>>()
2883        .join("<br>")
2884}
2885
2886fn render_marked_html_segment(
2887    text: &str,
2888    bold: bool,
2889    italic: bool,
2890    strike: bool,
2891    link: Option<&str>,
2892) -> String {
2893    if text.is_empty() {
2894        return String::new();
2895    }
2896    let mut output = escape_html(text);
2897    if bold {
2898        output = format!("<strong>{output}</strong>");
2899    }
2900    if italic {
2901        output = format!("<em>{output}</em>");
2902    }
2903    if strike {
2904        output = format!("<s>{output}</s>");
2905    }
2906    if let Some(link) = link {
2907        output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
2908    }
2909    output
2910}
2911
2912fn paragraph_tag(style: Option<&str>) -> &'static str {
2913    match style {
2914        Some("TITLE" | "HEADING_1") => "h1",
2915        Some("SUBTITLE" | "HEADING_2") => "h2",
2916        Some("HEADING_3") => "h3",
2917        Some("HEADING_4") => "h4",
2918        Some("HEADING_5") => "h5",
2919        Some("HEADING_6") => "h6",
2920        _ => "p",
2921    }
2922}
2923
2924fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
2925    blocks
2926        .iter()
2927        .map(|block| match block {
2928            CapturedBlock::Paragraph { content, .. } => content_to_text(content),
2929            CapturedBlock::Table(table) => table
2930                .rows
2931                .iter()
2932                .map(|row| {
2933                    row.cells
2934                        .iter()
2935                        .map(|cell| content_to_text(&cell.content))
2936                        .collect::<Vec<_>>()
2937                        .join("\t")
2938                })
2939                .collect::<Vec<_>>()
2940                .join("\n"),
2941        })
2942        .filter(|text| !text.is_empty())
2943        .collect::<Vec<_>>()
2944        .join("\n")
2945}
2946
2947fn content_to_text(content: &[ContentNode]) -> String {
2948    content
2949        .iter()
2950        .map(|node| match node {
2951            ContentNode::Text { text, .. } => text.clone(),
2952            ContentNode::Image {
2953                url: Some(_), alt, ..
2954            } => format!("[{alt}]"),
2955            ContentNode::Image { .. } => String::new(),
2956        })
2957        .collect()
2958}
2959
2960fn escape_html(value: &str) -> String {
2961    value
2962        .replace('&', "&amp;")
2963        .replace('<', "&lt;")
2964        .replace('>', "&gt;")
2965        .replace('"', "&quot;")
2966        .replace('\'', "&#39;")
2967}
2968
2969fn escape_markdown_table_cell(value: &str) -> String {
2970    value.replace('|', "\\|").replace('\n', "<br>")
2971}
2972
2973/// Extract a Bearer token from an Authorization header value.
2974///
2975/// Returns `None` if the header is not a valid Bearer token.
2976#[must_use]
2977pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
2978    let trimmed = auth_header.trim();
2979    trimmed
2980        .strip_prefix("Bearer ")
2981        .or_else(|| trimmed.strip_prefix("bearer "))
2982        .map(str::trim)
2983        .filter(|t| !t.is_empty())
2984}
2985
2986/// An image extracted from base64 data URIs in HTML.
2987#[derive(Debug, Clone)]
2988pub struct ExtractedImage {
2989    /// Local filename (e.g., "image-01.png")
2990    pub filename: String,
2991    /// Raw image bytes
2992    pub data: Vec<u8>,
2993    /// MIME type (e.g., "image/png")
2994    pub mime_type: String,
2995}
2996
2997/// Result of fetching a Google Doc as an archive.
2998#[derive(Debug, Clone)]
2999pub struct GDocsArchiveResult {
3000    /// HTML content with local image paths
3001    pub html: String,
3002    /// Markdown content with local image paths
3003    pub markdown: String,
3004    /// Extracted images
3005    pub images: Vec<ExtractedImage>,
3006    /// Document ID
3007    pub document_id: String,
3008    /// Export URL used
3009    pub export_url: String,
3010}
3011
3012/// Build a self-contained archive result from browser-model rendered output.
3013///
3014/// `DOCS_modelChunk` image nodes point at `docs-images-rt` URLs. Archive mode
3015/// downloads those URLs into `images/` and rewrites markdown/html references to
3016/// local paths so Rust browser capture matches the JavaScript archive path.
3017///
3018/// # Errors
3019///
3020/// Returns an error if the HTTP client cannot be created or an image response
3021/// body cannot be read. Individual failed image downloads are logged and left
3022/// out of the archive, matching the JS behavior.
3023pub async fn localize_rendered_remote_images_for_archive(
3024    rendered: &GDocsRenderedResult,
3025) -> crate::Result<GDocsArchiveResult> {
3026    let client = reqwest::Client::builder().build().map_err(|error| {
3027        WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
3028    })?;
3029    let mut seen = HashMap::new();
3030    let mut images = Vec::new();
3031    let mut next_index = 1usize;
3032
3033    for image in &rendered.remote_images {
3034        if seen.contains_key(&image.url) {
3035            continue;
3036        }
3037        let filename = remote_image_filename(&image.url, next_index);
3038        next_index += 1;
3039        seen.insert(image.url.clone(), filename.clone());
3040
3041        match client
3042            .get(&image.url)
3043            .header("User-Agent", GDOCS_USER_AGENT)
3044            .header("Accept", "image/*,*/*;q=0.8")
3045            .send()
3046            .await
3047        {
3048            Ok(response) if response.status().is_success() => {
3049                let mime_type = response
3050                    .headers()
3051                    .get(reqwest::header::CONTENT_TYPE)
3052                    .and_then(|value| value.to_str().ok())
3053                    .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
3054                let data = response.bytes().await.map_err(|error| {
3055                    WebCaptureError::FetchError(format!(
3056                        "Failed to read Google Docs image {}: {error}",
3057                        image.url
3058                    ))
3059                })?;
3060                debug!(
3061                    url = %image.url,
3062                    filename = %filename,
3063                    bytes = data.len(),
3064                    mime_type = %mime_type,
3065                    "downloaded Google Docs browser-model archive image"
3066                );
3067                images.push(ExtractedImage {
3068                    filename,
3069                    data: data.to_vec(),
3070                    mime_type,
3071                });
3072            }
3073            Ok(response) => {
3074                warn!(
3075                    url = %image.url,
3076                    status = response.status().as_u16(),
3077                    "failed to download Google Docs browser-model archive image"
3078                );
3079            }
3080            Err(error) => {
3081                warn!(
3082                    url = %image.url,
3083                    error = %error,
3084                    "failed to download Google Docs browser-model archive image"
3085                );
3086            }
3087        }
3088    }
3089
3090    let mut markdown = rendered.markdown.clone();
3091    let mut html = rendered.html.clone();
3092    for (url, filename) in seen {
3093        let local_path = format!("images/{filename}");
3094        markdown = markdown.replace(&url, &local_path);
3095        html = html.replace(&url, &local_path);
3096    }
3097
3098    Ok(GDocsArchiveResult {
3099        html,
3100        markdown,
3101        images,
3102        document_id: rendered.document_id.clone(),
3103        export_url: rendered.export_url.clone(),
3104    })
3105}
3106
3107fn remote_image_filename(url: &str, index: usize) -> String {
3108    let ext = crate::localize_images::get_extension_from_url(url);
3109    format!("image-{index:02}{ext}")
3110}
3111
3112fn mime_type_for_filename(filename: &str) -> String {
3113    match filename
3114        .rsplit('.')
3115        .next()
3116        .unwrap_or("png")
3117        .to_lowercase()
3118        .as_str()
3119    {
3120        "jpg" | "jpeg" => "image/jpeg",
3121        "gif" => "image/gif",
3122        "webp" => "image/webp",
3123        "svg" => "image/svg+xml",
3124        _ => "image/png",
3125    }
3126    .to_string()
3127}
3128
3129fn base64_image_pattern() -> &'static Regex {
3130    static PATTERN: OnceLock<Regex> = OnceLock::new();
3131    PATTERN.get_or_init(|| {
3132        Regex::new(
3133            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3134        )
3135        .unwrap()
3136    })
3137}
3138
3139/// Extract base64 data URI images from HTML content.
3140///
3141/// Google Docs HTML exports embed images as base64 data URIs.
3142/// This function extracts them and replaces with local file paths.
3143///
3144/// # Arguments
3145///
3146/// * `html` - HTML content with embedded base64 images
3147///
3148/// # Returns
3149///
3150/// Tuple of (updated HTML with local paths, extracted images)
3151#[must_use]
3152pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3153    let mut images = Vec::new();
3154    let mut idx = 1u32;
3155
3156    let updated_html = base64_image_pattern()
3157        .replace_all(html, |caps: &regex::Captures<'_>| {
3158            let prefix = &caps[1];
3159            let mime_ext = &caps[2];
3160            let base64_data = &caps[3];
3161            let suffix = &caps[4];
3162
3163            let ext = match mime_ext {
3164                "jpeg" => "jpg",
3165                "svg+xml" => "svg",
3166                other => other,
3167            };
3168
3169            let filename = format!("image-{idx:02}.{ext}");
3170            let mime_type = format!("image/{mime_ext}");
3171
3172            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3173                debug!("Extracted image: {} ({} bytes)", filename, data.len());
3174                images.push(ExtractedImage {
3175                    filename: filename.clone(),
3176                    data,
3177                    mime_type,
3178                });
3179            }
3180
3181            idx += 1;
3182            format!("{prefix}images/{filename}{suffix}")
3183        })
3184        .into_owned();
3185
3186    (updated_html, images)
3187}
3188
3189/// Fetch a Google Docs document as a ZIP archive.
3190///
3191/// Fetches the document as HTML, extracts embedded base64 images,
3192/// converts to Markdown, and returns all components ready for archiving.
3193///
3194/// The archive contains:
3195/// - `document.md` — Markdown version
3196/// - `document.html` — HTML version with local image paths
3197/// - `images/` — extracted images
3198///
3199/// # Arguments
3200///
3201/// * `url` - Google Docs URL
3202/// * `api_token` - Optional API token for private documents
3203///
3204/// # Errors
3205///
3206/// Returns an error if the fetch or conversion fails.
3207pub async fn fetch_google_doc_as_archive(
3208    url: &str,
3209    api_token: Option<&str>,
3210) -> crate::Result<GDocsArchiveResult> {
3211    let result = fetch_google_doc(url, "html", api_token).await?;
3212
3213    let preprocess = preprocess_google_docs_export_html(&result.content);
3214    debug!(
3215        document_id = %result.document_id,
3216        hoisted = preprocess.hoisted,
3217        unwrapped_links = preprocess.unwrapped_links,
3218        "google-docs-export pre-processor rewrote archive markup"
3219    );
3220
3221    let (local_html, images) = extract_base64_images(&preprocess.html);
3222
3223    let markdown = normalize_google_docs_export_markdown(
3224        &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3225    );
3226
3227    debug!(
3228        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3229        images.len(),
3230        local_html.len(),
3231        markdown.len()
3232    );
3233
3234    Ok(GDocsArchiveResult {
3235        html: local_html,
3236        markdown,
3237        images,
3238        document_id: result.document_id,
3239        export_url: result.export_url,
3240    })
3241}
3242
3243/// Create a ZIP archive from a `GDocsArchiveResult`.
3244///
3245/// # Arguments
3246///
3247/// * `archive` - The archive result to bundle
3248/// * `pretty_html` - Whether to pretty-print the HTML output
3249///
3250/// # Errors
3251///
3252/// Returns an error if ZIP creation fails.
3253pub fn create_archive_zip(
3254    archive: &GDocsArchiveResult,
3255    pretty_html: bool,
3256) -> crate::Result<Vec<u8>> {
3257    let mut buf = std::io::Cursor::new(Vec::new());
3258
3259    {
3260        let mut zip = zip::ZipWriter::new(&mut buf);
3261        let options = zip::write::SimpleFileOptions::default()
3262            .compression_method(zip::CompressionMethod::Deflated);
3263
3264        zip.start_file("document.md", options)
3265            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3266        zip.write_all(archive.markdown.as_bytes())?;
3267
3268        let html_output = if pretty_html {
3269            crate::html::pretty_print_html(&archive.html)
3270        } else {
3271            archive.html.clone()
3272        };
3273        zip.start_file("document.html", options)
3274            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3275        zip.write_all(html_output.as_bytes())?;
3276
3277        for img in &archive.images {
3278            zip.start_file(format!("images/{}", img.filename), options)
3279                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3280            zip.write_all(&img.data)?;
3281        }
3282
3283        zip.finish()
3284            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3285    }
3286
3287    Ok(buf.into_inner())
3288}
web_capture/gdocs.rs

web_capture/
gdocs.rs