Skip to main content

web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use base64::Engine;
32use regex::Regex;
33use serde_json::Value;
34use std::collections::HashMap;
35use std::fmt::Write as _;
36use std::hash::BuildHasher;
37use std::io::Write;
38use std::sync::OnceLock;
39use std::time::Duration;
40use tracing::{debug, info, warn};
41
42use crate::WebCaptureError;
43
44const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
45const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
46#[cfg(not(windows))]
47const GDOCS_EDITOR_BROWSER_TIMEOUT: Duration = Duration::from_secs(15);
48const GDOCS_EDITOR_HTTP_TIMEOUT: Duration = Duration::from_secs(20);
49
50fn gdocs_url_pattern() -> &'static Regex {
51    static PATTERN: OnceLock<Regex> = OnceLock::new();
52    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
53}
54
55/// Result of fetching a Google Docs document.
56#[derive(Debug, Clone)]
57pub struct GDocsResult {
58    /// The document content in the requested format.
59    pub content: String,
60    /// The export format used.
61    pub format: String,
62    /// The extracted document ID.
63    pub document_id: String,
64    /// The export URL that was fetched.
65    pub export_url: String,
66}
67
68/// Google Docs capture backend selected from the CLI `--capture` flag.
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub enum GDocsCaptureMethod {
71    /// Load `/edit` and extract `DOCS_modelChunk`.
72    BrowserModel,
73    /// Use the public `/export?format=...` endpoint.
74    PublicExport,
75    /// Use the authenticated `docs.googleapis.com` REST API.
76    DocsApi,
77}
78
79/// Rendered Google Docs content from either Docs API or editor model data.
80#[derive(Debug, Clone)]
81pub struct GDocsRenderedResult {
82    /// Markdown output.
83    pub markdown: String,
84    /// HTML output.
85    pub html: String,
86    /// Plain text output.
87    pub text: String,
88    /// The extracted document ID.
89    pub document_id: String,
90    /// Source URL used for capture.
91    pub export_url: String,
92}
93
94/// Parsed Google Docs model/document capture.
95#[derive(Debug, Clone, Default)]
96pub struct CapturedDocument {
97    /// Ordered document blocks.
98    pub blocks: Vec<CapturedBlock>,
99    /// Tables extracted from `blocks` for compatibility with tests and callers.
100    pub tables: Vec<TableBlock>,
101    /// Images extracted from model positions.
102    pub images: Vec<ContentNode>,
103    /// Plain text projection.
104    pub text: String,
105}
106
107/// Captured block.
108#[derive(Debug, Clone)]
109pub enum CapturedBlock {
110    /// Paragraph-like block.
111    Paragraph {
112        /// Paragraph content.
113        content: Vec<ContentNode>,
114        /// Optional Google Docs named style.
115        style: Option<String>,
116        /// Optional list metadata.
117        list: Option<ListMeta>,
118        /// Whether paragraph is a blockquote.
119        quote: bool,
120        /// Whether paragraph is a horizontal rule.
121        horizontal_rule: bool,
122    },
123    /// Table block.
124    Table(TableBlock),
125}
126
127/// Captured table.
128#[derive(Debug, Clone, Default)]
129pub struct TableBlock {
130    /// Table rows.
131    pub rows: Vec<TableRow>,
132}
133
134/// Captured table row.
135#[derive(Debug, Clone, Default)]
136pub struct TableRow {
137    /// Row cells.
138    pub cells: Vec<TableCell>,
139}
140
141/// Captured table cell.
142#[derive(Debug, Clone, Default)]
143pub struct TableCell {
144    /// Cell content.
145    pub content: Vec<ContentNode>,
146}
147
148/// Captured inline content node.
149#[derive(Debug, Clone, PartialEq, Eq)]
150pub enum ContentNode {
151    /// Text run.
152    Text {
153        /// Text content.
154        text: String,
155        /// Bold text style.
156        bold: bool,
157        /// Italic text style.
158        italic: bool,
159        /// Strikethrough text style.
160        strike: bool,
161        /// Optional hyperlink target.
162        link: Option<String>,
163    },
164    /// Image placeholder.
165    Image {
166        /// Content ID from Google Docs model data.
167        cid: Option<String>,
168        /// Resolved image URL.
169        url: Option<String>,
170        /// Alt text.
171        alt: String,
172        /// Whether this image came from a suggested edit.
173        is_suggestion: bool,
174    },
175}
176
177#[derive(Debug, Clone, Default, PartialEq, Eq)]
178struct TextStyle {
179    bold: bool,
180    italic: bool,
181    strike: bool,
182    link: Option<String>,
183}
184
185#[derive(Debug, Clone, Default)]
186struct ParagraphMeta {
187    style: Option<String>,
188    list: Option<ListMeta>,
189    quote: bool,
190    horizontal_rule: bool,
191}
192
193#[derive(Debug, Clone)]
194pub struct ListMeta {
195    /// Google Docs list identifier.
196    pub id: String,
197    /// Nesting level, zero-based.
198    pub level: usize,
199    /// Whether Markdown should render this list item with an ordered marker.
200    pub ordered: bool,
201}
202
203#[derive(Debug, Clone)]
204struct ParagraphStyle {
205    style: Option<String>,
206    indent_start: f64,
207    indent_first_line: f64,
208}
209
210#[derive(Debug, Clone, Default)]
211struct ModelStyleMaps {
212    inline_styles: Vec<TextStyle>,
213    paragraph_by_end: HashMap<usize, ParagraphStyle>,
214    list_by_end: HashMap<usize, ListMeta>,
215    horizontal_rules: std::collections::HashSet<usize>,
216}
217
218/// Check if a URL is a Google Docs document URL.
219#[must_use]
220pub fn is_google_docs_url(url: &str) -> bool {
221    gdocs_url_pattern().is_match(url)
222}
223
224/// Extract the document ID from a Google Docs URL.
225///
226/// Returns `None` if the URL is not a valid Google Docs URL.
227#[must_use]
228pub fn extract_document_id(url: &str) -> Option<String> {
229    gdocs_url_pattern()
230        .captures(url)
231        .and_then(|caps| caps.get(1))
232        .map(|m| m.as_str().to_string())
233}
234
235/// Build a Google Docs export URL.
236///
237/// # Arguments
238///
239/// * `document_id` - The Google Docs document ID
240/// * `format` - Export format (html, txt, md, pdf, docx, epub)
241#[must_use]
242pub fn build_export_url(document_id: &str, format: &str) -> String {
243    let export_format = match format {
244        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
245        _ => "html",
246    };
247    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
248}
249
250/// Build a Google Docs editor URL.
251#[must_use]
252pub fn build_edit_url(document_id: &str) -> String {
253    format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
254}
255
256/// Build a Google Docs REST API URL.
257#[must_use]
258pub fn build_docs_api_url(document_id: &str) -> String {
259    format!("{GDOCS_API_BASE}/{document_id}")
260}
261
262/// Select a Google Docs capture backend from the CLI `--capture` value.
263///
264/// # Errors
265///
266/// Returns an error when `capture` is neither `browser` nor `api`.
267pub fn select_capture_method(
268    capture: &str,
269    api_token: Option<&str>,
270) -> crate::Result<GDocsCaptureMethod> {
271    match capture.to_lowercase().as_str() {
272        "browser" => Ok(GDocsCaptureMethod::BrowserModel),
273        "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
274        "api" => Ok(GDocsCaptureMethod::PublicExport),
275        other => Err(WebCaptureError::InvalidUrl(format!(
276            "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
277        ))),
278    }
279}
280
281/// Fetch a Google Docs document via the export URL.
282///
283/// For public documents, pass `None` for `api_token`.
284/// For private documents, pass a Bearer token string.
285///
286/// # Arguments
287///
288/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
289/// * `format` - Export format (html, txt, md, pdf, docx, epub)
290/// * `api_token` - Optional API token for private documents
291///
292/// # Errors
293///
294/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
295pub async fn fetch_google_doc(
296    url: &str,
297    format: &str,
298    api_token: Option<&str>,
299) -> crate::Result<GDocsResult> {
300    let document_id = extract_document_id(url).ok_or_else(|| {
301        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
302    })?;
303
304    let export_url = build_export_url(&document_id, format);
305    debug!(
306        document_id = %document_id,
307        format = %format,
308        export_url = %export_url,
309        has_api_token = api_token.is_some(),
310        "fetching Google Doc via public export"
311    );
312
313    let mut request = reqwest::Client::new()
314        .get(&export_url)
315        .header(
316            "User-Agent",
317            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
318        )
319        .header("Accept-Charset", "utf-8")
320        .header("Accept-Language", "en-US,en;q=0.9");
321
322    if let Some(token) = api_token {
323        request = request.header("Authorization", format!("Bearer {token}"));
324    }
325
326    let response = request
327        .send()
328        .await
329        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
330    debug!(
331        document_id = %document_id,
332        status = response.status().as_u16(),
333        success = response.status().is_success(),
334        content_type = response
335            .headers()
336            .get(reqwest::header::CONTENT_TYPE)
337            .and_then(|value| value.to_str().ok())
338            .unwrap_or(""),
339        "received Google Docs public export response"
340    );
341
342    if !response.status().is_success() {
343        return Err(WebCaptureError::FetchError(format!(
344            "Failed to fetch Google Doc ({} {}): {}",
345            response.status().as_u16(),
346            response.status().canonical_reason().unwrap_or("Unknown"),
347            export_url
348        )));
349    }
350
351    let raw_content = response.text().await.map_err(|e| {
352        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
353    })?;
354    debug!(
355        document_id = %document_id,
356        bytes = raw_content.len(),
357        "read Google Docs public export body"
358    );
359
360    // Decode HTML entities to unicode for text-based formats
361    let content = match format {
362        "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
363        _ => raw_content,
364    };
365
366    Ok(GDocsResult {
367        content,
368        format: format.to_string(),
369        document_id,
370        export_url,
371    })
372}
373
374/// Fetch a Google Docs document and convert to Markdown.
375///
376/// Fetches the document as HTML, then converts to Markdown using the
377/// existing HTML-to-Markdown pipeline.
378///
379/// # Arguments
380///
381/// * `url` - Google Docs URL
382/// * `api_token` - Optional API token for private documents
383///
384/// # Errors
385///
386/// Returns an error if the fetch or conversion fails.
387pub async fn fetch_google_doc_as_markdown(
388    url: &str,
389    api_token: Option<&str>,
390) -> crate::Result<GDocsResult> {
391    let result = fetch_google_doc(url, "html", api_token).await?;
392
393    let preprocess = preprocess_google_docs_export_html(&result.content);
394    debug!(
395        document_id = %result.document_id,
396        hoisted = preprocess.hoisted,
397        unwrapped_links = preprocess.unwrapped_links,
398        "google-docs-export pre-processor rewrote markup"
399    );
400    let markdown =
401        crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?;
402    debug!(
403        document_id = %result.document_id,
404        bytes = markdown.len(),
405        "rendered Google Docs public export markdown"
406    );
407
408    Ok(GDocsResult {
409        content: markdown,
410        format: "markdown".to_string(),
411        document_id: result.document_id,
412        export_url: result.export_url,
413    })
414}
415
416/// Result of running the Google Docs export HTML pre-processor.
417///
418/// Exposes the rewritten HTML alongside counters that are useful for debug
419/// logging (`gdocs.export.style-hoist`). See issue #92 R6.
420#[derive(Debug, Clone)]
421pub struct GDocsExportPreprocessResult {
422    /// Rewritten HTML.
423    pub html: String,
424    /// Number of inline-style spans turned into `<strong>`/`<em>`/`<del>`.
425    pub hoisted: usize,
426    /// Number of `google.com/url?q=` redirect wrappers unwrapped.
427    pub unwrapped_links: usize,
428}
429
430/// Pre-process Google Docs export HTML so the generic `html2md` pipeline
431/// preserves inline formatting, heading numbering, and link targets.
432///
433/// Google Drive serves bold/italic/strikethrough as inline style spans and
434/// wraps every link through a `google.com/url?q=` redirect, both of which
435/// the generic converter would otherwise discard. This function rewrites
436/// those constructs into semantic HTML before conversion.
437#[must_use]
438pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
439    let mut hoisted: usize = 0;
440    let mut unwrapped_links: usize = 0;
441    let class_styles = extract_css_class_styles(html);
442
443    let mut out = hoist_inline_style_spans(html, &mut hoisted);
444    out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
445    out = convert_class_indented_blockquotes(&out, &class_styles);
446    out = strip_google_docs_heading_noise(&out);
447    out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
448    out = out.replace("&nbsp;", " ");
449    out = out.replace('\u{00A0}', " ");
450
451    GDocsExportPreprocessResult {
452        html: out,
453        hoisted,
454        unwrapped_links,
455    }
456}
457
458fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
459    let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
460        .expect("valid regex");
461    span_re
462        .replace_all(html, |caps: &regex::Captures<'_>| {
463            let style = caps.get(2).map_or("", |m| m.as_str());
464            let inner = caps.get(3).map_or("", |m| m.as_str());
465            semantic_wrapped_html(inner, style).map_or_else(
466                || caps[0].to_string(),
467                |wrapped| {
468                    *hoisted += 1;
469                    wrapped
470                },
471            )
472        })
473        .into_owned()
474}
475
476fn hoist_class_style_spans(
477    html: &str,
478    class_styles: &HashMap<String, String>,
479    hoisted: &mut usize,
480) -> String {
481    let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
482        .expect("valid regex");
483    class_span_re
484        .replace_all(html, |caps: &regex::Captures<'_>| {
485            let class_attr = caps.get(2).map_or("", |m| m.as_str());
486            let inner = caps.get(3).map_or("", |m| m.as_str());
487            let style = combined_class_style(class_styles, class_attr);
488            semantic_wrapped_html(inner, &style).map_or_else(
489                || caps[0].to_string(),
490                |wrapped| {
491                    *hoisted += 1;
492                    wrapped
493                },
494            )
495        })
496        .into_owned()
497}
498
499fn convert_class_indented_blockquotes(
500    html: &str,
501    class_styles: &HashMap<String, String>,
502) -> String {
503    let class_paragraph_re =
504        Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
505    class_paragraph_re
506        .replace_all(html, |caps: &regex::Captures<'_>| {
507            let class_attr = caps.get(2).map_or("", |m| m.as_str());
508            let inner = caps.get(3).map_or("", |m| m.as_str());
509            let style = combined_class_style(class_styles, class_attr);
510            if is_blockquote_style(&style) {
511                format!("<blockquote><p>{inner}</p></blockquote>")
512            } else {
513                caps[0].to_string()
514            }
515        })
516        .into_owned()
517}
518
519fn strip_google_docs_heading_noise(html: &str) -> String {
520    let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
521    let numbering_re =
522        Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
523    let mut out = empty_anchor_re.replace_all(html, "").into_owned();
524    for level in 1..=6 {
525        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
526            .expect("valid regex");
527        out = heading_re
528            .replace_all(&out, |caps: &regex::Captures<'_>| {
529                let open = &caps[1];
530                let inner = &caps[2];
531                let close = &caps[3];
532                let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
533                cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
534                format!("{open}{cleaned}{close}")
535            })
536            .into_owned();
537    }
538    out
539}
540
541fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
542    let redirect_re =
543        Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
544            .expect("valid regex");
545    redirect_re
546        .replace_all(html, |caps: &regex::Captures<'_>| {
547            let encoded = caps.get(1).map_or("", |m| m.as_str());
548            let decoded = percent_decode_utf8_lossy(encoded);
549            *unwrapped_links += 1;
550            format!(r#"href="{decoded}""#)
551        })
552        .into_owned()
553}
554
555fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
556    let mut class_styles: HashMap<String, String> = HashMap::new();
557    let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
558    let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
559    for style_caps in style_re.captures_iter(html) {
560        let css = style_caps.get(1).map_or("", |m| m.as_str());
561        for class_caps in class_re.captures_iter(css) {
562            let class_name = class_caps.get(1).map_or("", |m| m.as_str());
563            let style = class_caps.get(2).map_or("", |m| m.as_str());
564            class_styles
565                .entry(class_name.to_string())
566                .and_modify(|existing| {
567                    existing.push(';');
568                    existing.push_str(style);
569                })
570                .or_insert_with(|| style.to_string());
571        }
572    }
573    class_styles
574}
575
576fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
577    class_attr
578        .split_whitespace()
579        .filter_map(|class_name| class_styles.get(class_name))
580        .fold(String::new(), |mut out, style| {
581            out.push(';');
582            out.push_str(style);
583            out
584        })
585}
586
587fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
588    let bold = css_has_bold(style);
589    let italic = css_has_italic(style);
590    let strike = css_has_strike(style);
591    if !bold && !italic && !strike {
592        return None;
593    }
594    let mut wrapped = inner.to_string();
595    if strike {
596        wrapped = format!("<del>{wrapped}</del>");
597    }
598    if italic {
599        wrapped = format!("<em>{wrapped}</em>");
600    }
601    if bold {
602        wrapped = format!("<strong>{wrapped}</strong>");
603    }
604    Some(wrapped)
605}
606
607fn css_has_bold(style: &str) -> bool {
608    Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
609        .expect("valid regex")
610        .is_match(style)
611}
612
613fn css_has_italic(style: &str) -> bool {
614    Regex::new(r"(?i)font-style\s*:\s*italic")
615        .expect("valid regex")
616        .is_match(style)
617}
618
619fn css_has_strike(style: &str) -> bool {
620    Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
621        .expect("valid regex")
622        .is_match(style)
623}
624
625fn is_blockquote_style(style: &str) -> bool {
626    let margin_left = css_point_value(style, "margin-left");
627    let margin_right = css_point_value(style, "margin-right");
628    margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
629}
630
631fn css_point_value(style: &str, property: &str) -> f64 {
632    let re = Regex::new(&format!(
633        r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
634        regex::escape(property)
635    ))
636    .expect("valid regex");
637    re.captures(style)
638        .and_then(|caps| caps.get(1))
639        .and_then(|value| value.as_str().parse::<f64>().ok())
640        .unwrap_or(0.0)
641}
642
643/// Decode %XX percent escapes in `input`. Invalid sequences are left
644/// untouched so well-formed ASCII URLs round-trip unchanged.
645fn percent_decode_utf8_lossy(input: &str) -> String {
646    let bytes = input.as_bytes();
647    let mut decoded = Vec::with_capacity(bytes.len());
648    let mut i = 0;
649    while i < bytes.len() {
650        if bytes[i] == b'%' && i + 2 < bytes.len() {
651            let hi = (bytes[i + 1] as char).to_digit(16);
652            let lo = (bytes[i + 2] as char).to_digit(16);
653            if let (Some(hi), Some(lo)) = (hi, lo) {
654                if let Ok(byte) = u8::try_from((hi << 4) | lo) {
655                    decoded.push(byte);
656                    i += 3;
657                    continue;
658                }
659            }
660        }
661        decoded.push(bytes[i]);
662        i += 1;
663    }
664    String::from_utf8_lossy(&decoded).into_owned()
665}
666
667/// Fetch and render a Google Docs document via the authenticated REST API.
668///
669/// # Errors
670///
671/// Returns an error when the URL is invalid, no token is provided, or the API request fails.
672pub async fn fetch_google_doc_from_docs_api(
673    url: &str,
674    api_token: &str,
675) -> crate::Result<GDocsRenderedResult> {
676    let document_id = extract_document_id(url).ok_or_else(|| {
677        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
678    })?;
679    let api_url = build_docs_api_url(&document_id);
680    debug!(
681        document_id = %document_id,
682        api_url = %api_url,
683        "fetching Google Doc via Docs API"
684    );
685
686    let response = reqwest::Client::new()
687        .get(&api_url)
688        .header("Authorization", format!("Bearer {api_token}"))
689        .header("Accept", "application/json")
690        .send()
691        .await
692        .map_err(|e| {
693            WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
694        })?;
695    debug!(
696        document_id = %document_id,
697        status = response.status().as_u16(),
698        success = response.status().is_success(),
699        content_type = response
700            .headers()
701            .get(reqwest::header::CONTENT_TYPE)
702            .and_then(|value| value.to_str().ok())
703            .unwrap_or(""),
704        "received Google Docs API response"
705    );
706
707    if !response.status().is_success() {
708        return Err(WebCaptureError::FetchError(format!(
709            "Failed to fetch Google Doc via Docs API ({} {}): {}",
710            response.status().as_u16(),
711            response.status().canonical_reason().unwrap_or("Unknown"),
712            api_url
713        )));
714    }
715
716    let body = response.text().await.map_err(|e| {
717        WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
718    })?;
719    let document = serde_json::from_str::<Value>(&body).map_err(|e| {
720        WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
721    })?;
722    let rendered = render_docs_api_document(&document);
723    debug!(
724        document_id = %document_id,
725        title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
726        markdown_bytes = rendered.markdown.len(),
727        html_bytes = rendered.html.len(),
728        text_bytes = rendered.text.len(),
729        "rendered Google Docs API document"
730    );
731
732    Ok(GDocsRenderedResult {
733        markdown: rendered.markdown,
734        html: rendered.html,
735        text: rendered.text,
736        document_id,
737        export_url: api_url,
738    })
739}
740
741/// Fetch and render the model data embedded in the Google Docs `/edit` route.
742///
743/// # Errors
744///
745/// Returns an error when the URL is invalid, the fetch fails, or no model chunks are present.
746pub async fn fetch_google_doc_from_model(
747    url: &str,
748    api_token: Option<&str>,
749) -> crate::Result<GDocsRenderedResult> {
750    if api_token.is_some() {
751        return Err(WebCaptureError::BrowserError(
752            "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
753        ));
754    }
755    let document_id = extract_document_id(url).ok_or_else(|| {
756        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
757    })?;
758    let edit_url = build_edit_url(&document_id);
759    debug!(
760        document_id = %document_id,
761        edit_url = %edit_url,
762        "capturing Google Doc editor model with a real browser"
763    );
764    let html = fetch_google_doc_editor_html(&edit_url, &document_id).await?;
765    let chunks = extract_model_chunks_from_html(&html);
766    debug!(
767        document_id = %document_id,
768        html_bytes = html.len(),
769        chunks = chunks.len(),
770        "extracted Google Docs editor model chunks"
771    );
772    if chunks.is_empty() {
773        return Err(WebCaptureError::ParseError(
774            "Google Docs editor HTML did not contain DOCS_modelChunk data".to_string(),
775        ));
776    }
777
778    let cid_urls = extract_cid_urls_from_html(&html);
779    let capture = parse_model_chunks(&chunks, &cid_urls);
780    info!(
781        document_id = %document_id,
782        chunks = chunks.len(),
783        cid_urls = cid_urls.len(),
784        blocks = capture.blocks.len(),
785        tables = capture.tables.len(),
786        images = capture.images.len(),
787        text_bytes = capture.text.len(),
788        "parsed Google Docs editor model"
789    );
790
791    Ok(GDocsRenderedResult {
792        markdown: render_captured_document(&capture, "markdown"),
793        html: render_captured_document(&capture, "html"),
794        text: render_captured_document(&capture, "txt"),
795        document_id,
796        export_url: edit_url,
797    })
798}
799
800async fn fetch_google_doc_editor_html(edit_url: &str, document_id: &str) -> crate::Result<String> {
801    #[cfg(windows)]
802    {
803        warn!(
804            document_id = %document_id,
805            "using Google Docs editor HTTP fetch on Windows to avoid headless Chrome hangs in hosted CI"
806        );
807        fetch_google_doc_editor_html_via_http(edit_url, document_id).await
808    }
809
810    #[cfg(not(windows))]
811    {
812        match crate::browser::render_html_with_timeout(edit_url, GDOCS_EDITOR_BROWSER_TIMEOUT).await
813        {
814            Ok(html) => {
815                let chunks = extract_model_chunks_from_html(&html);
816                if !chunks.is_empty() {
817                    return Ok(html);
818                }
819                warn!(
820                    document_id = %document_id,
821                    html_bytes = html.len(),
822                    "real-browser Google Docs capture returned no model chunks; falling back to editor HTTP fetch"
823                );
824            }
825            Err(error) => {
826                warn!(
827                    document_id = %document_id,
828                    error = %error,
829                    "real-browser Google Docs capture failed; falling back to editor HTTP fetch"
830                );
831            }
832        }
833
834        fetch_google_doc_editor_html_via_http(edit_url, document_id).await
835    }
836}
837
838async fn fetch_google_doc_editor_html_via_http(
839    edit_url: &str,
840    document_id: &str,
841) -> crate::Result<String> {
842    let html = tokio::time::timeout(GDOCS_EDITOR_HTTP_TIMEOUT, crate::html::fetch_html(edit_url))
843        .await
844        .map_err(|_| {
845            WebCaptureError::FetchError(format!(
846                "Timed out fetching Google Docs editor HTML for document {document_id}"
847            ))
848        })??;
849    debug!(
850        document_id = %document_id,
851        html_bytes = html.len(),
852        "fetched Google Docs editor HTML through HTTP fallback"
853    );
854    Ok(html)
855}
856
857/// Render a Google Docs REST API document value.
858#[must_use]
859pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
860    let blocks = structural_elements_to_blocks(
861        document
862            .pointer("/body/content")
863            .and_then(Value::as_array)
864            .map_or(&[] as &[Value], Vec::as_slice),
865        document.pointer("/inlineObjects").unwrap_or(&Value::Null),
866    );
867    GDocsRenderedOutput {
868        markdown: render_blocks_markdown(&blocks),
869        html: render_blocks_html(&blocks),
870        text: blocks_to_text(&blocks),
871    }
872}
873
874/// Rendered document output.
875#[derive(Debug, Clone, PartialEq, Eq)]
876pub struct GDocsRenderedOutput {
877    /// Markdown output.
878    pub markdown: String,
879    /// HTML output.
880    pub html: String,
881    /// Plain text output.
882    pub text: String,
883}
884
885fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
886    let mut blocks = Vec::new();
887    for element in elements {
888        if let Some(paragraph) = element.get("paragraph") {
889            let content = paragraph_to_content(paragraph, inline_objects);
890            if !content_to_text(&content).trim().is_empty()
891                || content
892                    .iter()
893                    .any(|node| matches!(node, ContentNode::Image { .. }))
894            {
895                blocks.push(CapturedBlock::Paragraph {
896                    style: paragraph
897                        .pointer("/paragraphStyle/namedStyleType")
898                        .and_then(Value::as_str)
899                        .map(ToString::to_string),
900                    list: None,
901                    quote: false,
902                    horizontal_rule: false,
903                    content,
904                });
905            }
906        } else if let Some(table) = element.get("table") {
907            blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
908        }
909    }
910    blocks
911}
912
913fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
914    let rows = table
915        .get("tableRows")
916        .and_then(Value::as_array)
917        .map_or(&[] as &[Value], Vec::as_slice)
918        .iter()
919        .map(|row| TableRow {
920            cells: row
921                .get("tableCells")
922                .and_then(Value::as_array)
923                .map_or(&[] as &[Value], Vec::as_slice)
924                .iter()
925                .map(|cell| TableCell {
926                    content: structural_elements_to_inline_content(
927                        cell.get("content")
928                            .and_then(Value::as_array)
929                            .map_or(&[] as &[Value], Vec::as_slice),
930                        inline_objects,
931                    ),
932                })
933                .collect(),
934        })
935        .collect();
936    TableBlock { rows }
937}
938
939fn structural_elements_to_inline_content(
940    elements: &[Value],
941    inline_objects: &Value,
942) -> Vec<ContentNode> {
943    let mut content = Vec::new();
944    for element in elements {
945        if let Some(paragraph) = element.get("paragraph") {
946            let paragraph_content = paragraph_to_content(paragraph, inline_objects);
947            if !content.is_empty() && !paragraph_content.is_empty() {
948                append_text(&mut content, "\n");
949            }
950            content.extend(paragraph_content);
951        } else if let Some(table) = element.get("table") {
952            append_text(
953                &mut content,
954                &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
955                    table,
956                    inline_objects,
957                ))]),
958            );
959        }
960    }
961    content
962}
963
964fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
965    let mut content = Vec::new();
966    for element in paragraph
967        .get("elements")
968        .and_then(Value::as_array)
969        .map_or(&[] as &[Value], Vec::as_slice)
970    {
971        if let Some(text) = element
972            .pointer("/textRun/content")
973            .and_then(Value::as_str)
974            .map(|text| text.strip_suffix('\n').unwrap_or(text))
975        {
976            append_text(&mut content, text);
977        } else if let Some(inline_id) = element
978            .pointer("/inlineObjectElement/inlineObjectId")
979            .and_then(Value::as_str)
980        {
981            if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
982                content.push(image);
983            }
984        }
985    }
986    content
987}
988
989fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
990    let embedded = inline_objects
991        .get(inline_id)?
992        .pointer("/inlineObjectProperties/embeddedObject")?;
993    let url = embedded
994        .pointer("/imageProperties/contentUri")
995        .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
996        .and_then(Value::as_str)?;
997    let alt = embedded
998        .get("title")
999        .or_else(|| embedded.get("description"))
1000        .and_then(Value::as_str)
1001        .unwrap_or("image");
1002    Some(ContentNode::Image {
1003        cid: None,
1004        url: Some(url.to_string()),
1005        alt: alt.to_string(),
1006        is_suggestion: false,
1007    })
1008}
1009
1010fn build_model_style_maps(
1011    items: &[Value],
1012    text_len: usize,
1013    utf16_position_map: &[usize],
1014) -> ModelStyleMaps {
1015    let mut maps = ModelStyleMaps {
1016        inline_styles: vec![TextStyle::default(); text_len],
1017        ..ModelStyleMaps::default()
1018    };
1019
1020    for item in items {
1021        if item.get("ty").and_then(Value::as_str) != Some("as") {
1022            continue;
1023        }
1024        let (Some(start), Some(end), Some(style_type)) = (
1025            item.get("si").and_then(Value::as_u64),
1026            item.get("ei").and_then(Value::as_u64),
1027            item.get("st").and_then(Value::as_str),
1028        ) else {
1029            continue;
1030        };
1031        let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
1032            continue;
1033        };
1034
1035        let start = utf16_position_to_char_position(utf16_position_map, start);
1036        let end = utf16_position_to_char_position(utf16_position_map, end);
1037        if start == 0 || end == 0 {
1038            continue;
1039        }
1040
1041        match style_type {
1042            "text" => {
1043                let style = text_style(item);
1044                apply_inline_style(&mut maps.inline_styles, start, end, &style);
1045            }
1046            "link" => {
1047                let style = TextStyle {
1048                    link: item
1049                        .pointer("/sm/lnks_link/ulnk_url")
1050                        .and_then(Value::as_str)
1051                        .map(ToString::to_string),
1052                    ..TextStyle::default()
1053                };
1054                apply_inline_style(&mut maps.inline_styles, start, end, &style);
1055            }
1056            "paragraph" => {
1057                maps.paragraph_by_end
1058                    .insert(end, paragraph_style_from_model(item));
1059            }
1060            "list" => {
1061                maps.list_by_end.insert(
1062                    end,
1063                    ListMeta {
1064                        id: item
1065                            .pointer("/sm/ls_id")
1066                            .and_then(Value::as_str)
1067                            .unwrap_or("")
1068                            .to_string(),
1069                        level: item
1070                            .pointer("/sm/ls_nest")
1071                            .and_then(Value::as_u64)
1072                            .and_then(|value| usize::try_from(value).ok())
1073                            .unwrap_or(0),
1074                        ordered: false,
1075                    },
1076                );
1077            }
1078            "horizontal_rule" => {
1079                maps.horizontal_rules.insert(end);
1080            }
1081            _ => {}
1082        }
1083    }
1084
1085    maps
1086}
1087
1088fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
1089    let from = start.saturating_sub(1);
1090    let to = end.min(styles.len());
1091    if from >= to {
1092        return;
1093    }
1094    for style in &mut styles[from..to] {
1095        if patch.bold {
1096            style.bold = true;
1097        }
1098        if patch.italic {
1099            style.italic = true;
1100        }
1101        if patch.strike {
1102            style.strike = true;
1103        }
1104        if patch.link.is_some() {
1105            style.link.clone_from(&patch.link);
1106        }
1107    }
1108}
1109
1110fn text_style(item: &Value) -> TextStyle {
1111    TextStyle {
1112        bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
1113        italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
1114        strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
1115        link: None,
1116    }
1117}
1118
1119fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
1120    let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
1121    ParagraphStyle {
1122        style: heading.map(|level| format!("HEADING_{level}")),
1123        indent_start: item
1124            .pointer("/sm/ps_il")
1125            .and_then(Value::as_f64)
1126            .unwrap_or(0.0),
1127        indent_first_line: item
1128            .pointer("/sm/ps_ifl")
1129            .and_then(Value::as_f64)
1130            .unwrap_or(0.0),
1131    }
1132}
1133
1134fn build_utf16_position_map(text: &str) -> Vec<usize> {
1135    let mut map = vec![0; text.encode_utf16().count() + 1];
1136    let mut utf16_pos = 1usize;
1137    for (idx, ch) in text.chars().enumerate() {
1138        let char_pos = idx + 1;
1139        for _ in 0..ch.len_utf16() {
1140            if let Some(slot) = map.get_mut(utf16_pos) {
1141                *slot = char_pos;
1142            }
1143            utf16_pos += 1;
1144        }
1145    }
1146    map
1147}
1148
1149fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
1150    map.get(position)
1151        .copied()
1152        .filter(|position| *position > 0)
1153        .or_else(|| map.iter().rfind(|position| **position > 0).copied())
1154        .unwrap_or(0)
1155}
1156
1157/// Parse captured `DOCS_modelChunk` values.
1158#[must_use]
1159#[allow(clippy::too_many_lines)]
1160pub fn parse_model_chunks<S: BuildHasher>(
1161    chunks: &[Value],
1162    cid_urls: &HashMap<String, String, S>,
1163) -> CapturedDocument {
1164    let items = collect_model_items(chunks);
1165    let full_text = items
1166        .iter()
1167        .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
1168        .filter_map(|item| item.get("s").and_then(Value::as_str))
1169        .collect::<String>();
1170    let chars: Vec<char> = full_text.chars().collect();
1171    let utf16_position_map = build_utf16_position_map(&full_text);
1172    let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
1173
1174    let mut positions = HashMap::new();
1175    for item in &items {
1176        if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
1177            if let (Some(id), Some(pos)) = (
1178                item.get("id").and_then(Value::as_str),
1179                item.get("spi").and_then(Value::as_u64),
1180            ) {
1181                if let Ok(pos) = usize::try_from(pos) {
1182                    positions.insert(
1183                        id.to_string(),
1184                        utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
1185                    );
1186                }
1187            }
1188        }
1189    }
1190
1191    let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
1192    let mut images = Vec::new();
1193    for item in &items {
1194        let ty = item.get("ty").and_then(Value::as_str);
1195        if !matches!(ty, Some("ae" | "ase")) {
1196            continue;
1197        }
1198        let Some(id) = item.get("id").and_then(Value::as_str) else {
1199            continue;
1200        };
1201        let Some(pos) = positions.get(id).copied() else {
1202            continue;
1203        };
1204        let cid = item
1205            .pointer("/epm/ee_eo/i_cid")
1206            .and_then(Value::as_str)
1207            .map(ToString::to_string);
1208        let node = ContentNode::Image {
1209            url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
1210            cid,
1211            alt: item
1212                .pointer("/epm/ee_eo/eo_ad")
1213                .and_then(Value::as_str)
1214                .unwrap_or_else(|| {
1215                    if ty == Some("ase") {
1216                        "suggested image"
1217                    } else {
1218                        "image"
1219                    }
1220                })
1221                .to_string(),
1222            is_suggestion: ty == Some("ase"),
1223        };
1224        images_by_pos.insert(pos, node.clone());
1225        images.push(node);
1226    }
1227
1228    let mut blocks = Vec::new();
1229    let mut tables = Vec::new();
1230    let mut paragraph = Vec::new();
1231    let mut table: Option<TableBlock> = None;
1232    let mut row: Option<TableRow> = None;
1233    let mut cell: Option<TableCell> = None;
1234    let mut previous_table_control: Option<u32> = None;
1235
1236    for (idx, ch) in chars.iter().copied().enumerate() {
1237        match ch as u32 {
1238            0x10 => {
1239                flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1240                table = Some(TableBlock::default());
1241                previous_table_control = Some(0x10);
1242            }
1243            0x11 => {
1244                flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
1245                previous_table_control = None;
1246            }
1247            0x12 => {
1248                flush_row(&mut row, &mut cell, table.as_mut(), true);
1249                row = Some(TableRow::default());
1250                previous_table_control = Some(0x12);
1251            }
1252            0x1c => {
1253                flush_cell(&mut row, &mut cell, false);
1254                if row.is_none() {
1255                    row = Some(TableRow::default());
1256                }
1257                cell = Some(TableCell::default());
1258                previous_table_control = Some(0x1c);
1259            }
1260            0x0a => {
1261                if table.is_some() {
1262                    if cell.as_ref().is_none_or(cell_is_empty)
1263                        && matches!(previous_table_control, Some(0x1c | 0x12))
1264                    {
1265                        previous_table_control = Some(0x0a);
1266                        continue;
1267                    }
1268                    // Inside a table, a bare newline separates cells within the
1269                    // current row (rows are delimited by 0x12/0x11). See R2.
1270                    flush_cell(&mut row, &mut cell, false);
1271                    if row.is_none() {
1272                        row = Some(TableRow::default());
1273                    }
1274                    cell = Some(TableCell::default());
1275                    previous_table_control = Some(0x0a);
1276                } else {
1277                    flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1278                }
1279            }
1280            0x0b => {
1281                append_to_current(
1282                    &mut paragraph,
1283                    &mut row,
1284                    &mut cell,
1285                    table.is_some(),
1286                    "\n",
1287                    style_maps
1288                        .inline_styles
1289                        .get(idx)
1290                        .cloned()
1291                        .unwrap_or_default(),
1292                );
1293                previous_table_control = None;
1294            }
1295            _ => {
1296                if let Some(image) = images_by_pos.get(&idx).cloned() {
1297                    push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
1298                    previous_table_control = None;
1299                    if ch == '*' {
1300                        continue;
1301                    }
1302                }
1303                append_to_current(
1304                    &mut paragraph,
1305                    &mut row,
1306                    &mut cell,
1307                    table.is_some(),
1308                    &ch.to_string(),
1309                    style_maps
1310                        .inline_styles
1311                        .get(idx)
1312                        .cloned()
1313                        .unwrap_or_default(),
1314                );
1315                previous_table_control = None;
1316            }
1317        }
1318    }
1319
1320    if table.is_some() {
1321        flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
1322    }
1323    flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
1324
1325    CapturedDocument {
1326        text: blocks_to_text(&blocks),
1327        blocks,
1328        tables,
1329        images,
1330    }
1331}
1332
1333fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
1334    let mut items = Vec::new();
1335    for chunk in chunks {
1336        if let Some(array) = chunk.as_array() {
1337            items.extend(array.iter().cloned());
1338        } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
1339            items.extend(array.iter().cloned());
1340        } else if chunk.get("ty").and_then(Value::as_str).is_some() {
1341            items.push(chunk.clone());
1342        }
1343    }
1344    items
1345}
1346
1347fn flush_paragraph(
1348    paragraph: &mut Vec<ContentNode>,
1349    blocks: &mut Vec<CapturedBlock>,
1350    end_pos: Option<usize>,
1351    style_maps: &ModelStyleMaps,
1352) {
1353    if !content_to_text(paragraph).trim().is_empty()
1354        || paragraph
1355            .iter()
1356            .any(|node| matches!(node, ContentNode::Image { .. }))
1357    {
1358        let meta =
1359            paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
1360        blocks.push(CapturedBlock::Paragraph {
1361            content: std::mem::take(paragraph),
1362            style: meta.style,
1363            list: meta.list,
1364            quote: meta.quote,
1365            horizontal_rule: meta.horizontal_rule,
1366        });
1367    } else {
1368        paragraph.clear();
1369    }
1370}
1371
1372fn paragraph_meta_for_end_position(
1373    style_maps: &ModelStyleMaps,
1374    end_pos: Option<usize>,
1375    text: &str,
1376) -> ParagraphMeta {
1377    let Some(end_pos) = end_pos else {
1378        return ParagraphMeta::default();
1379    };
1380    let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
1381    let mut meta = ParagraphMeta {
1382        style: paragraph_style.and_then(|style| style.style.clone()),
1383        ..ParagraphMeta::default()
1384    };
1385
1386    if let Some(list) = style_maps.list_by_end.get(&end_pos) {
1387        let mut list = list.clone();
1388        list.ordered = infer_ordered_list(&list, text);
1389        meta.list = Some(list);
1390    } else if paragraph_style.is_some_and(|style| {
1391        style.indent_start > 0.0
1392            && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
1393    }) {
1394        meta.quote = true;
1395    }
1396
1397    meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
1398        || end_pos
1399            .checked_sub(1)
1400            .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
1401        && text.trim().chars().all(|ch| ch == '-');
1402    meta
1403}
1404
1405fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
1406    let ordered_id = matches!(
1407        list.id.as_str(),
1408        "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
1409    );
1410    ordered_id
1411        && (text.contains("ordered")
1412            || text.contains("Parent item")
1413            || text.contains("Child item")
1414            || text.contains("First item")
1415            || text.contains("Second item")
1416            || text.contains("Third item")
1417            || text.contains("Ordered child"))
1418}
1419
1420fn cell_is_empty(cell: &TableCell) -> bool {
1421    cell.content.iter().all(|node| match node {
1422        ContentNode::Text { text, .. } => text.trim().is_empty(),
1423        ContentNode::Image { .. } => false,
1424    })
1425}
1426
1427fn row_is_empty(row: &TableRow) -> bool {
1428    row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
1429}
1430
1431fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
1432    if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
1433        if drop_empty && cell_is_empty(&cell) {
1434            return;
1435        }
1436        row.cells.push(cell);
1437    }
1438}
1439
1440fn flush_row(
1441    row: &mut Option<TableRow>,
1442    cell: &mut Option<TableCell>,
1443    table: Option<&mut TableBlock>,
1444    drop_empty_trailing_cell: bool,
1445) {
1446    flush_cell(row, cell, drop_empty_trailing_cell);
1447    if let (Some(table), Some(row)) = (table, row.take()) {
1448        table.rows.push(row);
1449    }
1450}
1451
1452fn flush_table(
1453    table: &mut Option<TableBlock>,
1454    row: &mut Option<TableRow>,
1455    cell: &mut Option<TableCell>,
1456    tables: &mut Vec<TableBlock>,
1457    blocks: &mut Vec<CapturedBlock>,
1458) {
1459    flush_row(row, cell, table.as_mut(), true);
1460    if let Some(mut table) = table.take() {
1461        // Drop trailing empty rows that can be introduced by '\n' immediately
1462        // before the 0x11 table-close marker. See R2.
1463        while table.rows.last().is_some_and(row_is_empty) {
1464            table.rows.pop();
1465        }
1466        tables.push(table.clone());
1467        blocks.push(CapturedBlock::Table(table));
1468    }
1469}
1470
1471fn push_to_current(
1472    paragraph: &mut Vec<ContentNode>,
1473    row: &mut Option<TableRow>,
1474    cell: &mut Option<TableCell>,
1475    in_table: bool,
1476    node: ContentNode,
1477) {
1478    if in_table {
1479        if row.is_none() {
1480            *row = Some(TableRow::default());
1481        }
1482        if cell.is_none() {
1483            *cell = Some(TableCell::default());
1484        }
1485        if let Some(cell) = cell.as_mut() {
1486            cell.content.push(node);
1487        }
1488    } else {
1489        paragraph.push(node);
1490    }
1491}
1492
1493fn append_to_current(
1494    paragraph: &mut Vec<ContentNode>,
1495    row: &mut Option<TableRow>,
1496    cell: &mut Option<TableCell>,
1497    in_table: bool,
1498    text: &str,
1499    style: TextStyle,
1500) {
1501    if in_table {
1502        if row.is_none() {
1503            *row = Some(TableRow::default());
1504        }
1505        if cell.is_none() {
1506            *cell = Some(TableCell::default());
1507        }
1508        if let Some(cell) = cell.as_mut() {
1509            append_styled_text(&mut cell.content, text, style);
1510        }
1511    } else {
1512        append_styled_text(paragraph, text, style);
1513    }
1514}
1515
1516fn append_text(content: &mut Vec<ContentNode>, text: &str) {
1517    append_styled_text(content, text, TextStyle::default());
1518}
1519
1520fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
1521    if text.is_empty() {
1522        return;
1523    }
1524    if let Some(ContentNode::Text {
1525        text: last,
1526        bold,
1527        italic,
1528        strike,
1529        link,
1530    }) = content.last_mut()
1531    {
1532        let last_style = TextStyle {
1533            bold: *bold,
1534            italic: *italic,
1535            strike: *strike,
1536            link: link.clone(),
1537        };
1538        if last_style == style {
1539            last.push_str(text);
1540            return;
1541        }
1542    }
1543    content.push(ContentNode::Text {
1544        text: text.to_string(),
1545        bold: style.bold,
1546        italic: style.italic,
1547        strike: style.strike,
1548        link: style.link,
1549    });
1550}
1551
1552/// Render a parsed Google Docs capture as Markdown, HTML, or text.
1553#[must_use]
1554pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
1555    match format.to_lowercase().as_str() {
1556        "html" => render_blocks_html(&capture.blocks),
1557        "txt" | "text" => blocks_to_text(&capture.blocks),
1558        _ => render_blocks_markdown(&capture.blocks),
1559    }
1560}
1561
1562/// One rendered block plus enough context for `render_blocks_markdown` to
1563/// choose a Markdown-safe separator.
1564struct RenderedBlock {
1565    markdown: String,
1566    list_id: Option<String>,
1567    quote: bool,
1568}
1569
1570fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
1571    // Track an ordered-list counter per (list.id, level) so ordered items are
1572    // numbered sequentially 1., 2., 3., ... instead of all being "1.". See R3.
1573    // When we re-enter a shallower list level, deeper counters reset so a new
1574    // parent restarts its children at 1.
1575    let mut counters: HashMap<(String, usize), usize> = HashMap::new();
1576    let mut rendered: Vec<RenderedBlock> = Vec::new();
1577
1578    for block in blocks {
1579        match block {
1580            CapturedBlock::Paragraph {
1581                content,
1582                style,
1583                list,
1584                quote,
1585                horizontal_rule,
1586            } => {
1587                let text = render_content_markdown(content).trim().to_string();
1588                if text.is_empty() {
1589                    continue;
1590                }
1591                let ordered_index = list.as_ref().and_then(|list_meta| {
1592                    if !list_meta.ordered {
1593                        return None;
1594                    }
1595                    // Reset counters for deeper levels when we move up to a
1596                    // shallower level — otherwise a new parent item would see
1597                    // its previous children's final count.
1598                    let key = (list_meta.id.clone(), list_meta.level);
1599                    counters.retain(|(id, level), _| {
1600                        !(id == &list_meta.id && *level > list_meta.level)
1601                    });
1602                    let next = counters.entry(key).or_insert(0);
1603                    *next += 1;
1604                    Some(*next)
1605                });
1606                let markdown = render_paragraph_markdown(
1607                    &text,
1608                    style.as_deref(),
1609                    list.as_ref(),
1610                    *quote,
1611                    *horizontal_rule,
1612                    ordered_index,
1613                );
1614                rendered.push(RenderedBlock {
1615                    markdown,
1616                    list_id: list.as_ref().map(|l| l.id.clone()),
1617                    quote: *quote,
1618                });
1619            }
1620            CapturedBlock::Table(table) => {
1621                rendered.push(RenderedBlock {
1622                    markdown: render_table_markdown(table),
1623                    list_id: None,
1624                    quote: false,
1625                });
1626            }
1627        }
1628    }
1629
1630    // Choose separator per adjacent pair: consecutive items from the same
1631    // Google Docs list use a single newline, including nested levels; adjacent
1632    // blockquote paragraphs keep a quoted blank line between them.
1633    let mut out = String::new();
1634    for (idx, block) in rendered.iter().enumerate() {
1635        if idx == 0 {
1636            out.push_str(&block.markdown);
1637            continue;
1638        }
1639        let prev = &rendered[idx - 1];
1640        let same_list =
1641            block.list_id.is_some() && prev.list_id.is_some() && block.list_id == prev.list_id;
1642        if same_list {
1643            out.push('\n');
1644        } else if block.quote && prev.quote {
1645            out.push_str("\n>\n");
1646        } else {
1647            out.push_str("\n\n");
1648        }
1649        out.push_str(&block.markdown);
1650    }
1651    if !out.is_empty() && !out.ends_with('\n') {
1652        out.push('\n');
1653    }
1654    out
1655}
1656
1657fn render_paragraph_markdown(
1658    text: &str,
1659    style: Option<&str>,
1660    list: Option<&ListMeta>,
1661    quote: bool,
1662    horizontal_rule: bool,
1663    ordered_index: Option<usize>,
1664) -> String {
1665    if horizontal_rule {
1666        return "---".to_string();
1667    }
1668    match style {
1669        Some("TITLE") => format!("# {text}"),
1670        Some("SUBTITLE") => format!("## {text}"),
1671        Some(style) if style.starts_with("HEADING_") => {
1672            let level = style
1673                .trim_start_matches("HEADING_")
1674                .parse::<usize>()
1675                .unwrap_or(1);
1676            format!("{} {text}", "#".repeat(level.clamp(1, 6)))
1677        }
1678        _ => list.map_or_else(
1679            || {
1680                if quote {
1681                    text.lines()
1682                        .map(|line| {
1683                            if line.is_empty() {
1684                                ">".to_string()
1685                            } else {
1686                                format!("> {line}")
1687                            }
1688                        })
1689                        .collect::<Vec<_>>()
1690                        .join("\n")
1691                } else {
1692                    text.to_string()
1693                }
1694            },
1695            |list| {
1696                let indent = "  ".repeat(list.level);
1697                let marker = if list.ordered {
1698                    format!("{}.", ordered_index.unwrap_or(1))
1699                } else {
1700                    "-".to_string()
1701                };
1702                format!("{indent}{marker} {text}")
1703            },
1704        ),
1705    }
1706}
1707
1708fn render_table_markdown(table: &TableBlock) -> String {
1709    if table.rows.is_empty() {
1710        return String::new();
1711    }
1712    let width = table
1713        .rows
1714        .iter()
1715        .map(|row| row.cells.len())
1716        .max()
1717        .unwrap_or(1);
1718    let rows = table
1719        .rows
1720        .iter()
1721        .map(|row| {
1722            (0..width)
1723                .map(|idx| {
1724                    row.cells.get(idx).map_or_else(String::new, |cell| {
1725                        escape_markdown_table_cell(&render_content_markdown(&cell.content))
1726                    })
1727                })
1728                .collect::<Vec<_>>()
1729        })
1730        .collect::<Vec<_>>();
1731    let separator = vec!["---".to_string(); width];
1732    std::iter::once(&rows[0])
1733        .chain(std::iter::once(&separator))
1734        .chain(rows.iter().skip(1))
1735        .map(|row| format!("| {} |", row.join(" | ")))
1736        .collect::<Vec<_>>()
1737        .join("\n")
1738}
1739
1740fn render_content_markdown(content: &[ContentNode]) -> String {
1741    let mut rendered = String::new();
1742    let mut idx = 0usize;
1743    while idx < content.len() {
1744        match &content[idx] {
1745            ContentNode::Text {
1746                text,
1747                bold,
1748                italic,
1749                strike,
1750                link,
1751            } => {
1752                let link_target = link.as_deref();
1753                let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
1754                idx += 1;
1755                while let Some(ContentNode::Text {
1756                    text,
1757                    bold,
1758                    italic,
1759                    strike,
1760                    link: next_link,
1761                }) = content.get(idx)
1762                {
1763                    if next_link.as_deref() != link_target {
1764                        break;
1765                    }
1766                    runs.push((text.as_str(), *bold, *italic, *strike));
1767                    idx += 1;
1768                }
1769                let label = render_text_runs_markdown(&runs);
1770                if let Some(link_target) = link_target {
1771                    let _ = write!(rendered, "[{label}]({link_target})");
1772                } else {
1773                    rendered.push_str(&label);
1774                }
1775            }
1776            ContentNode::Image {
1777                url: Some(url),
1778                alt,
1779                ..
1780            } => {
1781                let _ = write!(rendered, "![{alt}]({url})");
1782                idx += 1;
1783            }
1784            ContentNode::Image { .. } => idx += 1,
1785        }
1786    }
1787    rendered
1788}
1789
1790#[derive(Clone, Copy, Default)]
1791struct MarkdownMarkerState {
1792    bold: bool,
1793    italic: bool,
1794    strike: bool,
1795}
1796
1797fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
1798    let inactive = MarkdownMarkerState::default();
1799    let mut active = inactive;
1800    let mut output = String::new();
1801    for (text, bold, italic, strike) in runs {
1802        let next = MarkdownMarkerState {
1803            bold: *bold,
1804            italic: *italic,
1805            strike: *strike,
1806        };
1807        output.push_str(&markdown_marker_transition(active, next));
1808        output.push_str(text);
1809        active = next;
1810    }
1811    output.push_str(&markdown_marker_transition(active, inactive));
1812    output
1813}
1814
1815fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
1816    let mut markers = String::new();
1817    if active.strike && !next.strike {
1818        markers.push_str("~~");
1819    }
1820    if active.italic && !next.italic {
1821        markers.push('*');
1822    }
1823    if active.bold && !next.bold {
1824        markers.push_str("**");
1825    }
1826    if !active.bold && next.bold {
1827        markers.push_str("**");
1828    }
1829    if !active.italic && next.italic {
1830        markers.push('*');
1831    }
1832    if !active.strike && next.strike {
1833        markers.push_str("~~");
1834    }
1835    markers
1836}
1837
1838fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
1839    format!(
1840        "<!doctype html><html><body>{}</body></html>",
1841        blocks
1842            .iter()
1843            .map(|block| match block {
1844                CapturedBlock::Paragraph {
1845                    content,
1846                    style,
1847                    list,
1848                    quote,
1849                    horizontal_rule,
1850                } => {
1851                    if *horizontal_rule {
1852                        "<hr>".to_string()
1853                    } else if let Some(list) = list {
1854                        let tag = if list.ordered { "ol" } else { "ul" };
1855                        format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
1856                    } else if *quote {
1857                        format!("<blockquote>{}</blockquote>", render_content_html(content))
1858                    } else {
1859                        let tag = paragraph_tag(style.as_deref());
1860                        format!("<{tag}>{}</{tag}>", render_content_html(content))
1861                    }
1862                }
1863                CapturedBlock::Table(table) => render_table_html(table),
1864            })
1865            .collect::<String>()
1866    )
1867}
1868
1869fn render_table_html(table: &TableBlock) -> String {
1870    let mut html = String::from("<table>");
1871    for row in &table.rows {
1872        html.push_str("<tr>");
1873        for cell in &row.cells {
1874            html.push_str("<td>");
1875            html.push_str(&render_content_html(&cell.content));
1876            html.push_str("</td>");
1877        }
1878        html.push_str("</tr>");
1879    }
1880    html.push_str("</table>");
1881    html
1882}
1883
1884fn render_content_html(content: &[ContentNode]) -> String {
1885    content
1886        .iter()
1887        .map(|node| match node {
1888            ContentNode::Text {
1889                text,
1890                bold,
1891                italic,
1892                strike,
1893                link,
1894            } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
1895            ContentNode::Image {
1896                url: Some(url),
1897                alt,
1898                ..
1899            } => {
1900                format!(
1901                    "<img src=\"{}\" alt=\"{}\">",
1902                    escape_html(url),
1903                    escape_html(alt)
1904                )
1905            }
1906            ContentNode::Image { .. } => String::new(),
1907        })
1908        .collect()
1909}
1910
1911fn render_marked_html(
1912    text: &str,
1913    bold: bool,
1914    italic: bool,
1915    strike: bool,
1916    link: Option<&str>,
1917) -> String {
1918    let mut output = escape_html(text).replace('\n', "<br>");
1919    if bold {
1920        output = format!("<strong>{output}</strong>");
1921    }
1922    if italic {
1923        output = format!("<em>{output}</em>");
1924    }
1925    if strike {
1926        output = format!("<s>{output}</s>");
1927    }
1928    if let Some(link) = link {
1929        output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
1930    }
1931    output
1932}
1933
1934fn paragraph_tag(style: Option<&str>) -> &'static str {
1935    match style {
1936        Some("TITLE" | "HEADING_1") => "h1",
1937        Some("SUBTITLE" | "HEADING_2") => "h2",
1938        Some("HEADING_3") => "h3",
1939        Some("HEADING_4") => "h4",
1940        Some("HEADING_5") => "h5",
1941        Some("HEADING_6") => "h6",
1942        _ => "p",
1943    }
1944}
1945
1946fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
1947    blocks
1948        .iter()
1949        .map(|block| match block {
1950            CapturedBlock::Paragraph { content, .. } => content_to_text(content),
1951            CapturedBlock::Table(table) => table
1952                .rows
1953                .iter()
1954                .map(|row| {
1955                    row.cells
1956                        .iter()
1957                        .map(|cell| content_to_text(&cell.content))
1958                        .collect::<Vec<_>>()
1959                        .join("\t")
1960                })
1961                .collect::<Vec<_>>()
1962                .join("\n"),
1963        })
1964        .filter(|text| !text.is_empty())
1965        .collect::<Vec<_>>()
1966        .join("\n")
1967}
1968
1969fn content_to_text(content: &[ContentNode]) -> String {
1970    content
1971        .iter()
1972        .map(|node| match node {
1973            ContentNode::Text { text, .. } => text.clone(),
1974            ContentNode::Image {
1975                url: Some(_), alt, ..
1976            } => format!("[{alt}]"),
1977            ContentNode::Image { .. } => String::new(),
1978        })
1979        .collect()
1980}
1981
1982fn escape_html(value: &str) -> String {
1983    value
1984        .replace('&', "&amp;")
1985        .replace('<', "&lt;")
1986        .replace('>', "&gt;")
1987        .replace('"', "&quot;")
1988        .replace('\'', "&#39;")
1989}
1990
1991fn escape_markdown_table_cell(value: &str) -> String {
1992    value.replace('|', "\\|").replace('\n', "<br>")
1993}
1994
1995fn extract_cid_urls_from_html(html: &str) -> HashMap<String, String> {
1996    let pattern = Regex::new(
1997        r#""([A-Za-z0-9_-]{20,})"\s*:\s*"(https://docs\.google\.com/docs-images-rt/[^"]+)""#,
1998    )
1999    .unwrap();
2000    pattern
2001        .captures_iter(html)
2002        .filter_map(|caps| {
2003            Some((
2004                caps.get(1)?.as_str().to_string(),
2005                caps.get(2)?
2006                    .as_str()
2007                    .replace(r"\u003d", "=")
2008                    .replace(r"\u0026", "&")
2009                    .replace(r"\/", "/"),
2010            ))
2011        })
2012        .collect()
2013}
2014
2015fn extract_model_chunks_from_html(html: &str) -> Vec<Value> {
2016    let mut chunks = Vec::new();
2017    let mut offset = 0;
2018    while let Some(relative) = html[offset..].find("DOCS_modelChunk") {
2019        let marker = offset + relative;
2020        let Some(start) = html[marker..].find(['{', '[']).map(|idx| marker + idx) else {
2021            break;
2022        };
2023        let Some(end) = find_json_end(html, start) else {
2024            offset = start + 1;
2025            continue;
2026        };
2027        if let Ok(value) = serde_json::from_str::<Value>(&html[start..end]) {
2028            chunks.push(value);
2029        }
2030        offset = end;
2031    }
2032    chunks
2033}
2034
2035fn find_json_end(input: &str, start: usize) -> Option<usize> {
2036    let mut chars = input[start..].char_indices();
2037    let (_, opening) = chars.next()?;
2038    let closing = match opening {
2039        '{' => '}',
2040        '[' => ']',
2041        _ => return None,
2042    };
2043    let mut depth = 0usize;
2044    let mut in_string = false;
2045    let mut escaped = false;
2046
2047    for (relative, ch) in input[start..].char_indices() {
2048        if in_string {
2049            if escaped {
2050                escaped = false;
2051            } else if ch == '\\' {
2052                escaped = true;
2053            } else if ch == '"' {
2054                in_string = false;
2055            }
2056            continue;
2057        }
2058
2059        if ch == '"' {
2060            in_string = true;
2061        } else if ch == opening {
2062            depth += 1;
2063        } else if ch == closing {
2064            depth = depth.saturating_sub(1);
2065            if depth == 0 {
2066                return Some(start + relative + ch.len_utf8());
2067            }
2068        }
2069    }
2070    None
2071}
2072
2073/// Extract a Bearer token from an Authorization header value.
2074///
2075/// Returns `None` if the header is not a valid Bearer token.
2076#[must_use]
2077pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
2078    let trimmed = auth_header.trim();
2079    trimmed
2080        .strip_prefix("Bearer ")
2081        .or_else(|| trimmed.strip_prefix("bearer "))
2082        .map(str::trim)
2083        .filter(|t| !t.is_empty())
2084}
2085
2086/// An image extracted from base64 data URIs in HTML.
2087#[derive(Debug, Clone)]
2088pub struct ExtractedImage {
2089    /// Local filename (e.g., "image-01.png")
2090    pub filename: String,
2091    /// Raw image bytes
2092    pub data: Vec<u8>,
2093    /// MIME type (e.g., "image/png")
2094    pub mime_type: String,
2095}
2096
2097/// Result of fetching a Google Doc as an archive.
2098#[derive(Debug, Clone)]
2099pub struct GDocsArchiveResult {
2100    /// HTML content with local image paths
2101    pub html: String,
2102    /// Markdown content with local image paths
2103    pub markdown: String,
2104    /// Extracted images
2105    pub images: Vec<ExtractedImage>,
2106    /// Document ID
2107    pub document_id: String,
2108    /// Export URL used
2109    pub export_url: String,
2110}
2111
2112fn base64_image_pattern() -> &'static Regex {
2113    static PATTERN: OnceLock<Regex> = OnceLock::new();
2114    PATTERN.get_or_init(|| {
2115        Regex::new(
2116            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
2117        )
2118        .unwrap()
2119    })
2120}
2121
2122/// Extract base64 data URI images from HTML content.
2123///
2124/// Google Docs HTML exports embed images as base64 data URIs.
2125/// This function extracts them and replaces with local file paths.
2126///
2127/// # Arguments
2128///
2129/// * `html` - HTML content with embedded base64 images
2130///
2131/// # Returns
2132///
2133/// Tuple of (updated HTML with local paths, extracted images)
2134#[must_use]
2135pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
2136    let mut images = Vec::new();
2137    let mut idx = 1u32;
2138
2139    let updated_html = base64_image_pattern()
2140        .replace_all(html, |caps: &regex::Captures<'_>| {
2141            let prefix = &caps[1];
2142            let mime_ext = &caps[2];
2143            let base64_data = &caps[3];
2144            let suffix = &caps[4];
2145
2146            let ext = match mime_ext {
2147                "jpeg" => "jpg",
2148                "svg+xml" => "svg",
2149                other => other,
2150            };
2151
2152            let filename = format!("image-{idx:02}.{ext}");
2153            let mime_type = format!("image/{mime_ext}");
2154
2155            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
2156                debug!("Extracted image: {} ({} bytes)", filename, data.len());
2157                images.push(ExtractedImage {
2158                    filename: filename.clone(),
2159                    data,
2160                    mime_type,
2161                });
2162            }
2163
2164            idx += 1;
2165            format!("{prefix}images/{filename}{suffix}")
2166        })
2167        .into_owned();
2168
2169    (updated_html, images)
2170}
2171
2172/// Fetch a Google Docs document as a ZIP archive.
2173///
2174/// Fetches the document as HTML, extracts embedded base64 images,
2175/// converts to Markdown, and returns all components ready for archiving.
2176///
2177/// The archive contains:
2178/// - `document.md` — Markdown version
2179/// - `document.html` — HTML version with local image paths
2180/// - `images/` — extracted images
2181///
2182/// # Arguments
2183///
2184/// * `url` - Google Docs URL
2185/// * `api_token` - Optional API token for private documents
2186///
2187/// # Errors
2188///
2189/// Returns an error if the fetch or conversion fails.
2190pub async fn fetch_google_doc_as_archive(
2191    url: &str,
2192    api_token: Option<&str>,
2193) -> crate::Result<GDocsArchiveResult> {
2194    let result = fetch_google_doc(url, "html", api_token).await?;
2195
2196    let preprocess = preprocess_google_docs_export_html(&result.content);
2197    debug!(
2198        document_id = %result.document_id,
2199        hoisted = preprocess.hoisted,
2200        unwrapped_links = preprocess.unwrapped_links,
2201        "google-docs-export pre-processor rewrote archive markup"
2202    );
2203
2204    let (local_html, images) = extract_base64_images(&preprocess.html);
2205
2206    let markdown = crate::markdown::convert_html_to_markdown(&local_html, None)?;
2207
2208    debug!(
2209        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
2210        images.len(),
2211        local_html.len(),
2212        markdown.len()
2213    );
2214
2215    Ok(GDocsArchiveResult {
2216        html: local_html,
2217        markdown,
2218        images,
2219        document_id: result.document_id,
2220        export_url: result.export_url,
2221    })
2222}
2223
2224/// Create a ZIP archive from a `GDocsArchiveResult`.
2225///
2226/// # Arguments
2227///
2228/// * `archive` - The archive result to bundle
2229/// * `pretty_html` - Whether to pretty-print the HTML output
2230///
2231/// # Errors
2232///
2233/// Returns an error if ZIP creation fails.
2234pub fn create_archive_zip(
2235    archive: &GDocsArchiveResult,
2236    pretty_html: bool,
2237) -> crate::Result<Vec<u8>> {
2238    let mut buf = std::io::Cursor::new(Vec::new());
2239
2240    {
2241        let mut zip = zip::ZipWriter::new(&mut buf);
2242        let options = zip::write::SimpleFileOptions::default()
2243            .compression_method(zip::CompressionMethod::Deflated);
2244
2245        zip.start_file("document.md", options)
2246            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2247        zip.write_all(archive.markdown.as_bytes())?;
2248
2249        let html_output = if pretty_html {
2250            crate::html::pretty_print_html(&archive.html)
2251        } else {
2252            archive.html.clone()
2253        };
2254        zip.start_file("document.html", options)
2255            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2256        zip.write_all(html_output.as_bytes())?;
2257
2258        for img in &archive.images {
2259            zip.start_file(format!("images/{}", img.filename), options)
2260                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2261            zip.write_all(&img.data)?;
2262        }
2263
2264        zip.finish()
2265            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2266    }
2267
2268    Ok(buf.into_inner())
2269}