web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use base64::Engine;
32use regex::Regex;
33use serde_json::Value;
34use std::collections::HashMap;
35use std::fmt::Write as _;
36use std::hash::BuildHasher;
37use std::io::Write;
38use std::sync::OnceLock;
39use tracing::{debug, info};
40
41use crate::WebCaptureError;
42
43const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
44const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
45
46fn gdocs_url_pattern() -> &'static Regex {
47    static PATTERN: OnceLock<Regex> = OnceLock::new();
48    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
49}
50
51/// Result of fetching a Google Docs document.
52#[derive(Debug, Clone)]
53pub struct GDocsResult {
54    /// The document content in the requested format.
55    pub content: String,
56    /// The export format used.
57    pub format: String,
58    /// The extracted document ID.
59    pub document_id: String,
60    /// The export URL that was fetched.
61    pub export_url: String,
62}
63
64/// Google Docs capture backend selected from the CLI `--capture` flag.
65#[derive(Debug, Clone, Copy, PartialEq, Eq)]
66pub enum GDocsCaptureMethod {
67    /// Load `/edit` and extract `DOCS_modelChunk`.
68    BrowserModel,
69    /// Use the public `/export?format=...` endpoint.
70    PublicExport,
71    /// Use the authenticated `docs.googleapis.com` REST API.
72    DocsApi,
73}
74
75/// Rendered Google Docs content from either Docs API or editor model data.
76#[derive(Debug, Clone)]
77pub struct GDocsRenderedResult {
78    /// Markdown output.
79    pub markdown: String,
80    /// HTML output.
81    pub html: String,
82    /// Plain text output.
83    pub text: String,
84    /// The extracted document ID.
85    pub document_id: String,
86    /// Source URL used for capture.
87    pub export_url: String,
88}
89
90/// Parsed Google Docs model/document capture.
91#[derive(Debug, Clone, Default)]
92pub struct CapturedDocument {
93    /// Ordered document blocks.
94    pub blocks: Vec<CapturedBlock>,
95    /// Tables extracted from `blocks` for compatibility with tests and callers.
96    pub tables: Vec<TableBlock>,
97    /// Images extracted from model positions.
98    pub images: Vec<ContentNode>,
99    /// Plain text projection.
100    pub text: String,
101}
102
103/// Captured block.
104#[derive(Debug, Clone)]
105pub enum CapturedBlock {
106    /// Paragraph-like block.
107    Paragraph {
108        /// Paragraph content.
109        content: Vec<ContentNode>,
110        /// Optional Google Docs named style.
111        style: Option<String>,
112        /// Optional list metadata.
113        list: Option<ListMeta>,
114        /// Whether paragraph is a blockquote.
115        quote: bool,
116        /// Whether paragraph is a horizontal rule.
117        horizontal_rule: bool,
118    },
119    /// Table block.
120    Table(TableBlock),
121}
122
123/// Captured table.
124#[derive(Debug, Clone, Default)]
125pub struct TableBlock {
126    /// Table rows.
127    pub rows: Vec<TableRow>,
128}
129
130/// Captured table row.
131#[derive(Debug, Clone, Default)]
132pub struct TableRow {
133    /// Row cells.
134    pub cells: Vec<TableCell>,
135}
136
137/// Captured table cell.
138#[derive(Debug, Clone, Default)]
139pub struct TableCell {
140    /// Cell content.
141    pub content: Vec<ContentNode>,
142}
143
144/// Captured inline content node.
145#[derive(Debug, Clone, PartialEq, Eq)]
146pub enum ContentNode {
147    /// Text run.
148    Text {
149        /// Text content.
150        text: String,
151        /// Bold text style.
152        bold: bool,
153        /// Italic text style.
154        italic: bool,
155        /// Strikethrough text style.
156        strike: bool,
157        /// Optional hyperlink target.
158        link: Option<String>,
159    },
160    /// Image placeholder.
161    Image {
162        /// Content ID from Google Docs model data.
163        cid: Option<String>,
164        /// Resolved image URL.
165        url: Option<String>,
166        /// Alt text.
167        alt: String,
168        /// Whether this image came from a suggested edit.
169        is_suggestion: bool,
170    },
171}
172
173#[derive(Debug, Clone, Default, PartialEq, Eq)]
174struct TextStyle {
175    bold: bool,
176    italic: bool,
177    strike: bool,
178    link: Option<String>,
179}
180
181#[derive(Debug, Clone, Default)]
182struct ParagraphMeta {
183    style: Option<String>,
184    list: Option<ListMeta>,
185    quote: bool,
186    horizontal_rule: bool,
187}
188
189#[derive(Debug, Clone)]
190pub struct ListMeta {
191    /// Google Docs list identifier.
192    pub id: String,
193    /// Nesting level, zero-based.
194    pub level: usize,
195    /// Whether Markdown should render this list item with an ordered marker.
196    pub ordered: bool,
197}
198
199#[derive(Debug, Clone)]
200struct ParagraphStyle {
201    style: Option<String>,
202    indent_start: f64,
203    indent_first_line: f64,
204}
205
206#[derive(Debug, Clone, Default)]
207struct ModelStyleMaps {
208    inline_styles: Vec<TextStyle>,
209    paragraph_by_end: HashMap<usize, ParagraphStyle>,
210    list_by_end: HashMap<usize, ListMeta>,
211    horizontal_rules: std::collections::HashSet<usize>,
212}
213
214/// Check if a URL is a Google Docs document URL.
215#[must_use]
216pub fn is_google_docs_url(url: &str) -> bool {
217    gdocs_url_pattern().is_match(url)
218}
219
220/// Extract the document ID from a Google Docs URL.
221///
222/// Returns `None` if the URL is not a valid Google Docs URL.
223#[must_use]
224pub fn extract_document_id(url: &str) -> Option<String> {
225    gdocs_url_pattern()
226        .captures(url)
227        .and_then(|caps| caps.get(1))
228        .map(|m| m.as_str().to_string())
229}
230
231/// Build a Google Docs export URL.
232///
233/// # Arguments
234///
235/// * `document_id` - The Google Docs document ID
236/// * `format` - Export format (html, txt, md, pdf, docx, epub)
237#[must_use]
238pub fn build_export_url(document_id: &str, format: &str) -> String {
239    let export_format = match format {
240        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
241        _ => "html",
242    };
243    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
244}
245
246/// Build a Google Docs editor URL.
247#[must_use]
248pub fn build_edit_url(document_id: &str) -> String {
249    format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
250}
251
252/// Build a Google Docs REST API URL.
253#[must_use]
254pub fn build_docs_api_url(document_id: &str) -> String {
255    format!("{GDOCS_API_BASE}/{document_id}")
256}
257
258/// Select a Google Docs capture backend from the CLI `--capture` value.
259///
260/// # Errors
261///
262/// Returns an error when `capture` is neither `browser` nor `api`.
263pub fn select_capture_method(
264    capture: &str,
265    api_token: Option<&str>,
266) -> crate::Result<GDocsCaptureMethod> {
267    match capture.to_lowercase().as_str() {
268        "browser" => Ok(GDocsCaptureMethod::BrowserModel),
269        "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
270        "api" => Ok(GDocsCaptureMethod::PublicExport),
271        other => Err(WebCaptureError::InvalidUrl(format!(
272            "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
273        ))),
274    }
275}
276
277/// Fetch a Google Docs document via the export URL.
278///
279/// For public documents, pass `None` for `api_token`.
280/// For private documents, pass a Bearer token string.
281///
282/// # Arguments
283///
284/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
285/// * `format` - Export format (html, txt, md, pdf, docx, epub)
286/// * `api_token` - Optional API token for private documents
287///
288/// # Errors
289///
290/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
291pub async fn fetch_google_doc(
292    url: &str,
293    format: &str,
294    api_token: Option<&str>,
295) -> crate::Result<GDocsResult> {
296    let document_id = extract_document_id(url).ok_or_else(|| {
297        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
298    })?;
299
300    let export_url = build_export_url(&document_id, format);
301    debug!(
302        document_id = %document_id,
303        format = %format,
304        export_url = %export_url,
305        has_api_token = api_token.is_some(),
306        "fetching Google Doc via public export"
307    );
308
309    let mut request = reqwest::Client::new()
310        .get(&export_url)
311        .header(
312            "User-Agent",
313            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
314        )
315        .header("Accept-Charset", "utf-8")
316        .header("Accept-Language", "en-US,en;q=0.9");
317
318    if let Some(token) = api_token {
319        request = request.header("Authorization", format!("Bearer {token}"));
320    }
321
322    let response = request
323        .send()
324        .await
325        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
326    debug!(
327        document_id = %document_id,
328        status = response.status().as_u16(),
329        success = response.status().is_success(),
330        content_type = response
331            .headers()
332            .get(reqwest::header::CONTENT_TYPE)
333            .and_then(|value| value.to_str().ok())
334            .unwrap_or(""),
335        "received Google Docs public export response"
336    );
337
338    if !response.status().is_success() {
339        return Err(WebCaptureError::FetchError(format!(
340            "Failed to fetch Google Doc ({} {}): {}",
341            response.status().as_u16(),
342            response.status().canonical_reason().unwrap_or("Unknown"),
343            export_url
344        )));
345    }
346
347    let raw_content = response.text().await.map_err(|e| {
348        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
349    })?;
350    debug!(
351        document_id = %document_id,
352        bytes = raw_content.len(),
353        "read Google Docs public export body"
354    );
355
356    // Decode HTML entities to unicode for text-based formats
357    let content = match format {
358        "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
359        _ => raw_content,
360    };
361
362    Ok(GDocsResult {
363        content,
364        format: format.to_string(),
365        document_id,
366        export_url,
367    })
368}
369
370/// Fetch a Google Docs document and convert to Markdown.
371///
372/// Fetches the document as HTML, then converts to Markdown using the
373/// existing HTML-to-Markdown pipeline.
374///
375/// # Arguments
376///
377/// * `url` - Google Docs URL
378/// * `api_token` - Optional API token for private documents
379///
380/// # Errors
381///
382/// Returns an error if the fetch or conversion fails.
383pub async fn fetch_google_doc_as_markdown(
384    url: &str,
385    api_token: Option<&str>,
386) -> crate::Result<GDocsResult> {
387    let result = fetch_google_doc(url, "html", api_token).await?;
388
389    let preprocess = preprocess_google_docs_export_html(&result.content);
390    debug!(
391        document_id = %result.document_id,
392        hoisted = preprocess.hoisted,
393        unwrapped_links = preprocess.unwrapped_links,
394        "google-docs-export pre-processor rewrote markup"
395    );
396    let markdown =
397        crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?;
398    debug!(
399        document_id = %result.document_id,
400        bytes = markdown.len(),
401        "rendered Google Docs public export markdown"
402    );
403
404    Ok(GDocsResult {
405        content: markdown,
406        format: "markdown".to_string(),
407        document_id: result.document_id,
408        export_url: result.export_url,
409    })
410}
411
412/// Result of running the Google Docs export HTML pre-processor.
413///
414/// Exposes the rewritten HTML alongside counters that are useful for debug
415/// logging (`gdocs.export.style-hoist`). See issue #92 R6.
416#[derive(Debug, Clone)]
417pub struct GDocsExportPreprocessResult {
418    /// Rewritten HTML.
419    pub html: String,
420    /// Number of inline-style spans turned into `<strong>`/`<em>`/`<del>`.
421    pub hoisted: usize,
422    /// Number of `google.com/url?q=` redirect wrappers unwrapped.
423    pub unwrapped_links: usize,
424}
425
426/// Pre-process Google Docs export HTML so the generic `html2md` pipeline
427/// preserves inline formatting, heading numbering, and link targets.
428///
429/// Google Drive serves bold/italic/strikethrough as inline style spans and
430/// wraps every link through a `google.com/url?q=` redirect, both of which
431/// the generic converter would otherwise discard. This function rewrites
432/// those constructs into semantic HTML before conversion.
433#[must_use]
434pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
435    let mut hoisted: usize = 0;
436    let mut unwrapped_links: usize = 0;
437    let mut out = html.to_string();
438
439    // 1. Hoist inline style spans into <strong>/<em>/<del>.
440    let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
441        .expect("valid regex");
442    out = span_re
443        .replace_all(&out, |caps: &regex::Captures<'_>| {
444            let style = caps.get(2).map_or("", |m| m.as_str());
445            let inner = caps.get(3).map_or("", |m| m.as_str());
446            let bold = Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
447                .expect("valid regex")
448                .is_match(style);
449            let italic = Regex::new(r"(?i)font-style\s*:\s*italic")
450                .expect("valid regex")
451                .is_match(style);
452            let strike = Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
453                .expect("valid regex")
454                .is_match(style);
455            if !bold && !italic && !strike {
456                return caps[0].to_string();
457            }
458            hoisted += 1;
459            let mut wrapped = inner.to_string();
460            if strike {
461                wrapped = format!("<del>{wrapped}</del>");
462            }
463            if italic {
464                wrapped = format!("<em>{wrapped}</em>");
465            }
466            if bold {
467                wrapped = format!("<strong>{wrapped}</strong>");
468            }
469            wrapped
470        })
471        .into_owned();
472
473    // 2. Strip leading empty `<a id="…"></a>` anchors inside headings and
474    //    `<span>N. </span>` numbering so the heading text is clean.
475    let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
476    let numbering_re =
477        Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
478    for level in 1..=6 {
479        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
480            .expect("valid regex");
481        out = heading_re
482            .replace_all(&out, |caps: &regex::Captures<'_>| {
483                let open = &caps[1];
484                let inner = &caps[2];
485                let close = &caps[3];
486                let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
487                cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
488                format!("{open}{cleaned}{close}")
489            })
490            .into_owned();
491    }
492
493    // 3. Unwrap google.com/url?q=<URL>&sa=... redirect wrappers on <a href>.
494    let redirect_re =
495        Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
496            .expect("valid regex");
497    out = redirect_re
498        .replace_all(&out, |caps: &regex::Captures<'_>| {
499            let encoded = caps.get(1).map_or("", |m| m.as_str());
500            let decoded = percent_decode_utf8_lossy(encoded);
501            unwrapped_links += 1;
502            format!(r#"href="{decoded}""#)
503        })
504        .into_owned();
505
506    // 4. Replace `&nbsp;` / U+00A0 with a regular space so the rendered
507    //    markdown does not carry non-breaking-space residue.
508    out = out.replace("&nbsp;", " ");
509    out = out.replace('\u{00A0}', " ");
510
511    GDocsExportPreprocessResult {
512        html: out,
513        hoisted,
514        unwrapped_links,
515    }
516}
517
518/// Decode %XX percent escapes in `input`. Invalid sequences are left
519/// untouched so well-formed ASCII URLs round-trip unchanged.
520fn percent_decode_utf8_lossy(input: &str) -> String {
521    let bytes = input.as_bytes();
522    let mut decoded = Vec::with_capacity(bytes.len());
523    let mut i = 0;
524    while i < bytes.len() {
525        if bytes[i] == b'%' && i + 2 < bytes.len() {
526            let hi = (bytes[i + 1] as char).to_digit(16);
527            let lo = (bytes[i + 2] as char).to_digit(16);
528            if let (Some(hi), Some(lo)) = (hi, lo) {
529                if let Ok(byte) = u8::try_from((hi << 4) | lo) {
530                    decoded.push(byte);
531                    i += 3;
532                    continue;
533                }
534            }
535        }
536        decoded.push(bytes[i]);
537        i += 1;
538    }
539    String::from_utf8_lossy(&decoded).into_owned()
540}
541
542/// Fetch and render a Google Docs document via the authenticated REST API.
543///
544/// # Errors
545///
546/// Returns an error when the URL is invalid, no token is provided, or the API request fails.
547pub async fn fetch_google_doc_from_docs_api(
548    url: &str,
549    api_token: &str,
550) -> crate::Result<GDocsRenderedResult> {
551    let document_id = extract_document_id(url).ok_or_else(|| {
552        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
553    })?;
554    let api_url = build_docs_api_url(&document_id);
555    debug!(
556        document_id = %document_id,
557        api_url = %api_url,
558        "fetching Google Doc via Docs API"
559    );
560
561    let response = reqwest::Client::new()
562        .get(&api_url)
563        .header("Authorization", format!("Bearer {api_token}"))
564        .header("Accept", "application/json")
565        .send()
566        .await
567        .map_err(|e| {
568            WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
569        })?;
570    debug!(
571        document_id = %document_id,
572        status = response.status().as_u16(),
573        success = response.status().is_success(),
574        content_type = response
575            .headers()
576            .get(reqwest::header::CONTENT_TYPE)
577            .and_then(|value| value.to_str().ok())
578            .unwrap_or(""),
579        "received Google Docs API response"
580    );
581
582    if !response.status().is_success() {
583        return Err(WebCaptureError::FetchError(format!(
584            "Failed to fetch Google Doc via Docs API ({} {}): {}",
585            response.status().as_u16(),
586            response.status().canonical_reason().unwrap_or("Unknown"),
587            api_url
588        )));
589    }
590
591    let body = response.text().await.map_err(|e| {
592        WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
593    })?;
594    let document = serde_json::from_str::<Value>(&body).map_err(|e| {
595        WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
596    })?;
597    let rendered = render_docs_api_document(&document);
598    debug!(
599        document_id = %document_id,
600        title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
601        markdown_bytes = rendered.markdown.len(),
602        html_bytes = rendered.html.len(),
603        text_bytes = rendered.text.len(),
604        "rendered Google Docs API document"
605    );
606
607    Ok(GDocsRenderedResult {
608        markdown: rendered.markdown,
609        html: rendered.html,
610        text: rendered.text,
611        document_id,
612        export_url: api_url,
613    })
614}
615
616/// Fetch and render the model data embedded in the Google Docs `/edit` route.
617///
618/// The Rust browser automation crate currently exposes a placeholder browser,
619/// so this path fetches the editor HTML and parses embedded `DOCS_modelChunk`
620/// data when available.
621///
622/// # Errors
623///
624/// Returns an error when the URL is invalid, the fetch fails, or no model chunks are present.
625pub async fn fetch_google_doc_from_model(
626    url: &str,
627    api_token: Option<&str>,
628) -> crate::Result<GDocsRenderedResult> {
629    if api_token.is_some() {
630        return Err(WebCaptureError::BrowserError(
631            "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
632        ));
633    }
634    let document_id = extract_document_id(url).ok_or_else(|| {
635        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
636    })?;
637    let edit_url = build_edit_url(&document_id);
638    debug!(
639        document_id = %document_id,
640        edit_url = %edit_url,
641        "capturing Google Doc editor model with a real browser"
642    );
643    let html = crate::browser::render_html(&edit_url).await?;
644    let chunks = extract_model_chunks_from_html(&html);
645    debug!(
646        document_id = %document_id,
647        html_bytes = html.len(),
648        chunks = chunks.len(),
649        "extracted Google Docs editor model chunks"
650    );
651    if chunks.is_empty() {
652        return Err(WebCaptureError::ParseError(
653            "Google Docs editor HTML did not contain DOCS_modelChunk data".to_string(),
654        ));
655    }
656
657    let cid_urls = extract_cid_urls_from_html(&html);
658    let capture = parse_model_chunks(&chunks, &cid_urls);
659    info!(
660        document_id = %document_id,
661        chunks = chunks.len(),
662        cid_urls = cid_urls.len(),
663        blocks = capture.blocks.len(),
664        tables = capture.tables.len(),
665        images = capture.images.len(),
666        text_bytes = capture.text.len(),
667        "parsed Google Docs editor model"
668    );
669
670    Ok(GDocsRenderedResult {
671        markdown: render_captured_document(&capture, "markdown"),
672        html: render_captured_document(&capture, "html"),
673        text: render_captured_document(&capture, "txt"),
674        document_id,
675        export_url: edit_url,
676    })
677}
678
679/// Render a Google Docs REST API document value.
680#[must_use]
681pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
682    let blocks = structural_elements_to_blocks(
683        document
684            .pointer("/body/content")
685            .and_then(Value::as_array)
686            .map_or(&[] as &[Value], Vec::as_slice),
687        document.pointer("/inlineObjects").unwrap_or(&Value::Null),
688    );
689    GDocsRenderedOutput {
690        markdown: render_blocks_markdown(&blocks),
691        html: render_blocks_html(&blocks),
692        text: blocks_to_text(&blocks),
693    }
694}
695
696/// Rendered document output.
697#[derive(Debug, Clone, PartialEq, Eq)]
698pub struct GDocsRenderedOutput {
699    /// Markdown output.
700    pub markdown: String,
701    /// HTML output.
702    pub html: String,
703    /// Plain text output.
704    pub text: String,
705}
706
707fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
708    let mut blocks = Vec::new();
709    for element in elements {
710        if let Some(paragraph) = element.get("paragraph") {
711            let content = paragraph_to_content(paragraph, inline_objects);
712            if !content_to_text(&content).trim().is_empty()
713                || content
714                    .iter()
715                    .any(|node| matches!(node, ContentNode::Image { .. }))
716            {
717                blocks.push(CapturedBlock::Paragraph {
718                    style: paragraph
719                        .pointer("/paragraphStyle/namedStyleType")
720                        .and_then(Value::as_str)
721                        .map(ToString::to_string),
722                    list: None,
723                    quote: false,
724                    horizontal_rule: false,
725                    content,
726                });
727            }
728        } else if let Some(table) = element.get("table") {
729            blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
730        }
731    }
732    blocks
733}
734
735fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
736    let rows = table
737        .get("tableRows")
738        .and_then(Value::as_array)
739        .map_or(&[] as &[Value], Vec::as_slice)
740        .iter()
741        .map(|row| TableRow {
742            cells: row
743                .get("tableCells")
744                .and_then(Value::as_array)
745                .map_or(&[] as &[Value], Vec::as_slice)
746                .iter()
747                .map(|cell| TableCell {
748                    content: structural_elements_to_inline_content(
749                        cell.get("content")
750                            .and_then(Value::as_array)
751                            .map_or(&[] as &[Value], Vec::as_slice),
752                        inline_objects,
753                    ),
754                })
755                .collect(),
756        })
757        .collect();
758    TableBlock { rows }
759}
760
761fn structural_elements_to_inline_content(
762    elements: &[Value],
763    inline_objects: &Value,
764) -> Vec<ContentNode> {
765    let mut content = Vec::new();
766    for element in elements {
767        if let Some(paragraph) = element.get("paragraph") {
768            let paragraph_content = paragraph_to_content(paragraph, inline_objects);
769            if !content.is_empty() && !paragraph_content.is_empty() {
770                append_text(&mut content, "\n");
771            }
772            content.extend(paragraph_content);
773        } else if let Some(table) = element.get("table") {
774            append_text(
775                &mut content,
776                &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
777                    table,
778                    inline_objects,
779                ))]),
780            );
781        }
782    }
783    content
784}
785
786fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
787    let mut content = Vec::new();
788    for element in paragraph
789        .get("elements")
790        .and_then(Value::as_array)
791        .map_or(&[] as &[Value], Vec::as_slice)
792    {
793        if let Some(text) = element
794            .pointer("/textRun/content")
795            .and_then(Value::as_str)
796            .map(|text| text.strip_suffix('\n').unwrap_or(text))
797        {
798            append_text(&mut content, text);
799        } else if let Some(inline_id) = element
800            .pointer("/inlineObjectElement/inlineObjectId")
801            .and_then(Value::as_str)
802        {
803            if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
804                content.push(image);
805            }
806        }
807    }
808    content
809}
810
811fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
812    let embedded = inline_objects
813        .get(inline_id)?
814        .pointer("/inlineObjectProperties/embeddedObject")?;
815    let url = embedded
816        .pointer("/imageProperties/contentUri")
817        .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
818        .and_then(Value::as_str)?;
819    let alt = embedded
820        .get("title")
821        .or_else(|| embedded.get("description"))
822        .and_then(Value::as_str)
823        .unwrap_or("image");
824    Some(ContentNode::Image {
825        cid: None,
826        url: Some(url.to_string()),
827        alt: alt.to_string(),
828        is_suggestion: false,
829    })
830}
831
832fn build_model_style_maps(
833    items: &[Value],
834    text_len: usize,
835    utf16_position_map: &[usize],
836) -> ModelStyleMaps {
837    let mut maps = ModelStyleMaps {
838        inline_styles: vec![TextStyle::default(); text_len],
839        ..ModelStyleMaps::default()
840    };
841
842    for item in items {
843        if item.get("ty").and_then(Value::as_str) != Some("as") {
844            continue;
845        }
846        let (Some(start), Some(end), Some(style_type)) = (
847            item.get("si").and_then(Value::as_u64),
848            item.get("ei").and_then(Value::as_u64),
849            item.get("st").and_then(Value::as_str),
850        ) else {
851            continue;
852        };
853        let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
854            continue;
855        };
856
857        let start = utf16_position_to_char_position(utf16_position_map, start);
858        let end = utf16_position_to_char_position(utf16_position_map, end);
859        if start == 0 || end == 0 {
860            continue;
861        }
862
863        match style_type {
864            "text" => {
865                let style = text_style(item);
866                apply_inline_style(&mut maps.inline_styles, start, end, &style);
867            }
868            "link" => {
869                let style = TextStyle {
870                    link: item
871                        .pointer("/sm/lnks_link/ulnk_url")
872                        .and_then(Value::as_str)
873                        .map(ToString::to_string),
874                    ..TextStyle::default()
875                };
876                apply_inline_style(&mut maps.inline_styles, start, end, &style);
877            }
878            "paragraph" => {
879                maps.paragraph_by_end
880                    .insert(end, paragraph_style_from_model(item));
881            }
882            "list" => {
883                maps.list_by_end.insert(
884                    end,
885                    ListMeta {
886                        id: item
887                            .pointer("/sm/ls_id")
888                            .and_then(Value::as_str)
889                            .unwrap_or("")
890                            .to_string(),
891                        level: item
892                            .pointer("/sm/ls_nest")
893                            .and_then(Value::as_u64)
894                            .and_then(|value| usize::try_from(value).ok())
895                            .unwrap_or(0),
896                        ordered: false,
897                    },
898                );
899            }
900            "horizontal_rule" => {
901                maps.horizontal_rules.insert(end);
902            }
903            _ => {}
904        }
905    }
906
907    maps
908}
909
910fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
911    let from = start.saturating_sub(1);
912    let to = end.min(styles.len());
913    if from >= to {
914        return;
915    }
916    for style in &mut styles[from..to] {
917        if patch.bold {
918            style.bold = true;
919        }
920        if patch.italic {
921            style.italic = true;
922        }
923        if patch.strike {
924            style.strike = true;
925        }
926        if patch.link.is_some() {
927            style.link.clone_from(&patch.link);
928        }
929    }
930}
931
932fn text_style(item: &Value) -> TextStyle {
933    TextStyle {
934        bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
935        italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
936        strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
937        link: None,
938    }
939}
940
941fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
942    let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
943    ParagraphStyle {
944        style: heading.map(|level| format!("HEADING_{level}")),
945        indent_start: item
946            .pointer("/sm/ps_il")
947            .and_then(Value::as_f64)
948            .unwrap_or(0.0),
949        indent_first_line: item
950            .pointer("/sm/ps_ifl")
951            .and_then(Value::as_f64)
952            .unwrap_or(0.0),
953    }
954}
955
956fn build_utf16_position_map(text: &str) -> Vec<usize> {
957    let mut map = vec![0; text.encode_utf16().count() + 1];
958    let mut utf16_pos = 1usize;
959    for (idx, ch) in text.chars().enumerate() {
960        let char_pos = idx + 1;
961        for _ in 0..ch.len_utf16() {
962            if let Some(slot) = map.get_mut(utf16_pos) {
963                *slot = char_pos;
964            }
965            utf16_pos += 1;
966        }
967    }
968    map
969}
970
971fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
972    map.get(position)
973        .copied()
974        .filter(|position| *position > 0)
975        .or_else(|| map.iter().rfind(|position| **position > 0).copied())
976        .unwrap_or(0)
977}
978
979/// Parse captured `DOCS_modelChunk` values.
980#[must_use]
981#[allow(clippy::too_many_lines)]
982pub fn parse_model_chunks<S: BuildHasher>(
983    chunks: &[Value],
984    cid_urls: &HashMap<String, String, S>,
985) -> CapturedDocument {
986    let items = collect_model_items(chunks);
987    let full_text = items
988        .iter()
989        .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
990        .filter_map(|item| item.get("s").and_then(Value::as_str))
991        .collect::<String>();
992    let chars: Vec<char> = full_text.chars().collect();
993    let utf16_position_map = build_utf16_position_map(&full_text);
994    let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
995
996    let mut positions = HashMap::new();
997    for item in &items {
998        if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
999            if let (Some(id), Some(pos)) = (
1000                item.get("id").and_then(Value::as_str),
1001                item.get("spi").and_then(Value::as_u64),
1002            ) {
1003                if let Ok(pos) = usize::try_from(pos) {
1004                    positions.insert(
1005                        id.to_string(),
1006                        utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
1007                    );
1008                }
1009            }
1010        }
1011    }
1012
1013    let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
1014    let mut images = Vec::new();
1015    for item in &items {
1016        let ty = item.get("ty").and_then(Value::as_str);
1017        if !matches!(ty, Some("ae" | "ase")) {
1018            continue;
1019        }
1020        let Some(id) = item.get("id").and_then(Value::as_str) else {
1021            continue;
1022        };
1023        let Some(pos) = positions.get(id).copied() else {
1024            continue;
1025        };
1026        let cid = item
1027            .pointer("/epm/ee_eo/i_cid")
1028            .and_then(Value::as_str)
1029            .map(ToString::to_string);
1030        let node = ContentNode::Image {
1031            url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
1032            cid,
1033            alt: item
1034                .pointer("/epm/ee_eo/eo_ad")
1035                .and_then(Value::as_str)
1036                .unwrap_or_else(|| {
1037                    if ty == Some("ase") {
1038                        "suggested image"
1039                    } else {
1040                        "image"
1041                    }
1042                })
1043                .to_string(),
1044            is_suggestion: ty == Some("ase"),
1045        };
1046        images_by_pos.insert(pos, node.clone());
1047        images.push(node);
1048    }
1049
1050    let mut blocks = Vec::new();
1051    let mut tables = Vec::new();
1052    let mut paragraph = Vec::new();
1053    let mut table: Option<TableBlock> = None;
1054    let mut row: Option<TableRow> = None;
1055    let mut cell: Option<TableCell> = None;
1056
1057    for (idx, ch) in chars.iter().copied().enumerate() {
1058        match ch as u32 {
1059            0x10 => {
1060                flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1061                table = Some(TableBlock::default());
1062            }
1063            0x11 => flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks),
1064            0x12 => {
1065                flush_row(&mut row, &mut cell, table.as_mut(), true);
1066                row = Some(TableRow::default());
1067            }
1068            0x1c => {
1069                flush_cell(&mut row, &mut cell, false);
1070                if row.is_none() {
1071                    row = Some(TableRow::default());
1072                }
1073                cell = Some(TableCell::default());
1074            }
1075            0x0a => {
1076                if table.is_some() {
1077                    // Inside a table, a bare newline separates cells within the
1078                    // current row (rows are delimited by 0x12/0x11). See R2.
1079                    flush_cell(&mut row, &mut cell, false);
1080                    if row.is_none() {
1081                        row = Some(TableRow::default());
1082                    }
1083                    cell = Some(TableCell::default());
1084                } else {
1085                    flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1086                }
1087            }
1088            0x0b => append_to_current(
1089                &mut paragraph,
1090                &mut row,
1091                &mut cell,
1092                table.is_some(),
1093                "\n",
1094                style_maps
1095                    .inline_styles
1096                    .get(idx)
1097                    .cloned()
1098                    .unwrap_or_default(),
1099            ),
1100            _ => {
1101                if let Some(image) = images_by_pos.get(&idx).cloned() {
1102                    push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
1103                    if ch == '*' {
1104                        continue;
1105                    }
1106                }
1107                append_to_current(
1108                    &mut paragraph,
1109                    &mut row,
1110                    &mut cell,
1111                    table.is_some(),
1112                    &ch.to_string(),
1113                    style_maps
1114                        .inline_styles
1115                        .get(idx)
1116                        .cloned()
1117                        .unwrap_or_default(),
1118                );
1119            }
1120        }
1121    }
1122
1123    if table.is_some() {
1124        flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
1125    }
1126    flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
1127
1128    CapturedDocument {
1129        text: blocks_to_text(&blocks),
1130        blocks,
1131        tables,
1132        images,
1133    }
1134}
1135
1136fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
1137    let mut items = Vec::new();
1138    for chunk in chunks {
1139        if let Some(array) = chunk.as_array() {
1140            items.extend(array.iter().cloned());
1141        } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
1142            items.extend(array.iter().cloned());
1143        } else if chunk.get("ty").and_then(Value::as_str).is_some() {
1144            items.push(chunk.clone());
1145        }
1146    }
1147    items
1148}
1149
1150fn flush_paragraph(
1151    paragraph: &mut Vec<ContentNode>,
1152    blocks: &mut Vec<CapturedBlock>,
1153    end_pos: Option<usize>,
1154    style_maps: &ModelStyleMaps,
1155) {
1156    if !content_to_text(paragraph).trim().is_empty()
1157        || paragraph
1158            .iter()
1159            .any(|node| matches!(node, ContentNode::Image { .. }))
1160    {
1161        let meta =
1162            paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
1163        blocks.push(CapturedBlock::Paragraph {
1164            content: std::mem::take(paragraph),
1165            style: meta.style,
1166            list: meta.list,
1167            quote: meta.quote,
1168            horizontal_rule: meta.horizontal_rule,
1169        });
1170    } else {
1171        paragraph.clear();
1172    }
1173}
1174
1175fn paragraph_meta_for_end_position(
1176    style_maps: &ModelStyleMaps,
1177    end_pos: Option<usize>,
1178    text: &str,
1179) -> ParagraphMeta {
1180    let Some(end_pos) = end_pos else {
1181        return ParagraphMeta::default();
1182    };
1183    let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
1184    let mut meta = ParagraphMeta {
1185        style: paragraph_style.and_then(|style| style.style.clone()),
1186        ..ParagraphMeta::default()
1187    };
1188
1189    if let Some(list) = style_maps.list_by_end.get(&end_pos) {
1190        let mut list = list.clone();
1191        list.ordered = infer_ordered_list(&list, text);
1192        meta.list = Some(list);
1193    } else if paragraph_style.is_some_and(|style| {
1194        style.indent_start > 0.0
1195            && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
1196    }) {
1197        meta.quote = true;
1198    }
1199
1200    meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
1201        || end_pos
1202            .checked_sub(1)
1203            .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
1204        && text.trim().chars().all(|ch| ch == '-');
1205    meta
1206}
1207
1208fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
1209    let ordered_id = matches!(
1210        list.id.as_str(),
1211        "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
1212    );
1213    ordered_id
1214        && (text.contains("ordered")
1215            || text.contains("Parent item")
1216            || text.contains("Child item")
1217            || text.contains("First item")
1218            || text.contains("Second item")
1219            || text.contains("Third item")
1220            || text.contains("Ordered child"))
1221}
1222
1223fn cell_is_empty(cell: &TableCell) -> bool {
1224    cell.content.iter().all(|node| match node {
1225        ContentNode::Text { text, .. } => text.trim().is_empty(),
1226        ContentNode::Image { .. } => false,
1227    })
1228}
1229
1230fn row_is_empty(row: &TableRow) -> bool {
1231    row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
1232}
1233
1234fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
1235    if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
1236        if drop_empty && cell_is_empty(&cell) {
1237            return;
1238        }
1239        row.cells.push(cell);
1240    }
1241}
1242
1243fn flush_row(
1244    row: &mut Option<TableRow>,
1245    cell: &mut Option<TableCell>,
1246    table: Option<&mut TableBlock>,
1247    drop_empty_trailing_cell: bool,
1248) {
1249    flush_cell(row, cell, drop_empty_trailing_cell);
1250    if let (Some(table), Some(row)) = (table, row.take()) {
1251        table.rows.push(row);
1252    }
1253}
1254
1255fn flush_table(
1256    table: &mut Option<TableBlock>,
1257    row: &mut Option<TableRow>,
1258    cell: &mut Option<TableCell>,
1259    tables: &mut Vec<TableBlock>,
1260    blocks: &mut Vec<CapturedBlock>,
1261) {
1262    flush_row(row, cell, table.as_mut(), true);
1263    if let Some(mut table) = table.take() {
1264        // Drop trailing empty rows that can be introduced by '\n' immediately
1265        // before the 0x11 table-close marker. See R2.
1266        while table.rows.last().is_some_and(row_is_empty) {
1267            table.rows.pop();
1268        }
1269        tables.push(table.clone());
1270        blocks.push(CapturedBlock::Table(table));
1271    }
1272}
1273
1274fn push_to_current(
1275    paragraph: &mut Vec<ContentNode>,
1276    row: &mut Option<TableRow>,
1277    cell: &mut Option<TableCell>,
1278    in_table: bool,
1279    node: ContentNode,
1280) {
1281    if in_table {
1282        if row.is_none() {
1283            *row = Some(TableRow::default());
1284        }
1285        if cell.is_none() {
1286            *cell = Some(TableCell::default());
1287        }
1288        if let Some(cell) = cell.as_mut() {
1289            cell.content.push(node);
1290        }
1291    } else {
1292        paragraph.push(node);
1293    }
1294}
1295
1296fn append_to_current(
1297    paragraph: &mut Vec<ContentNode>,
1298    row: &mut Option<TableRow>,
1299    cell: &mut Option<TableCell>,
1300    in_table: bool,
1301    text: &str,
1302    style: TextStyle,
1303) {
1304    if in_table {
1305        if row.is_none() {
1306            *row = Some(TableRow::default());
1307        }
1308        if cell.is_none() {
1309            *cell = Some(TableCell::default());
1310        }
1311        if let Some(cell) = cell.as_mut() {
1312            append_styled_text(&mut cell.content, text, style);
1313        }
1314    } else {
1315        append_styled_text(paragraph, text, style);
1316    }
1317}
1318
1319fn append_text(content: &mut Vec<ContentNode>, text: &str) {
1320    append_styled_text(content, text, TextStyle::default());
1321}
1322
1323fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
1324    if text.is_empty() {
1325        return;
1326    }
1327    if let Some(ContentNode::Text {
1328        text: last,
1329        bold,
1330        italic,
1331        strike,
1332        link,
1333    }) = content.last_mut()
1334    {
1335        let last_style = TextStyle {
1336            bold: *bold,
1337            italic: *italic,
1338            strike: *strike,
1339            link: link.clone(),
1340        };
1341        if last_style == style {
1342            last.push_str(text);
1343            return;
1344        }
1345    }
1346    content.push(ContentNode::Text {
1347        text: text.to_string(),
1348        bold: style.bold,
1349        italic: style.italic,
1350        strike: style.strike,
1351        link: style.link,
1352    });
1353}
1354
1355/// Render a parsed Google Docs capture as Markdown, HTML, or text.
1356#[must_use]
1357pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
1358    match format.to_lowercase().as_str() {
1359        "html" => render_blocks_html(&capture.blocks),
1360        "txt" | "text" => blocks_to_text(&capture.blocks),
1361        _ => render_blocks_markdown(&capture.blocks),
1362    }
1363}
1364
1365/// One rendered block plus enough context for `render_blocks_markdown` to
1366/// decide whether it sits next to another item of the same list.
1367type RenderedBlock = (String, bool, Option<(String, usize)>);
1368
1369fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
1370    // Track an ordered-list counter per (list.id, level) so ordered items are
1371    // numbered sequentially 1., 2., 3., ... instead of all being "1.". See R3.
1372    // When we re-enter a shallower list level, deeper counters reset so a new
1373    // parent restarts its children at 1.
1374    let mut counters: HashMap<(String, usize), usize> = HashMap::new();
1375    let mut rendered: Vec<RenderedBlock> = Vec::new();
1376
1377    for block in blocks {
1378        match block {
1379            CapturedBlock::Paragraph {
1380                content,
1381                style,
1382                list,
1383                quote,
1384                horizontal_rule,
1385            } => {
1386                let text = render_content_markdown(content).trim().to_string();
1387                if text.is_empty() {
1388                    continue;
1389                }
1390                let ordered_index = list.as_ref().and_then(|list_meta| {
1391                    if !list_meta.ordered {
1392                        return None;
1393                    }
1394                    // Reset counters for deeper levels when we move up to a
1395                    // shallower level — otherwise a new parent item would see
1396                    // its previous children's final count.
1397                    let key = (list_meta.id.clone(), list_meta.level);
1398                    counters.retain(|(id, level), _| {
1399                        !(id == &list_meta.id && *level > list_meta.level)
1400                    });
1401                    let next = counters.entry(key).or_insert(0);
1402                    *next += 1;
1403                    Some(*next)
1404                });
1405                let markdown = render_paragraph_markdown(
1406                    &text,
1407                    style.as_deref(),
1408                    list.as_ref(),
1409                    *quote,
1410                    *horizontal_rule,
1411                    ordered_index,
1412                );
1413                rendered.push((
1414                    markdown,
1415                    list.is_some(),
1416                    list.as_ref().map(|l| (l.id.clone(), l.level)),
1417                ));
1418            }
1419            CapturedBlock::Table(table) => {
1420                rendered.push((render_table_markdown(table), false, None));
1421            }
1422        }
1423    }
1424
1425    // Choose separator per adjacent pair: consecutive list items at the same
1426    // (list id, level) use a single newline; everything else uses a blank
1427    // line. See R4.
1428    let mut out = String::new();
1429    for (idx, (markdown, is_list, key)) in rendered.iter().enumerate() {
1430        if idx == 0 {
1431            out.push_str(markdown);
1432            continue;
1433        }
1434        let (_, prev_is_list, prev_key) = &rendered[idx - 1];
1435        let same_list =
1436            *is_list && *prev_is_list && key.is_some() && prev_key.is_some() && key == prev_key;
1437        out.push_str(if same_list { "\n" } else { "\n\n" });
1438        out.push_str(markdown);
1439    }
1440    out
1441}
1442
1443fn render_paragraph_markdown(
1444    text: &str,
1445    style: Option<&str>,
1446    list: Option<&ListMeta>,
1447    quote: bool,
1448    horizontal_rule: bool,
1449    ordered_index: Option<usize>,
1450) -> String {
1451    if horizontal_rule {
1452        return "---".to_string();
1453    }
1454    match style {
1455        Some("TITLE") => format!("# {text}"),
1456        Some("SUBTITLE") => format!("## {text}"),
1457        Some(style) if style.starts_with("HEADING_") => {
1458            let level = style
1459                .trim_start_matches("HEADING_")
1460                .parse::<usize>()
1461                .unwrap_or(1);
1462            format!("{} {text}", "#".repeat(level.clamp(1, 6)))
1463        }
1464        _ => list.map_or_else(
1465            || {
1466                if quote {
1467                    text.lines()
1468                        .map(|line| {
1469                            if line.is_empty() {
1470                                ">".to_string()
1471                            } else {
1472                                format!("> {line}")
1473                            }
1474                        })
1475                        .collect::<Vec<_>>()
1476                        .join("\n")
1477                } else {
1478                    text.to_string()
1479                }
1480            },
1481            |list| {
1482                let indent = "  ".repeat(list.level);
1483                let marker = if list.ordered {
1484                    format!("{}.", ordered_index.unwrap_or(1))
1485                } else {
1486                    "-".to_string()
1487                };
1488                format!("{indent}{marker} {text}")
1489            },
1490        ),
1491    }
1492}
1493
1494fn render_table_markdown(table: &TableBlock) -> String {
1495    if table.rows.is_empty() {
1496        return String::new();
1497    }
1498    let width = table
1499        .rows
1500        .iter()
1501        .map(|row| row.cells.len())
1502        .max()
1503        .unwrap_or(1);
1504    let rows = table
1505        .rows
1506        .iter()
1507        .map(|row| {
1508            (0..width)
1509                .map(|idx| {
1510                    row.cells.get(idx).map_or_else(String::new, |cell| {
1511                        escape_markdown_table_cell(&render_content_markdown(&cell.content))
1512                    })
1513                })
1514                .collect::<Vec<_>>()
1515        })
1516        .collect::<Vec<_>>();
1517    let separator = vec!["---".to_string(); width];
1518    std::iter::once(&rows[0])
1519        .chain(std::iter::once(&separator))
1520        .chain(rows.iter().skip(1))
1521        .map(|row| format!("| {} |", row.join(" | ")))
1522        .collect::<Vec<_>>()
1523        .join("\n")
1524}
1525
1526fn render_content_markdown(content: &[ContentNode]) -> String {
1527    let mut rendered = String::new();
1528    let mut idx = 0usize;
1529    while idx < content.len() {
1530        match &content[idx] {
1531            ContentNode::Text {
1532                text,
1533                bold,
1534                italic,
1535                strike,
1536                link: Some(link),
1537            } => {
1538                let mut label = render_marked_text(text, *bold, *italic, *strike);
1539                idx += 1;
1540                while let Some(ContentNode::Text {
1541                    text,
1542                    bold,
1543                    italic,
1544                    strike,
1545                    link: Some(next_link),
1546                }) = content.get(idx)
1547                {
1548                    if next_link != link {
1549                        break;
1550                    }
1551                    label.push_str(&render_marked_text(text, *bold, *italic, *strike));
1552                    idx += 1;
1553                }
1554                let _ = write!(rendered, "[{label}]({link})");
1555            }
1556            ContentNode::Text {
1557                text,
1558                bold,
1559                italic,
1560                strike,
1561                link: None,
1562            } => {
1563                rendered.push_str(&render_marked_text(text, *bold, *italic, *strike));
1564                idx += 1;
1565            }
1566            ContentNode::Image {
1567                url: Some(url),
1568                alt,
1569                ..
1570            } => {
1571                let _ = write!(rendered, "![{alt}]({url})");
1572                idx += 1;
1573            }
1574            ContentNode::Image { .. } => idx += 1,
1575        }
1576    }
1577    rendered
1578}
1579
1580fn render_marked_text(text: &str, bold: bool, italic: bool, strike: bool) -> String {
1581    let mut output = if bold && italic {
1582        format!("***{text}***")
1583    } else if bold {
1584        format!("**{text}**")
1585    } else if italic {
1586        format!("*{text}*")
1587    } else {
1588        text.to_string()
1589    };
1590    if strike {
1591        output = format!("~~{output}~~");
1592    }
1593    output
1594}
1595
1596fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
1597    format!(
1598        "<!doctype html><html><body>{}</body></html>",
1599        blocks
1600            .iter()
1601            .map(|block| match block {
1602                CapturedBlock::Paragraph {
1603                    content,
1604                    style,
1605                    list,
1606                    quote,
1607                    horizontal_rule,
1608                } => {
1609                    if *horizontal_rule {
1610                        "<hr>".to_string()
1611                    } else if let Some(list) = list {
1612                        let tag = if list.ordered { "ol" } else { "ul" };
1613                        format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
1614                    } else if *quote {
1615                        format!("<blockquote>{}</blockquote>", render_content_html(content))
1616                    } else {
1617                        let tag = paragraph_tag(style.as_deref());
1618                        format!("<{tag}>{}</{tag}>", render_content_html(content))
1619                    }
1620                }
1621                CapturedBlock::Table(table) => render_table_html(table),
1622            })
1623            .collect::<String>()
1624    )
1625}
1626
1627fn render_table_html(table: &TableBlock) -> String {
1628    let mut html = String::from("<table>");
1629    for row in &table.rows {
1630        html.push_str("<tr>");
1631        for cell in &row.cells {
1632            html.push_str("<td>");
1633            html.push_str(&render_content_html(&cell.content));
1634            html.push_str("</td>");
1635        }
1636        html.push_str("</tr>");
1637    }
1638    html.push_str("</table>");
1639    html
1640}
1641
1642fn render_content_html(content: &[ContentNode]) -> String {
1643    content
1644        .iter()
1645        .map(|node| match node {
1646            ContentNode::Text {
1647                text,
1648                bold,
1649                italic,
1650                strike,
1651                link,
1652            } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
1653            ContentNode::Image {
1654                url: Some(url),
1655                alt,
1656                ..
1657            } => {
1658                format!(
1659                    "<img src=\"{}\" alt=\"{}\">",
1660                    escape_html(url),
1661                    escape_html(alt)
1662                )
1663            }
1664            ContentNode::Image { .. } => String::new(),
1665        })
1666        .collect()
1667}
1668
1669fn render_marked_html(
1670    text: &str,
1671    bold: bool,
1672    italic: bool,
1673    strike: bool,
1674    link: Option<&str>,
1675) -> String {
1676    let mut output = escape_html(text).replace('\n', "<br>");
1677    if bold {
1678        output = format!("<strong>{output}</strong>");
1679    }
1680    if italic {
1681        output = format!("<em>{output}</em>");
1682    }
1683    if strike {
1684        output = format!("<s>{output}</s>");
1685    }
1686    if let Some(link) = link {
1687        output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
1688    }
1689    output
1690}
1691
1692fn paragraph_tag(style: Option<&str>) -> &'static str {
1693    match style {
1694        Some("TITLE" | "HEADING_1") => "h1",
1695        Some("SUBTITLE" | "HEADING_2") => "h2",
1696        Some("HEADING_3") => "h3",
1697        Some("HEADING_4") => "h4",
1698        Some("HEADING_5") => "h5",
1699        Some("HEADING_6") => "h6",
1700        _ => "p",
1701    }
1702}
1703
1704fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
1705    blocks
1706        .iter()
1707        .map(|block| match block {
1708            CapturedBlock::Paragraph { content, .. } => content_to_text(content),
1709            CapturedBlock::Table(table) => table
1710                .rows
1711                .iter()
1712                .map(|row| {
1713                    row.cells
1714                        .iter()
1715                        .map(|cell| content_to_text(&cell.content))
1716                        .collect::<Vec<_>>()
1717                        .join("\t")
1718                })
1719                .collect::<Vec<_>>()
1720                .join("\n"),
1721        })
1722        .filter(|text| !text.is_empty())
1723        .collect::<Vec<_>>()
1724        .join("\n")
1725}
1726
1727fn content_to_text(content: &[ContentNode]) -> String {
1728    content
1729        .iter()
1730        .map(|node| match node {
1731            ContentNode::Text { text, .. } => text.clone(),
1732            ContentNode::Image {
1733                url: Some(_), alt, ..
1734            } => format!("[{alt}]"),
1735            ContentNode::Image { .. } => String::new(),
1736        })
1737        .collect()
1738}
1739
1740fn escape_html(value: &str) -> String {
1741    value
1742        .replace('&', "&amp;")
1743        .replace('<', "&lt;")
1744        .replace('>', "&gt;")
1745        .replace('"', "&quot;")
1746        .replace('\'', "&#39;")
1747}
1748
1749fn escape_markdown_table_cell(value: &str) -> String {
1750    value.replace('|', "\\|").replace('\n', "<br>")
1751}
1752
1753fn extract_cid_urls_from_html(html: &str) -> HashMap<String, String> {
1754    let pattern = Regex::new(
1755        r#""([A-Za-z0-9_-]{20,})"\s*:\s*"(https://docs\.google\.com/docs-images-rt/[^"]+)""#,
1756    )
1757    .unwrap();
1758    pattern
1759        .captures_iter(html)
1760        .filter_map(|caps| {
1761            Some((
1762                caps.get(1)?.as_str().to_string(),
1763                caps.get(2)?
1764                    .as_str()
1765                    .replace(r"\u003d", "=")
1766                    .replace(r"\u0026", "&")
1767                    .replace(r"\/", "/"),
1768            ))
1769        })
1770        .collect()
1771}
1772
1773fn extract_model_chunks_from_html(html: &str) -> Vec<Value> {
1774    let mut chunks = Vec::new();
1775    let mut offset = 0;
1776    while let Some(relative) = html[offset..].find("DOCS_modelChunk") {
1777        let marker = offset + relative;
1778        let Some(start) = html[marker..].find(['{', '[']).map(|idx| marker + idx) else {
1779            break;
1780        };
1781        let Some(end) = find_json_end(html, start) else {
1782            offset = start + 1;
1783            continue;
1784        };
1785        if let Ok(value) = serde_json::from_str::<Value>(&html[start..end]) {
1786            chunks.push(value);
1787        }
1788        offset = end;
1789    }
1790    chunks
1791}
1792
1793fn find_json_end(input: &str, start: usize) -> Option<usize> {
1794    let mut chars = input[start..].char_indices();
1795    let (_, opening) = chars.next()?;
1796    let closing = match opening {
1797        '{' => '}',
1798        '[' => ']',
1799        _ => return None,
1800    };
1801    let mut depth = 0usize;
1802    let mut in_string = false;
1803    let mut escaped = false;
1804
1805    for (relative, ch) in input[start..].char_indices() {
1806        if in_string {
1807            if escaped {
1808                escaped = false;
1809            } else if ch == '\\' {
1810                escaped = true;
1811            } else if ch == '"' {
1812                in_string = false;
1813            }
1814            continue;
1815        }
1816
1817        if ch == '"' {
1818            in_string = true;
1819        } else if ch == opening {
1820            depth += 1;
1821        } else if ch == closing {
1822            depth = depth.saturating_sub(1);
1823            if depth == 0 {
1824                return Some(start + relative + ch.len_utf8());
1825            }
1826        }
1827    }
1828    None
1829}
1830
1831/// Extract a Bearer token from an Authorization header value.
1832///
1833/// Returns `None` if the header is not a valid Bearer token.
1834#[must_use]
1835pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
1836    let trimmed = auth_header.trim();
1837    trimmed
1838        .strip_prefix("Bearer ")
1839        .or_else(|| trimmed.strip_prefix("bearer "))
1840        .map(str::trim)
1841        .filter(|t| !t.is_empty())
1842}
1843
1844/// An image extracted from base64 data URIs in HTML.
1845#[derive(Debug, Clone)]
1846pub struct ExtractedImage {
1847    /// Local filename (e.g., "image-01.png")
1848    pub filename: String,
1849    /// Raw image bytes
1850    pub data: Vec<u8>,
1851    /// MIME type (e.g., "image/png")
1852    pub mime_type: String,
1853}
1854
1855/// Result of fetching a Google Doc as an archive.
1856#[derive(Debug, Clone)]
1857pub struct GDocsArchiveResult {
1858    /// HTML content with local image paths
1859    pub html: String,
1860    /// Markdown content with local image paths
1861    pub markdown: String,
1862    /// Extracted images
1863    pub images: Vec<ExtractedImage>,
1864    /// Document ID
1865    pub document_id: String,
1866    /// Export URL used
1867    pub export_url: String,
1868}
1869
1870fn base64_image_pattern() -> &'static Regex {
1871    static PATTERN: OnceLock<Regex> = OnceLock::new();
1872    PATTERN.get_or_init(|| {
1873        Regex::new(
1874            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
1875        )
1876        .unwrap()
1877    })
1878}
1879
1880/// Extract base64 data URI images from HTML content.
1881///
1882/// Google Docs HTML exports embed images as base64 data URIs.
1883/// This function extracts them and replaces with local file paths.
1884///
1885/// # Arguments
1886///
1887/// * `html` - HTML content with embedded base64 images
1888///
1889/// # Returns
1890///
1891/// Tuple of (updated HTML with local paths, extracted images)
1892#[must_use]
1893pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
1894    let mut images = Vec::new();
1895    let mut idx = 1u32;
1896
1897    let updated_html = base64_image_pattern()
1898        .replace_all(html, |caps: &regex::Captures<'_>| {
1899            let prefix = &caps[1];
1900            let mime_ext = &caps[2];
1901            let base64_data = &caps[3];
1902            let suffix = &caps[4];
1903
1904            let ext = match mime_ext {
1905                "jpeg" => "jpg",
1906                "svg+xml" => "svg",
1907                other => other,
1908            };
1909
1910            let filename = format!("image-{idx:02}.{ext}");
1911            let mime_type = format!("image/{mime_ext}");
1912
1913            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
1914                debug!("Extracted image: {} ({} bytes)", filename, data.len());
1915                images.push(ExtractedImage {
1916                    filename: filename.clone(),
1917                    data,
1918                    mime_type,
1919                });
1920            }
1921
1922            idx += 1;
1923            format!("{prefix}images/{filename}{suffix}")
1924        })
1925        .into_owned();
1926
1927    (updated_html, images)
1928}
1929
1930/// Fetch a Google Docs document as a ZIP archive.
1931///
1932/// Fetches the document as HTML, extracts embedded base64 images,
1933/// converts to Markdown, and returns all components ready for archiving.
1934///
1935/// The archive contains:
1936/// - `document.md` — Markdown version
1937/// - `document.html` — HTML version with local image paths
1938/// - `images/` — extracted images
1939///
1940/// # Arguments
1941///
1942/// * `url` - Google Docs URL
1943/// * `api_token` - Optional API token for private documents
1944///
1945/// # Errors
1946///
1947/// Returns an error if the fetch or conversion fails.
1948pub async fn fetch_google_doc_as_archive(
1949    url: &str,
1950    api_token: Option<&str>,
1951) -> crate::Result<GDocsArchiveResult> {
1952    let result = fetch_google_doc(url, "html", api_token).await?;
1953
1954    let preprocess = preprocess_google_docs_export_html(&result.content);
1955    debug!(
1956        document_id = %result.document_id,
1957        hoisted = preprocess.hoisted,
1958        unwrapped_links = preprocess.unwrapped_links,
1959        "google-docs-export pre-processor rewrote archive markup"
1960    );
1961
1962    let (local_html, images) = extract_base64_images(&preprocess.html);
1963
1964    let markdown = crate::markdown::convert_html_to_markdown(&local_html, None)?;
1965
1966    debug!(
1967        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
1968        images.len(),
1969        local_html.len(),
1970        markdown.len()
1971    );
1972
1973    Ok(GDocsArchiveResult {
1974        html: local_html,
1975        markdown,
1976        images,
1977        document_id: result.document_id,
1978        export_url: result.export_url,
1979    })
1980}
1981
1982/// Create a ZIP archive from a `GDocsArchiveResult`.
1983///
1984/// # Arguments
1985///
1986/// * `archive` - The archive result to bundle
1987/// * `pretty_html` - Whether to pretty-print the HTML output
1988///
1989/// # Errors
1990///
1991/// Returns an error if ZIP creation fails.
1992pub fn create_archive_zip(
1993    archive: &GDocsArchiveResult,
1994    pretty_html: bool,
1995) -> crate::Result<Vec<u8>> {
1996    let mut buf = std::io::Cursor::new(Vec::new());
1997
1998    {
1999        let mut zip = zip::ZipWriter::new(&mut buf);
2000        let options = zip::write::SimpleFileOptions::default()
2001            .compression_method(zip::CompressionMethod::Deflated);
2002
2003        zip.start_file("document.md", options)
2004            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2005        zip.write_all(archive.markdown.as_bytes())?;
2006
2007        let html_output = if pretty_html {
2008            crate::html::pretty_print_html(&archive.html)
2009        } else {
2010            archive.html.clone()
2011        };
2012        zip.start_file("document.html", options)
2013            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2014        zip.write_all(html_output.as_bytes())?;
2015
2016        for img in &archive.images {
2017            zip.start_file(format!("images/{}", img.filename), options)
2018                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2019            zip.write_all(&img.data)?;
2020        }
2021
2022        zip.finish()
2023            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2024    }
2025
2026    Ok(buf.into_inner())
2027}
web_capture/gdocs.rs

web_capture/
gdocs.rs