Skip to main content

web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use scraper::{node::Node, ElementRef, Html, Selector};
38use serde_json::Value;
39use std::collections::HashMap;
40use std::fmt::Write as _;
41use std::hash::BuildHasher;
42use std::io::Write;
43use std::process::Stdio;
44use std::sync::OnceLock;
45use std::time::{Duration, Instant};
46use tokio::io::{AsyncBufReadExt, BufReader};
47use tokio::process::{Child, Command};
48use tracing::{debug, info, warn};
49
50use crate::WebCaptureError;
51
52const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
53const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
54const GDOCS_USER_AGENT: &str =
55    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
56const GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT: Duration = Duration::from_secs(30);
57const GDOCS_EDITOR_MODEL_STABILITY_DEFAULT: Duration = Duration::from_millis(1500);
58const GDOCS_EDITOR_MODEL_POLL_INTERVAL: Duration = Duration::from_millis(250);
59const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
60
61type CdpWebSocket = WebSocketStream<ConnectStream>;
62
63const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
64window.__captured_chunks = [];
65const captureChunk = (value) => {
66  if (!value) {
67    return;
68  }
69  if (Array.isArray(value)) {
70    for (const item of value) {
71      captureChunk(item);
72    }
73    return;
74  }
75  try {
76    window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
77  } catch {
78    window.__captured_chunks.push(value);
79  }
80};
81const wrapChunkArray = (value) => {
82  if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
83    return value;
84  }
85  const originalPush = value.push;
86  Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
87    value: true,
88    enumerable: false,
89  });
90  Object.defineProperty(value, 'push', {
91    value(...items) {
92      for (const item of items) {
93        captureChunk(item);
94      }
95      return originalPush.apply(this, items);
96    },
97    writable: true,
98    configurable: true,
99  });
100  for (const item of value) {
101    captureChunk(item);
102  }
103  return value;
104};
105Object.defineProperty(window, 'DOCS_modelChunk', {
106  set(value) {
107    captureChunk(value);
108    window.__DOCS_modelChunk_latest = wrapChunkArray(value);
109  },
110  get() {
111    return window.__DOCS_modelChunk_latest;
112  },
113  configurable: false,
114});
115";
116
117const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
118  const chunks = [...(window.__captured_chunks || [])];
119  if (
120    window.DOCS_modelChunk &&
121    chunks.length === 0 &&
122    !chunks.includes(window.DOCS_modelChunk)
123  ) {
124    chunks.push(window.DOCS_modelChunk);
125  }
126  const cidUrlMap = {};
127  const scripts = document.querySelectorAll('script');
128  for (const script of scripts) {
129    const text = script.textContent || '';
130    if (!text.includes('docs-images-rt')) {
131      continue;
132    }
133    const regex =
134      /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
135    let match;
136    while ((match = regex.exec(text)) !== null) {
137      cidUrlMap[match[1]] = match[2]
138        .replace(/\\u003d/g, '=')
139        .replace(/\\u0026/g, '&')
140        .replace(/\\\//g, '/');
141    }
142  }
143  return { chunks, cidUrlMap };
144}"#;
145
146fn gdocs_url_pattern() -> &'static Regex {
147    static PATTERN: OnceLock<Regex> = OnceLock::new();
148    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
149}
150
151/// Result of fetching a Google Docs document.
152#[derive(Debug, Clone)]
153pub struct GDocsResult {
154    /// The document content in the requested format.
155    pub content: String,
156    /// The export format used.
157    pub format: String,
158    /// The extracted document ID.
159    pub document_id: String,
160    /// The export URL that was fetched.
161    pub export_url: String,
162}
163
164/// Google Docs capture backend selected from the CLI `--capture` flag.
165#[derive(Debug, Clone, Copy, PartialEq, Eq)]
166pub enum GDocsCaptureMethod {
167    /// Load `/edit` and extract `DOCS_modelChunk`.
168    BrowserModel,
169    /// Use the public `/export?format=...` endpoint.
170    PublicExport,
171    /// Use the authenticated `docs.googleapis.com` REST API.
172    DocsApi,
173}
174
175/// Rendered Google Docs content from either Docs API or editor model data.
176#[derive(Debug, Clone)]
177pub struct GDocsRenderedResult {
178    /// Markdown output.
179    pub markdown: String,
180    /// HTML output.
181    pub html: String,
182    /// Plain text output.
183    pub text: String,
184    /// The extracted document ID.
185    pub document_id: String,
186    /// Source URL used for capture.
187    pub export_url: String,
188    /// Remote images exposed by the editor model, used for archive localization.
189    pub remote_images: Vec<RemoteImage>,
190}
191
192/// Remote image reference extracted from browser-model capture.
193#[derive(Debug, Clone, PartialEq, Eq)]
194pub struct RemoteImage {
195    /// Original image URL.
196    pub url: String,
197    /// Image alt text.
198    pub alt: String,
199}
200
201#[derive(Debug, Clone)]
202struct BrowserModelData {
203    chunks: Vec<Value>,
204    cid_urls: HashMap<String, String>,
205    chunk_payload_bytes: usize,
206    poll_count: usize,
207    stable_for: Duration,
208}
209
210#[derive(Debug, Clone, Copy, PartialEq, Eq)]
211struct BrowserModelFingerprint {
212    chunks: usize,
213    payload_bytes: usize,
214}
215
216#[derive(Debug, Default)]
217struct BrowserModelQuiescence {
218    last_fingerprint: Option<BrowserModelFingerprint>,
219    stable_since: Option<Instant>,
220}
221
222impl BrowserModelData {
223    const fn fingerprint(&self) -> BrowserModelFingerprint {
224        BrowserModelFingerprint {
225            chunks: self.chunks.len(),
226            payload_bytes: self.chunk_payload_bytes,
227        }
228    }
229}
230
231impl BrowserModelQuiescence {
232    fn observe(
233        &mut self,
234        fingerprint: BrowserModelFingerprint,
235        now: Instant,
236        stability_window: Duration,
237    ) -> Option<Duration> {
238        if fingerprint.chunks == 0 {
239            self.last_fingerprint = Some(fingerprint);
240            self.stable_since = None;
241            return None;
242        }
243
244        if self.last_fingerprint == Some(fingerprint) {
245            let stable_since = *self.stable_since.get_or_insert(now);
246            let stable_for = now.saturating_duration_since(stable_since);
247            if stable_for >= stability_window {
248                return Some(stable_for);
249            }
250        } else {
251            self.last_fingerprint = Some(fingerprint);
252            self.stable_since = None;
253        }
254
255        None
256    }
257
258    fn stable_for(&self, now: Instant) -> Duration {
259        self.stable_since.map_or(Duration::ZERO, |stable_since| {
260            now.saturating_duration_since(stable_since)
261        })
262    }
263}
264
265/// Parsed Google Docs model/document capture.
266#[derive(Debug, Clone, Default)]
267pub struct CapturedDocument {
268    /// Ordered document blocks.
269    pub blocks: Vec<CapturedBlock>,
270    /// Tables extracted from `blocks` for compatibility with tests and callers.
271    pub tables: Vec<TableBlock>,
272    /// Images extracted from model positions.
273    pub images: Vec<ContentNode>,
274    /// Plain text projection.
275    pub text: String,
276}
277
278/// Captured block.
279#[derive(Debug, Clone)]
280pub enum CapturedBlock {
281    /// Paragraph-like block.
282    Paragraph {
283        /// Paragraph content.
284        content: Vec<ContentNode>,
285        /// Optional Google Docs named style.
286        style: Option<String>,
287        /// Optional list metadata.
288        list: Option<ListMeta>,
289        /// Whether paragraph is a blockquote.
290        quote: bool,
291        /// Whether paragraph is a horizontal rule.
292        horizontal_rule: bool,
293    },
294    /// Table block.
295    Table(TableBlock),
296}
297
298/// Captured table.
299#[derive(Debug, Clone, Default)]
300pub struct TableBlock {
301    /// Table rows.
302    pub rows: Vec<TableRow>,
303}
304
305/// Captured table row.
306#[derive(Debug, Clone, Default)]
307pub struct TableRow {
308    /// Row cells.
309    pub cells: Vec<TableCell>,
310}
311
312/// Captured table cell.
313#[derive(Debug, Clone, Default)]
314pub struct TableCell {
315    /// Cell content.
316    pub content: Vec<ContentNode>,
317}
318
319/// Captured inline content node.
320#[derive(Debug, Clone, PartialEq, Eq)]
321pub enum ContentNode {
322    /// Text run.
323    Text {
324        /// Text content.
325        text: String,
326        /// Bold text style.
327        bold: bool,
328        /// Italic text style.
329        italic: bool,
330        /// Strikethrough text style.
331        strike: bool,
332        /// Optional hyperlink target.
333        link: Option<String>,
334    },
335    /// Image placeholder.
336    Image {
337        /// Content ID from Google Docs model data.
338        cid: Option<String>,
339        /// Resolved image URL.
340        url: Option<String>,
341        /// Alt text.
342        alt: String,
343        /// Editor-model image width, when available.
344        width: Option<String>,
345        /// Editor-model image height, when available.
346        height: Option<String>,
347        /// Whether this image came from a suggested edit.
348        is_suggestion: bool,
349    },
350}
351
352#[derive(Debug, Clone, Default, PartialEq, Eq)]
353struct TextStyle {
354    bold: bool,
355    italic: bool,
356    strike: bool,
357    link: Option<String>,
358}
359
360#[derive(Debug, Clone, Default)]
361struct ParagraphMeta {
362    style: Option<String>,
363    list: Option<ListMeta>,
364    quote: bool,
365    horizontal_rule: bool,
366}
367
368#[derive(Debug, Clone)]
369pub struct ListMeta {
370    /// Google Docs list identifier.
371    pub id: String,
372    /// Nesting level, zero-based.
373    pub level: usize,
374    /// Whether Markdown should render this list item with an ordered marker.
375    pub ordered: bool,
376}
377
378#[derive(Debug, Clone)]
379struct ParagraphStyle {
380    style: Option<String>,
381    indent_start: f64,
382    indent_first_line: f64,
383}
384
385#[derive(Debug, Clone)]
386struct ExportSemanticHint {
387    text: String,
388    list_ordered: Option<bool>,
389    quote: bool,
390}
391
392#[derive(Debug, Clone, Default)]
393struct ModelStyleMaps {
394    inline_styles: Vec<TextStyle>,
395    paragraph_by_end: HashMap<usize, ParagraphStyle>,
396    list_by_end: HashMap<usize, ListMeta>,
397    horizontal_rules: std::collections::HashSet<usize>,
398}
399
400/// Check if a URL is a Google Docs document URL.
401#[must_use]
402pub fn is_google_docs_url(url: &str) -> bool {
403    gdocs_url_pattern().is_match(url)
404}
405
406/// Extract the document ID from a Google Docs URL.
407///
408/// Returns `None` if the URL is not a valid Google Docs URL.
409#[must_use]
410pub fn extract_document_id(url: &str) -> Option<String> {
411    gdocs_url_pattern()
412        .captures(url)
413        .and_then(|caps| caps.get(1))
414        .map(|m| m.as_str().to_string())
415}
416
417/// Build a Google Docs export URL.
418///
419/// # Arguments
420///
421/// * `document_id` - The Google Docs document ID
422/// * `format` - Export format (html, txt, md, pdf, docx, epub)
423#[must_use]
424pub fn build_export_url(document_id: &str, format: &str) -> String {
425    let export_format = match format {
426        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
427        _ => "html",
428    };
429    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
430}
431
432/// Build a Google Docs editor URL.
433#[must_use]
434pub fn build_edit_url(document_id: &str) -> String {
435    format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
436}
437
438/// Build a Google Docs REST API URL.
439#[must_use]
440pub fn build_docs_api_url(document_id: &str) -> String {
441    format!("{GDOCS_API_BASE}/{document_id}")
442}
443
444/// Select a Google Docs capture backend from the CLI `--capture` value.
445///
446/// # Errors
447///
448/// Returns an error when `capture` is neither `browser` nor `api`.
449pub fn select_capture_method(
450    capture: &str,
451    api_token: Option<&str>,
452) -> crate::Result<GDocsCaptureMethod> {
453    match capture.to_lowercase().as_str() {
454        "browser" => Ok(GDocsCaptureMethod::BrowserModel),
455        "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
456        "api" => Ok(GDocsCaptureMethod::PublicExport),
457        other => Err(WebCaptureError::InvalidUrl(format!(
458            "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
459        ))),
460    }
461}
462
463/// Fetch a Google Docs document via the export URL.
464///
465/// For public documents, pass `None` for `api_token`.
466/// For private documents, pass a Bearer token string.
467///
468/// # Arguments
469///
470/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
471/// * `format` - Export format (html, txt, md, pdf, docx, epub)
472/// * `api_token` - Optional API token for private documents
473///
474/// # Errors
475///
476/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
477pub async fn fetch_google_doc(
478    url: &str,
479    format: &str,
480    api_token: Option<&str>,
481) -> crate::Result<GDocsResult> {
482    let document_id = extract_document_id(url).ok_or_else(|| {
483        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
484    })?;
485
486    let export_url = build_export_url(&document_id, format);
487    debug!(
488        document_id = %document_id,
489        format = %format,
490        export_url = %export_url,
491        has_api_token = api_token.is_some(),
492        "fetching Google Doc via public export"
493    );
494
495    let mut request = reqwest::Client::new()
496        .get(&export_url)
497        .header(
498            "User-Agent",
499            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
500        )
501        .header("Accept-Charset", "utf-8")
502        .header("Accept-Language", "en-US,en;q=0.9");
503
504    if let Some(token) = api_token {
505        request = request.header("Authorization", format!("Bearer {token}"));
506    }
507
508    let response = request
509        .send()
510        .await
511        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
512    debug!(
513        document_id = %document_id,
514        status = response.status().as_u16(),
515        success = response.status().is_success(),
516        content_type = response
517            .headers()
518            .get(reqwest::header::CONTENT_TYPE)
519            .and_then(|value| value.to_str().ok())
520            .unwrap_or(""),
521        "received Google Docs public export response"
522    );
523
524    if !response.status().is_success() {
525        return Err(WebCaptureError::FetchError(format!(
526            "Failed to fetch Google Doc ({} {}): {}",
527            response.status().as_u16(),
528            response.status().canonical_reason().unwrap_or("Unknown"),
529            export_url
530        )));
531    }
532
533    let raw_content = response.text().await.map_err(|e| {
534        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
535    })?;
536    debug!(
537        document_id = %document_id,
538        bytes = raw_content.len(),
539        "read Google Docs public export body"
540    );
541
542    // Keep HTML markup escaped so literal examples such as `&lt;ol&gt;` do not
543    // become real tags before the HTML parser sees the document.
544    let content = match format {
545        "txt" | "md" => crate::html::decode_html_entities(&raw_content),
546        _ => raw_content,
547    };
548
549    Ok(GDocsResult {
550        content,
551        format: format.to_string(),
552        document_id,
553        export_url,
554    })
555}
556
557/// Fetch a Google Docs document and convert to Markdown.
558///
559/// Fetches the document as HTML, then converts to Markdown using the
560/// existing HTML-to-Markdown pipeline.
561///
562/// # Arguments
563///
564/// * `url` - Google Docs URL
565/// * `api_token` - Optional API token for private documents
566///
567/// # Errors
568///
569/// Returns an error if the fetch or conversion fails.
570pub async fn fetch_google_doc_as_markdown(
571    url: &str,
572    api_token: Option<&str>,
573) -> crate::Result<GDocsResult> {
574    let result = fetch_google_doc(url, "html", api_token).await?;
575
576    let preprocess = preprocess_google_docs_export_html(&result.content);
577    debug!(
578        document_id = %result.document_id,
579        hoisted = preprocess.hoisted,
580        unwrapped_links = preprocess.unwrapped_links,
581        "google-docs-export pre-processor rewrote markup"
582    );
583    let markdown = normalize_google_docs_export_markdown(
584        &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
585    );
586    debug!(
587        document_id = %result.document_id,
588        bytes = markdown.len(),
589        "rendered Google Docs public export markdown"
590    );
591
592    Ok(GDocsResult {
593        content: markdown,
594        format: "markdown".to_string(),
595        document_id: result.document_id,
596        export_url: result.export_url,
597    })
598}
599
600/// Result of running the Google Docs export HTML pre-processor.
601///
602/// Exposes the rewritten HTML alongside counters that are useful for debug
603/// logging (`gdocs.export.style-hoist`). See issue #92 R6.
604#[derive(Debug, Clone)]
605pub struct GDocsExportPreprocessResult {
606    /// Rewritten HTML.
607    pub html: String,
608    /// Number of inline-style spans turned into `<strong>`/`<em>`/`<del>`.
609    pub hoisted: usize,
610    /// Number of `google.com/url?q=` redirect wrappers unwrapped.
611    pub unwrapped_links: usize,
612}
613
614/// Pre-process Google Docs export HTML so the generic `html2md` pipeline
615/// preserves inline formatting, heading numbering, and link targets.
616///
617/// Google Drive serves bold/italic/strikethrough as inline style spans and
618/// wraps every link through a `google.com/url?q=` redirect, both of which
619/// the generic converter would otherwise discard. This function rewrites
620/// those constructs into semantic HTML before conversion.
621#[must_use]
622pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
623    let mut hoisted: usize = 0;
624    let mut unwrapped_links: usize = 0;
625    let class_styles = extract_css_class_styles(html);
626
627    let mut out = hoist_inline_style_spans(html, &mut hoisted);
628    out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
629    out = convert_class_indented_blockquotes(&out, &class_styles);
630    out = nest_google_docs_lists(&out, &class_styles);
631    out = strip_google_docs_heading_noise(&out);
632    out = strip_heading_inline_formatting(&out);
633    out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
634    out = out.replace("&nbsp;", " ");
635    out = out.replace('\u{00A0}', " ");
636
637    GDocsExportPreprocessResult {
638        html: out,
639        hoisted,
640        unwrapped_links,
641    }
642}
643
644/// Normalize Markdown emitted from Google Docs public-export HTML converters.
645#[must_use]
646pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
647    let markdown = unescape_public_export_punctuation(markdown);
648    let markdown = convert_setext_headings(&markdown);
649    let markdown = normalize_atx_headings(&markdown);
650    let markdown = normalize_bullet_markers(&markdown);
651    let markdown = normalize_list_spacing(&markdown);
652    let markdown = normalize_blockquote_spacing(&markdown);
653    let markdown = normalize_markdown_tables(&markdown);
654    crate::markdown::clean_markdown(&markdown)
655}
656
657fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
658    let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
659        .expect("valid regex");
660    span_re
661        .replace_all(html, |caps: &regex::Captures<'_>| {
662            let style = caps.get(2).map_or("", |m| m.as_str());
663            let inner = caps.get(3).map_or("", |m| m.as_str());
664            semantic_wrapped_html(inner, style).map_or_else(
665                || caps[0].to_string(),
666                |wrapped| {
667                    *hoisted += 1;
668                    wrapped
669                },
670            )
671        })
672        .into_owned()
673}
674
675fn hoist_class_style_spans(
676    html: &str,
677    class_styles: &HashMap<String, String>,
678    hoisted: &mut usize,
679) -> String {
680    let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
681        .expect("valid regex");
682    class_span_re
683        .replace_all(html, |caps: &regex::Captures<'_>| {
684            let class_attr = caps.get(2).map_or("", |m| m.as_str());
685            let inner = caps.get(3).map_or("", |m| m.as_str());
686            let style = combined_class_style(class_styles, class_attr);
687            semantic_wrapped_html(inner, &style).map_or_else(
688                || caps[0].to_string(),
689                |wrapped| {
690                    *hoisted += 1;
691                    wrapped
692                },
693            )
694        })
695        .into_owned()
696}
697
698fn convert_class_indented_blockquotes(
699    html: &str,
700    class_styles: &HashMap<String, String>,
701) -> String {
702    let class_paragraph_re =
703        Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
704    class_paragraph_re
705        .replace_all(html, |caps: &regex::Captures<'_>| {
706            let class_attr = caps.get(2).map_or("", |m| m.as_str());
707            let inner = caps.get(3).map_or("", |m| m.as_str());
708            let style = combined_class_style(class_styles, class_attr);
709            if is_blockquote_style(&style) {
710                format!("<blockquote><p>{inner}</p></blockquote>")
711            } else {
712                caps[0].to_string()
713            }
714        })
715        .into_owned()
716}
717
718#[derive(Debug, Clone)]
719struct ExportListBlock {
720    start: usize,
721    end: usize,
722    tag: String,
723    inner: String,
724    start_attr: Option<String>,
725}
726
727#[derive(Debug, Clone)]
728struct ExportListItem {
729    tag: String,
730    level: usize,
731    inner: String,
732}
733
734fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
735    let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
736    let start_attr_re = Regex::new(r#"(?i)\bstart\s*=\s*"([^"]*)""#).expect("valid regex");
737    let blocks: Vec<ExportListBlock> = list_re
738        .captures_iter(html)
739        .filter_map(|caps| {
740            let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
741            let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
742            if open_tag != close_tag {
743                return None;
744            }
745            let whole = caps.get(0)?;
746            let attrs = caps.get(2).map_or("", |m| m.as_str());
747            let start_attr = if open_tag == "ol" {
748                start_attr_re
749                    .captures(attrs)
750                    .and_then(|c| c.get(1).map(|m| m.as_str().to_string()))
751            } else {
752                None
753            };
754            Some(ExportListBlock {
755                start: whole.start(),
756                end: whole.end(),
757                tag: open_tag,
758                inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
759                start_attr,
760            })
761        })
762        .collect();
763
764    if blocks.len() < 2 {
765        return html.to_string();
766    }
767
768    let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
769    let mut current: Vec<ExportListBlock> = Vec::new();
770    for block in blocks {
771        if let Some(previous) = current.last() {
772            if !html[previous.end..block.start].trim().is_empty() {
773                if current.len() > 1 {
774                    groups.push(std::mem::take(&mut current));
775                } else {
776                    current.clear();
777                }
778            }
779        }
780        current.push(block);
781    }
782    if current.len() > 1 {
783        groups.push(current);
784    }
785
786    if groups.is_empty() {
787        return html.to_string();
788    }
789
790    let mut out = html.to_string();
791    for group in groups.iter().rev() {
792        let rendered = render_nested_list_group(group, class_styles);
793        let start = group.first().expect("non-empty group").start;
794        let end = group.last().expect("non-empty group").end;
795        out.replace_range(start..end, &rendered);
796    }
797    out
798}
799
800#[allow(clippy::too_many_lines)]
801fn render_nested_list_group(
802    group: &[ExportListBlock],
803    class_styles: &HashMap<String, String>,
804) -> String {
805    let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
806    let items: Vec<ExportListItem> = group
807        .iter()
808        .flat_map(|block| {
809            item_re.captures_iter(&block.inner).map(|caps| {
810                let attrs = caps.get(1).map_or("", |m| m.as_str());
811                let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
812                ExportListItem {
813                    tag: block.tag.clone(),
814                    level: google_docs_list_item_level(attrs, class_styles),
815                    inner,
816                }
817            })
818        })
819        .collect();
820
821    if items.is_empty() {
822        let mut unchanged = String::new();
823        for block in group {
824            write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
825                .expect("write to String");
826        }
827        return unchanged;
828    }
829
830    let top_level_start = group.first().and_then(|block| block.start_attr.clone());
831
832    let mut html = String::new();
833    let mut current_level: Option<usize> = None;
834    let mut open_tags: Vec<Option<String>> = Vec::new();
835    let mut item_open: Vec<bool> = Vec::new();
836    let mut top_level_opened = false;
837
838    for item in items {
839        let level = item.level;
840        while current_level.is_some_and(|current| current > level) {
841            let current = current_level.expect("checked as Some");
842            close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
843            current_level = current.checked_sub(1);
844        }
845
846        while current_level.is_none_or(|current| current < level) {
847            let next_level = current_level.map_or(0, |current| current + 1);
848            let start_attr = if next_level == 0 && !top_level_opened {
849                top_level_opened = true;
850                top_level_start.as_deref()
851            } else {
852                None
853            };
854            open_rendered_list(
855                &mut html,
856                &mut open_tags,
857                &mut item_open,
858                next_level,
859                &item.tag,
860                start_attr,
861            );
862            current_level = Some(next_level);
863        }
864
865        ensure_list_stack(&mut open_tags, &mut item_open, level);
866        if open_tags[level]
867            .as_deref()
868            .is_some_and(|tag| tag != item.tag)
869        {
870            close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
871            let start_attr = if level == 0 && !top_level_opened {
872                top_level_opened = true;
873                top_level_start.as_deref()
874            } else {
875                None
876            };
877            open_rendered_list(
878                &mut html,
879                &mut open_tags,
880                &mut item_open,
881                level,
882                &item.tag,
883                start_attr,
884            );
885        } else if open_tags[level].is_none() {
886            let start_attr = if level == 0 && !top_level_opened {
887                top_level_opened = true;
888                top_level_start.as_deref()
889            } else {
890                None
891            };
892            open_rendered_list(
893                &mut html,
894                &mut open_tags,
895                &mut item_open,
896                level,
897                &item.tag,
898                start_attr,
899            );
900        }
901
902        close_rendered_item(&mut html, &mut item_open, level);
903        html.push_str("<li>");
904        html.push_str(&item.inner);
905        item_open[level] = true;
906
907        for deeper in (level + 1)..item_open.len() {
908            item_open[deeper] = false;
909            open_tags[deeper] = None;
910        }
911    }
912
913    while let Some(current) = current_level {
914        close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
915        current_level = current.checked_sub(1);
916    }
917
918    html
919}
920
921fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
922    while open_tags.len() <= level {
923        open_tags.push(None);
924        item_open.push(false);
925    }
926}
927
928fn open_rendered_list(
929    html: &mut String,
930    open_tags: &mut Vec<Option<String>>,
931    item_open: &mut Vec<bool>,
932    level: usize,
933    tag: &str,
934    start_attr: Option<&str>,
935) {
936    ensure_list_stack(open_tags, item_open, level);
937    html.push('<');
938    html.push_str(tag);
939    if let Some(start) = start_attr {
940        if tag == "ol" && !start.is_empty() {
941            write!(html, r#" start="{start}""#).expect("write to String");
942        }
943    }
944    html.push('>');
945    open_tags[level] = Some(tag.to_string());
946    item_open[level] = false;
947}
948
949fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
950    if item_open.get(level).copied().unwrap_or(false) {
951        html.push_str("</li>");
952        item_open[level] = false;
953    }
954}
955
956fn close_rendered_list(
957    html: &mut String,
958    open_tags: &mut [Option<String>],
959    item_open: &mut [bool],
960    level: usize,
961) {
962    close_rendered_item(html, item_open, level);
963    if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
964        html.push_str("</");
965        html.push_str(&tag);
966        html.push('>');
967    }
968}
969
970fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
971    let style = combined_attr_style(class_styles, attrs);
972    let margin_left = css_point_value(&style, "margin-left");
973    if margin_left <= 0.0 {
974        return 0;
975    }
976    [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
977        .iter()
978        .take_while(|boundary| margin_left >= **boundary)
979        .count()
980}
981
982fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
983    let mut styles = String::new();
984    if let Some(style) = attr_value(attrs, "style") {
985        styles.push_str(&style);
986    }
987    if let Some(class_attr) = attr_value(attrs, "class") {
988        styles.push_str(&combined_class_style(class_styles, &class_attr));
989    }
990    styles
991}
992
993fn attr_value(attrs: &str, name: &str) -> Option<String> {
994    let attr_re = Regex::new(&format!(
995        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
996        regex::escape(name)
997    ))
998    .expect("valid regex");
999    attr_re.captures(attrs).and_then(|caps| {
1000        caps.get(1)
1001            .or_else(|| caps.get(2))
1002            .map(|value| value.as_str().to_string())
1003    })
1004}
1005
1006fn strip_google_docs_heading_noise(html: &str) -> String {
1007    let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
1008    let numbering_re =
1009        Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
1010    let mut out = empty_anchor_re.replace_all(html, "").into_owned();
1011    for level in 1..=6 {
1012        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
1013            .expect("valid regex");
1014        out = heading_re
1015            .replace_all(&out, |caps: &regex::Captures<'_>| {
1016                let open = &caps[1];
1017                let inner = &caps[2];
1018                let close = &caps[3];
1019                let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
1020                cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
1021                format!("{open}{cleaned}{close}")
1022            })
1023            .into_owned();
1024    }
1025    out
1026}
1027
1028fn strip_heading_inline_formatting(html: &str) -> String {
1029    let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
1030    let mut out = html.to_string();
1031    for level in 1..=6 {
1032        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
1033            .expect("valid regex");
1034        out = heading_re
1035            .replace_all(&out, |caps: &regex::Captures<'_>| {
1036                let open = &caps[1];
1037                let inner = &caps[2];
1038                let close = &caps[3];
1039                let cleaned = inline_marker_re.replace_all(inner, "");
1040                format!("{open}{cleaned}{close}")
1041            })
1042            .into_owned();
1043    }
1044    out
1045}
1046
1047fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
1048    let redirect_re =
1049        Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
1050            .expect("valid regex");
1051    redirect_re
1052        .replace_all(html, |caps: &regex::Captures<'_>| {
1053            let encoded = caps.get(1).map_or("", |m| m.as_str());
1054            let decoded = percent_decode_utf8_lossy(encoded);
1055            *unwrapped_links += 1;
1056            format!(r#"href="{decoded}""#)
1057        })
1058        .into_owned()
1059}
1060
1061fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
1062    let mut class_styles: HashMap<String, String> = HashMap::new();
1063    let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
1064    let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
1065    for style_caps in style_re.captures_iter(html) {
1066        let css = style_caps.get(1).map_or("", |m| m.as_str());
1067        for class_caps in class_re.captures_iter(css) {
1068            let class_name = class_caps.get(1).map_or("", |m| m.as_str());
1069            let style = class_caps.get(2).map_or("", |m| m.as_str());
1070            class_styles
1071                .entry(class_name.to_string())
1072                .and_modify(|existing| {
1073                    existing.push(';');
1074                    existing.push_str(style);
1075                })
1076                .or_insert_with(|| style.to_string());
1077        }
1078    }
1079    class_styles
1080}
1081
1082fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
1083    class_attr
1084        .split_whitespace()
1085        .filter_map(|class_name| class_styles.get(class_name))
1086        .fold(String::new(), |mut out, style| {
1087            out.push(';');
1088            out.push_str(style);
1089            out
1090        })
1091}
1092
1093fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
1094    let bold = css_has_bold(style);
1095    let italic = css_has_italic(style);
1096    let strike = css_has_strike(style);
1097    if !bold && !italic && !strike {
1098        return None;
1099    }
1100    let mut wrapped = inner.to_string();
1101    if strike {
1102        wrapped = format!("<del>{wrapped}</del>");
1103    }
1104    if italic {
1105        wrapped = format!("<em>{wrapped}</em>");
1106    }
1107    if bold {
1108        wrapped = format!("<strong>{wrapped}</strong>");
1109    }
1110    Some(wrapped)
1111}
1112
1113fn css_has_bold(style: &str) -> bool {
1114    Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
1115        .expect("valid regex")
1116        .is_match(style)
1117}
1118
1119fn css_has_italic(style: &str) -> bool {
1120    Regex::new(r"(?i)font-style\s*:\s*italic")
1121        .expect("valid regex")
1122        .is_match(style)
1123}
1124
1125fn css_has_strike(style: &str) -> bool {
1126    Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1127        .expect("valid regex")
1128        .is_match(style)
1129}
1130
1131fn is_blockquote_style(style: &str) -> bool {
1132    let margin_left = css_point_value(style, "margin-left");
1133    let margin_right = css_point_value(style, "margin-right");
1134    margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1135}
1136
1137fn css_point_value(style: &str, property: &str) -> f64 {
1138    let re = Regex::new(&format!(
1139        r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1140        regex::escape(property)
1141    ))
1142    .expect("valid regex");
1143    re.captures(style)
1144        .and_then(|caps| caps.get(1))
1145        .and_then(|value| value.as_str().parse::<f64>().ok())
1146        .unwrap_or(0.0)
1147}
1148
1149/// Decode %XX percent escapes in `input`. Invalid sequences are left
1150/// untouched so well-formed ASCII URLs round-trip unchanged.
1151fn percent_decode_utf8_lossy(input: &str) -> String {
1152    let bytes = input.as_bytes();
1153    let mut decoded = Vec::with_capacity(bytes.len());
1154    let mut i = 0;
1155    while i < bytes.len() {
1156        if bytes[i] == b'%' && i + 2 < bytes.len() {
1157            let hi = (bytes[i + 1] as char).to_digit(16);
1158            let lo = (bytes[i + 2] as char).to_digit(16);
1159            if let (Some(hi), Some(lo)) = (hi, lo) {
1160                if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1161                    decoded.push(byte);
1162                    i += 3;
1163                    continue;
1164                }
1165            }
1166        }
1167        decoded.push(bytes[i]);
1168        i += 1;
1169    }
1170    String::from_utf8_lossy(&decoded).into_owned()
1171}
1172
1173fn unescape_public_export_punctuation(markdown: &str) -> String {
1174    markdown
1175        .replace("\\.", ".")
1176        .replace("\\!", "!")
1177        .replace("\\(", "(")
1178        .replace("\\)", ")")
1179        .replace("\\[", "[")
1180        .replace("\\]", "]")
1181}
1182
1183fn convert_setext_headings(markdown: &str) -> String {
1184    let lines: Vec<&str> = markdown.lines().collect();
1185    let mut out = Vec::with_capacity(lines.len());
1186    let mut index = 0;
1187    while index < lines.len() {
1188        if index + 1 < lines.len() {
1189            let underline = lines[index + 1].trim();
1190            if is_setext_underline(underline, '=') {
1191                out.push(format!("# {}", lines[index].trim()));
1192                index += 2;
1193                continue;
1194            }
1195            if is_setext_underline(underline, '-') {
1196                out.push(format!("## {}", lines[index].trim()));
1197                index += 2;
1198                continue;
1199            }
1200        }
1201        out.push(lines[index].to_string());
1202        index += 1;
1203    }
1204    out.join("\n")
1205}
1206
1207fn is_setext_underline(line: &str, marker: char) -> bool {
1208    line.len() >= 5 && line.chars().all(|ch| ch == marker)
1209}
1210
1211fn normalize_atx_headings(markdown: &str) -> String {
1212    let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1213    let closing_re = closing_atx_heading_re();
1214    markdown
1215        .lines()
1216        .map(|line| {
1217            let Some(caps) = heading_re.captures(line) else {
1218                return line.to_string();
1219            };
1220            let hashes = caps.get(1).map_or("", |m| m.as_str());
1221            let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1222            text = closing_re.replace(&text, "").trim().to_string();
1223            text = strip_wrapping_markdown_emphasis(&text);
1224            format!("{hashes} {text}")
1225        })
1226        .collect::<Vec<_>>()
1227        .join("\n")
1228}
1229
1230fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1231    let trimmed = text.trim();
1232    for marker in ["***", "**", "*"] {
1233        if trimmed.len() > marker.len() * 2
1234            && trimmed.starts_with(marker)
1235            && trimmed.ends_with(marker)
1236        {
1237            return trimmed[marker.len()..trimmed.len() - marker.len()]
1238                .trim()
1239                .to_string();
1240        }
1241    }
1242    trimmed.to_string()
1243}
1244
1245fn normalize_bullet_markers(markdown: &str) -> String {
1246    let bullet_re = asterisk_bullet_re();
1247    markdown
1248        .lines()
1249        .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1250        .collect::<Vec<_>>()
1251        .join("\n")
1252}
1253
1254fn normalize_list_spacing(markdown: &str) -> String {
1255    let lines: Vec<&str> = markdown.lines().collect();
1256    let mut out = Vec::with_capacity(lines.len());
1257
1258    for (index, line) in lines.iter().enumerate() {
1259        if line.trim().is_empty()
1260            && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1261            && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1262        {
1263            continue;
1264        }
1265        out.push((*line).to_string());
1266    }
1267
1268    out.join("\n")
1269}
1270
1271fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1272    lines[..index]
1273        .iter()
1274        .rev()
1275        .copied()
1276        .find(|line| !line.trim().is_empty())
1277}
1278
1279fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1280    lines[index + 1..]
1281        .iter()
1282        .copied()
1283        .find(|line| !line.trim().is_empty())
1284}
1285
1286fn is_markdown_list_item(line: &str) -> bool {
1287    markdown_list_item_re().is_match(line)
1288}
1289
1290fn normalize_blockquote_spacing(markdown: &str) -> String {
1291    let mut out = String::with_capacity(markdown.len());
1292    let mut pending_quote_blank = false;
1293    let mut in_quote = false;
1294
1295    for line in markdown.lines() {
1296        if line.trim().is_empty() && in_quote {
1297            pending_quote_blank = true;
1298            continue;
1299        }
1300
1301        if line.trim() == ">" {
1302            if in_quote {
1303                pending_quote_blank = true;
1304            }
1305            continue;
1306        }
1307
1308        if line.starts_with("> ") {
1309            if pending_quote_blank {
1310                out.push_str(">\n");
1311                pending_quote_blank = false;
1312            }
1313            out.push_str(line);
1314            out.push('\n');
1315            in_quote = true;
1316            continue;
1317        }
1318
1319        if in_quote && !line.trim().is_empty() {
1320            out.push('\n');
1321        }
1322        pending_quote_blank = false;
1323        in_quote = false;
1324        out.push_str(line);
1325        out.push('\n');
1326    }
1327
1328    out
1329}
1330
1331fn normalize_markdown_tables(markdown: &str) -> String {
1332    let lines: Vec<&str> = markdown.lines().collect();
1333    let mut out = Vec::with_capacity(lines.len());
1334    let mut index = 0;
1335
1336    while index < lines.len() {
1337        if !is_markdown_table_line(lines[index]) {
1338            out.push(lines[index].to_string());
1339            index += 1;
1340            continue;
1341        }
1342
1343        let start = index;
1344        while index < lines.len() && is_markdown_table_line(lines[index]) {
1345            index += 1;
1346        }
1347        let block = &lines[start..index];
1348        if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1349            out.extend(normalize_markdown_table_block(block));
1350        } else {
1351            out.extend(block.iter().map(|line| (*line).to_string()));
1352        }
1353    }
1354
1355    out.join("\n")
1356}
1357
1358fn is_markdown_table_line(line: &str) -> bool {
1359    let trimmed = line.trim();
1360    trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1361}
1362
1363fn is_markdown_separator_line(line: &str) -> bool {
1364    split_markdown_table_cells(line)
1365        .iter()
1366        .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1367}
1368
1369fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1370    lines
1371        .iter()
1372        .enumerate()
1373        .map(|(index, line)| {
1374            let cells = split_markdown_table_cells(line);
1375            if index == 1 {
1376                let separators = vec!["---".to_string(); cells.len()];
1377                render_markdown_table_row(&separators)
1378            } else {
1379                render_markdown_table_row(&cells)
1380            }
1381        })
1382        .collect()
1383}
1384
1385fn split_markdown_table_cells(line: &str) -> Vec<String> {
1386    line.trim()
1387        .trim_matches('|')
1388        .split('|')
1389        .map(|cell| cell.trim().to_string())
1390        .collect()
1391}
1392
1393fn render_markdown_table_row(cells: &[String]) -> String {
1394    format!("| {} |", cells.join(" | "))
1395}
1396
1397fn closing_atx_heading_re() -> &'static Regex {
1398    static RE: OnceLock<Regex> = OnceLock::new();
1399    RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1400}
1401
1402fn asterisk_bullet_re() -> &'static Regex {
1403    static RE: OnceLock<Regex> = OnceLock::new();
1404    RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1405}
1406
1407fn markdown_list_item_re() -> &'static Regex {
1408    static RE: OnceLock<Regex> = OnceLock::new();
1409    RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1410}
1411
1412fn markdown_table_separator_cell_re() -> &'static Regex {
1413    static RE: OnceLock<Regex> = OnceLock::new();
1414    RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1415}
1416
1417/// Fetch and render a Google Docs document via the authenticated REST API.
1418///
1419/// # Errors
1420///
1421/// Returns an error when the URL is invalid, no token is provided, or the API request fails.
1422pub async fn fetch_google_doc_from_docs_api(
1423    url: &str,
1424    api_token: &str,
1425) -> crate::Result<GDocsRenderedResult> {
1426    let document_id = extract_document_id(url).ok_or_else(|| {
1427        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1428    })?;
1429    let api_url = build_docs_api_url(&document_id);
1430    debug!(
1431        document_id = %document_id,
1432        api_url = %api_url,
1433        "fetching Google Doc via Docs API"
1434    );
1435
1436    let response = reqwest::Client::new()
1437        .get(&api_url)
1438        .header("Authorization", format!("Bearer {api_token}"))
1439        .header("Accept", "application/json")
1440        .send()
1441        .await
1442        .map_err(|e| {
1443            WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1444        })?;
1445    debug!(
1446        document_id = %document_id,
1447        status = response.status().as_u16(),
1448        success = response.status().is_success(),
1449        content_type = response
1450            .headers()
1451            .get(reqwest::header::CONTENT_TYPE)
1452            .and_then(|value| value.to_str().ok())
1453            .unwrap_or(""),
1454        "received Google Docs API response"
1455    );
1456
1457    if !response.status().is_success() {
1458        return Err(WebCaptureError::FetchError(format!(
1459            "Failed to fetch Google Doc via Docs API ({} {}): {}",
1460            response.status().as_u16(),
1461            response.status().canonical_reason().unwrap_or("Unknown"),
1462            api_url
1463        )));
1464    }
1465
1466    let body = response.text().await.map_err(|e| {
1467        WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1468    })?;
1469    let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1470        WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1471    })?;
1472    let rendered = render_docs_api_document(&document);
1473    debug!(
1474        document_id = %document_id,
1475        title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1476        markdown_bytes = rendered.markdown.len(),
1477        html_bytes = rendered.html.len(),
1478        text_bytes = rendered.text.len(),
1479        "rendered Google Docs API document"
1480    );
1481
1482    Ok(GDocsRenderedResult {
1483        markdown: rendered.markdown,
1484        html: rendered.html,
1485        text: rendered.text,
1486        document_id,
1487        export_url: api_url,
1488        remote_images: Vec::new(),
1489    })
1490}
1491
1492/// Fetch and render the model data embedded in the Google Docs `/edit` route.
1493///
1494/// # Errors
1495///
1496/// Returns an error when the URL is invalid, the fetch fails, or no model chunks are present.
1497pub async fn fetch_google_doc_from_model(
1498    url: &str,
1499    api_token: Option<&str>,
1500) -> crate::Result<GDocsRenderedResult> {
1501    if api_token.is_some() {
1502        return Err(WebCaptureError::BrowserError(
1503            "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1504        ));
1505    }
1506    let document_id = extract_document_id(url).ok_or_else(|| {
1507        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1508    })?;
1509    let edit_url = build_edit_url(&document_id);
1510    debug!(
1511        document_id = %document_id,
1512        edit_url = %edit_url,
1513        "capturing Google Doc editor model with a real browser"
1514    );
1515    let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1516    let BrowserModelData {
1517        chunks,
1518        cid_urls,
1519        chunk_payload_bytes,
1520        poll_count,
1521        stable_for,
1522    } = model_data;
1523    debug!(
1524        document_id = %document_id,
1525        chunks = chunks.len(),
1526        cid_urls = cid_urls.len(),
1527        chunk_payload_bytes,
1528        poll_count,
1529        stable_for_ms = stable_for.as_millis(),
1530        "extracted Google Docs editor model chunks through CDP"
1531    );
1532    if chunks.is_empty() {
1533        return Err(WebCaptureError::ParseError(
1534            "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1535        ));
1536    }
1537
1538    let export_html = match fetch_google_doc(url, "html", None).await {
1539        Ok(result) => Some(result.content),
1540        Err(error) => {
1541            warn!(
1542                document_id = %document_id,
1543                error = %error,
1544                "failed to fetch Google Docs export HTML for browser-model semantic hints"
1545            );
1546            None
1547        }
1548    };
1549    let capture = parse_model_chunks_with_export_html(&chunks, &cid_urls, export_html.as_deref());
1550    let remote_images = remote_images_from_capture(&capture);
1551    info!(
1552        document_id = %document_id,
1553        chunks = chunks.len(),
1554        cid_urls = cid_urls.len(),
1555        chunk_payload_bytes,
1556        poll_count,
1557        stable_for_ms = stable_for.as_millis(),
1558        blocks = capture.blocks.len(),
1559        tables = capture.tables.len(),
1560        images = capture.images.len(),
1561        text_bytes = capture.text.len(),
1562        "parsed Google Docs editor model"
1563    );
1564
1565    Ok(GDocsRenderedResult {
1566        markdown: render_captured_document(&capture, "markdown"),
1567        html: render_captured_document(&capture, "html"),
1568        text: render_captured_document(&capture, "txt"),
1569        document_id,
1570        export_url: edit_url,
1571        remote_images,
1572    })
1573}
1574
1575async fn fetch_google_doc_editor_model_with_cdp(
1576    edit_url: &str,
1577    document_id: &str,
1578) -> crate::Result<BrowserModelData> {
1579    let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1580        WebCaptureError::BrowserError(
1581            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1582        )
1583    })?;
1584    let user_data_dir = crate::browser::temporary_user_data_dir();
1585    std::fs::create_dir_all(&user_data_dir)?;
1586
1587    debug!(
1588        document_id = %document_id,
1589        chrome = %chrome.display(),
1590        user_data_dir = %user_data_dir.display(),
1591        edit_url = %edit_url,
1592        "launching headless Chrome CDP session for Google Docs model capture"
1593    );
1594
1595    let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1596    let capture_result = async {
1597        let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1598        let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1599            WebCaptureError::BrowserError(format!(
1600                "Failed to connect to Chrome DevTools websocket: {error}"
1601            ))
1602        })?;
1603        let mut next_id = 0u64;
1604        let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1605        wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1606    }
1607    .await;
1608
1609    if let Err(error) = child.kill().await {
1610        debug!(
1611            document_id = %document_id,
1612            error = %error,
1613            "failed to kill Chrome CDP browser process"
1614        );
1615    }
1616    let _ = child.wait().await;
1617    let _ = std::fs::remove_dir_all(&user_data_dir);
1618
1619    capture_result
1620}
1621
1622async fn navigate_google_docs_cdp_page(
1623    ws: &mut CdpWebSocket,
1624    next_id: &mut u64,
1625    edit_url: &str,
1626) -> crate::Result<String> {
1627    let target = cdp_send(
1628        ws,
1629        next_id,
1630        None,
1631        "Target.createTarget",
1632        serde_json::json!({ "url": "about:blank" }),
1633    )
1634    .await?;
1635    let target_id = target
1636        .get("targetId")
1637        .and_then(Value::as_str)
1638        .ok_or_else(|| {
1639            WebCaptureError::BrowserError(
1640                "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1641            )
1642        })?
1643        .to_string();
1644    let attached = cdp_send(
1645        ws,
1646        next_id,
1647        None,
1648        "Target.attachToTarget",
1649        serde_json::json!({ "targetId": target_id, "flatten": true }),
1650    )
1651    .await?;
1652    let session_id = attached
1653        .get("sessionId")
1654        .and_then(Value::as_str)
1655        .ok_or_else(|| {
1656            WebCaptureError::BrowserError(
1657                "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1658            )
1659        })?
1660        .to_string();
1661
1662    cdp_send(
1663        ws,
1664        next_id,
1665        Some(&session_id),
1666        "Page.enable",
1667        serde_json::json!({}),
1668    )
1669    .await?;
1670    cdp_send(
1671        ws,
1672        next_id,
1673        Some(&session_id),
1674        "Runtime.enable",
1675        serde_json::json!({}),
1676    )
1677    .await?;
1678    cdp_send(
1679        ws,
1680        next_id,
1681        Some(&session_id),
1682        "Page.addScriptToEvaluateOnNewDocument",
1683        serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1684    )
1685    .await?;
1686    cdp_send(
1687        ws,
1688        next_id,
1689        Some(&session_id),
1690        "Page.navigate",
1691        serde_json::json!({ "url": edit_url }),
1692    )
1693    .await?;
1694
1695    Ok(session_id)
1696}
1697
1698async fn wait_for_google_docs_model_chunks(
1699    ws: &mut CdpWebSocket,
1700    next_id: &mut u64,
1701    session_id: &str,
1702    document_id: &str,
1703) -> crate::Result<BrowserModelData> {
1704    let started = Instant::now();
1705    let max_wait = gdocs_editor_model_max_wait();
1706    let stability_window = gdocs_editor_model_stability_window();
1707    let mut quiescence = BrowserModelQuiescence::default();
1708    let mut last_chunks = 0usize;
1709    let mut last_cid_urls = 0usize;
1710    let mut last_payload_bytes = 0usize;
1711    let mut last_stable_for = Duration::ZERO;
1712    let mut poll_count = 0usize;
1713
1714    while started.elapsed() < max_wait {
1715        let result = cdp_send(
1716            ws,
1717            next_id,
1718            Some(session_id),
1719            "Runtime.evaluate",
1720            serde_json::json!({
1721                "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1722                "returnByValue": true,
1723                "awaitPromise": true
1724            }),
1725        )
1726        .await?;
1727        if let Some(exception) = result.get("exceptionDetails") {
1728            return Err(WebCaptureError::BrowserError(format!(
1729                "Google Docs model extraction script failed: {exception}"
1730            )));
1731        }
1732        let value = result
1733            .pointer("/result/value")
1734            .cloned()
1735            .unwrap_or(Value::Null);
1736        let model_data = browser_model_data_from_value(&value);
1737        poll_count += 1;
1738        let fingerprint = model_data.fingerprint();
1739        last_chunks = model_data.chunks.len();
1740        last_cid_urls = model_data.cid_urls.len();
1741        last_payload_bytes = model_data.chunk_payload_bytes;
1742        let now = Instant::now();
1743        if let Some(stable_for) = quiescence.observe(fingerprint, now, stability_window) {
1744            let mut model_data = model_data;
1745            model_data.poll_count = poll_count;
1746            model_data.stable_for = stable_for;
1747            debug!(
1748                document_id = %document_id,
1749                chunks = model_data.chunks.len(),
1750                cid_urls = model_data.cid_urls.len(),
1751                chunk_payload_bytes = model_data.chunk_payload_bytes,
1752                poll_count,
1753                stable_for_ms = stable_for.as_millis(),
1754                elapsed_ms = started.elapsed().as_millis(),
1755                "captured quiesced Google Docs model chunks through CDP Runtime.evaluate"
1756            );
1757            return Ok(model_data);
1758        }
1759        last_stable_for = quiescence.stable_for(now);
1760        tokio::time::sleep(GDOCS_EDITOR_MODEL_POLL_INTERVAL).await;
1761    }
1762
1763    Err(WebCaptureError::BrowserError(format!(
1764        "Timed out waiting for Google Docs DOCS_modelChunk stream to quiesce for document {document_id} after {} ms (last chunks={last_chunks}, payload_bytes={last_payload_bytes}, cid_urls={last_cid_urls}, poll_count={poll_count}, stable_for_ms={})",
1765        max_wait.as_millis(),
1766        last_stable_for.as_millis()
1767    )))
1768}
1769
1770fn launch_cdp_chrome(
1771    chrome: &std::path::Path,
1772    user_data_dir: &std::path::Path,
1773) -> crate::Result<Child> {
1774    let mut command = Command::new(chrome);
1775    command
1776        .args([
1777            "--headless=new",
1778            "--disable-gpu",
1779            "--disable-extensions",
1780            "--disable-dev-shm-usage",
1781            "--disable-background-networking",
1782            "--disable-component-update",
1783            "--disable-default-apps",
1784            "--disable-sync",
1785            "--metrics-recording-only",
1786            "--no-default-browser-check",
1787            "--no-first-run",
1788            "--no-sandbox",
1789            "--remote-debugging-port=0",
1790            "--window-size=1280,800",
1791        ])
1792        .arg(format!("--user-data-dir={}", user_data_dir.display()))
1793        .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1794        .stderr(Stdio::piped())
1795        .stdout(Stdio::null())
1796        .kill_on_drop(true);
1797
1798    command.spawn().map_err(|error| {
1799        WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1800    })
1801}
1802
1803async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1804    let stderr = child.stderr.take().ok_or_else(|| {
1805        WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1806    })?;
1807    let mut lines = BufReader::new(stderr).lines();
1808    let started = Instant::now();
1809
1810    while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1811        let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1812        match line {
1813            Ok(Ok(Some(line))) => {
1814                if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1815                    return Ok(ws_url.trim().to_string());
1816                }
1817            }
1818            Ok(Ok(None)) => {
1819                break;
1820            }
1821            Ok(Err(error)) => {
1822                return Err(WebCaptureError::BrowserError(format!(
1823                    "Failed to read Chrome CDP stderr: {error}"
1824                )));
1825            }
1826            Err(_) => {}
1827        }
1828    }
1829
1830    Err(WebCaptureError::BrowserError(format!(
1831        "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1832        GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1833    )))
1834}
1835
1836async fn cdp_send(
1837    ws: &mut CdpWebSocket,
1838    next_id: &mut u64,
1839    session_id: Option<&str>,
1840    method: &str,
1841    params: Value,
1842) -> crate::Result<Value> {
1843    *next_id += 1;
1844    let id = *next_id;
1845    let mut message = serde_json::json!({
1846        "id": id,
1847        "method": method,
1848        "params": params
1849    });
1850    if let Some(session_id) = session_id {
1851        message["sessionId"] = Value::String(session_id.to_string());
1852    }
1853
1854    ws.send(Message::Text(message.to_string()))
1855        .await
1856        .map_err(|error| {
1857            WebCaptureError::BrowserError(format!(
1858                "Failed to send Chrome DevTools command {method}: {error}"
1859            ))
1860        })?;
1861
1862    while let Some(message) = ws.next().await {
1863        let message = message.map_err(|error| {
1864            WebCaptureError::BrowserError(format!(
1865                "Failed to read Chrome DevTools response for {method}: {error}"
1866            ))
1867        })?;
1868        if !message.is_text() {
1869            continue;
1870        }
1871        let text = message.to_text().map_err(|error| {
1872            WebCaptureError::BrowserError(format!(
1873                "Chrome DevTools response for {method} was not text: {error}"
1874            ))
1875        })?;
1876        let value = serde_json::from_str::<Value>(text).map_err(|error| {
1877            WebCaptureError::ParseError(format!(
1878                "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1879            ))
1880        })?;
1881        if value.get("id").and_then(Value::as_u64) != Some(id) {
1882            continue;
1883        }
1884        if let Some(error) = value.get("error") {
1885            return Err(WebCaptureError::BrowserError(format!(
1886                "Chrome DevTools command {method} failed: {error}"
1887            )));
1888        }
1889        return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1890    }
1891
1892    Err(WebCaptureError::BrowserError(format!(
1893        "Chrome DevTools websocket closed before response for {method}"
1894    )))
1895}
1896
1897fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1898    let chunks = value
1899        .get("chunks")
1900        .and_then(Value::as_array)
1901        .cloned()
1902        .unwrap_or_default();
1903    let chunk_payload_bytes = model_chunk_payload_bytes(&chunks);
1904    let cid_urls = value
1905        .get("cidUrlMap")
1906        .and_then(Value::as_object)
1907        .map(|map| {
1908            map.iter()
1909                .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1910                .collect::<HashMap<_, _>>()
1911        })
1912        .unwrap_or_default();
1913    BrowserModelData {
1914        chunks,
1915        cid_urls,
1916        chunk_payload_bytes,
1917        poll_count: 0,
1918        stable_for: Duration::ZERO,
1919    }
1920}
1921
1922fn model_chunk_payload_bytes(chunks: &[Value]) -> usize {
1923    chunks
1924        .iter()
1925        .map(|chunk| serde_json::to_vec(chunk).map_or(0, |encoded| encoded.len()))
1926        .sum()
1927}
1928
1929fn gdocs_editor_model_max_wait() -> Duration {
1930    duration_from_env_ms(
1931        "WEB_CAPTURE_GDOCS_MAX_WAIT_MS",
1932        GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT,
1933    )
1934}
1935
1936fn gdocs_editor_model_stability_window() -> Duration {
1937    duration_from_env_ms(
1938        "WEB_CAPTURE_GDOCS_STABILITY_MS",
1939        GDOCS_EDITOR_MODEL_STABILITY_DEFAULT,
1940    )
1941}
1942
1943fn duration_from_env_ms(name: &str, default: Duration) -> Duration {
1944    std::env::var(name).map_or(default, |value| match value.trim().parse::<u64>() {
1945        Ok(ms) => Duration::from_millis(ms),
1946        Err(error) => {
1947            warn!(
1948                name,
1949                value,
1950                error = %error,
1951                default_ms = default.as_millis(),
1952                "ignoring invalid Google Docs model wait environment variable"
1953            );
1954            default
1955        }
1956    })
1957}
1958
1959fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1960    capture
1961        .images
1962        .iter()
1963        .filter_map(|node| match node {
1964            ContentNode::Image {
1965                url: Some(url),
1966                alt,
1967                ..
1968            } => Some(RemoteImage {
1969                url: url.clone(),
1970                alt: alt.clone(),
1971            }),
1972            ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1973        })
1974        .collect()
1975}
1976
1977/// Render a Google Docs REST API document value.
1978#[must_use]
1979pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1980    let blocks = structural_elements_to_blocks(
1981        document
1982            .pointer("/body/content")
1983            .and_then(Value::as_array)
1984            .map_or(&[] as &[Value], Vec::as_slice),
1985        document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1986    );
1987    GDocsRenderedOutput {
1988        markdown: render_blocks_markdown(&blocks),
1989        html: render_blocks_html(&blocks),
1990        text: blocks_to_text(&blocks),
1991    }
1992}
1993
1994/// Rendered document output.
1995#[derive(Debug, Clone, PartialEq, Eq)]
1996pub struct GDocsRenderedOutput {
1997    /// Markdown output.
1998    pub markdown: String,
1999    /// HTML output.
2000    pub html: String,
2001    /// Plain text output.
2002    pub text: String,
2003}
2004
2005fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
2006    let mut blocks = Vec::new();
2007    for element in elements {
2008        if let Some(paragraph) = element.get("paragraph") {
2009            let content = paragraph_to_content(paragraph, inline_objects);
2010            if !content_to_text(&content).trim().is_empty()
2011                || content
2012                    .iter()
2013                    .any(|node| matches!(node, ContentNode::Image { .. }))
2014            {
2015                blocks.push(CapturedBlock::Paragraph {
2016                    style: paragraph
2017                        .pointer("/paragraphStyle/namedStyleType")
2018                        .and_then(Value::as_str)
2019                        .map(ToString::to_string),
2020                    list: None,
2021                    quote: false,
2022                    horizontal_rule: false,
2023                    content,
2024                });
2025            }
2026        } else if let Some(table) = element.get("table") {
2027            blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
2028        }
2029    }
2030    blocks
2031}
2032
2033fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
2034    let rows = table
2035        .get("tableRows")
2036        .and_then(Value::as_array)
2037        .map_or(&[] as &[Value], Vec::as_slice)
2038        .iter()
2039        .map(|row| TableRow {
2040            cells: row
2041                .get("tableCells")
2042                .and_then(Value::as_array)
2043                .map_or(&[] as &[Value], Vec::as_slice)
2044                .iter()
2045                .map(|cell| TableCell {
2046                    content: structural_elements_to_inline_content(
2047                        cell.get("content")
2048                            .and_then(Value::as_array)
2049                            .map_or(&[] as &[Value], Vec::as_slice),
2050                        inline_objects,
2051                    ),
2052                })
2053                .collect(),
2054        })
2055        .collect();
2056    TableBlock { rows }
2057}
2058
2059fn structural_elements_to_inline_content(
2060    elements: &[Value],
2061    inline_objects: &Value,
2062) -> Vec<ContentNode> {
2063    let mut content = Vec::new();
2064    for element in elements {
2065        if let Some(paragraph) = element.get("paragraph") {
2066            let paragraph_content = paragraph_to_content(paragraph, inline_objects);
2067            if !content.is_empty() && !paragraph_content.is_empty() {
2068                append_text(&mut content, "\n");
2069            }
2070            content.extend(paragraph_content);
2071        } else if let Some(table) = element.get("table") {
2072            append_text(
2073                &mut content,
2074                &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
2075                    table,
2076                    inline_objects,
2077                ))]),
2078            );
2079        }
2080    }
2081    content
2082}
2083
2084fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
2085    let mut content = Vec::new();
2086    for element in paragraph
2087        .get("elements")
2088        .and_then(Value::as_array)
2089        .map_or(&[] as &[Value], Vec::as_slice)
2090    {
2091        if let Some(text) = element
2092            .pointer("/textRun/content")
2093            .and_then(Value::as_str)
2094            .map(|text| text.strip_suffix('\n').unwrap_or(text))
2095        {
2096            append_text(&mut content, text);
2097        } else if let Some(inline_id) = element
2098            .pointer("/inlineObjectElement/inlineObjectId")
2099            .and_then(Value::as_str)
2100        {
2101            if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
2102                content.push(image);
2103            }
2104        }
2105    }
2106    content
2107}
2108
2109fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
2110    let embedded = inline_objects
2111        .get(inline_id)?
2112        .pointer("/inlineObjectProperties/embeddedObject")?;
2113    let url = embedded
2114        .pointer("/imageProperties/contentUri")
2115        .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
2116        .and_then(Value::as_str)?;
2117    let alt = embedded
2118        .get("title")
2119        .or_else(|| embedded.get("description"))
2120        .and_then(Value::as_str)
2121        .unwrap_or("image");
2122    Some(ContentNode::Image {
2123        cid: None,
2124        url: Some(url.to_string()),
2125        alt: alt.to_string(),
2126        width: json_dimension_to_string(embedded.pointer("/size/width/magnitude")),
2127        height: json_dimension_to_string(embedded.pointer("/size/height/magnitude")),
2128        is_suggestion: false,
2129    })
2130}
2131
2132fn json_dimension_to_string(value: Option<&Value>) -> Option<String> {
2133    match value? {
2134        Value::Number(number) => Some(number.to_string()),
2135        Value::String(text) if !text.is_empty() => Some(text.clone()),
2136        _ => None,
2137    }
2138}
2139
2140fn build_model_style_maps(
2141    items: &[Value],
2142    text_len: usize,
2143    utf16_position_map: &[usize],
2144) -> ModelStyleMaps {
2145    let mut maps = ModelStyleMaps {
2146        inline_styles: vec![TextStyle::default(); text_len],
2147        ..ModelStyleMaps::default()
2148    };
2149
2150    for item in items {
2151        if item.get("ty").and_then(Value::as_str) != Some("as") {
2152            continue;
2153        }
2154        let (Some(start), Some(end), Some(style_type)) = (
2155            item.get("si").and_then(Value::as_u64),
2156            item.get("ei").and_then(Value::as_u64),
2157            item.get("st").and_then(Value::as_str),
2158        ) else {
2159            continue;
2160        };
2161        let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
2162            continue;
2163        };
2164
2165        let start = utf16_position_to_char_position(utf16_position_map, start);
2166        let end = utf16_position_to_char_position(utf16_position_map, end);
2167        if start == 0 || end == 0 {
2168            continue;
2169        }
2170
2171        match style_type {
2172            "text" => {
2173                let style = text_style(item);
2174                apply_inline_style(&mut maps.inline_styles, start, end, &style);
2175            }
2176            "link" => {
2177                let style = TextStyle {
2178                    link: item
2179                        .pointer("/sm/lnks_link/ulnk_url")
2180                        .and_then(Value::as_str)
2181                        .map(ToString::to_string),
2182                    ..TextStyle::default()
2183                };
2184                apply_inline_style(&mut maps.inline_styles, start, end, &style);
2185            }
2186            "paragraph" => {
2187                maps.paragraph_by_end
2188                    .insert(end, paragraph_style_from_model(item));
2189            }
2190            "list" => {
2191                maps.list_by_end.insert(
2192                    end,
2193                    ListMeta {
2194                        id: item
2195                            .pointer("/sm/ls_id")
2196                            .and_then(Value::as_str)
2197                            .unwrap_or("")
2198                            .to_string(),
2199                        level: item
2200                            .pointer("/sm/ls_nest")
2201                            .and_then(Value::as_u64)
2202                            .and_then(|value| usize::try_from(value).ok())
2203                            .unwrap_or(0),
2204                        ordered: false,
2205                    },
2206                );
2207            }
2208            "horizontal_rule" => {
2209                maps.horizontal_rules.insert(end);
2210            }
2211            _ => {}
2212        }
2213    }
2214
2215    maps
2216}
2217
2218fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
2219    let from = start.saturating_sub(1);
2220    let to = end.min(styles.len());
2221    if from >= to {
2222        return;
2223    }
2224    for style in &mut styles[from..to] {
2225        if patch.bold {
2226            style.bold = true;
2227        }
2228        if patch.italic {
2229            style.italic = true;
2230        }
2231        if patch.strike {
2232            style.strike = true;
2233        }
2234        if patch.link.is_some() {
2235            style.link.clone_from(&patch.link);
2236        }
2237    }
2238}
2239
2240fn text_style(item: &Value) -> TextStyle {
2241    TextStyle {
2242        bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true)
2243            && item.pointer("/sm/ts_bd_i").and_then(Value::as_bool) != Some(true),
2244        italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true)
2245            && item.pointer("/sm/ts_it_i").and_then(Value::as_bool) != Some(true),
2246        strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true)
2247            && item.pointer("/sm/ts_st_i").and_then(Value::as_bool) != Some(true),
2248        link: None,
2249    }
2250}
2251
2252fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2253    let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2254    ParagraphStyle {
2255        style: heading.map(|level| format!("HEADING_{level}")),
2256        indent_start: item
2257            .pointer("/sm/ps_il")
2258            .and_then(Value::as_f64)
2259            .unwrap_or(0.0),
2260        indent_first_line: item
2261            .pointer("/sm/ps_ifl")
2262            .and_then(Value::as_f64)
2263            .unwrap_or(0.0),
2264    }
2265}
2266
2267fn build_utf16_position_map(text: &str) -> Vec<usize> {
2268    let mut map = vec![0; text.encode_utf16().count() + 1];
2269    let mut utf16_pos = 1usize;
2270    for (idx, ch) in text.chars().enumerate() {
2271        let char_pos = idx + 1;
2272        for _ in 0..ch.len_utf16() {
2273            if let Some(slot) = map.get_mut(utf16_pos) {
2274                *slot = char_pos;
2275            }
2276            utf16_pos += 1;
2277        }
2278    }
2279    map
2280}
2281
2282fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2283    map.get(position)
2284        .copied()
2285        .filter(|position| *position > 0)
2286        .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2287        .unwrap_or(0)
2288}
2289
2290/// Parse captured `DOCS_modelChunk` values.
2291#[must_use]
2292pub fn parse_model_chunks<S: BuildHasher>(
2293    chunks: &[Value],
2294    cid_urls: &HashMap<String, String, S>,
2295) -> CapturedDocument {
2296    parse_model_chunks_with_export_html(chunks, cid_urls, None)
2297}
2298
2299/// Parse captured `DOCS_modelChunk` values and optionally merge semantic hints
2300/// from Google Docs export HTML.
2301#[must_use]
2302#[allow(clippy::too_many_lines)]
2303pub fn parse_model_chunks_with_export_html<S: BuildHasher>(
2304    chunks: &[Value],
2305    cid_urls: &HashMap<String, String, S>,
2306    export_html: Option<&str>,
2307) -> CapturedDocument {
2308    let items = collect_model_items(chunks);
2309    let full_text = items
2310        .iter()
2311        .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2312        .filter_map(|item| item.get("s").and_then(Value::as_str))
2313        .collect::<String>();
2314    let chars: Vec<char> = full_text.chars().collect();
2315    let utf16_position_map = build_utf16_position_map(&full_text);
2316    let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2317
2318    let mut positions = HashMap::new();
2319    for item in &items {
2320        if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2321            if let (Some(id), Some(pos)) = (
2322                item.get("id").and_then(Value::as_str),
2323                item.get("spi").and_then(Value::as_u64),
2324            ) {
2325                if let Ok(pos) = usize::try_from(pos) {
2326                    positions.insert(
2327                        id.to_string(),
2328                        utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2329                    );
2330                }
2331            }
2332        }
2333    }
2334
2335    let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2336    let mut images = Vec::new();
2337    for item in &items {
2338        let ty = item.get("ty").and_then(Value::as_str);
2339        if !matches!(ty, Some("ae" | "ase")) {
2340            continue;
2341        }
2342        let Some(id) = item.get("id").and_then(Value::as_str) else {
2343            continue;
2344        };
2345        let Some(pos) = positions.get(id).copied() else {
2346            continue;
2347        };
2348        let cid = item
2349            .pointer("/epm/ee_eo/i_cid")
2350            .and_then(Value::as_str)
2351            .map(ToString::to_string);
2352        let node = ContentNode::Image {
2353            url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2354            cid,
2355            alt: item
2356                .pointer("/epm/ee_eo/eo_ad")
2357                .and_then(Value::as_str)
2358                .unwrap_or_else(|| {
2359                    if ty == Some("ase") {
2360                        "suggested image"
2361                    } else {
2362                        "image"
2363                    }
2364                })
2365                .to_string(),
2366            width: json_dimension_to_string(item.pointer("/epm/ee_eo/i_wth")),
2367            height: json_dimension_to_string(item.pointer("/epm/ee_eo/i_ht")),
2368            is_suggestion: ty == Some("ase"),
2369        };
2370        images_by_pos.insert(pos, node.clone());
2371        images.push(node);
2372    }
2373
2374    let mut blocks = Vec::new();
2375    let mut tables = Vec::new();
2376    let mut paragraph = Vec::new();
2377    let mut table: Option<TableBlock> = None;
2378    let mut row: Option<TableRow> = None;
2379    let mut cell: Option<TableCell> = None;
2380    let mut previous_table_control: Option<u32> = None;
2381    let mut skip_next_table_newline = false;
2382
2383    for (idx, ch) in chars.iter().copied().enumerate() {
2384        match ch as u32 {
2385            0x10 => {
2386                flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2387                table = Some(TableBlock::default());
2388                previous_table_control = Some(0x10);
2389                skip_next_table_newline = false;
2390            }
2391            0x11 => {
2392                flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2393                previous_table_control = None;
2394                skip_next_table_newline = false;
2395            }
2396            0x12 => {
2397                flush_row(&mut row, &mut cell, table.as_mut(), true);
2398                row = Some(TableRow::default());
2399                previous_table_control = Some(0x12);
2400                skip_next_table_newline = false;
2401            }
2402            0x1c => {
2403                if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2404                    previous_table_control = Some(0x1c);
2405                    continue;
2406                }
2407                let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2408                flush_cell(&mut row, &mut cell, false);
2409                if row.is_none() {
2410                    row = Some(TableRow::default());
2411                }
2412                cell = Some(TableCell::default());
2413                if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2414                    skip_next_table_newline = true;
2415                }
2416                previous_table_control = Some(0x1c);
2417            }
2418            0x0a => {
2419                if table.is_some() {
2420                    if skip_next_table_newline {
2421                        skip_next_table_newline = false;
2422                        previous_table_control = Some(0x0a);
2423                        continue;
2424                    }
2425                    // Inside a table, a bare newline separates cells within the
2426                    // current row (rows are delimited by 0x12/0x11). See R2.
2427                    flush_cell(&mut row, &mut cell, false);
2428                    if row.is_none() {
2429                        row = Some(TableRow::default());
2430                    }
2431                    cell = Some(TableCell::default());
2432                    previous_table_control = Some(0x0a);
2433                } else {
2434                    flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2435                }
2436            }
2437            0x0b => {
2438                append_to_current(
2439                    &mut paragraph,
2440                    &mut row,
2441                    &mut cell,
2442                    table.is_some(),
2443                    "\n",
2444                    TextStyle::default(),
2445                );
2446                previous_table_control = None;
2447                skip_next_table_newline = false;
2448            }
2449            _ => {
2450                if let Some(image) = images_by_pos.get(&idx).cloned() {
2451                    push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2452                    previous_table_control = None;
2453                    skip_next_table_newline = false;
2454                    if ch == '*' {
2455                        continue;
2456                    }
2457                }
2458                append_to_current(
2459                    &mut paragraph,
2460                    &mut row,
2461                    &mut cell,
2462                    table.is_some(),
2463                    &ch.to_string(),
2464                    style_maps
2465                        .inline_styles
2466                        .get(idx)
2467                        .cloned()
2468                        .unwrap_or_default(),
2469                );
2470                previous_table_control = None;
2471                skip_next_table_newline = false;
2472            }
2473        }
2474    }
2475
2476    if table.is_some() {
2477        flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2478    }
2479    flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2480
2481    let mut capture = CapturedDocument {
2482        text: blocks_to_text(&blocks),
2483        blocks,
2484        tables,
2485        images,
2486    };
2487    if let Some(export_html) = export_html {
2488        apply_export_semantic_hints(&mut capture.blocks, export_html);
2489        capture.text = blocks_to_text(&capture.blocks);
2490    }
2491    capture
2492}
2493
2494fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2495    let mut items = Vec::new();
2496    for chunk in chunks {
2497        if let Some(array) = chunk.as_array() {
2498            items.extend(array.iter().cloned());
2499        } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2500            items.extend(array.iter().cloned());
2501        } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2502            items.push(chunk.clone());
2503        }
2504    }
2505    items
2506}
2507
2508fn flush_paragraph(
2509    paragraph: &mut Vec<ContentNode>,
2510    blocks: &mut Vec<CapturedBlock>,
2511    end_pos: Option<usize>,
2512    style_maps: &ModelStyleMaps,
2513) {
2514    if !content_to_text(paragraph).trim().is_empty()
2515        || paragraph
2516            .iter()
2517            .any(|node| matches!(node, ContentNode::Image { .. }))
2518    {
2519        let meta =
2520            paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2521        blocks.push(CapturedBlock::Paragraph {
2522            content: std::mem::take(paragraph),
2523            style: meta.style,
2524            list: meta.list,
2525            quote: meta.quote,
2526            horizontal_rule: meta.horizontal_rule,
2527        });
2528    } else {
2529        paragraph.clear();
2530    }
2531}
2532
2533fn paragraph_meta_for_end_position(
2534    style_maps: &ModelStyleMaps,
2535    end_pos: Option<usize>,
2536    text: &str,
2537) -> ParagraphMeta {
2538    let Some(end_pos) = end_pos else {
2539        return ParagraphMeta::default();
2540    };
2541    let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2542    let mut meta = ParagraphMeta {
2543        style: paragraph_style.and_then(|style| style.style.clone()),
2544        ..ParagraphMeta::default()
2545    };
2546
2547    if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2548        let mut list = list.clone();
2549        list.ordered = infer_ordered_list(&list, text);
2550        meta.list = Some(list);
2551    } else if paragraph_style.is_some_and(|style| {
2552        style.indent_start > 0.0
2553            && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2554    }) {
2555        meta.quote = true;
2556    }
2557
2558    meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2559        || end_pos
2560            .checked_sub(1)
2561            .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2562        && text.trim().chars().all(|ch| ch == '-');
2563    meta
2564}
2565
2566const fn infer_ordered_list(_list: &ListMeta, _text: &str) -> bool {
2567    false
2568}
2569
2570fn apply_export_semantic_hints(blocks: &mut [CapturedBlock], export_html: &str) {
2571    let hints = extract_export_semantic_hints(export_html);
2572    let mut cursor = 0usize;
2573    for block in blocks {
2574        let CapturedBlock::Paragraph {
2575            content,
2576            list,
2577            quote,
2578            ..
2579        } = block
2580        else {
2581            continue;
2582        };
2583        let text = normalize_semantic_text(&content_to_text(content));
2584        if text.is_empty() {
2585            continue;
2586        }
2587        let Some((index, hint)) = find_next_semantic_hint(&hints, &text, cursor, list.is_some())
2588        else {
2589            continue;
2590        };
2591        cursor = index + 1;
2592        if let Some(list) = list.as_mut() {
2593            if let Some(ordered) = hint.list_ordered {
2594                list.ordered = ordered;
2595            }
2596        } else {
2597            *quote = hint.quote;
2598        }
2599    }
2600}
2601
2602fn find_next_semantic_hint<'a>(
2603    hints: &'a [ExportSemanticHint],
2604    text: &str,
2605    cursor: usize,
2606    needs_list_hint: bool,
2607) -> Option<(usize, &'a ExportSemanticHint)> {
2608    hints.iter().enumerate().skip(cursor).find(|(_, hint)| {
2609        hint.text == text
2610            && if needs_list_hint {
2611                hint.list_ordered.is_some()
2612            } else {
2613                hint.list_ordered.is_none()
2614            }
2615    })
2616}
2617
2618fn extract_export_semantic_hints(export_html: &str) -> Vec<ExportSemanticHint> {
2619    let preprocessed = preprocess_google_docs_export_html(export_html).html;
2620    let document = Html::parse_document(&preprocessed);
2621    let selector =
2622        Selector::parse("body h1,body h2,body h3,body h4,body h5,body h6,body p,body li")
2623            .expect("valid semantic hint selector");
2624    document
2625        .select(&selector)
2626        .filter_map(|element| {
2627            let tag = element.value().name();
2628            let text = export_element_semantic_text(&element);
2629            if text.is_empty() {
2630                return None;
2631            }
2632            let list_ordered = if tag == "li" {
2633                nearest_list_is_ordered(&element)
2634            } else {
2635                None
2636            };
2637            Some(ExportSemanticHint {
2638                text,
2639                list_ordered,
2640                quote: tag != "li" && has_ancestor_tag(&element, "blockquote"),
2641            })
2642        })
2643        .collect()
2644}
2645
2646fn export_element_semantic_text(element: &ElementRef<'_>) -> String {
2647    let raw_text = if element.value().name() == "li" {
2648        list_item_own_text(element)
2649    } else {
2650        element.text().collect()
2651    };
2652    normalize_semantic_text(&raw_text)
2653}
2654
2655fn list_item_own_text(element: &ElementRef<'_>) -> String {
2656    let mut text = String::new();
2657    let mut stack: Vec<_> = element.children().collect();
2658    stack.reverse();
2659
2660    while let Some(node) = stack.pop() {
2661        match node.value() {
2662            Node::Text(value) => text.push_str(value),
2663            Node::Element(child) if matches!(child.name(), "ol" | "ul") => {}
2664            Node::Element(_) => {
2665                let mut children: Vec<_> = node.children().collect();
2666                children.reverse();
2667                stack.extend(children);
2668            }
2669            _ => {}
2670        }
2671    }
2672
2673    text
2674}
2675
2676fn nearest_list_is_ordered(element: &ElementRef<'_>) -> Option<bool> {
2677    element
2678        .ancestors()
2679        .filter_map(ElementRef::wrap)
2680        .find_map(|ancestor| match ancestor.value().name() {
2681            "ol" => Some(true),
2682            "ul" => Some(false),
2683            _ => None,
2684        })
2685}
2686
2687fn has_ancestor_tag(element: &ElementRef<'_>, tag: &str) -> bool {
2688    element
2689        .ancestors()
2690        .filter_map(ElementRef::wrap)
2691        .any(|ancestor| ancestor.value().name() == tag)
2692}
2693
2694fn normalize_semantic_text(text: &str) -> String {
2695    text.replace('\u{a0}', " ")
2696        .split_whitespace()
2697        .collect::<Vec<_>>()
2698        .join(" ")
2699}
2700
2701fn cell_is_empty(cell: &TableCell) -> bool {
2702    cell.content.iter().all(|node| match node {
2703        ContentNode::Text { text, .. } => text.trim().is_empty(),
2704        ContentNode::Image { .. } => false,
2705    })
2706}
2707
2708fn row_is_empty(row: &TableRow) -> bool {
2709    row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2710}
2711
2712fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2713    if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2714        if drop_empty && cell_is_empty(&cell) {
2715            return;
2716        }
2717        row.cells.push(cell);
2718    }
2719}
2720
2721fn flush_row(
2722    row: &mut Option<TableRow>,
2723    cell: &mut Option<TableCell>,
2724    table: Option<&mut TableBlock>,
2725    drop_empty_trailing_cell: bool,
2726) {
2727    flush_cell(row, cell, drop_empty_trailing_cell);
2728    if let (Some(table), Some(row)) = (table, row.take()) {
2729        table.rows.push(row);
2730    }
2731}
2732
2733fn flush_table(
2734    table: &mut Option<TableBlock>,
2735    row: &mut Option<TableRow>,
2736    cell: &mut Option<TableCell>,
2737    tables: &mut Vec<TableBlock>,
2738    blocks: &mut Vec<CapturedBlock>,
2739) {
2740    flush_row(row, cell, table.as_mut(), true);
2741    if let Some(mut table) = table.take() {
2742        // Drop trailing empty rows that can be introduced by '\n' immediately
2743        // before the 0x11 table-close marker. See R2.
2744        while table.rows.last().is_some_and(row_is_empty) {
2745            table.rows.pop();
2746        }
2747        tables.push(table.clone());
2748        blocks.push(CapturedBlock::Table(table));
2749    }
2750}
2751
2752fn push_to_current(
2753    paragraph: &mut Vec<ContentNode>,
2754    row: &mut Option<TableRow>,
2755    cell: &mut Option<TableCell>,
2756    in_table: bool,
2757    node: ContentNode,
2758) {
2759    if in_table {
2760        if row.is_none() {
2761            *row = Some(TableRow::default());
2762        }
2763        if cell.is_none() {
2764            *cell = Some(TableCell::default());
2765        }
2766        if let Some(cell) = cell.as_mut() {
2767            cell.content.push(node);
2768        }
2769    } else {
2770        paragraph.push(node);
2771    }
2772}
2773
2774fn append_to_current(
2775    paragraph: &mut Vec<ContentNode>,
2776    row: &mut Option<TableRow>,
2777    cell: &mut Option<TableCell>,
2778    in_table: bool,
2779    text: &str,
2780    style: TextStyle,
2781) {
2782    if in_table {
2783        if row.is_none() {
2784            *row = Some(TableRow::default());
2785        }
2786        if cell.is_none() {
2787            *cell = Some(TableCell::default());
2788        }
2789        if let Some(cell) = cell.as_mut() {
2790            append_styled_text(&mut cell.content, text, style);
2791        }
2792    } else {
2793        append_styled_text(paragraph, text, style);
2794    }
2795}
2796
2797fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2798    append_styled_text(content, text, TextStyle::default());
2799}
2800
2801fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2802    if text.is_empty() {
2803        return;
2804    }
2805    if let Some(ContentNode::Text {
2806        text: last,
2807        bold,
2808        italic,
2809        strike,
2810        link,
2811    }) = content.last_mut()
2812    {
2813        let last_style = TextStyle {
2814            bold: *bold,
2815            italic: *italic,
2816            strike: *strike,
2817            link: link.clone(),
2818        };
2819        if last_style == style {
2820            last.push_str(text);
2821            return;
2822        }
2823    }
2824    content.push(ContentNode::Text {
2825        text: text.to_string(),
2826        bold: style.bold,
2827        italic: style.italic,
2828        strike: style.strike,
2829        link: style.link,
2830    });
2831}
2832
2833/// Render a parsed Google Docs capture as Markdown, HTML, or text.
2834#[must_use]
2835pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2836    match format.to_lowercase().as_str() {
2837        "html" => render_blocks_html(&capture.blocks),
2838        "txt" | "text" => blocks_to_text(&capture.blocks),
2839        _ => render_blocks_markdown(&capture.blocks),
2840    }
2841}
2842
2843/// One rendered block plus enough context for `render_blocks_markdown` to
2844/// choose a Markdown-safe separator.
2845struct RenderedBlock {
2846    markdown: String,
2847    list_id: Option<String>,
2848    quote: bool,
2849}
2850
2851fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
2852    // Track an ordered-list counter per (list.id, level) so ordered items are
2853    // numbered sequentially 1., 2., 3., ... instead of all being "1.". See R3.
2854    // When we re-enter a shallower list level, deeper counters reset so a new
2855    // parent restarts its children at 1.
2856    let mut counters: HashMap<(String, usize), usize> = HashMap::new();
2857    let mut rendered: Vec<RenderedBlock> = Vec::new();
2858
2859    for block in blocks {
2860        match block {
2861            CapturedBlock::Paragraph {
2862                content,
2863                style,
2864                list,
2865                quote,
2866                horizontal_rule,
2867            } => {
2868                let text = render_content_markdown(content).trim().to_string();
2869                if text.is_empty() {
2870                    continue;
2871                }
2872                let ordered_index = list.as_ref().and_then(|list_meta| {
2873                    if !list_meta.ordered {
2874                        return None;
2875                    }
2876                    // Reset counters for deeper levels when we move up to a
2877                    // shallower level — otherwise a new parent item would see
2878                    // its previous children's final count.
2879                    let key = (list_meta.id.clone(), list_meta.level);
2880                    counters.retain(|(id, level), _| {
2881                        !(id == &list_meta.id && *level > list_meta.level)
2882                    });
2883                    let next = counters.entry(key).or_insert(0);
2884                    *next += 1;
2885                    Some(*next)
2886                });
2887                let markdown = render_paragraph_markdown(
2888                    &text,
2889                    style.as_deref(),
2890                    list.as_ref(),
2891                    *quote,
2892                    *horizontal_rule,
2893                    ordered_index,
2894                );
2895                rendered.push(RenderedBlock {
2896                    markdown,
2897                    list_id: list.as_ref().map(|l| l.id.clone()),
2898                    quote: *quote,
2899                });
2900            }
2901            CapturedBlock::Table(table) => {
2902                rendered.push(RenderedBlock {
2903                    markdown: render_table_markdown(table),
2904                    list_id: None,
2905                    quote: false,
2906                });
2907            }
2908        }
2909    }
2910
2911    // Choose separator per adjacent pair: consecutive items from the same
2912    // Google Docs list use a single newline, including nested levels; adjacent
2913    // blockquote paragraphs keep a quoted blank line between them.
2914    let mut out = String::new();
2915    for (idx, block) in rendered.iter().enumerate() {
2916        if idx == 0 {
2917            out.push_str(&block.markdown);
2918            continue;
2919        }
2920        let prev = &rendered[idx - 1];
2921        if block.list_id.is_some() && prev.list_id.is_some() {
2922            out.push('\n');
2923        } else if block.quote && prev.quote {
2924            out.push_str("\n>\n");
2925        } else {
2926            out.push_str("\n\n");
2927        }
2928        out.push_str(&block.markdown);
2929    }
2930    if !out.is_empty() && !out.ends_with('\n') {
2931        out.push('\n');
2932    }
2933    out
2934}
2935
2936fn render_paragraph_markdown(
2937    text: &str,
2938    style: Option<&str>,
2939    list: Option<&ListMeta>,
2940    quote: bool,
2941    horizontal_rule: bool,
2942    ordered_index: Option<usize>,
2943) -> String {
2944    if horizontal_rule {
2945        return "---".to_string();
2946    }
2947    match style {
2948        Some("TITLE") => format!("# {text}"),
2949        Some("SUBTITLE") => format!("## {text}"),
2950        Some(style) if style.starts_with("HEADING_") => {
2951            let level = style
2952                .trim_start_matches("HEADING_")
2953                .parse::<usize>()
2954                .unwrap_or(1);
2955            format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2956        }
2957        _ => list.map_or_else(
2958            || {
2959                if quote {
2960                    text.lines()
2961                        .map(|line| {
2962                            if line.is_empty() {
2963                                ">".to_string()
2964                            } else {
2965                                format!("> {line}")
2966                            }
2967                        })
2968                        .collect::<Vec<_>>()
2969                        .join("\n")
2970                } else {
2971                    text.to_string()
2972                }
2973            },
2974            |list| {
2975                let indent = "    ".repeat(list.level);
2976                let marker = if list.ordered {
2977                    format!("{}.", ordered_index.unwrap_or(1))
2978                } else {
2979                    "-".to_string()
2980                };
2981                format!("{indent}{marker} {text}")
2982            },
2983        ),
2984    }
2985}
2986
2987fn render_table_markdown(table: &TableBlock) -> String {
2988    if table.rows.is_empty() {
2989        return String::new();
2990    }
2991    let width = table
2992        .rows
2993        .iter()
2994        .map(|row| row.cells.len())
2995        .max()
2996        .unwrap_or(1);
2997    let rows = table
2998        .rows
2999        .iter()
3000        .map(|row| {
3001            (0..width)
3002                .map(|idx| {
3003                    row.cells.get(idx).map_or_else(String::new, |cell| {
3004                        escape_markdown_table_cell(&render_content_markdown(&cell.content))
3005                    })
3006                })
3007                .collect::<Vec<_>>()
3008        })
3009        .collect::<Vec<_>>();
3010    let separator = vec!["---".to_string(); width];
3011    std::iter::once(&rows[0])
3012        .chain(std::iter::once(&separator))
3013        .chain(rows.iter().skip(1))
3014        .map(|row| format!("| {} |", row.join(" | ")))
3015        .collect::<Vec<_>>()
3016        .join("\n")
3017}
3018
3019fn render_content_markdown(content: &[ContentNode]) -> String {
3020    let mut rendered = String::new();
3021    let mut idx = 0usize;
3022    while idx < content.len() {
3023        match &content[idx] {
3024            ContentNode::Text {
3025                text,
3026                bold,
3027                italic,
3028                strike,
3029                link,
3030            } => {
3031                let link_target = link.as_deref();
3032                let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
3033                idx += 1;
3034                while let Some(ContentNode::Text {
3035                    text,
3036                    bold,
3037                    italic,
3038                    strike,
3039                    link: next_link,
3040                }) = content.get(idx)
3041                {
3042                    if next_link.as_deref() != link_target {
3043                        break;
3044                    }
3045                    runs.push((text.as_str(), *bold, *italic, *strike));
3046                    idx += 1;
3047                }
3048                let label = render_text_runs_markdown(&runs);
3049                if let Some(link_target) = link_target {
3050                    let _ = write!(rendered, "[{label}]({link_target})");
3051                } else {
3052                    rendered.push_str(&label);
3053                }
3054            }
3055            ContentNode::Image {
3056                url: Some(url),
3057                alt,
3058                ..
3059            } => {
3060                let _ = write!(rendered, "![{alt}]({url})");
3061                idx += 1;
3062            }
3063            ContentNode::Image { .. } => idx += 1,
3064        }
3065    }
3066    rendered
3067}
3068
3069#[derive(Clone, Copy, Default)]
3070struct MarkdownMarkerState {
3071    bold: bool,
3072    italic: bool,
3073    strike: bool,
3074}
3075
3076fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
3077    let inactive = MarkdownMarkerState::default();
3078    let mut active = inactive;
3079    let mut output = String::new();
3080    for (text, bold, italic, strike) in runs {
3081        let next = MarkdownMarkerState {
3082            bold: *bold,
3083            italic: *italic,
3084            strike: *strike,
3085        };
3086        let mut start = 0usize;
3087        for (offset, ch) in text.char_indices() {
3088            if ch != '\n' {
3089                continue;
3090            }
3091            if offset > start {
3092                output.push_str(&markdown_marker_transition(active, next));
3093                output.push_str(&text[start..offset]);
3094                active = next;
3095            }
3096            output.push_str(&markdown_marker_transition(active, inactive));
3097            output.push('\n');
3098            active = inactive;
3099            start = offset + ch.len_utf8();
3100        }
3101        if start < text.len() {
3102            output.push_str(&markdown_marker_transition(active, next));
3103            output.push_str(&text[start..]);
3104            active = next;
3105        }
3106    }
3107    output.push_str(&markdown_marker_transition(active, inactive));
3108    output
3109}
3110
3111fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
3112    let mut markers = String::new();
3113    if active.strike && !next.strike {
3114        markers.push_str("~~");
3115    }
3116    if active.italic && !next.italic {
3117        markers.push('*');
3118    }
3119    if active.bold && !next.bold {
3120        markers.push_str("**");
3121    }
3122    if !active.bold && next.bold {
3123        markers.push_str("**");
3124    }
3125    if !active.italic && next.italic {
3126        markers.push('*');
3127    }
3128    if !active.strike && next.strike {
3129        markers.push_str("~~");
3130    }
3131    markers
3132}
3133
3134fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
3135    format!(
3136        "<!doctype html><html><body>{}</body></html>",
3137        blocks
3138            .iter()
3139            .map(|block| match block {
3140                CapturedBlock::Paragraph {
3141                    content,
3142                    style,
3143                    list,
3144                    quote,
3145                    horizontal_rule,
3146                } => {
3147                    if *horizontal_rule {
3148                        "<hr>".to_string()
3149                    } else if let Some(list) = list {
3150                        let tag = if list.ordered { "ol" } else { "ul" };
3151                        format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
3152                    } else if *quote {
3153                        format!("<blockquote>{}</blockquote>", render_content_html(content))
3154                    } else {
3155                        let tag = paragraph_tag(style.as_deref());
3156                        format!("<{tag}>{}</{tag}>", render_content_html(content))
3157                    }
3158                }
3159                CapturedBlock::Table(table) => render_table_html(table),
3160            })
3161            .collect::<String>()
3162    )
3163}
3164
3165fn render_table_html(table: &TableBlock) -> String {
3166    let mut html = String::from("<table>");
3167    for row in &table.rows {
3168        html.push_str("<tr>");
3169        for cell in &row.cells {
3170            html.push_str("<td>");
3171            html.push_str(&render_content_html(&cell.content));
3172            html.push_str("</td>");
3173        }
3174        html.push_str("</tr>");
3175    }
3176    html.push_str("</table>");
3177    html
3178}
3179
3180fn render_content_html(content: &[ContentNode]) -> String {
3181    content
3182        .iter()
3183        .map(|node| match node {
3184            ContentNode::Text {
3185                text,
3186                bold,
3187                italic,
3188                strike,
3189                link,
3190            } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
3191            ContentNode::Image {
3192                url: Some(url),
3193                alt,
3194                width,
3195                height,
3196                ..
3197            } => render_image_html(url, alt, width.as_deref(), height.as_deref()),
3198            ContentNode::Image { .. } => String::new(),
3199        })
3200        .collect()
3201}
3202
3203fn render_image_html(url: &str, alt: &str, width: Option<&str>, height: Option<&str>) -> String {
3204    let mut html = format!(
3205        "<img src=\"{}\" alt=\"{}\"",
3206        escape_html(url),
3207        escape_html(alt)
3208    );
3209    if let Some(width) = width.filter(|value| !value.is_empty()) {
3210        let _ = write!(html, " width=\"{}\"", escape_html(width));
3211    }
3212    if let Some(height) = height.filter(|value| !value.is_empty()) {
3213        let _ = write!(html, " height=\"{}\"", escape_html(height));
3214    }
3215    html.push('>');
3216    html
3217}
3218
3219fn render_marked_html(
3220    text: &str,
3221    bold: bool,
3222    italic: bool,
3223    strike: bool,
3224    link: Option<&str>,
3225) -> String {
3226    text.split('\n')
3227        .map(|segment| render_marked_html_segment(segment, bold, italic, strike, link))
3228        .collect::<Vec<_>>()
3229        .join("<br>")
3230}
3231
3232fn render_marked_html_segment(
3233    text: &str,
3234    bold: bool,
3235    italic: bool,
3236    strike: bool,
3237    link: Option<&str>,
3238) -> String {
3239    if text.is_empty() {
3240        return String::new();
3241    }
3242    let mut output = escape_html(text);
3243    if bold {
3244        output = format!("<strong>{output}</strong>");
3245    }
3246    if italic {
3247        output = format!("<em>{output}</em>");
3248    }
3249    if strike {
3250        output = format!("<s>{output}</s>");
3251    }
3252    if let Some(link) = link {
3253        output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
3254    }
3255    output
3256}
3257
3258fn paragraph_tag(style: Option<&str>) -> &'static str {
3259    match style {
3260        Some("TITLE" | "HEADING_1") => "h1",
3261        Some("SUBTITLE" | "HEADING_2") => "h2",
3262        Some("HEADING_3") => "h3",
3263        Some("HEADING_4") => "h4",
3264        Some("HEADING_5") => "h5",
3265        Some("HEADING_6") => "h6",
3266        _ => "p",
3267    }
3268}
3269
3270fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
3271    blocks
3272        .iter()
3273        .map(|block| match block {
3274            CapturedBlock::Paragraph { content, .. } => content_to_text(content),
3275            CapturedBlock::Table(table) => table
3276                .rows
3277                .iter()
3278                .map(|row| {
3279                    row.cells
3280                        .iter()
3281                        .map(|cell| content_to_text(&cell.content))
3282                        .collect::<Vec<_>>()
3283                        .join("\t")
3284                })
3285                .collect::<Vec<_>>()
3286                .join("\n"),
3287        })
3288        .filter(|text| !text.is_empty())
3289        .collect::<Vec<_>>()
3290        .join("\n")
3291}
3292
3293fn content_to_text(content: &[ContentNode]) -> String {
3294    content
3295        .iter()
3296        .map(|node| match node {
3297            ContentNode::Text { text, .. } => text.clone(),
3298            ContentNode::Image {
3299                url: Some(_), alt, ..
3300            } => format!("[{alt}]"),
3301            ContentNode::Image { .. } => String::new(),
3302        })
3303        .collect()
3304}
3305
3306fn escape_html(value: &str) -> String {
3307    value
3308        .replace('&', "&amp;")
3309        .replace('<', "&lt;")
3310        .replace('>', "&gt;")
3311        .replace('"', "&quot;")
3312        .replace('\'', "&#39;")
3313}
3314
3315fn escape_markdown_table_cell(value: &str) -> String {
3316    value.replace('|', "\\|").replace('\n', "<br>")
3317}
3318
3319/// Extract a Bearer token from an Authorization header value.
3320///
3321/// Returns `None` if the header is not a valid Bearer token.
3322#[must_use]
3323pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
3324    let trimmed = auth_header.trim();
3325    trimmed
3326        .strip_prefix("Bearer ")
3327        .or_else(|| trimmed.strip_prefix("bearer "))
3328        .map(str::trim)
3329        .filter(|t| !t.is_empty())
3330}
3331
3332/// An image extracted from base64 data URIs in HTML.
3333#[derive(Debug, Clone)]
3334pub struct ExtractedImage {
3335    /// Local filename (e.g., "image-01.png")
3336    pub filename: String,
3337    /// Raw image bytes
3338    pub data: Vec<u8>,
3339    /// MIME type (e.g., "image/png")
3340    pub mime_type: String,
3341}
3342
3343/// Result of fetching a Google Doc as an archive.
3344#[derive(Debug, Clone)]
3345pub struct GDocsArchiveResult {
3346    /// HTML content with local image paths
3347    pub html: String,
3348    /// Markdown content with local image paths
3349    pub markdown: String,
3350    /// Extracted images
3351    pub images: Vec<ExtractedImage>,
3352    /// Document ID
3353    pub document_id: String,
3354    /// Export URL used
3355    pub export_url: String,
3356}
3357
3358/// Build a self-contained archive result from browser-model rendered output.
3359///
3360/// `DOCS_modelChunk` image nodes point at `docs-images-rt` URLs. Archive mode
3361/// downloads those URLs into `images/` and rewrites markdown/html references to
3362/// local paths so Rust browser capture matches the JavaScript archive path.
3363///
3364/// # Errors
3365///
3366/// Returns an error if the HTTP client cannot be created or an image response
3367/// body cannot be read. Individual failed image downloads are logged and left
3368/// out of the archive, matching the JS behavior.
3369pub async fn localize_rendered_remote_images_for_archive(
3370    rendered: &GDocsRenderedResult,
3371) -> crate::Result<GDocsArchiveResult> {
3372    let client = reqwest::Client::builder().build().map_err(|error| {
3373        WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
3374    })?;
3375    let mut seen = HashMap::new();
3376    let mut images = Vec::new();
3377    let mut next_index = 1usize;
3378
3379    for image in &rendered.remote_images {
3380        if seen.contains_key(&image.url) {
3381            continue;
3382        }
3383        let filename = remote_image_filename(&image.url, next_index);
3384        next_index += 1;
3385        seen.insert(image.url.clone(), filename.clone());
3386
3387        match client
3388            .get(&image.url)
3389            .header("User-Agent", GDOCS_USER_AGENT)
3390            .header("Accept", "image/*,*/*;q=0.8")
3391            .send()
3392            .await
3393        {
3394            Ok(response) if response.status().is_success() => {
3395                let mime_type = response
3396                    .headers()
3397                    .get(reqwest::header::CONTENT_TYPE)
3398                    .and_then(|value| value.to_str().ok())
3399                    .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
3400                let data = response.bytes().await.map_err(|error| {
3401                    WebCaptureError::FetchError(format!(
3402                        "Failed to read Google Docs image {}: {error}",
3403                        image.url
3404                    ))
3405                })?;
3406                debug!(
3407                    url = %image.url,
3408                    filename = %filename,
3409                    bytes = data.len(),
3410                    mime_type = %mime_type,
3411                    "downloaded Google Docs browser-model archive image"
3412                );
3413                images.push(ExtractedImage {
3414                    filename,
3415                    data: data.to_vec(),
3416                    mime_type,
3417                });
3418            }
3419            Ok(response) => {
3420                warn!(
3421                    url = %image.url,
3422                    status = response.status().as_u16(),
3423                    "failed to download Google Docs browser-model archive image"
3424                );
3425            }
3426            Err(error) => {
3427                warn!(
3428                    url = %image.url,
3429                    error = %error,
3430                    "failed to download Google Docs browser-model archive image"
3431                );
3432            }
3433        }
3434    }
3435
3436    let mut markdown = rendered.markdown.clone();
3437    let mut html = rendered.html.clone();
3438    for (url, filename) in seen {
3439        let local_path = format!("images/{filename}");
3440        markdown = markdown.replace(&url, &local_path);
3441        html = html.replace(&url, &local_path);
3442    }
3443
3444    Ok(GDocsArchiveResult {
3445        html,
3446        markdown,
3447        images,
3448        document_id: rendered.document_id.clone(),
3449        export_url: rendered.export_url.clone(),
3450    })
3451}
3452
3453fn remote_image_filename(url: &str, index: usize) -> String {
3454    let ext = crate::localize_images::get_extension_from_url(url);
3455    format!("image-{index:02}{ext}")
3456}
3457
3458fn mime_type_for_filename(filename: &str) -> String {
3459    match filename
3460        .rsplit('.')
3461        .next()
3462        .unwrap_or("png")
3463        .to_lowercase()
3464        .as_str()
3465    {
3466        "jpg" | "jpeg" => "image/jpeg",
3467        "gif" => "image/gif",
3468        "webp" => "image/webp",
3469        "svg" => "image/svg+xml",
3470        _ => "image/png",
3471    }
3472    .to_string()
3473}
3474
3475fn base64_image_pattern() -> &'static Regex {
3476    static PATTERN: OnceLock<Regex> = OnceLock::new();
3477    PATTERN.get_or_init(|| {
3478        Regex::new(
3479            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3480        )
3481        .unwrap()
3482    })
3483}
3484
3485/// Extract base64 data URI images from HTML content.
3486///
3487/// Google Docs HTML exports embed images as base64 data URIs.
3488/// This function extracts them and replaces with local file paths.
3489///
3490/// # Arguments
3491///
3492/// * `html` - HTML content with embedded base64 images
3493///
3494/// # Returns
3495///
3496/// Tuple of (updated HTML with local paths, extracted images)
3497#[must_use]
3498pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3499    let mut images = Vec::new();
3500    let mut idx = 1u32;
3501
3502    let updated_html = base64_image_pattern()
3503        .replace_all(html, |caps: &regex::Captures<'_>| {
3504            let prefix = &caps[1];
3505            let mime_ext = &caps[2];
3506            let base64_data = &caps[3];
3507            let suffix = &caps[4];
3508
3509            let ext = match mime_ext {
3510                "jpeg" => "jpg",
3511                "svg+xml" => "svg",
3512                other => other,
3513            };
3514
3515            let filename = format!("image-{idx:02}.{ext}");
3516            let mime_type = format!("image/{mime_ext}");
3517
3518            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3519                debug!("Extracted image: {} ({} bytes)", filename, data.len());
3520                images.push(ExtractedImage {
3521                    filename: filename.clone(),
3522                    data,
3523                    mime_type,
3524                });
3525            }
3526
3527            idx += 1;
3528            format!("{prefix}images/{filename}{suffix}")
3529        })
3530        .into_owned();
3531
3532    (updated_html, images)
3533}
3534
3535/// Fetch a Google Docs document as a ZIP archive.
3536///
3537/// Fetches the document as HTML, extracts embedded base64 images,
3538/// converts to Markdown, and returns all components ready for archiving.
3539///
3540/// The archive contains:
3541/// - `document.md` — Markdown version
3542/// - `document.html` — HTML version with local image paths
3543/// - `images/` — extracted images
3544///
3545/// # Arguments
3546///
3547/// * `url` - Google Docs URL
3548/// * `api_token` - Optional API token for private documents
3549///
3550/// # Errors
3551///
3552/// Returns an error if the fetch or conversion fails.
3553pub async fn fetch_google_doc_as_archive(
3554    url: &str,
3555    api_token: Option<&str>,
3556) -> crate::Result<GDocsArchiveResult> {
3557    let result = fetch_google_doc(url, "html", api_token).await?;
3558
3559    let preprocess = preprocess_google_docs_export_html(&result.content);
3560    debug!(
3561        document_id = %result.document_id,
3562        hoisted = preprocess.hoisted,
3563        unwrapped_links = preprocess.unwrapped_links,
3564        "google-docs-export pre-processor rewrote archive markup"
3565    );
3566
3567    let (local_html, images) = extract_base64_images(&preprocess.html);
3568
3569    let markdown = normalize_google_docs_export_markdown(
3570        &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3571    );
3572
3573    debug!(
3574        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3575        images.len(),
3576        local_html.len(),
3577        markdown.len()
3578    );
3579
3580    Ok(GDocsArchiveResult {
3581        html: local_html,
3582        markdown,
3583        images,
3584        document_id: result.document_id,
3585        export_url: result.export_url,
3586    })
3587}
3588
3589/// Create a ZIP archive from a `GDocsArchiveResult`.
3590///
3591/// # Arguments
3592///
3593/// * `archive` - The archive result to bundle
3594/// * `pretty_html` - Whether to pretty-print the HTML output
3595///
3596/// # Errors
3597///
3598/// Returns an error if ZIP creation fails.
3599pub fn create_archive_zip(
3600    archive: &GDocsArchiveResult,
3601    pretty_html: bool,
3602) -> crate::Result<Vec<u8>> {
3603    let mut buf = std::io::Cursor::new(Vec::new());
3604
3605    {
3606        let mut zip = zip::ZipWriter::new(&mut buf);
3607        let options = zip::write::SimpleFileOptions::default()
3608            .compression_method(zip::CompressionMethod::Deflated);
3609
3610        zip.start_file("document.md", options)
3611            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3612        zip.write_all(archive.markdown.as_bytes())?;
3613
3614        let html_output = if pretty_html {
3615            crate::html::pretty_print_html(&archive.html)
3616        } else {
3617            archive.html.clone()
3618        };
3619        zip.start_file("document.html", options)
3620            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3621        zip.write_all(html_output.as_bytes())?;
3622
3623        for img in &archive.images {
3624            zip.start_file(format!("images/{}", img.filename), options)
3625                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3626            zip.write_all(&img.data)?;
3627        }
3628
3629        zip.finish()
3630            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3631    }
3632
3633    Ok(buf.into_inner())
3634}
3635
3636#[cfg(test)]
3637mod tests {
3638    use super::*;
3639    use serde_json::json;
3640
3641    #[test]
3642    fn browser_model_fingerprint_includes_payload_size() {
3643        let small = browser_model_data_from_value(&json!({
3644            "chunks": [{ "chunk": [{ "ty": "is", "s": "first" }] }],
3645            "cidUrlMap": {}
3646        }));
3647        let larger = browser_model_data_from_value(&json!({
3648            "chunks": [{ "chunk": [{ "ty": "is", "s": "first and later text" }] }],
3649            "cidUrlMap": {}
3650        }));
3651
3652        assert_eq!(small.fingerprint().chunks, larger.fingerprint().chunks);
3653        assert_ne!(
3654            small.fingerprint().payload_bytes,
3655            larger.fingerprint().payload_bytes
3656        );
3657    }
3658
3659    #[test]
3660    fn browser_model_quiescence_resets_when_chunks_change() {
3661        let start = Instant::now();
3662        let stability_window = Duration::from_millis(1500);
3663        let one_chunk = BrowserModelFingerprint {
3664            chunks: 1,
3665            payload_bytes: 100,
3666        };
3667        let two_chunks = BrowserModelFingerprint {
3668            chunks: 2,
3669            payload_bytes: 200,
3670        };
3671        let mut quiescence = BrowserModelQuiescence::default();
3672
3673        assert_eq!(quiescence.observe(one_chunk, start, stability_window), None);
3674        assert_eq!(
3675            quiescence.observe(
3676                one_chunk,
3677                start + Duration::from_millis(250),
3678                stability_window
3679            ),
3680            None
3681        );
3682        assert_eq!(
3683            quiescence.observe(
3684                two_chunks,
3685                start + Duration::from_millis(500),
3686                stability_window
3687            ),
3688            None
3689        );
3690        assert_eq!(
3691            quiescence.observe(
3692                two_chunks,
3693                start + Duration::from_millis(750),
3694                stability_window
3695            ),
3696            None
3697        );
3698        assert_eq!(
3699            quiescence.observe(
3700                two_chunks,
3701                start + Duration::from_millis(2300),
3702                stability_window
3703            ),
3704            Some(Duration::from_millis(1550))
3705        );
3706    }
3707}