Skip to main content

web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use scraper::{node::Node, ElementRef, Html, Selector};
38use serde_json::Value;
39use std::collections::HashMap;
40use std::fmt::Write as _;
41use std::hash::BuildHasher;
42use std::io::Write;
43use std::process::Stdio;
44use std::sync::OnceLock;
45use std::time::{Duration, Instant};
46use tokio::io::{AsyncBufReadExt, BufReader};
47use tokio::process::{Child, Command};
48use tracing::{debug, info, warn};
49
50use crate::WebCaptureError;
51
52const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
53const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
54const GDOCS_USER_AGENT: &str =
55    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
56const GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT: Duration = Duration::from_secs(30);
57const GDOCS_EDITOR_MODEL_STABILITY_DEFAULT: Duration = Duration::from_millis(1500);
58const GDOCS_EDITOR_MODEL_POLL_INTERVAL: Duration = Duration::from_millis(250);
59const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
60
61type CdpWebSocket = WebSocketStream<ConnectStream>;
62
63const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
64window.__captured_chunks = [];
65const captureChunk = (value) => {
66  if (!value) {
67    return;
68  }
69  if (Array.isArray(value)) {
70    for (const item of value) {
71      captureChunk(item);
72    }
73    return;
74  }
75  try {
76    window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
77  } catch {
78    window.__captured_chunks.push(value);
79  }
80};
81const wrapChunkArray = (value) => {
82  if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
83    return value;
84  }
85  const originalPush = value.push;
86  Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
87    value: true,
88    enumerable: false,
89  });
90  Object.defineProperty(value, 'push', {
91    value(...items) {
92      for (const item of items) {
93        captureChunk(item);
94      }
95      return originalPush.apply(this, items);
96    },
97    writable: true,
98    configurable: true,
99  });
100  for (const item of value) {
101    captureChunk(item);
102  }
103  return value;
104};
105Object.defineProperty(window, 'DOCS_modelChunk', {
106  set(value) {
107    captureChunk(value);
108    window.__DOCS_modelChunk_latest = wrapChunkArray(value);
109  },
110  get() {
111    return window.__DOCS_modelChunk_latest;
112  },
113  configurable: false,
114});
115";
116
117const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
118  const chunks = [...(window.__captured_chunks || [])];
119  if (
120    window.DOCS_modelChunk &&
121    chunks.length === 0 &&
122    !chunks.includes(window.DOCS_modelChunk)
123  ) {
124    chunks.push(window.DOCS_modelChunk);
125  }
126  const cidUrlMap = {};
127  const scripts = document.querySelectorAll('script');
128  for (const script of scripts) {
129    const text = script.textContent || '';
130    if (!text.includes('docs-images-rt')) {
131      continue;
132    }
133    const regex =
134      /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
135    let match;
136    while ((match = regex.exec(text)) !== null) {
137      cidUrlMap[match[1]] = match[2]
138        .replace(/\\u003d/g, '=')
139        .replace(/\\u0026/g, '&')
140        .replace(/\\\//g, '/');
141    }
142  }
143  return { chunks, cidUrlMap };
144}"#;
145
146fn gdocs_url_pattern() -> &'static Regex {
147    static PATTERN: OnceLock<Regex> = OnceLock::new();
148    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
149}
150
151/// Result of fetching a Google Docs document.
152#[derive(Debug, Clone)]
153pub struct GDocsResult {
154    /// The document content in the requested format.
155    pub content: String,
156    /// The export format used.
157    pub format: String,
158    /// The extracted document ID.
159    pub document_id: String,
160    /// The export URL that was fetched.
161    pub export_url: String,
162}
163
164/// Google Docs capture backend selected from the CLI `--capture` flag.
165#[derive(Debug, Clone, Copy, PartialEq, Eq)]
166pub enum GDocsCaptureMethod {
167    /// Load `/edit` and extract `DOCS_modelChunk`.
168    BrowserModel,
169    /// Use the public `/export?format=...` endpoint.
170    PublicExport,
171    /// Use the authenticated `docs.googleapis.com` REST API.
172    DocsApi,
173}
174
175/// Rendered Google Docs content from either Docs API or editor model data.
176#[derive(Debug, Clone)]
177pub struct GDocsRenderedResult {
178    /// Markdown output.
179    pub markdown: String,
180    /// HTML output.
181    pub html: String,
182    /// Plain text output.
183    pub text: String,
184    /// The extracted document ID.
185    pub document_id: String,
186    /// Source URL used for capture.
187    pub export_url: String,
188    /// Remote images exposed by the editor model, used for archive localization.
189    pub remote_images: Vec<RemoteImage>,
190}
191
192/// Remote image reference extracted from browser-model capture.
193#[derive(Debug, Clone, PartialEq, Eq)]
194pub struct RemoteImage {
195    /// Original image URL.
196    pub url: String,
197    /// Image alt text.
198    pub alt: String,
199}
200
201#[derive(Debug, Clone)]
202struct BrowserModelData {
203    chunks: Vec<Value>,
204    cid_urls: HashMap<String, String>,
205    chunk_payload_bytes: usize,
206    poll_count: usize,
207    stable_for: Duration,
208}
209
210#[derive(Debug, Clone, Copy, PartialEq, Eq)]
211struct BrowserModelFingerprint {
212    chunks: usize,
213    payload_bytes: usize,
214}
215
216#[derive(Debug, Default)]
217struct BrowserModelQuiescence {
218    last_fingerprint: Option<BrowserModelFingerprint>,
219    stable_since: Option<Instant>,
220}
221
222impl BrowserModelData {
223    const fn fingerprint(&self) -> BrowserModelFingerprint {
224        BrowserModelFingerprint {
225            chunks: self.chunks.len(),
226            payload_bytes: self.chunk_payload_bytes,
227        }
228    }
229}
230
231impl BrowserModelQuiescence {
232    fn observe(
233        &mut self,
234        fingerprint: BrowserModelFingerprint,
235        now: Instant,
236        stability_window: Duration,
237    ) -> Option<Duration> {
238        if fingerprint.chunks == 0 {
239            self.last_fingerprint = Some(fingerprint);
240            self.stable_since = None;
241            return None;
242        }
243
244        if self.last_fingerprint == Some(fingerprint) {
245            let stable_since = *self.stable_since.get_or_insert(now);
246            let stable_for = now.saturating_duration_since(stable_since);
247            if stable_for >= stability_window {
248                return Some(stable_for);
249            }
250        } else {
251            self.last_fingerprint = Some(fingerprint);
252            self.stable_since = None;
253        }
254
255        None
256    }
257
258    fn stable_for(&self, now: Instant) -> Duration {
259        self.stable_since.map_or(Duration::ZERO, |stable_since| {
260            now.saturating_duration_since(stable_since)
261        })
262    }
263}
264
265/// Parsed Google Docs model/document capture.
266#[derive(Debug, Clone, Default)]
267pub struct CapturedDocument {
268    /// Ordered document blocks.
269    pub blocks: Vec<CapturedBlock>,
270    /// Tables extracted from `blocks` for compatibility with tests and callers.
271    pub tables: Vec<TableBlock>,
272    /// Images extracted from model positions.
273    pub images: Vec<ContentNode>,
274    /// Plain text projection.
275    pub text: String,
276}
277
278/// Captured block.
279#[derive(Debug, Clone)]
280pub enum CapturedBlock {
281    /// Paragraph-like block.
282    Paragraph {
283        /// Paragraph content.
284        content: Vec<ContentNode>,
285        /// Optional Google Docs named style.
286        style: Option<String>,
287        /// Optional list metadata.
288        list: Option<ListMeta>,
289        /// Whether paragraph is a blockquote.
290        quote: bool,
291        /// Whether paragraph is a horizontal rule.
292        horizontal_rule: bool,
293    },
294    /// Table block.
295    Table(TableBlock),
296}
297
298/// Captured table.
299#[derive(Debug, Clone, Default)]
300pub struct TableBlock {
301    /// Table rows.
302    pub rows: Vec<TableRow>,
303}
304
305/// Captured table row.
306#[derive(Debug, Clone, Default)]
307pub struct TableRow {
308    /// Row cells.
309    pub cells: Vec<TableCell>,
310}
311
312/// Captured table cell.
313#[derive(Debug, Clone, Default)]
314pub struct TableCell {
315    /// Cell content.
316    pub content: Vec<ContentNode>,
317}
318
319/// Captured inline content node.
320#[derive(Debug, Clone, PartialEq, Eq)]
321pub enum ContentNode {
322    /// Text run.
323    Text {
324        /// Text content.
325        text: String,
326        /// Bold text style.
327        bold: bool,
328        /// Italic text style.
329        italic: bool,
330        /// Strikethrough text style.
331        strike: bool,
332        /// Optional hyperlink target.
333        link: Option<String>,
334    },
335    /// Image placeholder.
336    Image {
337        /// Content ID from Google Docs model data.
338        cid: Option<String>,
339        /// Resolved image URL.
340        url: Option<String>,
341        /// Alt text.
342        alt: String,
343        /// Editor-model image width, when available.
344        width: Option<String>,
345        /// Editor-model image height, when available.
346        height: Option<String>,
347        /// Whether this image came from a suggested edit.
348        is_suggestion: bool,
349    },
350}
351
352#[derive(Debug, Clone, Default, PartialEq, Eq)]
353struct TextStyle {
354    bold: bool,
355    italic: bool,
356    strike: bool,
357    link: Option<String>,
358}
359
360#[derive(Debug, Clone, Default)]
361struct ParagraphMeta {
362    style: Option<String>,
363    list: Option<ListMeta>,
364    quote: bool,
365    horizontal_rule: bool,
366}
367
368#[derive(Debug, Clone)]
369pub struct ListMeta {
370    /// Google Docs list identifier.
371    pub id: String,
372    /// Nesting level, zero-based.
373    pub level: usize,
374    /// Whether Markdown should render this list item with an ordered marker.
375    pub ordered: bool,
376}
377
378#[derive(Debug, Clone)]
379struct ParagraphStyle {
380    style: Option<String>,
381    indent_start: f64,
382    indent_first_line: f64,
383}
384
385#[derive(Debug, Clone)]
386struct ExportSemanticHint {
387    text: String,
388    list_ordered: Option<bool>,
389    quote: bool,
390}
391
392#[derive(Debug, Clone, Default)]
393struct ModelStyleMaps {
394    inline_styles: Vec<TextStyle>,
395    paragraph_by_end: HashMap<usize, ParagraphStyle>,
396    list_by_end: HashMap<usize, ListMeta>,
397    horizontal_rules: std::collections::HashSet<usize>,
398}
399
400/// Check if a URL is a Google Docs document URL.
401#[must_use]
402pub fn is_google_docs_url(url: &str) -> bool {
403    gdocs_url_pattern().is_match(url)
404}
405
406/// Extract the document ID from a Google Docs URL.
407///
408/// Returns `None` if the URL is not a valid Google Docs URL.
409#[must_use]
410pub fn extract_document_id(url: &str) -> Option<String> {
411    gdocs_url_pattern()
412        .captures(url)
413        .and_then(|caps| caps.get(1))
414        .map(|m| m.as_str().to_string())
415}
416
417/// Build a Google Docs export URL.
418///
419/// # Arguments
420///
421/// * `document_id` - The Google Docs document ID
422/// * `format` - Export format (html, txt, md, pdf, docx, epub)
423#[must_use]
424pub fn build_export_url(document_id: &str, format: &str) -> String {
425    let export_format = match format {
426        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
427        _ => "html",
428    };
429    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
430}
431
432/// Build a Google Docs editor URL.
433#[must_use]
434pub fn build_edit_url(document_id: &str) -> String {
435    format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
436}
437
438/// Build a Google Docs REST API URL.
439#[must_use]
440pub fn build_docs_api_url(document_id: &str) -> String {
441    format!("{GDOCS_API_BASE}/{document_id}")
442}
443
444/// Select a Google Docs capture backend from the CLI `--capture` value.
445///
446/// # Errors
447///
448/// Returns an error when `capture` is neither `browser` nor `api`.
449pub fn select_capture_method(
450    capture: &str,
451    api_token: Option<&str>,
452) -> crate::Result<GDocsCaptureMethod> {
453    match capture.to_lowercase().as_str() {
454        "browser" => Ok(GDocsCaptureMethod::BrowserModel),
455        "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
456        "api" => Ok(GDocsCaptureMethod::PublicExport),
457        other => Err(WebCaptureError::InvalidUrl(format!(
458            "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
459        ))),
460    }
461}
462
463/// Fetch a Google Docs document via the export URL.
464///
465/// For public documents, pass `None` for `api_token`.
466/// For private documents, pass a Bearer token string.
467///
468/// # Arguments
469///
470/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
471/// * `format` - Export format (html, txt, md, pdf, docx, epub)
472/// * `api_token` - Optional API token for private documents
473///
474/// # Errors
475///
476/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
477pub async fn fetch_google_doc(
478    url: &str,
479    format: &str,
480    api_token: Option<&str>,
481) -> crate::Result<GDocsResult> {
482    let document_id = extract_document_id(url).ok_or_else(|| {
483        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
484    })?;
485
486    let export_url = build_export_url(&document_id, format);
487    debug!(
488        document_id = %document_id,
489        format = %format,
490        export_url = %export_url,
491        has_api_token = api_token.is_some(),
492        "fetching Google Doc via public export"
493    );
494
495    let mut request = reqwest::Client::new()
496        .get(&export_url)
497        .header(
498            "User-Agent",
499            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
500        )
501        .header("Accept-Charset", "utf-8")
502        .header("Accept-Language", "en-US,en;q=0.9");
503
504    if let Some(token) = api_token {
505        request = request.header("Authorization", format!("Bearer {token}"));
506    }
507
508    let response = request
509        .send()
510        .await
511        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
512    debug!(
513        document_id = %document_id,
514        status = response.status().as_u16(),
515        success = response.status().is_success(),
516        content_type = response
517            .headers()
518            .get(reqwest::header::CONTENT_TYPE)
519            .and_then(|value| value.to_str().ok())
520            .unwrap_or(""),
521        "received Google Docs public export response"
522    );
523
524    if !response.status().is_success() {
525        return Err(WebCaptureError::FetchError(format!(
526            "Failed to fetch Google Doc ({} {}): {}",
527            response.status().as_u16(),
528            response.status().canonical_reason().unwrap_or("Unknown"),
529            export_url
530        )));
531    }
532
533    let raw_content = response.text().await.map_err(|e| {
534        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
535    })?;
536    debug!(
537        document_id = %document_id,
538        bytes = raw_content.len(),
539        "read Google Docs public export body"
540    );
541
542    // Keep HTML markup escaped so literal examples such as `&lt;ol&gt;` do not
543    // become real tags before the HTML parser sees the document.
544    let content = match format {
545        "txt" | "md" => crate::html::decode_html_entities(&raw_content),
546        _ => raw_content,
547    };
548
549    Ok(GDocsResult {
550        content,
551        format: format.to_string(),
552        document_id,
553        export_url,
554    })
555}
556
557/// Fetch a Google Docs document and convert to Markdown.
558///
559/// Fetches the document as HTML, then converts to Markdown using the
560/// existing HTML-to-Markdown pipeline.
561///
562/// # Arguments
563///
564/// * `url` - Google Docs URL
565/// * `api_token` - Optional API token for private documents
566///
567/// # Errors
568///
569/// Returns an error if the fetch or conversion fails.
570pub async fn fetch_google_doc_as_markdown(
571    url: &str,
572    api_token: Option<&str>,
573) -> crate::Result<GDocsResult> {
574    let result = fetch_google_doc(url, "html", api_token).await?;
575
576    let preprocess = preprocess_google_docs_export_html(&result.content);
577    debug!(
578        document_id = %result.document_id,
579        hoisted = preprocess.hoisted,
580        unwrapped_links = preprocess.unwrapped_links,
581        "google-docs-export pre-processor rewrote markup"
582    );
583    let markdown = normalize_google_docs_export_markdown(
584        &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
585    );
586    debug!(
587        document_id = %result.document_id,
588        bytes = markdown.len(),
589        "rendered Google Docs public export markdown"
590    );
591
592    Ok(GDocsResult {
593        content: markdown,
594        format: "markdown".to_string(),
595        document_id: result.document_id,
596        export_url: result.export_url,
597    })
598}
599
600/// Result of running the Google Docs export HTML pre-processor.
601///
602/// Exposes the rewritten HTML alongside counters that are useful for debug
603/// logging (`gdocs.export.style-hoist`). See issue #92 R6.
604#[derive(Debug, Clone)]
605pub struct GDocsExportPreprocessResult {
606    /// Rewritten HTML.
607    pub html: String,
608    /// Number of inline-style spans turned into `<strong>`/`<em>`/`<del>`.
609    pub hoisted: usize,
610    /// Number of `google.com/url?q=` redirect wrappers unwrapped.
611    pub unwrapped_links: usize,
612}
613
614/// Pre-process Google Docs export HTML so the generic `html2md` pipeline
615/// preserves inline formatting, heading numbering, and link targets.
616///
617/// Google Drive serves bold/italic/strikethrough as inline style spans and
618/// wraps every link through a `google.com/url?q=` redirect, both of which
619/// the generic converter would otherwise discard. This function rewrites
620/// those constructs into semantic HTML before conversion.
621#[must_use]
622pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
623    let mut hoisted: usize = 0;
624    let mut unwrapped_links: usize = 0;
625    let class_styles = extract_css_class_styles(html);
626
627    let mut out = hoist_inline_style_spans(html, &mut hoisted);
628    out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
629    out = convert_class_indented_blockquotes(&out, &class_styles);
630    out = nest_google_docs_lists(&out, &class_styles);
631    out = strip_google_docs_heading_noise(&out);
632    out = strip_heading_inline_formatting(&out);
633    out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
634    out = out.replace("&nbsp;", " ");
635    out = out.replace('\u{00A0}', " ");
636
637    GDocsExportPreprocessResult {
638        html: out,
639        hoisted,
640        unwrapped_links,
641    }
642}
643
644/// Normalize Markdown emitted from Google Docs public-export HTML converters.
645#[must_use]
646pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
647    let markdown = unescape_public_export_punctuation(markdown);
648    let markdown = convert_setext_headings(&markdown);
649    let markdown = normalize_atx_headings(&markdown);
650    let markdown = normalize_bullet_markers(&markdown);
651    let markdown = normalize_list_spacing(&markdown);
652    let markdown = normalize_blockquote_spacing(&markdown);
653    let markdown = normalize_markdown_tables(&markdown);
654    crate::markdown::clean_markdown(&markdown)
655}
656
657fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
658    let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
659        .expect("valid regex");
660    span_re
661        .replace_all(html, |caps: &regex::Captures<'_>| {
662            let style = caps.get(2).map_or("", |m| m.as_str());
663            let inner = caps.get(3).map_or("", |m| m.as_str());
664            semantic_wrapped_html(inner, style).map_or_else(
665                || caps[0].to_string(),
666                |wrapped| {
667                    *hoisted += 1;
668                    wrapped
669                },
670            )
671        })
672        .into_owned()
673}
674
675fn hoist_class_style_spans(
676    html: &str,
677    class_styles: &HashMap<String, String>,
678    hoisted: &mut usize,
679) -> String {
680    let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
681        .expect("valid regex");
682    class_span_re
683        .replace_all(html, |caps: &regex::Captures<'_>| {
684            let class_attr = caps.get(2).map_or("", |m| m.as_str());
685            let inner = caps.get(3).map_or("", |m| m.as_str());
686            let style = combined_class_style(class_styles, class_attr);
687            semantic_wrapped_html(inner, &style).map_or_else(
688                || caps[0].to_string(),
689                |wrapped| {
690                    *hoisted += 1;
691                    wrapped
692                },
693            )
694        })
695        .into_owned()
696}
697
698fn convert_class_indented_blockquotes(
699    html: &str,
700    class_styles: &HashMap<String, String>,
701) -> String {
702    let class_paragraph_re =
703        Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
704    class_paragraph_re
705        .replace_all(html, |caps: &regex::Captures<'_>| {
706            let class_attr = caps.get(2).map_or("", |m| m.as_str());
707            let inner = caps.get(3).map_or("", |m| m.as_str());
708            let style = combined_class_style(class_styles, class_attr);
709            if is_blockquote_style(&style) {
710                format!("<blockquote><p>{inner}</p></blockquote>")
711            } else {
712                caps[0].to_string()
713            }
714        })
715        .into_owned()
716}
717
718#[derive(Debug, Clone)]
719struct ExportListBlock {
720    start: usize,
721    end: usize,
722    tag: String,
723    inner: String,
724}
725
726#[derive(Debug, Clone)]
727struct ExportListItem {
728    tag: String,
729    level: usize,
730    inner: String,
731}
732
733fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
734    let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
735    let blocks: Vec<ExportListBlock> = list_re
736        .captures_iter(html)
737        .filter_map(|caps| {
738            let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
739            let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
740            if open_tag != close_tag {
741                return None;
742            }
743            let whole = caps.get(0)?;
744            Some(ExportListBlock {
745                start: whole.start(),
746                end: whole.end(),
747                tag: open_tag,
748                inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
749            })
750        })
751        .collect();
752
753    if blocks.len() < 2 {
754        return html.to_string();
755    }
756
757    let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
758    let mut current: Vec<ExportListBlock> = Vec::new();
759    for block in blocks {
760        if let Some(previous) = current.last() {
761            if !html[previous.end..block.start].trim().is_empty() {
762                if current.len() > 1 {
763                    groups.push(std::mem::take(&mut current));
764                } else {
765                    current.clear();
766                }
767            }
768        }
769        current.push(block);
770    }
771    if current.len() > 1 {
772        groups.push(current);
773    }
774
775    if groups.is_empty() {
776        return html.to_string();
777    }
778
779    let mut out = html.to_string();
780    for group in groups.iter().rev() {
781        let rendered = render_nested_list_group(group, class_styles);
782        let start = group.first().expect("non-empty group").start;
783        let end = group.last().expect("non-empty group").end;
784        out.replace_range(start..end, &rendered);
785    }
786    out
787}
788
789fn render_nested_list_group(
790    group: &[ExportListBlock],
791    class_styles: &HashMap<String, String>,
792) -> String {
793    let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
794    let items: Vec<ExportListItem> = group
795        .iter()
796        .flat_map(|block| {
797            item_re.captures_iter(&block.inner).map(|caps| {
798                let attrs = caps.get(1).map_or("", |m| m.as_str());
799                let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
800                ExportListItem {
801                    tag: block.tag.clone(),
802                    level: google_docs_list_item_level(attrs, class_styles),
803                    inner,
804                }
805            })
806        })
807        .collect();
808
809    if items.is_empty() {
810        let mut unchanged = String::new();
811        for block in group {
812            write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
813                .expect("write to String");
814        }
815        return unchanged;
816    }
817
818    let mut html = String::new();
819    let mut current_level: Option<usize> = None;
820    let mut open_tags: Vec<Option<String>> = Vec::new();
821    let mut item_open: Vec<bool> = Vec::new();
822
823    for item in items {
824        let level = item.level;
825        while current_level.is_some_and(|current| current > level) {
826            let current = current_level.expect("checked as Some");
827            close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
828            current_level = current.checked_sub(1);
829        }
830
831        while current_level.is_none_or(|current| current < level) {
832            let next_level = current_level.map_or(0, |current| current + 1);
833            open_rendered_list(
834                &mut html,
835                &mut open_tags,
836                &mut item_open,
837                next_level,
838                &item.tag,
839            );
840            current_level = Some(next_level);
841        }
842
843        ensure_list_stack(&mut open_tags, &mut item_open, level);
844        if open_tags[level]
845            .as_deref()
846            .is_some_and(|tag| tag != item.tag)
847        {
848            close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
849            open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
850        } else if open_tags[level].is_none() {
851            open_rendered_list(&mut html, &mut open_tags, &mut item_open, level, &item.tag);
852        }
853
854        close_rendered_item(&mut html, &mut item_open, level);
855        html.push_str("<li>");
856        html.push_str(&item.inner);
857        item_open[level] = true;
858
859        for deeper in (level + 1)..item_open.len() {
860            item_open[deeper] = false;
861            open_tags[deeper] = None;
862        }
863    }
864
865    while let Some(current) = current_level {
866        close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
867        current_level = current.checked_sub(1);
868    }
869
870    html
871}
872
873fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
874    while open_tags.len() <= level {
875        open_tags.push(None);
876        item_open.push(false);
877    }
878}
879
880fn open_rendered_list(
881    html: &mut String,
882    open_tags: &mut Vec<Option<String>>,
883    item_open: &mut Vec<bool>,
884    level: usize,
885    tag: &str,
886) {
887    ensure_list_stack(open_tags, item_open, level);
888    html.push('<');
889    html.push_str(tag);
890    html.push('>');
891    open_tags[level] = Some(tag.to_string());
892    item_open[level] = false;
893}
894
895fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
896    if item_open.get(level).copied().unwrap_or(false) {
897        html.push_str("</li>");
898        item_open[level] = false;
899    }
900}
901
902fn close_rendered_list(
903    html: &mut String,
904    open_tags: &mut [Option<String>],
905    item_open: &mut [bool],
906    level: usize,
907) {
908    close_rendered_item(html, item_open, level);
909    if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
910        html.push_str("</");
911        html.push_str(&tag);
912        html.push('>');
913    }
914}
915
916fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
917    let style = combined_attr_style(class_styles, attrs);
918    let margin_left = css_point_value(&style, "margin-left");
919    if margin_left <= 0.0 {
920        return 0;
921    }
922    [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
923        .iter()
924        .take_while(|boundary| margin_left >= **boundary)
925        .count()
926}
927
928fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
929    let mut styles = String::new();
930    if let Some(style) = attr_value(attrs, "style") {
931        styles.push_str(&style);
932    }
933    if let Some(class_attr) = attr_value(attrs, "class") {
934        styles.push_str(&combined_class_style(class_styles, &class_attr));
935    }
936    styles
937}
938
939fn attr_value(attrs: &str, name: &str) -> Option<String> {
940    let attr_re = Regex::new(&format!(
941        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
942        regex::escape(name)
943    ))
944    .expect("valid regex");
945    attr_re.captures(attrs).and_then(|caps| {
946        caps.get(1)
947            .or_else(|| caps.get(2))
948            .map(|value| value.as_str().to_string())
949    })
950}
951
952fn strip_google_docs_heading_noise(html: &str) -> String {
953    let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
954    let numbering_re =
955        Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
956    let mut out = empty_anchor_re.replace_all(html, "").into_owned();
957    for level in 1..=6 {
958        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
959            .expect("valid regex");
960        out = heading_re
961            .replace_all(&out, |caps: &regex::Captures<'_>| {
962                let open = &caps[1];
963                let inner = &caps[2];
964                let close = &caps[3];
965                let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
966                cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
967                format!("{open}{cleaned}{close}")
968            })
969            .into_owned();
970    }
971    out
972}
973
974fn strip_heading_inline_formatting(html: &str) -> String {
975    let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
976    let mut out = html.to_string();
977    for level in 1..=6 {
978        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
979            .expect("valid regex");
980        out = heading_re
981            .replace_all(&out, |caps: &regex::Captures<'_>| {
982                let open = &caps[1];
983                let inner = &caps[2];
984                let close = &caps[3];
985                let cleaned = inline_marker_re.replace_all(inner, "");
986                format!("{open}{cleaned}{close}")
987            })
988            .into_owned();
989    }
990    out
991}
992
993fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
994    let redirect_re =
995        Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
996            .expect("valid regex");
997    redirect_re
998        .replace_all(html, |caps: &regex::Captures<'_>| {
999            let encoded = caps.get(1).map_or("", |m| m.as_str());
1000            let decoded = percent_decode_utf8_lossy(encoded);
1001            *unwrapped_links += 1;
1002            format!(r#"href="{decoded}""#)
1003        })
1004        .into_owned()
1005}
1006
1007fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
1008    let mut class_styles: HashMap<String, String> = HashMap::new();
1009    let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
1010    let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
1011    for style_caps in style_re.captures_iter(html) {
1012        let css = style_caps.get(1).map_or("", |m| m.as_str());
1013        for class_caps in class_re.captures_iter(css) {
1014            let class_name = class_caps.get(1).map_or("", |m| m.as_str());
1015            let style = class_caps.get(2).map_or("", |m| m.as_str());
1016            class_styles
1017                .entry(class_name.to_string())
1018                .and_modify(|existing| {
1019                    existing.push(';');
1020                    existing.push_str(style);
1021                })
1022                .or_insert_with(|| style.to_string());
1023        }
1024    }
1025    class_styles
1026}
1027
1028fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
1029    class_attr
1030        .split_whitespace()
1031        .filter_map(|class_name| class_styles.get(class_name))
1032        .fold(String::new(), |mut out, style| {
1033            out.push(';');
1034            out.push_str(style);
1035            out
1036        })
1037}
1038
1039fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
1040    let bold = css_has_bold(style);
1041    let italic = css_has_italic(style);
1042    let strike = css_has_strike(style);
1043    if !bold && !italic && !strike {
1044        return None;
1045    }
1046    let mut wrapped = inner.to_string();
1047    if strike {
1048        wrapped = format!("<del>{wrapped}</del>");
1049    }
1050    if italic {
1051        wrapped = format!("<em>{wrapped}</em>");
1052    }
1053    if bold {
1054        wrapped = format!("<strong>{wrapped}</strong>");
1055    }
1056    Some(wrapped)
1057}
1058
1059fn css_has_bold(style: &str) -> bool {
1060    Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
1061        .expect("valid regex")
1062        .is_match(style)
1063}
1064
1065fn css_has_italic(style: &str) -> bool {
1066    Regex::new(r"(?i)font-style\s*:\s*italic")
1067        .expect("valid regex")
1068        .is_match(style)
1069}
1070
1071fn css_has_strike(style: &str) -> bool {
1072    Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1073        .expect("valid regex")
1074        .is_match(style)
1075}
1076
1077fn is_blockquote_style(style: &str) -> bool {
1078    let margin_left = css_point_value(style, "margin-left");
1079    let margin_right = css_point_value(style, "margin-right");
1080    margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1081}
1082
1083fn css_point_value(style: &str, property: &str) -> f64 {
1084    let re = Regex::new(&format!(
1085        r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1086        regex::escape(property)
1087    ))
1088    .expect("valid regex");
1089    re.captures(style)
1090        .and_then(|caps| caps.get(1))
1091        .and_then(|value| value.as_str().parse::<f64>().ok())
1092        .unwrap_or(0.0)
1093}
1094
1095/// Decode %XX percent escapes in `input`. Invalid sequences are left
1096/// untouched so well-formed ASCII URLs round-trip unchanged.
1097fn percent_decode_utf8_lossy(input: &str) -> String {
1098    let bytes = input.as_bytes();
1099    let mut decoded = Vec::with_capacity(bytes.len());
1100    let mut i = 0;
1101    while i < bytes.len() {
1102        if bytes[i] == b'%' && i + 2 < bytes.len() {
1103            let hi = (bytes[i + 1] as char).to_digit(16);
1104            let lo = (bytes[i + 2] as char).to_digit(16);
1105            if let (Some(hi), Some(lo)) = (hi, lo) {
1106                if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1107                    decoded.push(byte);
1108                    i += 3;
1109                    continue;
1110                }
1111            }
1112        }
1113        decoded.push(bytes[i]);
1114        i += 1;
1115    }
1116    String::from_utf8_lossy(&decoded).into_owned()
1117}
1118
1119fn unescape_public_export_punctuation(markdown: &str) -> String {
1120    markdown
1121        .replace("\\.", ".")
1122        .replace("\\!", "!")
1123        .replace("\\(", "(")
1124        .replace("\\)", ")")
1125        .replace("\\[", "[")
1126        .replace("\\]", "]")
1127}
1128
1129fn convert_setext_headings(markdown: &str) -> String {
1130    let lines: Vec<&str> = markdown.lines().collect();
1131    let mut out = Vec::with_capacity(lines.len());
1132    let mut index = 0;
1133    while index < lines.len() {
1134        if index + 1 < lines.len() {
1135            let underline = lines[index + 1].trim();
1136            if is_setext_underline(underline, '=') {
1137                out.push(format!("# {}", lines[index].trim()));
1138                index += 2;
1139                continue;
1140            }
1141            if is_setext_underline(underline, '-') {
1142                out.push(format!("## {}", lines[index].trim()));
1143                index += 2;
1144                continue;
1145            }
1146        }
1147        out.push(lines[index].to_string());
1148        index += 1;
1149    }
1150    out.join("\n")
1151}
1152
1153fn is_setext_underline(line: &str, marker: char) -> bool {
1154    line.len() >= 5 && line.chars().all(|ch| ch == marker)
1155}
1156
1157fn normalize_atx_headings(markdown: &str) -> String {
1158    let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1159    let closing_re = closing_atx_heading_re();
1160    markdown
1161        .lines()
1162        .map(|line| {
1163            let Some(caps) = heading_re.captures(line) else {
1164                return line.to_string();
1165            };
1166            let hashes = caps.get(1).map_or("", |m| m.as_str());
1167            let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1168            text = closing_re.replace(&text, "").trim().to_string();
1169            text = strip_wrapping_markdown_emphasis(&text);
1170            format!("{hashes} {text}")
1171        })
1172        .collect::<Vec<_>>()
1173        .join("\n")
1174}
1175
1176fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1177    let trimmed = text.trim();
1178    for marker in ["***", "**", "*"] {
1179        if trimmed.len() > marker.len() * 2
1180            && trimmed.starts_with(marker)
1181            && trimmed.ends_with(marker)
1182        {
1183            return trimmed[marker.len()..trimmed.len() - marker.len()]
1184                .trim()
1185                .to_string();
1186        }
1187    }
1188    trimmed.to_string()
1189}
1190
1191fn normalize_bullet_markers(markdown: &str) -> String {
1192    let bullet_re = asterisk_bullet_re();
1193    markdown
1194        .lines()
1195        .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1196        .collect::<Vec<_>>()
1197        .join("\n")
1198}
1199
1200fn normalize_list_spacing(markdown: &str) -> String {
1201    let lines: Vec<&str> = markdown.lines().collect();
1202    let mut out = Vec::with_capacity(lines.len());
1203
1204    for (index, line) in lines.iter().enumerate() {
1205        if line.trim().is_empty()
1206            && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1207            && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1208        {
1209            continue;
1210        }
1211        out.push((*line).to_string());
1212    }
1213
1214    out.join("\n")
1215}
1216
1217fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1218    lines[..index]
1219        .iter()
1220        .rev()
1221        .copied()
1222        .find(|line| !line.trim().is_empty())
1223}
1224
1225fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1226    lines[index + 1..]
1227        .iter()
1228        .copied()
1229        .find(|line| !line.trim().is_empty())
1230}
1231
1232fn is_markdown_list_item(line: &str) -> bool {
1233    markdown_list_item_re().is_match(line)
1234}
1235
1236fn normalize_blockquote_spacing(markdown: &str) -> String {
1237    let mut out = String::with_capacity(markdown.len());
1238    let mut pending_quote_blank = false;
1239    let mut in_quote = false;
1240
1241    for line in markdown.lines() {
1242        if line.trim().is_empty() && in_quote {
1243            pending_quote_blank = true;
1244            continue;
1245        }
1246
1247        if line.trim() == ">" {
1248            if in_quote {
1249                pending_quote_blank = true;
1250            }
1251            continue;
1252        }
1253
1254        if line.starts_with("> ") {
1255            if pending_quote_blank {
1256                out.push_str(">\n");
1257                pending_quote_blank = false;
1258            }
1259            out.push_str(line);
1260            out.push('\n');
1261            in_quote = true;
1262            continue;
1263        }
1264
1265        if in_quote && !line.trim().is_empty() {
1266            out.push('\n');
1267        }
1268        pending_quote_blank = false;
1269        in_quote = false;
1270        out.push_str(line);
1271        out.push('\n');
1272    }
1273
1274    out
1275}
1276
1277fn normalize_markdown_tables(markdown: &str) -> String {
1278    let lines: Vec<&str> = markdown.lines().collect();
1279    let mut out = Vec::with_capacity(lines.len());
1280    let mut index = 0;
1281
1282    while index < lines.len() {
1283        if !is_markdown_table_line(lines[index]) {
1284            out.push(lines[index].to_string());
1285            index += 1;
1286            continue;
1287        }
1288
1289        let start = index;
1290        while index < lines.len() && is_markdown_table_line(lines[index]) {
1291            index += 1;
1292        }
1293        let block = &lines[start..index];
1294        if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1295            out.extend(normalize_markdown_table_block(block));
1296        } else {
1297            out.extend(block.iter().map(|line| (*line).to_string()));
1298        }
1299    }
1300
1301    out.join("\n")
1302}
1303
1304fn is_markdown_table_line(line: &str) -> bool {
1305    let trimmed = line.trim();
1306    trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1307}
1308
1309fn is_markdown_separator_line(line: &str) -> bool {
1310    split_markdown_table_cells(line)
1311        .iter()
1312        .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1313}
1314
1315fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1316    lines
1317        .iter()
1318        .enumerate()
1319        .map(|(index, line)| {
1320            let cells = split_markdown_table_cells(line);
1321            if index == 1 {
1322                let separators = vec!["---".to_string(); cells.len()];
1323                render_markdown_table_row(&separators)
1324            } else {
1325                render_markdown_table_row(&cells)
1326            }
1327        })
1328        .collect()
1329}
1330
1331fn split_markdown_table_cells(line: &str) -> Vec<String> {
1332    line.trim()
1333        .trim_matches('|')
1334        .split('|')
1335        .map(|cell| cell.trim().to_string())
1336        .collect()
1337}
1338
1339fn render_markdown_table_row(cells: &[String]) -> String {
1340    format!("| {} |", cells.join(" | "))
1341}
1342
1343fn closing_atx_heading_re() -> &'static Regex {
1344    static RE: OnceLock<Regex> = OnceLock::new();
1345    RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1346}
1347
1348fn asterisk_bullet_re() -> &'static Regex {
1349    static RE: OnceLock<Regex> = OnceLock::new();
1350    RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1351}
1352
1353fn markdown_list_item_re() -> &'static Regex {
1354    static RE: OnceLock<Regex> = OnceLock::new();
1355    RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1356}
1357
1358fn markdown_table_separator_cell_re() -> &'static Regex {
1359    static RE: OnceLock<Regex> = OnceLock::new();
1360    RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1361}
1362
1363/// Fetch and render a Google Docs document via the authenticated REST API.
1364///
1365/// # Errors
1366///
1367/// Returns an error when the URL is invalid, no token is provided, or the API request fails.
1368pub async fn fetch_google_doc_from_docs_api(
1369    url: &str,
1370    api_token: &str,
1371) -> crate::Result<GDocsRenderedResult> {
1372    let document_id = extract_document_id(url).ok_or_else(|| {
1373        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1374    })?;
1375    let api_url = build_docs_api_url(&document_id);
1376    debug!(
1377        document_id = %document_id,
1378        api_url = %api_url,
1379        "fetching Google Doc via Docs API"
1380    );
1381
1382    let response = reqwest::Client::new()
1383        .get(&api_url)
1384        .header("Authorization", format!("Bearer {api_token}"))
1385        .header("Accept", "application/json")
1386        .send()
1387        .await
1388        .map_err(|e| {
1389            WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1390        })?;
1391    debug!(
1392        document_id = %document_id,
1393        status = response.status().as_u16(),
1394        success = response.status().is_success(),
1395        content_type = response
1396            .headers()
1397            .get(reqwest::header::CONTENT_TYPE)
1398            .and_then(|value| value.to_str().ok())
1399            .unwrap_or(""),
1400        "received Google Docs API response"
1401    );
1402
1403    if !response.status().is_success() {
1404        return Err(WebCaptureError::FetchError(format!(
1405            "Failed to fetch Google Doc via Docs API ({} {}): {}",
1406            response.status().as_u16(),
1407            response.status().canonical_reason().unwrap_or("Unknown"),
1408            api_url
1409        )));
1410    }
1411
1412    let body = response.text().await.map_err(|e| {
1413        WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1414    })?;
1415    let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1416        WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1417    })?;
1418    let rendered = render_docs_api_document(&document);
1419    debug!(
1420        document_id = %document_id,
1421        title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1422        markdown_bytes = rendered.markdown.len(),
1423        html_bytes = rendered.html.len(),
1424        text_bytes = rendered.text.len(),
1425        "rendered Google Docs API document"
1426    );
1427
1428    Ok(GDocsRenderedResult {
1429        markdown: rendered.markdown,
1430        html: rendered.html,
1431        text: rendered.text,
1432        document_id,
1433        export_url: api_url,
1434        remote_images: Vec::new(),
1435    })
1436}
1437
1438/// Fetch and render the model data embedded in the Google Docs `/edit` route.
1439///
1440/// # Errors
1441///
1442/// Returns an error when the URL is invalid, the fetch fails, or no model chunks are present.
1443pub async fn fetch_google_doc_from_model(
1444    url: &str,
1445    api_token: Option<&str>,
1446) -> crate::Result<GDocsRenderedResult> {
1447    if api_token.is_some() {
1448        return Err(WebCaptureError::BrowserError(
1449            "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1450        ));
1451    }
1452    let document_id = extract_document_id(url).ok_or_else(|| {
1453        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1454    })?;
1455    let edit_url = build_edit_url(&document_id);
1456    debug!(
1457        document_id = %document_id,
1458        edit_url = %edit_url,
1459        "capturing Google Doc editor model with a real browser"
1460    );
1461    let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1462    let BrowserModelData {
1463        chunks,
1464        cid_urls,
1465        chunk_payload_bytes,
1466        poll_count,
1467        stable_for,
1468    } = model_data;
1469    debug!(
1470        document_id = %document_id,
1471        chunks = chunks.len(),
1472        cid_urls = cid_urls.len(),
1473        chunk_payload_bytes,
1474        poll_count,
1475        stable_for_ms = stable_for.as_millis(),
1476        "extracted Google Docs editor model chunks through CDP"
1477    );
1478    if chunks.is_empty() {
1479        return Err(WebCaptureError::ParseError(
1480            "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1481        ));
1482    }
1483
1484    let export_html = match fetch_google_doc(url, "html", None).await {
1485        Ok(result) => Some(result.content),
1486        Err(error) => {
1487            warn!(
1488                document_id = %document_id,
1489                error = %error,
1490                "failed to fetch Google Docs export HTML for browser-model semantic hints"
1491            );
1492            None
1493        }
1494    };
1495    let capture = parse_model_chunks_with_export_html(&chunks, &cid_urls, export_html.as_deref());
1496    let remote_images = remote_images_from_capture(&capture);
1497    info!(
1498        document_id = %document_id,
1499        chunks = chunks.len(),
1500        cid_urls = cid_urls.len(),
1501        chunk_payload_bytes,
1502        poll_count,
1503        stable_for_ms = stable_for.as_millis(),
1504        blocks = capture.blocks.len(),
1505        tables = capture.tables.len(),
1506        images = capture.images.len(),
1507        text_bytes = capture.text.len(),
1508        "parsed Google Docs editor model"
1509    );
1510
1511    Ok(GDocsRenderedResult {
1512        markdown: render_captured_document(&capture, "markdown"),
1513        html: render_captured_document(&capture, "html"),
1514        text: render_captured_document(&capture, "txt"),
1515        document_id,
1516        export_url: edit_url,
1517        remote_images,
1518    })
1519}
1520
1521async fn fetch_google_doc_editor_model_with_cdp(
1522    edit_url: &str,
1523    document_id: &str,
1524) -> crate::Result<BrowserModelData> {
1525    let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1526        WebCaptureError::BrowserError(
1527            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1528        )
1529    })?;
1530    let user_data_dir = crate::browser::temporary_user_data_dir();
1531    std::fs::create_dir_all(&user_data_dir)?;
1532
1533    debug!(
1534        document_id = %document_id,
1535        chrome = %chrome.display(),
1536        user_data_dir = %user_data_dir.display(),
1537        edit_url = %edit_url,
1538        "launching headless Chrome CDP session for Google Docs model capture"
1539    );
1540
1541    let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1542    let capture_result = async {
1543        let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1544        let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1545            WebCaptureError::BrowserError(format!(
1546                "Failed to connect to Chrome DevTools websocket: {error}"
1547            ))
1548        })?;
1549        let mut next_id = 0u64;
1550        let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1551        wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1552    }
1553    .await;
1554
1555    if let Err(error) = child.kill().await {
1556        debug!(
1557            document_id = %document_id,
1558            error = %error,
1559            "failed to kill Chrome CDP browser process"
1560        );
1561    }
1562    let _ = child.wait().await;
1563    let _ = std::fs::remove_dir_all(&user_data_dir);
1564
1565    capture_result
1566}
1567
1568async fn navigate_google_docs_cdp_page(
1569    ws: &mut CdpWebSocket,
1570    next_id: &mut u64,
1571    edit_url: &str,
1572) -> crate::Result<String> {
1573    let target = cdp_send(
1574        ws,
1575        next_id,
1576        None,
1577        "Target.createTarget",
1578        serde_json::json!({ "url": "about:blank" }),
1579    )
1580    .await?;
1581    let target_id = target
1582        .get("targetId")
1583        .and_then(Value::as_str)
1584        .ok_or_else(|| {
1585            WebCaptureError::BrowserError(
1586                "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1587            )
1588        })?
1589        .to_string();
1590    let attached = cdp_send(
1591        ws,
1592        next_id,
1593        None,
1594        "Target.attachToTarget",
1595        serde_json::json!({ "targetId": target_id, "flatten": true }),
1596    )
1597    .await?;
1598    let session_id = attached
1599        .get("sessionId")
1600        .and_then(Value::as_str)
1601        .ok_or_else(|| {
1602            WebCaptureError::BrowserError(
1603                "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1604            )
1605        })?
1606        .to_string();
1607
1608    cdp_send(
1609        ws,
1610        next_id,
1611        Some(&session_id),
1612        "Page.enable",
1613        serde_json::json!({}),
1614    )
1615    .await?;
1616    cdp_send(
1617        ws,
1618        next_id,
1619        Some(&session_id),
1620        "Runtime.enable",
1621        serde_json::json!({}),
1622    )
1623    .await?;
1624    cdp_send(
1625        ws,
1626        next_id,
1627        Some(&session_id),
1628        "Page.addScriptToEvaluateOnNewDocument",
1629        serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1630    )
1631    .await?;
1632    cdp_send(
1633        ws,
1634        next_id,
1635        Some(&session_id),
1636        "Page.navigate",
1637        serde_json::json!({ "url": edit_url }),
1638    )
1639    .await?;
1640
1641    Ok(session_id)
1642}
1643
1644async fn wait_for_google_docs_model_chunks(
1645    ws: &mut CdpWebSocket,
1646    next_id: &mut u64,
1647    session_id: &str,
1648    document_id: &str,
1649) -> crate::Result<BrowserModelData> {
1650    let started = Instant::now();
1651    let max_wait = gdocs_editor_model_max_wait();
1652    let stability_window = gdocs_editor_model_stability_window();
1653    let mut quiescence = BrowserModelQuiescence::default();
1654    let mut last_chunks = 0usize;
1655    let mut last_cid_urls = 0usize;
1656    let mut last_payload_bytes = 0usize;
1657    let mut last_stable_for = Duration::ZERO;
1658    let mut poll_count = 0usize;
1659
1660    while started.elapsed() < max_wait {
1661        let result = cdp_send(
1662            ws,
1663            next_id,
1664            Some(session_id),
1665            "Runtime.evaluate",
1666            serde_json::json!({
1667                "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1668                "returnByValue": true,
1669                "awaitPromise": true
1670            }),
1671        )
1672        .await?;
1673        if let Some(exception) = result.get("exceptionDetails") {
1674            return Err(WebCaptureError::BrowserError(format!(
1675                "Google Docs model extraction script failed: {exception}"
1676            )));
1677        }
1678        let value = result
1679            .pointer("/result/value")
1680            .cloned()
1681            .unwrap_or(Value::Null);
1682        let model_data = browser_model_data_from_value(&value);
1683        poll_count += 1;
1684        let fingerprint = model_data.fingerprint();
1685        last_chunks = model_data.chunks.len();
1686        last_cid_urls = model_data.cid_urls.len();
1687        last_payload_bytes = model_data.chunk_payload_bytes;
1688        let now = Instant::now();
1689        if let Some(stable_for) = quiescence.observe(fingerprint, now, stability_window) {
1690            let mut model_data = model_data;
1691            model_data.poll_count = poll_count;
1692            model_data.stable_for = stable_for;
1693            debug!(
1694                document_id = %document_id,
1695                chunks = model_data.chunks.len(),
1696                cid_urls = model_data.cid_urls.len(),
1697                chunk_payload_bytes = model_data.chunk_payload_bytes,
1698                poll_count,
1699                stable_for_ms = stable_for.as_millis(),
1700                elapsed_ms = started.elapsed().as_millis(),
1701                "captured quiesced Google Docs model chunks through CDP Runtime.evaluate"
1702            );
1703            return Ok(model_data);
1704        }
1705        last_stable_for = quiescence.stable_for(now);
1706        tokio::time::sleep(GDOCS_EDITOR_MODEL_POLL_INTERVAL).await;
1707    }
1708
1709    Err(WebCaptureError::BrowserError(format!(
1710        "Timed out waiting for Google Docs DOCS_modelChunk stream to quiesce for document {document_id} after {} ms (last chunks={last_chunks}, payload_bytes={last_payload_bytes}, cid_urls={last_cid_urls}, poll_count={poll_count}, stable_for_ms={})",
1711        max_wait.as_millis(),
1712        last_stable_for.as_millis()
1713    )))
1714}
1715
1716fn launch_cdp_chrome(
1717    chrome: &std::path::Path,
1718    user_data_dir: &std::path::Path,
1719) -> crate::Result<Child> {
1720    let mut command = Command::new(chrome);
1721    command
1722        .args([
1723            "--headless=new",
1724            "--disable-gpu",
1725            "--disable-extensions",
1726            "--disable-dev-shm-usage",
1727            "--disable-background-networking",
1728            "--disable-component-update",
1729            "--disable-default-apps",
1730            "--disable-sync",
1731            "--metrics-recording-only",
1732            "--no-default-browser-check",
1733            "--no-first-run",
1734            "--no-sandbox",
1735            "--remote-debugging-port=0",
1736            "--window-size=1280,800",
1737        ])
1738        .arg(format!("--user-data-dir={}", user_data_dir.display()))
1739        .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1740        .stderr(Stdio::piped())
1741        .stdout(Stdio::null())
1742        .kill_on_drop(true);
1743
1744    command.spawn().map_err(|error| {
1745        WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1746    })
1747}
1748
1749async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1750    let stderr = child.stderr.take().ok_or_else(|| {
1751        WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1752    })?;
1753    let mut lines = BufReader::new(stderr).lines();
1754    let started = Instant::now();
1755
1756    while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1757        let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1758        match line {
1759            Ok(Ok(Some(line))) => {
1760                if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1761                    return Ok(ws_url.trim().to_string());
1762                }
1763            }
1764            Ok(Ok(None)) => {
1765                break;
1766            }
1767            Ok(Err(error)) => {
1768                return Err(WebCaptureError::BrowserError(format!(
1769                    "Failed to read Chrome CDP stderr: {error}"
1770                )));
1771            }
1772            Err(_) => {}
1773        }
1774    }
1775
1776    Err(WebCaptureError::BrowserError(format!(
1777        "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1778        GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1779    )))
1780}
1781
1782async fn cdp_send(
1783    ws: &mut CdpWebSocket,
1784    next_id: &mut u64,
1785    session_id: Option<&str>,
1786    method: &str,
1787    params: Value,
1788) -> crate::Result<Value> {
1789    *next_id += 1;
1790    let id = *next_id;
1791    let mut message = serde_json::json!({
1792        "id": id,
1793        "method": method,
1794        "params": params
1795    });
1796    if let Some(session_id) = session_id {
1797        message["sessionId"] = Value::String(session_id.to_string());
1798    }
1799
1800    ws.send(Message::Text(message.to_string()))
1801        .await
1802        .map_err(|error| {
1803            WebCaptureError::BrowserError(format!(
1804                "Failed to send Chrome DevTools command {method}: {error}"
1805            ))
1806        })?;
1807
1808    while let Some(message) = ws.next().await {
1809        let message = message.map_err(|error| {
1810            WebCaptureError::BrowserError(format!(
1811                "Failed to read Chrome DevTools response for {method}: {error}"
1812            ))
1813        })?;
1814        if !message.is_text() {
1815            continue;
1816        }
1817        let text = message.to_text().map_err(|error| {
1818            WebCaptureError::BrowserError(format!(
1819                "Chrome DevTools response for {method} was not text: {error}"
1820            ))
1821        })?;
1822        let value = serde_json::from_str::<Value>(text).map_err(|error| {
1823            WebCaptureError::ParseError(format!(
1824                "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
1825            ))
1826        })?;
1827        if value.get("id").and_then(Value::as_u64) != Some(id) {
1828            continue;
1829        }
1830        if let Some(error) = value.get("error") {
1831            return Err(WebCaptureError::BrowserError(format!(
1832                "Chrome DevTools command {method} failed: {error}"
1833            )));
1834        }
1835        return Ok(value.get("result").cloned().unwrap_or(Value::Null));
1836    }
1837
1838    Err(WebCaptureError::BrowserError(format!(
1839        "Chrome DevTools websocket closed before response for {method}"
1840    )))
1841}
1842
1843fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
1844    let chunks = value
1845        .get("chunks")
1846        .and_then(Value::as_array)
1847        .cloned()
1848        .unwrap_or_default();
1849    let chunk_payload_bytes = model_chunk_payload_bytes(&chunks);
1850    let cid_urls = value
1851        .get("cidUrlMap")
1852        .and_then(Value::as_object)
1853        .map(|map| {
1854            map.iter()
1855                .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
1856                .collect::<HashMap<_, _>>()
1857        })
1858        .unwrap_or_default();
1859    BrowserModelData {
1860        chunks,
1861        cid_urls,
1862        chunk_payload_bytes,
1863        poll_count: 0,
1864        stable_for: Duration::ZERO,
1865    }
1866}
1867
1868fn model_chunk_payload_bytes(chunks: &[Value]) -> usize {
1869    chunks
1870        .iter()
1871        .map(|chunk| serde_json::to_vec(chunk).map_or(0, |encoded| encoded.len()))
1872        .sum()
1873}
1874
1875fn gdocs_editor_model_max_wait() -> Duration {
1876    duration_from_env_ms(
1877        "WEB_CAPTURE_GDOCS_MAX_WAIT_MS",
1878        GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT,
1879    )
1880}
1881
1882fn gdocs_editor_model_stability_window() -> Duration {
1883    duration_from_env_ms(
1884        "WEB_CAPTURE_GDOCS_STABILITY_MS",
1885        GDOCS_EDITOR_MODEL_STABILITY_DEFAULT,
1886    )
1887}
1888
1889fn duration_from_env_ms(name: &str, default: Duration) -> Duration {
1890    std::env::var(name).map_or(default, |value| match value.trim().parse::<u64>() {
1891        Ok(ms) => Duration::from_millis(ms),
1892        Err(error) => {
1893            warn!(
1894                name,
1895                value,
1896                error = %error,
1897                default_ms = default.as_millis(),
1898                "ignoring invalid Google Docs model wait environment variable"
1899            );
1900            default
1901        }
1902    })
1903}
1904
1905fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
1906    capture
1907        .images
1908        .iter()
1909        .filter_map(|node| match node {
1910            ContentNode::Image {
1911                url: Some(url),
1912                alt,
1913                ..
1914            } => Some(RemoteImage {
1915                url: url.clone(),
1916                alt: alt.clone(),
1917            }),
1918            ContentNode::Image { .. } | ContentNode::Text { .. } => None,
1919        })
1920        .collect()
1921}
1922
1923/// Render a Google Docs REST API document value.
1924#[must_use]
1925pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
1926    let blocks = structural_elements_to_blocks(
1927        document
1928            .pointer("/body/content")
1929            .and_then(Value::as_array)
1930            .map_or(&[] as &[Value], Vec::as_slice),
1931        document.pointer("/inlineObjects").unwrap_or(&Value::Null),
1932    );
1933    GDocsRenderedOutput {
1934        markdown: render_blocks_markdown(&blocks),
1935        html: render_blocks_html(&blocks),
1936        text: blocks_to_text(&blocks),
1937    }
1938}
1939
1940/// Rendered document output.
1941#[derive(Debug, Clone, PartialEq, Eq)]
1942pub struct GDocsRenderedOutput {
1943    /// Markdown output.
1944    pub markdown: String,
1945    /// HTML output.
1946    pub html: String,
1947    /// Plain text output.
1948    pub text: String,
1949}
1950
1951fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
1952    let mut blocks = Vec::new();
1953    for element in elements {
1954        if let Some(paragraph) = element.get("paragraph") {
1955            let content = paragraph_to_content(paragraph, inline_objects);
1956            if !content_to_text(&content).trim().is_empty()
1957                || content
1958                    .iter()
1959                    .any(|node| matches!(node, ContentNode::Image { .. }))
1960            {
1961                blocks.push(CapturedBlock::Paragraph {
1962                    style: paragraph
1963                        .pointer("/paragraphStyle/namedStyleType")
1964                        .and_then(Value::as_str)
1965                        .map(ToString::to_string),
1966                    list: None,
1967                    quote: false,
1968                    horizontal_rule: false,
1969                    content,
1970                });
1971            }
1972        } else if let Some(table) = element.get("table") {
1973            blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
1974        }
1975    }
1976    blocks
1977}
1978
1979fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
1980    let rows = table
1981        .get("tableRows")
1982        .and_then(Value::as_array)
1983        .map_or(&[] as &[Value], Vec::as_slice)
1984        .iter()
1985        .map(|row| TableRow {
1986            cells: row
1987                .get("tableCells")
1988                .and_then(Value::as_array)
1989                .map_or(&[] as &[Value], Vec::as_slice)
1990                .iter()
1991                .map(|cell| TableCell {
1992                    content: structural_elements_to_inline_content(
1993                        cell.get("content")
1994                            .and_then(Value::as_array)
1995                            .map_or(&[] as &[Value], Vec::as_slice),
1996                        inline_objects,
1997                    ),
1998                })
1999                .collect(),
2000        })
2001        .collect();
2002    TableBlock { rows }
2003}
2004
2005fn structural_elements_to_inline_content(
2006    elements: &[Value],
2007    inline_objects: &Value,
2008) -> Vec<ContentNode> {
2009    let mut content = Vec::new();
2010    for element in elements {
2011        if let Some(paragraph) = element.get("paragraph") {
2012            let paragraph_content = paragraph_to_content(paragraph, inline_objects);
2013            if !content.is_empty() && !paragraph_content.is_empty() {
2014                append_text(&mut content, "\n");
2015            }
2016            content.extend(paragraph_content);
2017        } else if let Some(table) = element.get("table") {
2018            append_text(
2019                &mut content,
2020                &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
2021                    table,
2022                    inline_objects,
2023                ))]),
2024            );
2025        }
2026    }
2027    content
2028}
2029
2030fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
2031    let mut content = Vec::new();
2032    for element in paragraph
2033        .get("elements")
2034        .and_then(Value::as_array)
2035        .map_or(&[] as &[Value], Vec::as_slice)
2036    {
2037        if let Some(text) = element
2038            .pointer("/textRun/content")
2039            .and_then(Value::as_str)
2040            .map(|text| text.strip_suffix('\n').unwrap_or(text))
2041        {
2042            append_text(&mut content, text);
2043        } else if let Some(inline_id) = element
2044            .pointer("/inlineObjectElement/inlineObjectId")
2045            .and_then(Value::as_str)
2046        {
2047            if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
2048                content.push(image);
2049            }
2050        }
2051    }
2052    content
2053}
2054
2055fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
2056    let embedded = inline_objects
2057        .get(inline_id)?
2058        .pointer("/inlineObjectProperties/embeddedObject")?;
2059    let url = embedded
2060        .pointer("/imageProperties/contentUri")
2061        .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
2062        .and_then(Value::as_str)?;
2063    let alt = embedded
2064        .get("title")
2065        .or_else(|| embedded.get("description"))
2066        .and_then(Value::as_str)
2067        .unwrap_or("image");
2068    Some(ContentNode::Image {
2069        cid: None,
2070        url: Some(url.to_string()),
2071        alt: alt.to_string(),
2072        width: json_dimension_to_string(embedded.pointer("/size/width/magnitude")),
2073        height: json_dimension_to_string(embedded.pointer("/size/height/magnitude")),
2074        is_suggestion: false,
2075    })
2076}
2077
2078fn json_dimension_to_string(value: Option<&Value>) -> Option<String> {
2079    match value? {
2080        Value::Number(number) => Some(number.to_string()),
2081        Value::String(text) if !text.is_empty() => Some(text.clone()),
2082        _ => None,
2083    }
2084}
2085
2086fn build_model_style_maps(
2087    items: &[Value],
2088    text_len: usize,
2089    utf16_position_map: &[usize],
2090) -> ModelStyleMaps {
2091    let mut maps = ModelStyleMaps {
2092        inline_styles: vec![TextStyle::default(); text_len],
2093        ..ModelStyleMaps::default()
2094    };
2095
2096    for item in items {
2097        if item.get("ty").and_then(Value::as_str) != Some("as") {
2098            continue;
2099        }
2100        let (Some(start), Some(end), Some(style_type)) = (
2101            item.get("si").and_then(Value::as_u64),
2102            item.get("ei").and_then(Value::as_u64),
2103            item.get("st").and_then(Value::as_str),
2104        ) else {
2105            continue;
2106        };
2107        let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
2108            continue;
2109        };
2110
2111        let start = utf16_position_to_char_position(utf16_position_map, start);
2112        let end = utf16_position_to_char_position(utf16_position_map, end);
2113        if start == 0 || end == 0 {
2114            continue;
2115        }
2116
2117        match style_type {
2118            "text" => {
2119                let style = text_style(item);
2120                apply_inline_style(&mut maps.inline_styles, start, end, &style);
2121            }
2122            "link" => {
2123                let style = TextStyle {
2124                    link: item
2125                        .pointer("/sm/lnks_link/ulnk_url")
2126                        .and_then(Value::as_str)
2127                        .map(ToString::to_string),
2128                    ..TextStyle::default()
2129                };
2130                apply_inline_style(&mut maps.inline_styles, start, end, &style);
2131            }
2132            "paragraph" => {
2133                maps.paragraph_by_end
2134                    .insert(end, paragraph_style_from_model(item));
2135            }
2136            "list" => {
2137                maps.list_by_end.insert(
2138                    end,
2139                    ListMeta {
2140                        id: item
2141                            .pointer("/sm/ls_id")
2142                            .and_then(Value::as_str)
2143                            .unwrap_or("")
2144                            .to_string(),
2145                        level: item
2146                            .pointer("/sm/ls_nest")
2147                            .and_then(Value::as_u64)
2148                            .and_then(|value| usize::try_from(value).ok())
2149                            .unwrap_or(0),
2150                        ordered: false,
2151                    },
2152                );
2153            }
2154            "horizontal_rule" => {
2155                maps.horizontal_rules.insert(end);
2156            }
2157            _ => {}
2158        }
2159    }
2160
2161    maps
2162}
2163
2164fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
2165    let from = start.saturating_sub(1);
2166    let to = end.min(styles.len());
2167    if from >= to {
2168        return;
2169    }
2170    for style in &mut styles[from..to] {
2171        if patch.bold {
2172            style.bold = true;
2173        }
2174        if patch.italic {
2175            style.italic = true;
2176        }
2177        if patch.strike {
2178            style.strike = true;
2179        }
2180        if patch.link.is_some() {
2181            style.link.clone_from(&patch.link);
2182        }
2183    }
2184}
2185
2186fn text_style(item: &Value) -> TextStyle {
2187    TextStyle {
2188        bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true)
2189            && item.pointer("/sm/ts_bd_i").and_then(Value::as_bool) != Some(true),
2190        italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true)
2191            && item.pointer("/sm/ts_it_i").and_then(Value::as_bool) != Some(true),
2192        strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true)
2193            && item.pointer("/sm/ts_st_i").and_then(Value::as_bool) != Some(true),
2194        link: None,
2195    }
2196}
2197
2198fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2199    let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2200    ParagraphStyle {
2201        style: heading.map(|level| format!("HEADING_{level}")),
2202        indent_start: item
2203            .pointer("/sm/ps_il")
2204            .and_then(Value::as_f64)
2205            .unwrap_or(0.0),
2206        indent_first_line: item
2207            .pointer("/sm/ps_ifl")
2208            .and_then(Value::as_f64)
2209            .unwrap_or(0.0),
2210    }
2211}
2212
2213fn build_utf16_position_map(text: &str) -> Vec<usize> {
2214    let mut map = vec![0; text.encode_utf16().count() + 1];
2215    let mut utf16_pos = 1usize;
2216    for (idx, ch) in text.chars().enumerate() {
2217        let char_pos = idx + 1;
2218        for _ in 0..ch.len_utf16() {
2219            if let Some(slot) = map.get_mut(utf16_pos) {
2220                *slot = char_pos;
2221            }
2222            utf16_pos += 1;
2223        }
2224    }
2225    map
2226}
2227
2228fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2229    map.get(position)
2230        .copied()
2231        .filter(|position| *position > 0)
2232        .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2233        .unwrap_or(0)
2234}
2235
2236/// Parse captured `DOCS_modelChunk` values.
2237#[must_use]
2238pub fn parse_model_chunks<S: BuildHasher>(
2239    chunks: &[Value],
2240    cid_urls: &HashMap<String, String, S>,
2241) -> CapturedDocument {
2242    parse_model_chunks_with_export_html(chunks, cid_urls, None)
2243}
2244
2245/// Parse captured `DOCS_modelChunk` values and optionally merge semantic hints
2246/// from Google Docs export HTML.
2247#[must_use]
2248#[allow(clippy::too_many_lines)]
2249pub fn parse_model_chunks_with_export_html<S: BuildHasher>(
2250    chunks: &[Value],
2251    cid_urls: &HashMap<String, String, S>,
2252    export_html: Option<&str>,
2253) -> CapturedDocument {
2254    let items = collect_model_items(chunks);
2255    let full_text = items
2256        .iter()
2257        .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2258        .filter_map(|item| item.get("s").and_then(Value::as_str))
2259        .collect::<String>();
2260    let chars: Vec<char> = full_text.chars().collect();
2261    let utf16_position_map = build_utf16_position_map(&full_text);
2262    let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2263
2264    let mut positions = HashMap::new();
2265    for item in &items {
2266        if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2267            if let (Some(id), Some(pos)) = (
2268                item.get("id").and_then(Value::as_str),
2269                item.get("spi").and_then(Value::as_u64),
2270            ) {
2271                if let Ok(pos) = usize::try_from(pos) {
2272                    positions.insert(
2273                        id.to_string(),
2274                        utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2275                    );
2276                }
2277            }
2278        }
2279    }
2280
2281    let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2282    let mut images = Vec::new();
2283    for item in &items {
2284        let ty = item.get("ty").and_then(Value::as_str);
2285        if !matches!(ty, Some("ae" | "ase")) {
2286            continue;
2287        }
2288        let Some(id) = item.get("id").and_then(Value::as_str) else {
2289            continue;
2290        };
2291        let Some(pos) = positions.get(id).copied() else {
2292            continue;
2293        };
2294        let cid = item
2295            .pointer("/epm/ee_eo/i_cid")
2296            .and_then(Value::as_str)
2297            .map(ToString::to_string);
2298        let node = ContentNode::Image {
2299            url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2300            cid,
2301            alt: item
2302                .pointer("/epm/ee_eo/eo_ad")
2303                .and_then(Value::as_str)
2304                .unwrap_or_else(|| {
2305                    if ty == Some("ase") {
2306                        "suggested image"
2307                    } else {
2308                        "image"
2309                    }
2310                })
2311                .to_string(),
2312            width: json_dimension_to_string(item.pointer("/epm/ee_eo/i_wth")),
2313            height: json_dimension_to_string(item.pointer("/epm/ee_eo/i_ht")),
2314            is_suggestion: ty == Some("ase"),
2315        };
2316        images_by_pos.insert(pos, node.clone());
2317        images.push(node);
2318    }
2319
2320    let mut blocks = Vec::new();
2321    let mut tables = Vec::new();
2322    let mut paragraph = Vec::new();
2323    let mut table: Option<TableBlock> = None;
2324    let mut row: Option<TableRow> = None;
2325    let mut cell: Option<TableCell> = None;
2326    let mut previous_table_control: Option<u32> = None;
2327    let mut skip_next_table_newline = false;
2328
2329    for (idx, ch) in chars.iter().copied().enumerate() {
2330        match ch as u32 {
2331            0x10 => {
2332                flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2333                table = Some(TableBlock::default());
2334                previous_table_control = Some(0x10);
2335                skip_next_table_newline = false;
2336            }
2337            0x11 => {
2338                flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2339                previous_table_control = None;
2340                skip_next_table_newline = false;
2341            }
2342            0x12 => {
2343                flush_row(&mut row, &mut cell, table.as_mut(), true);
2344                row = Some(TableRow::default());
2345                previous_table_control = Some(0x12);
2346                skip_next_table_newline = false;
2347            }
2348            0x1c => {
2349                if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2350                    previous_table_control = Some(0x1c);
2351                    continue;
2352                }
2353                let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2354                flush_cell(&mut row, &mut cell, false);
2355                if row.is_none() {
2356                    row = Some(TableRow::default());
2357                }
2358                cell = Some(TableCell::default());
2359                if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2360                    skip_next_table_newline = true;
2361                }
2362                previous_table_control = Some(0x1c);
2363            }
2364            0x0a => {
2365                if table.is_some() {
2366                    if skip_next_table_newline {
2367                        skip_next_table_newline = false;
2368                        previous_table_control = Some(0x0a);
2369                        continue;
2370                    }
2371                    // Inside a table, a bare newline separates cells within the
2372                    // current row (rows are delimited by 0x12/0x11). See R2.
2373                    flush_cell(&mut row, &mut cell, false);
2374                    if row.is_none() {
2375                        row = Some(TableRow::default());
2376                    }
2377                    cell = Some(TableCell::default());
2378                    previous_table_control = Some(0x0a);
2379                } else {
2380                    flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2381                }
2382            }
2383            0x0b => {
2384                append_to_current(
2385                    &mut paragraph,
2386                    &mut row,
2387                    &mut cell,
2388                    table.is_some(),
2389                    "\n",
2390                    TextStyle::default(),
2391                );
2392                previous_table_control = None;
2393                skip_next_table_newline = false;
2394            }
2395            _ => {
2396                if let Some(image) = images_by_pos.get(&idx).cloned() {
2397                    push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2398                    previous_table_control = None;
2399                    skip_next_table_newline = false;
2400                    if ch == '*' {
2401                        continue;
2402                    }
2403                }
2404                append_to_current(
2405                    &mut paragraph,
2406                    &mut row,
2407                    &mut cell,
2408                    table.is_some(),
2409                    &ch.to_string(),
2410                    style_maps
2411                        .inline_styles
2412                        .get(idx)
2413                        .cloned()
2414                        .unwrap_or_default(),
2415                );
2416                previous_table_control = None;
2417                skip_next_table_newline = false;
2418            }
2419        }
2420    }
2421
2422    if table.is_some() {
2423        flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2424    }
2425    flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2426
2427    let mut capture = CapturedDocument {
2428        text: blocks_to_text(&blocks),
2429        blocks,
2430        tables,
2431        images,
2432    };
2433    if let Some(export_html) = export_html {
2434        apply_export_semantic_hints(&mut capture.blocks, export_html);
2435        capture.text = blocks_to_text(&capture.blocks);
2436    }
2437    capture
2438}
2439
2440fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2441    let mut items = Vec::new();
2442    for chunk in chunks {
2443        if let Some(array) = chunk.as_array() {
2444            items.extend(array.iter().cloned());
2445        } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2446            items.extend(array.iter().cloned());
2447        } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2448            items.push(chunk.clone());
2449        }
2450    }
2451    items
2452}
2453
2454fn flush_paragraph(
2455    paragraph: &mut Vec<ContentNode>,
2456    blocks: &mut Vec<CapturedBlock>,
2457    end_pos: Option<usize>,
2458    style_maps: &ModelStyleMaps,
2459) {
2460    if !content_to_text(paragraph).trim().is_empty()
2461        || paragraph
2462            .iter()
2463            .any(|node| matches!(node, ContentNode::Image { .. }))
2464    {
2465        let meta =
2466            paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2467        blocks.push(CapturedBlock::Paragraph {
2468            content: std::mem::take(paragraph),
2469            style: meta.style,
2470            list: meta.list,
2471            quote: meta.quote,
2472            horizontal_rule: meta.horizontal_rule,
2473        });
2474    } else {
2475        paragraph.clear();
2476    }
2477}
2478
2479fn paragraph_meta_for_end_position(
2480    style_maps: &ModelStyleMaps,
2481    end_pos: Option<usize>,
2482    text: &str,
2483) -> ParagraphMeta {
2484    let Some(end_pos) = end_pos else {
2485        return ParagraphMeta::default();
2486    };
2487    let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2488    let mut meta = ParagraphMeta {
2489        style: paragraph_style.and_then(|style| style.style.clone()),
2490        ..ParagraphMeta::default()
2491    };
2492
2493    if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2494        let mut list = list.clone();
2495        list.ordered = infer_ordered_list(&list, text);
2496        meta.list = Some(list);
2497    } else if paragraph_style.is_some_and(|style| {
2498        style.indent_start > 0.0
2499            && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2500    }) {
2501        meta.quote = true;
2502    }
2503
2504    meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2505        || end_pos
2506            .checked_sub(1)
2507            .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2508        && text.trim().chars().all(|ch| ch == '-');
2509    meta
2510}
2511
2512const fn infer_ordered_list(_list: &ListMeta, _text: &str) -> bool {
2513    false
2514}
2515
2516fn apply_export_semantic_hints(blocks: &mut [CapturedBlock], export_html: &str) {
2517    let hints = extract_export_semantic_hints(export_html);
2518    let mut cursor = 0usize;
2519    for block in blocks {
2520        let CapturedBlock::Paragraph {
2521            content,
2522            list,
2523            quote,
2524            ..
2525        } = block
2526        else {
2527            continue;
2528        };
2529        let text = normalize_semantic_text(&content_to_text(content));
2530        if text.is_empty() {
2531            continue;
2532        }
2533        let Some((index, hint)) = find_next_semantic_hint(&hints, &text, cursor, list.is_some())
2534        else {
2535            continue;
2536        };
2537        cursor = index + 1;
2538        if let Some(list) = list.as_mut() {
2539            if let Some(ordered) = hint.list_ordered {
2540                list.ordered = ordered;
2541            }
2542        } else {
2543            *quote = hint.quote;
2544        }
2545    }
2546}
2547
2548fn find_next_semantic_hint<'a>(
2549    hints: &'a [ExportSemanticHint],
2550    text: &str,
2551    cursor: usize,
2552    needs_list_hint: bool,
2553) -> Option<(usize, &'a ExportSemanticHint)> {
2554    hints.iter().enumerate().skip(cursor).find(|(_, hint)| {
2555        hint.text == text
2556            && if needs_list_hint {
2557                hint.list_ordered.is_some()
2558            } else {
2559                hint.list_ordered.is_none()
2560            }
2561    })
2562}
2563
2564fn extract_export_semantic_hints(export_html: &str) -> Vec<ExportSemanticHint> {
2565    let preprocessed = preprocess_google_docs_export_html(export_html).html;
2566    let document = Html::parse_document(&preprocessed);
2567    let selector =
2568        Selector::parse("body h1,body h2,body h3,body h4,body h5,body h6,body p,body li")
2569            .expect("valid semantic hint selector");
2570    document
2571        .select(&selector)
2572        .filter_map(|element| {
2573            let tag = element.value().name();
2574            let text = export_element_semantic_text(&element);
2575            if text.is_empty() {
2576                return None;
2577            }
2578            let list_ordered = if tag == "li" {
2579                nearest_list_is_ordered(&element)
2580            } else {
2581                None
2582            };
2583            Some(ExportSemanticHint {
2584                text,
2585                list_ordered,
2586                quote: tag != "li" && has_ancestor_tag(&element, "blockquote"),
2587            })
2588        })
2589        .collect()
2590}
2591
2592fn export_element_semantic_text(element: &ElementRef<'_>) -> String {
2593    let raw_text = if element.value().name() == "li" {
2594        list_item_own_text(element)
2595    } else {
2596        element.text().collect()
2597    };
2598    normalize_semantic_text(&raw_text)
2599}
2600
2601fn list_item_own_text(element: &ElementRef<'_>) -> String {
2602    let mut text = String::new();
2603    let mut stack: Vec<_> = element.children().collect();
2604    stack.reverse();
2605
2606    while let Some(node) = stack.pop() {
2607        match node.value() {
2608            Node::Text(value) => text.push_str(value),
2609            Node::Element(child) if matches!(child.name(), "ol" | "ul") => {}
2610            Node::Element(_) => {
2611                let mut children: Vec<_> = node.children().collect();
2612                children.reverse();
2613                stack.extend(children);
2614            }
2615            _ => {}
2616        }
2617    }
2618
2619    text
2620}
2621
2622fn nearest_list_is_ordered(element: &ElementRef<'_>) -> Option<bool> {
2623    element
2624        .ancestors()
2625        .filter_map(ElementRef::wrap)
2626        .find_map(|ancestor| match ancestor.value().name() {
2627            "ol" => Some(true),
2628            "ul" => Some(false),
2629            _ => None,
2630        })
2631}
2632
2633fn has_ancestor_tag(element: &ElementRef<'_>, tag: &str) -> bool {
2634    element
2635        .ancestors()
2636        .filter_map(ElementRef::wrap)
2637        .any(|ancestor| ancestor.value().name() == tag)
2638}
2639
2640fn normalize_semantic_text(text: &str) -> String {
2641    text.replace('\u{a0}', " ")
2642        .split_whitespace()
2643        .collect::<Vec<_>>()
2644        .join(" ")
2645}
2646
2647fn cell_is_empty(cell: &TableCell) -> bool {
2648    cell.content.iter().all(|node| match node {
2649        ContentNode::Text { text, .. } => text.trim().is_empty(),
2650        ContentNode::Image { .. } => false,
2651    })
2652}
2653
2654fn row_is_empty(row: &TableRow) -> bool {
2655    row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2656}
2657
2658fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2659    if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2660        if drop_empty && cell_is_empty(&cell) {
2661            return;
2662        }
2663        row.cells.push(cell);
2664    }
2665}
2666
2667fn flush_row(
2668    row: &mut Option<TableRow>,
2669    cell: &mut Option<TableCell>,
2670    table: Option<&mut TableBlock>,
2671    drop_empty_trailing_cell: bool,
2672) {
2673    flush_cell(row, cell, drop_empty_trailing_cell);
2674    if let (Some(table), Some(row)) = (table, row.take()) {
2675        table.rows.push(row);
2676    }
2677}
2678
2679fn flush_table(
2680    table: &mut Option<TableBlock>,
2681    row: &mut Option<TableRow>,
2682    cell: &mut Option<TableCell>,
2683    tables: &mut Vec<TableBlock>,
2684    blocks: &mut Vec<CapturedBlock>,
2685) {
2686    flush_row(row, cell, table.as_mut(), true);
2687    if let Some(mut table) = table.take() {
2688        // Drop trailing empty rows that can be introduced by '\n' immediately
2689        // before the 0x11 table-close marker. See R2.
2690        while table.rows.last().is_some_and(row_is_empty) {
2691            table.rows.pop();
2692        }
2693        tables.push(table.clone());
2694        blocks.push(CapturedBlock::Table(table));
2695    }
2696}
2697
2698fn push_to_current(
2699    paragraph: &mut Vec<ContentNode>,
2700    row: &mut Option<TableRow>,
2701    cell: &mut Option<TableCell>,
2702    in_table: bool,
2703    node: ContentNode,
2704) {
2705    if in_table {
2706        if row.is_none() {
2707            *row = Some(TableRow::default());
2708        }
2709        if cell.is_none() {
2710            *cell = Some(TableCell::default());
2711        }
2712        if let Some(cell) = cell.as_mut() {
2713            cell.content.push(node);
2714        }
2715    } else {
2716        paragraph.push(node);
2717    }
2718}
2719
2720fn append_to_current(
2721    paragraph: &mut Vec<ContentNode>,
2722    row: &mut Option<TableRow>,
2723    cell: &mut Option<TableCell>,
2724    in_table: bool,
2725    text: &str,
2726    style: TextStyle,
2727) {
2728    if in_table {
2729        if row.is_none() {
2730            *row = Some(TableRow::default());
2731        }
2732        if cell.is_none() {
2733            *cell = Some(TableCell::default());
2734        }
2735        if let Some(cell) = cell.as_mut() {
2736            append_styled_text(&mut cell.content, text, style);
2737        }
2738    } else {
2739        append_styled_text(paragraph, text, style);
2740    }
2741}
2742
2743fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2744    append_styled_text(content, text, TextStyle::default());
2745}
2746
2747fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2748    if text.is_empty() {
2749        return;
2750    }
2751    if let Some(ContentNode::Text {
2752        text: last,
2753        bold,
2754        italic,
2755        strike,
2756        link,
2757    }) = content.last_mut()
2758    {
2759        let last_style = TextStyle {
2760            bold: *bold,
2761            italic: *italic,
2762            strike: *strike,
2763            link: link.clone(),
2764        };
2765        if last_style == style {
2766            last.push_str(text);
2767            return;
2768        }
2769    }
2770    content.push(ContentNode::Text {
2771        text: text.to_string(),
2772        bold: style.bold,
2773        italic: style.italic,
2774        strike: style.strike,
2775        link: style.link,
2776    });
2777}
2778
2779/// Render a parsed Google Docs capture as Markdown, HTML, or text.
2780#[must_use]
2781pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2782    match format.to_lowercase().as_str() {
2783        "html" => render_blocks_html(&capture.blocks),
2784        "txt" | "text" => blocks_to_text(&capture.blocks),
2785        _ => render_blocks_markdown(&capture.blocks),
2786    }
2787}
2788
2789/// One rendered block plus enough context for `render_blocks_markdown` to
2790/// choose a Markdown-safe separator.
2791struct RenderedBlock {
2792    markdown: String,
2793    list_id: Option<String>,
2794    quote: bool,
2795}
2796
2797fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
2798    // Track an ordered-list counter per (list.id, level) so ordered items are
2799    // numbered sequentially 1., 2., 3., ... instead of all being "1.". See R3.
2800    // When we re-enter a shallower list level, deeper counters reset so a new
2801    // parent restarts its children at 1.
2802    let mut counters: HashMap<(String, usize), usize> = HashMap::new();
2803    let mut rendered: Vec<RenderedBlock> = Vec::new();
2804
2805    for block in blocks {
2806        match block {
2807            CapturedBlock::Paragraph {
2808                content,
2809                style,
2810                list,
2811                quote,
2812                horizontal_rule,
2813            } => {
2814                let text = render_content_markdown(content).trim().to_string();
2815                if text.is_empty() {
2816                    continue;
2817                }
2818                let ordered_index = list.as_ref().and_then(|list_meta| {
2819                    if !list_meta.ordered {
2820                        return None;
2821                    }
2822                    // Reset counters for deeper levels when we move up to a
2823                    // shallower level — otherwise a new parent item would see
2824                    // its previous children's final count.
2825                    let key = (list_meta.id.clone(), list_meta.level);
2826                    counters.retain(|(id, level), _| {
2827                        !(id == &list_meta.id && *level > list_meta.level)
2828                    });
2829                    let next = counters.entry(key).or_insert(0);
2830                    *next += 1;
2831                    Some(*next)
2832                });
2833                let markdown = render_paragraph_markdown(
2834                    &text,
2835                    style.as_deref(),
2836                    list.as_ref(),
2837                    *quote,
2838                    *horizontal_rule,
2839                    ordered_index,
2840                );
2841                rendered.push(RenderedBlock {
2842                    markdown,
2843                    list_id: list.as_ref().map(|l| l.id.clone()),
2844                    quote: *quote,
2845                });
2846            }
2847            CapturedBlock::Table(table) => {
2848                rendered.push(RenderedBlock {
2849                    markdown: render_table_markdown(table),
2850                    list_id: None,
2851                    quote: false,
2852                });
2853            }
2854        }
2855    }
2856
2857    // Choose separator per adjacent pair: consecutive items from the same
2858    // Google Docs list use a single newline, including nested levels; adjacent
2859    // blockquote paragraphs keep a quoted blank line between them.
2860    let mut out = String::new();
2861    for (idx, block) in rendered.iter().enumerate() {
2862        if idx == 0 {
2863            out.push_str(&block.markdown);
2864            continue;
2865        }
2866        let prev = &rendered[idx - 1];
2867        if block.list_id.is_some() && prev.list_id.is_some() {
2868            out.push('\n');
2869        } else if block.quote && prev.quote {
2870            out.push_str("\n>\n");
2871        } else {
2872            out.push_str("\n\n");
2873        }
2874        out.push_str(&block.markdown);
2875    }
2876    if !out.is_empty() && !out.ends_with('\n') {
2877        out.push('\n');
2878    }
2879    out
2880}
2881
2882fn render_paragraph_markdown(
2883    text: &str,
2884    style: Option<&str>,
2885    list: Option<&ListMeta>,
2886    quote: bool,
2887    horizontal_rule: bool,
2888    ordered_index: Option<usize>,
2889) -> String {
2890    if horizontal_rule {
2891        return "---".to_string();
2892    }
2893    match style {
2894        Some("TITLE") => format!("# {text}"),
2895        Some("SUBTITLE") => format!("## {text}"),
2896        Some(style) if style.starts_with("HEADING_") => {
2897            let level = style
2898                .trim_start_matches("HEADING_")
2899                .parse::<usize>()
2900                .unwrap_or(1);
2901            format!("{} {text}", "#".repeat(level.clamp(1, 6)))
2902        }
2903        _ => list.map_or_else(
2904            || {
2905                if quote {
2906                    text.lines()
2907                        .map(|line| {
2908                            if line.is_empty() {
2909                                ">".to_string()
2910                            } else {
2911                                format!("> {line}")
2912                            }
2913                        })
2914                        .collect::<Vec<_>>()
2915                        .join("\n")
2916                } else {
2917                    text.to_string()
2918                }
2919            },
2920            |list| {
2921                let indent = "    ".repeat(list.level);
2922                let marker = if list.ordered {
2923                    format!("{}.", ordered_index.unwrap_or(1))
2924                } else {
2925                    "-".to_string()
2926                };
2927                format!("{indent}{marker} {text}")
2928            },
2929        ),
2930    }
2931}
2932
2933fn render_table_markdown(table: &TableBlock) -> String {
2934    if table.rows.is_empty() {
2935        return String::new();
2936    }
2937    let width = table
2938        .rows
2939        .iter()
2940        .map(|row| row.cells.len())
2941        .max()
2942        .unwrap_or(1);
2943    let rows = table
2944        .rows
2945        .iter()
2946        .map(|row| {
2947            (0..width)
2948                .map(|idx| {
2949                    row.cells.get(idx).map_or_else(String::new, |cell| {
2950                        escape_markdown_table_cell(&render_content_markdown(&cell.content))
2951                    })
2952                })
2953                .collect::<Vec<_>>()
2954        })
2955        .collect::<Vec<_>>();
2956    let separator = vec!["---".to_string(); width];
2957    std::iter::once(&rows[0])
2958        .chain(std::iter::once(&separator))
2959        .chain(rows.iter().skip(1))
2960        .map(|row| format!("| {} |", row.join(" | ")))
2961        .collect::<Vec<_>>()
2962        .join("\n")
2963}
2964
2965fn render_content_markdown(content: &[ContentNode]) -> String {
2966    let mut rendered = String::new();
2967    let mut idx = 0usize;
2968    while idx < content.len() {
2969        match &content[idx] {
2970            ContentNode::Text {
2971                text,
2972                bold,
2973                italic,
2974                strike,
2975                link,
2976            } => {
2977                let link_target = link.as_deref();
2978                let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
2979                idx += 1;
2980                while let Some(ContentNode::Text {
2981                    text,
2982                    bold,
2983                    italic,
2984                    strike,
2985                    link: next_link,
2986                }) = content.get(idx)
2987                {
2988                    if next_link.as_deref() != link_target {
2989                        break;
2990                    }
2991                    runs.push((text.as_str(), *bold, *italic, *strike));
2992                    idx += 1;
2993                }
2994                let label = render_text_runs_markdown(&runs);
2995                if let Some(link_target) = link_target {
2996                    let _ = write!(rendered, "[{label}]({link_target})");
2997                } else {
2998                    rendered.push_str(&label);
2999                }
3000            }
3001            ContentNode::Image {
3002                url: Some(url),
3003                alt,
3004                ..
3005            } => {
3006                let _ = write!(rendered, "![{alt}]({url})");
3007                idx += 1;
3008            }
3009            ContentNode::Image { .. } => idx += 1,
3010        }
3011    }
3012    rendered
3013}
3014
3015#[derive(Clone, Copy, Default)]
3016struct MarkdownMarkerState {
3017    bold: bool,
3018    italic: bool,
3019    strike: bool,
3020}
3021
3022fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
3023    let inactive = MarkdownMarkerState::default();
3024    let mut active = inactive;
3025    let mut output = String::new();
3026    for (text, bold, italic, strike) in runs {
3027        let next = MarkdownMarkerState {
3028            bold: *bold,
3029            italic: *italic,
3030            strike: *strike,
3031        };
3032        let mut start = 0usize;
3033        for (offset, ch) in text.char_indices() {
3034            if ch != '\n' {
3035                continue;
3036            }
3037            if offset > start {
3038                output.push_str(&markdown_marker_transition(active, next));
3039                output.push_str(&text[start..offset]);
3040                active = next;
3041            }
3042            output.push_str(&markdown_marker_transition(active, inactive));
3043            output.push('\n');
3044            active = inactive;
3045            start = offset + ch.len_utf8();
3046        }
3047        if start < text.len() {
3048            output.push_str(&markdown_marker_transition(active, next));
3049            output.push_str(&text[start..]);
3050            active = next;
3051        }
3052    }
3053    output.push_str(&markdown_marker_transition(active, inactive));
3054    output
3055}
3056
3057fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
3058    let mut markers = String::new();
3059    if active.strike && !next.strike {
3060        markers.push_str("~~");
3061    }
3062    if active.italic && !next.italic {
3063        markers.push('*');
3064    }
3065    if active.bold && !next.bold {
3066        markers.push_str("**");
3067    }
3068    if !active.bold && next.bold {
3069        markers.push_str("**");
3070    }
3071    if !active.italic && next.italic {
3072        markers.push('*');
3073    }
3074    if !active.strike && next.strike {
3075        markers.push_str("~~");
3076    }
3077    markers
3078}
3079
3080fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
3081    format!(
3082        "<!doctype html><html><body>{}</body></html>",
3083        blocks
3084            .iter()
3085            .map(|block| match block {
3086                CapturedBlock::Paragraph {
3087                    content,
3088                    style,
3089                    list,
3090                    quote,
3091                    horizontal_rule,
3092                } => {
3093                    if *horizontal_rule {
3094                        "<hr>".to_string()
3095                    } else if let Some(list) = list {
3096                        let tag = if list.ordered { "ol" } else { "ul" };
3097                        format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
3098                    } else if *quote {
3099                        format!("<blockquote>{}</blockquote>", render_content_html(content))
3100                    } else {
3101                        let tag = paragraph_tag(style.as_deref());
3102                        format!("<{tag}>{}</{tag}>", render_content_html(content))
3103                    }
3104                }
3105                CapturedBlock::Table(table) => render_table_html(table),
3106            })
3107            .collect::<String>()
3108    )
3109}
3110
3111fn render_table_html(table: &TableBlock) -> String {
3112    let mut html = String::from("<table>");
3113    for row in &table.rows {
3114        html.push_str("<tr>");
3115        for cell in &row.cells {
3116            html.push_str("<td>");
3117            html.push_str(&render_content_html(&cell.content));
3118            html.push_str("</td>");
3119        }
3120        html.push_str("</tr>");
3121    }
3122    html.push_str("</table>");
3123    html
3124}
3125
3126fn render_content_html(content: &[ContentNode]) -> String {
3127    content
3128        .iter()
3129        .map(|node| match node {
3130            ContentNode::Text {
3131                text,
3132                bold,
3133                italic,
3134                strike,
3135                link,
3136            } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
3137            ContentNode::Image {
3138                url: Some(url),
3139                alt,
3140                width,
3141                height,
3142                ..
3143            } => render_image_html(url, alt, width.as_deref(), height.as_deref()),
3144            ContentNode::Image { .. } => String::new(),
3145        })
3146        .collect()
3147}
3148
3149fn render_image_html(url: &str, alt: &str, width: Option<&str>, height: Option<&str>) -> String {
3150    let mut html = format!(
3151        "<img src=\"{}\" alt=\"{}\"",
3152        escape_html(url),
3153        escape_html(alt)
3154    );
3155    if let Some(width) = width.filter(|value| !value.is_empty()) {
3156        let _ = write!(html, " width=\"{}\"", escape_html(width));
3157    }
3158    if let Some(height) = height.filter(|value| !value.is_empty()) {
3159        let _ = write!(html, " height=\"{}\"", escape_html(height));
3160    }
3161    html.push('>');
3162    html
3163}
3164
3165fn render_marked_html(
3166    text: &str,
3167    bold: bool,
3168    italic: bool,
3169    strike: bool,
3170    link: Option<&str>,
3171) -> String {
3172    text.split('\n')
3173        .map(|segment| render_marked_html_segment(segment, bold, italic, strike, link))
3174        .collect::<Vec<_>>()
3175        .join("<br>")
3176}
3177
3178fn render_marked_html_segment(
3179    text: &str,
3180    bold: bool,
3181    italic: bool,
3182    strike: bool,
3183    link: Option<&str>,
3184) -> String {
3185    if text.is_empty() {
3186        return String::new();
3187    }
3188    let mut output = escape_html(text);
3189    if bold {
3190        output = format!("<strong>{output}</strong>");
3191    }
3192    if italic {
3193        output = format!("<em>{output}</em>");
3194    }
3195    if strike {
3196        output = format!("<s>{output}</s>");
3197    }
3198    if let Some(link) = link {
3199        output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
3200    }
3201    output
3202}
3203
3204fn paragraph_tag(style: Option<&str>) -> &'static str {
3205    match style {
3206        Some("TITLE" | "HEADING_1") => "h1",
3207        Some("SUBTITLE" | "HEADING_2") => "h2",
3208        Some("HEADING_3") => "h3",
3209        Some("HEADING_4") => "h4",
3210        Some("HEADING_5") => "h5",
3211        Some("HEADING_6") => "h6",
3212        _ => "p",
3213    }
3214}
3215
3216fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
3217    blocks
3218        .iter()
3219        .map(|block| match block {
3220            CapturedBlock::Paragraph { content, .. } => content_to_text(content),
3221            CapturedBlock::Table(table) => table
3222                .rows
3223                .iter()
3224                .map(|row| {
3225                    row.cells
3226                        .iter()
3227                        .map(|cell| content_to_text(&cell.content))
3228                        .collect::<Vec<_>>()
3229                        .join("\t")
3230                })
3231                .collect::<Vec<_>>()
3232                .join("\n"),
3233        })
3234        .filter(|text| !text.is_empty())
3235        .collect::<Vec<_>>()
3236        .join("\n")
3237}
3238
3239fn content_to_text(content: &[ContentNode]) -> String {
3240    content
3241        .iter()
3242        .map(|node| match node {
3243            ContentNode::Text { text, .. } => text.clone(),
3244            ContentNode::Image {
3245                url: Some(_), alt, ..
3246            } => format!("[{alt}]"),
3247            ContentNode::Image { .. } => String::new(),
3248        })
3249        .collect()
3250}
3251
3252fn escape_html(value: &str) -> String {
3253    value
3254        .replace('&', "&amp;")
3255        .replace('<', "&lt;")
3256        .replace('>', "&gt;")
3257        .replace('"', "&quot;")
3258        .replace('\'', "&#39;")
3259}
3260
3261fn escape_markdown_table_cell(value: &str) -> String {
3262    value.replace('|', "\\|").replace('\n', "<br>")
3263}
3264
3265/// Extract a Bearer token from an Authorization header value.
3266///
3267/// Returns `None` if the header is not a valid Bearer token.
3268#[must_use]
3269pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
3270    let trimmed = auth_header.trim();
3271    trimmed
3272        .strip_prefix("Bearer ")
3273        .or_else(|| trimmed.strip_prefix("bearer "))
3274        .map(str::trim)
3275        .filter(|t| !t.is_empty())
3276}
3277
3278/// An image extracted from base64 data URIs in HTML.
3279#[derive(Debug, Clone)]
3280pub struct ExtractedImage {
3281    /// Local filename (e.g., "image-01.png")
3282    pub filename: String,
3283    /// Raw image bytes
3284    pub data: Vec<u8>,
3285    /// MIME type (e.g., "image/png")
3286    pub mime_type: String,
3287}
3288
3289/// Result of fetching a Google Doc as an archive.
3290#[derive(Debug, Clone)]
3291pub struct GDocsArchiveResult {
3292    /// HTML content with local image paths
3293    pub html: String,
3294    /// Markdown content with local image paths
3295    pub markdown: String,
3296    /// Extracted images
3297    pub images: Vec<ExtractedImage>,
3298    /// Document ID
3299    pub document_id: String,
3300    /// Export URL used
3301    pub export_url: String,
3302}
3303
3304/// Build a self-contained archive result from browser-model rendered output.
3305///
3306/// `DOCS_modelChunk` image nodes point at `docs-images-rt` URLs. Archive mode
3307/// downloads those URLs into `images/` and rewrites markdown/html references to
3308/// local paths so Rust browser capture matches the JavaScript archive path.
3309///
3310/// # Errors
3311///
3312/// Returns an error if the HTTP client cannot be created or an image response
3313/// body cannot be read. Individual failed image downloads are logged and left
3314/// out of the archive, matching the JS behavior.
3315pub async fn localize_rendered_remote_images_for_archive(
3316    rendered: &GDocsRenderedResult,
3317) -> crate::Result<GDocsArchiveResult> {
3318    let client = reqwest::Client::builder().build().map_err(|error| {
3319        WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
3320    })?;
3321    let mut seen = HashMap::new();
3322    let mut images = Vec::new();
3323    let mut next_index = 1usize;
3324
3325    for image in &rendered.remote_images {
3326        if seen.contains_key(&image.url) {
3327            continue;
3328        }
3329        let filename = remote_image_filename(&image.url, next_index);
3330        next_index += 1;
3331        seen.insert(image.url.clone(), filename.clone());
3332
3333        match client
3334            .get(&image.url)
3335            .header("User-Agent", GDOCS_USER_AGENT)
3336            .header("Accept", "image/*,*/*;q=0.8")
3337            .send()
3338            .await
3339        {
3340            Ok(response) if response.status().is_success() => {
3341                let mime_type = response
3342                    .headers()
3343                    .get(reqwest::header::CONTENT_TYPE)
3344                    .and_then(|value| value.to_str().ok())
3345                    .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
3346                let data = response.bytes().await.map_err(|error| {
3347                    WebCaptureError::FetchError(format!(
3348                        "Failed to read Google Docs image {}: {error}",
3349                        image.url
3350                    ))
3351                })?;
3352                debug!(
3353                    url = %image.url,
3354                    filename = %filename,
3355                    bytes = data.len(),
3356                    mime_type = %mime_type,
3357                    "downloaded Google Docs browser-model archive image"
3358                );
3359                images.push(ExtractedImage {
3360                    filename,
3361                    data: data.to_vec(),
3362                    mime_type,
3363                });
3364            }
3365            Ok(response) => {
3366                warn!(
3367                    url = %image.url,
3368                    status = response.status().as_u16(),
3369                    "failed to download Google Docs browser-model archive image"
3370                );
3371            }
3372            Err(error) => {
3373                warn!(
3374                    url = %image.url,
3375                    error = %error,
3376                    "failed to download Google Docs browser-model archive image"
3377                );
3378            }
3379        }
3380    }
3381
3382    let mut markdown = rendered.markdown.clone();
3383    let mut html = rendered.html.clone();
3384    for (url, filename) in seen {
3385        let local_path = format!("images/{filename}");
3386        markdown = markdown.replace(&url, &local_path);
3387        html = html.replace(&url, &local_path);
3388    }
3389
3390    Ok(GDocsArchiveResult {
3391        html,
3392        markdown,
3393        images,
3394        document_id: rendered.document_id.clone(),
3395        export_url: rendered.export_url.clone(),
3396    })
3397}
3398
3399fn remote_image_filename(url: &str, index: usize) -> String {
3400    let ext = crate::localize_images::get_extension_from_url(url);
3401    format!("image-{index:02}{ext}")
3402}
3403
3404fn mime_type_for_filename(filename: &str) -> String {
3405    match filename
3406        .rsplit('.')
3407        .next()
3408        .unwrap_or("png")
3409        .to_lowercase()
3410        .as_str()
3411    {
3412        "jpg" | "jpeg" => "image/jpeg",
3413        "gif" => "image/gif",
3414        "webp" => "image/webp",
3415        "svg" => "image/svg+xml",
3416        _ => "image/png",
3417    }
3418    .to_string()
3419}
3420
3421fn base64_image_pattern() -> &'static Regex {
3422    static PATTERN: OnceLock<Regex> = OnceLock::new();
3423    PATTERN.get_or_init(|| {
3424        Regex::new(
3425            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3426        )
3427        .unwrap()
3428    })
3429}
3430
3431/// Extract base64 data URI images from HTML content.
3432///
3433/// Google Docs HTML exports embed images as base64 data URIs.
3434/// This function extracts them and replaces with local file paths.
3435///
3436/// # Arguments
3437///
3438/// * `html` - HTML content with embedded base64 images
3439///
3440/// # Returns
3441///
3442/// Tuple of (updated HTML with local paths, extracted images)
3443#[must_use]
3444pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3445    let mut images = Vec::new();
3446    let mut idx = 1u32;
3447
3448    let updated_html = base64_image_pattern()
3449        .replace_all(html, |caps: &regex::Captures<'_>| {
3450            let prefix = &caps[1];
3451            let mime_ext = &caps[2];
3452            let base64_data = &caps[3];
3453            let suffix = &caps[4];
3454
3455            let ext = match mime_ext {
3456                "jpeg" => "jpg",
3457                "svg+xml" => "svg",
3458                other => other,
3459            };
3460
3461            let filename = format!("image-{idx:02}.{ext}");
3462            let mime_type = format!("image/{mime_ext}");
3463
3464            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3465                debug!("Extracted image: {} ({} bytes)", filename, data.len());
3466                images.push(ExtractedImage {
3467                    filename: filename.clone(),
3468                    data,
3469                    mime_type,
3470                });
3471            }
3472
3473            idx += 1;
3474            format!("{prefix}images/{filename}{suffix}")
3475        })
3476        .into_owned();
3477
3478    (updated_html, images)
3479}
3480
3481/// Fetch a Google Docs document as a ZIP archive.
3482///
3483/// Fetches the document as HTML, extracts embedded base64 images,
3484/// converts to Markdown, and returns all components ready for archiving.
3485///
3486/// The archive contains:
3487/// - `document.md` — Markdown version
3488/// - `document.html` — HTML version with local image paths
3489/// - `images/` — extracted images
3490///
3491/// # Arguments
3492///
3493/// * `url` - Google Docs URL
3494/// * `api_token` - Optional API token for private documents
3495///
3496/// # Errors
3497///
3498/// Returns an error if the fetch or conversion fails.
3499pub async fn fetch_google_doc_as_archive(
3500    url: &str,
3501    api_token: Option<&str>,
3502) -> crate::Result<GDocsArchiveResult> {
3503    let result = fetch_google_doc(url, "html", api_token).await?;
3504
3505    let preprocess = preprocess_google_docs_export_html(&result.content);
3506    debug!(
3507        document_id = %result.document_id,
3508        hoisted = preprocess.hoisted,
3509        unwrapped_links = preprocess.unwrapped_links,
3510        "google-docs-export pre-processor rewrote archive markup"
3511    );
3512
3513    let (local_html, images) = extract_base64_images(&preprocess.html);
3514
3515    let markdown = normalize_google_docs_export_markdown(
3516        &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3517    );
3518
3519    debug!(
3520        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3521        images.len(),
3522        local_html.len(),
3523        markdown.len()
3524    );
3525
3526    Ok(GDocsArchiveResult {
3527        html: local_html,
3528        markdown,
3529        images,
3530        document_id: result.document_id,
3531        export_url: result.export_url,
3532    })
3533}
3534
3535/// Create a ZIP archive from a `GDocsArchiveResult`.
3536///
3537/// # Arguments
3538///
3539/// * `archive` - The archive result to bundle
3540/// * `pretty_html` - Whether to pretty-print the HTML output
3541///
3542/// # Errors
3543///
3544/// Returns an error if ZIP creation fails.
3545pub fn create_archive_zip(
3546    archive: &GDocsArchiveResult,
3547    pretty_html: bool,
3548) -> crate::Result<Vec<u8>> {
3549    let mut buf = std::io::Cursor::new(Vec::new());
3550
3551    {
3552        let mut zip = zip::ZipWriter::new(&mut buf);
3553        let options = zip::write::SimpleFileOptions::default()
3554            .compression_method(zip::CompressionMethod::Deflated);
3555
3556        zip.start_file("document.md", options)
3557            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3558        zip.write_all(archive.markdown.as_bytes())?;
3559
3560        let html_output = if pretty_html {
3561            crate::html::pretty_print_html(&archive.html)
3562        } else {
3563            archive.html.clone()
3564        };
3565        zip.start_file("document.html", options)
3566            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3567        zip.write_all(html_output.as_bytes())?;
3568
3569        for img in &archive.images {
3570            zip.start_file(format!("images/{}", img.filename), options)
3571                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3572            zip.write_all(&img.data)?;
3573        }
3574
3575        zip.finish()
3576            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3577    }
3578
3579    Ok(buf.into_inner())
3580}
3581
3582#[cfg(test)]
3583mod tests {
3584    use super::*;
3585    use serde_json::json;
3586
3587    #[test]
3588    fn browser_model_fingerprint_includes_payload_size() {
3589        let small = browser_model_data_from_value(&json!({
3590            "chunks": [{ "chunk": [{ "ty": "is", "s": "first" }] }],
3591            "cidUrlMap": {}
3592        }));
3593        let larger = browser_model_data_from_value(&json!({
3594            "chunks": [{ "chunk": [{ "ty": "is", "s": "first and later text" }] }],
3595            "cidUrlMap": {}
3596        }));
3597
3598        assert_eq!(small.fingerprint().chunks, larger.fingerprint().chunks);
3599        assert_ne!(
3600            small.fingerprint().payload_bytes,
3601            larger.fingerprint().payload_bytes
3602        );
3603    }
3604
3605    #[test]
3606    fn browser_model_quiescence_resets_when_chunks_change() {
3607        let start = Instant::now();
3608        let stability_window = Duration::from_millis(1500);
3609        let one_chunk = BrowserModelFingerprint {
3610            chunks: 1,
3611            payload_bytes: 100,
3612        };
3613        let two_chunks = BrowserModelFingerprint {
3614            chunks: 2,
3615            payload_bytes: 200,
3616        };
3617        let mut quiescence = BrowserModelQuiescence::default();
3618
3619        assert_eq!(quiescence.observe(one_chunk, start, stability_window), None);
3620        assert_eq!(
3621            quiescence.observe(
3622                one_chunk,
3623                start + Duration::from_millis(250),
3624                stability_window
3625            ),
3626            None
3627        );
3628        assert_eq!(
3629            quiescence.observe(
3630                two_chunks,
3631                start + Duration::from_millis(500),
3632                stability_window
3633            ),
3634            None
3635        );
3636        assert_eq!(
3637            quiescence.observe(
3638                two_chunks,
3639                start + Duration::from_millis(750),
3640                stability_window
3641            ),
3642            None
3643        );
3644        assert_eq!(
3645            quiescence.observe(
3646                two_chunks,
3647                start + Duration::from_millis(2300),
3648                stability_window
3649            ),
3650            Some(Duration::from_millis(1550))
3651        );
3652    }
3653}