web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use async_tungstenite::tokio::{connect_async, ConnectStream};
32use async_tungstenite::tungstenite::Message;
33use async_tungstenite::WebSocketStream;
34use base64::Engine;
35use futures::{SinkExt, StreamExt};
36use regex::Regex;
37use scraper::{node::Node, ElementRef, Html, Selector};
38use serde_json::Value;
39use std::collections::HashMap;
40use std::fmt::Write as _;
41use std::hash::BuildHasher;
42use std::io::Write;
43use std::process::Stdio;
44use std::sync::OnceLock;
45use std::time::{Duration, Instant};
46use tokio::io::{AsyncBufReadExt, BufReader};
47use tokio::process::{Child, Command};
48use tracing::{debug, info, warn};
49
50use crate::WebCaptureError;
51
52const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
53const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
54const GDOCS_USER_AGENT: &str =
55    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
56const GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT: Duration = Duration::from_secs(30);
57const GDOCS_EDITOR_MODEL_STABILITY_DEFAULT: Duration = Duration::from_millis(1500);
58const GDOCS_EDITOR_MODEL_POLL_INTERVAL: Duration = Duration::from_millis(250);
59const GDOCS_BROWSER_LAUNCH_TIMEOUT: Duration = Duration::from_secs(20);
60
61type CdpWebSocket = WebSocketStream<ConnectStream>;
62
63const GDOCS_MODEL_CAPTURE_INIT_SCRIPT: &str = r"
64window.__captured_chunks = [];
65const captureChunk = (value) => {
66  if (!value) {
67    return;
68  }
69  if (Array.isArray(value)) {
70    for (const item of value) {
71      captureChunk(item);
72    }
73    return;
74  }
75  try {
76    window.__captured_chunks.push(JSON.parse(JSON.stringify(value)));
77  } catch {
78    window.__captured_chunks.push(value);
79  }
80};
81const wrapChunkArray = (value) => {
82  if (!Array.isArray(value) || value.__webCaptureDocsModelWrapped) {
83    return value;
84  }
85  const originalPush = value.push;
86  Object.defineProperty(value, '__webCaptureDocsModelWrapped', {
87    value: true,
88    enumerable: false,
89  });
90  Object.defineProperty(value, 'push', {
91    value(...items) {
92      for (const item of items) {
93        captureChunk(item);
94      }
95      return originalPush.apply(this, items);
96    },
97    writable: true,
98    configurable: true,
99  });
100  for (const item of value) {
101    captureChunk(item);
102  }
103  return value;
104};
105Object.defineProperty(window, 'DOCS_modelChunk', {
106  set(value) {
107    captureChunk(value);
108    window.__DOCS_modelChunk_latest = wrapChunkArray(value);
109  },
110  get() {
111    return window.__DOCS_modelChunk_latest;
112  },
113  configurable: false,
114});
115";
116
117const GDOCS_MODEL_EXTRACT_SCRIPT: &str = r#"() => {
118  const chunks = [...(window.__captured_chunks || [])];
119  if (
120    window.DOCS_modelChunk &&
121    chunks.length === 0 &&
122    !chunks.includes(window.DOCS_modelChunk)
123  ) {
124    chunks.push(window.DOCS_modelChunk);
125  }
126  const cidUrlMap = {};
127  const scripts = document.querySelectorAll('script');
128  for (const script of scripts) {
129    const text = script.textContent || '';
130    if (!text.includes('docs-images-rt')) {
131      continue;
132    }
133    const regex =
134      /"([A-Za-z0-9_-]{20,})"\s*:\s*"(https:\/\/docs\.google\.com\/docs-images-rt\/[^"]+)"/g;
135    let match;
136    while ((match = regex.exec(text)) !== null) {
137      cidUrlMap[match[1]] = match[2]
138        .replace(/\\u003d/g, '=')
139        .replace(/\\u0026/g, '&')
140        .replace(/\\\//g, '/');
141    }
142  }
143  return { chunks, cidUrlMap };
144}"#;
145
146fn gdocs_url_pattern() -> &'static Regex {
147    static PATTERN: OnceLock<Regex> = OnceLock::new();
148    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
149}
150
151/// Result of fetching a Google Docs document.
152#[derive(Debug, Clone)]
153pub struct GDocsResult {
154    /// The document content in the requested format.
155    pub content: String,
156    /// The export format used.
157    pub format: String,
158    /// The extracted document ID.
159    pub document_id: String,
160    /// The export URL that was fetched.
161    pub export_url: String,
162}
163
164/// Google Docs capture backend selected from the CLI `--capture` flag.
165#[derive(Debug, Clone, Copy, PartialEq, Eq)]
166pub enum GDocsCaptureMethod {
167    /// Load `/edit` and extract `DOCS_modelChunk`.
168    BrowserModel,
169    /// Use the public `/export?format=...` endpoint.
170    PublicExport,
171    /// Use the authenticated `docs.googleapis.com` REST API.
172    DocsApi,
173}
174
175/// Rendered Google Docs content from either Docs API or editor model data.
176#[derive(Debug, Clone)]
177pub struct GDocsRenderedResult {
178    /// Markdown output.
179    pub markdown: String,
180    /// HTML output.
181    pub html: String,
182    /// Plain text output.
183    pub text: String,
184    /// The extracted document ID.
185    pub document_id: String,
186    /// Source URL used for capture.
187    pub export_url: String,
188    /// Remote images exposed by the editor model, used for archive localization.
189    pub remote_images: Vec<RemoteImage>,
190}
191
192/// Remote image reference extracted from browser-model capture.
193#[derive(Debug, Clone, PartialEq, Eq)]
194pub struct RemoteImage {
195    /// Original image URL.
196    pub url: String,
197    /// Image alt text.
198    pub alt: String,
199}
200
201#[derive(Debug, Clone)]
202struct BrowserModelData {
203    chunks: Vec<Value>,
204    cid_urls: HashMap<String, String>,
205    chunk_payload_bytes: usize,
206    poll_count: usize,
207    stable_for: Duration,
208}
209
210#[derive(Debug, Clone, Copy, PartialEq, Eq)]
211struct BrowserModelFingerprint {
212    chunks: usize,
213    payload_bytes: usize,
214}
215
216#[derive(Debug, Default)]
217struct BrowserModelQuiescence {
218    last_fingerprint: Option<BrowserModelFingerprint>,
219    stable_since: Option<Instant>,
220}
221
222impl BrowserModelData {
223    const fn fingerprint(&self) -> BrowserModelFingerprint {
224        BrowserModelFingerprint {
225            chunks: self.chunks.len(),
226            payload_bytes: self.chunk_payload_bytes,
227        }
228    }
229}
230
231impl BrowserModelQuiescence {
232    fn observe(
233        &mut self,
234        fingerprint: BrowserModelFingerprint,
235        now: Instant,
236        stability_window: Duration,
237    ) -> Option<Duration> {
238        if fingerprint.chunks == 0 {
239            self.last_fingerprint = Some(fingerprint);
240            self.stable_since = None;
241            return None;
242        }
243
244        if self.last_fingerprint == Some(fingerprint) {
245            let stable_since = *self.stable_since.get_or_insert(now);
246            let stable_for = now.saturating_duration_since(stable_since);
247            if stable_for >= stability_window {
248                return Some(stable_for);
249            }
250        } else {
251            self.last_fingerprint = Some(fingerprint);
252            self.stable_since = None;
253        }
254
255        None
256    }
257
258    fn stable_for(&self, now: Instant) -> Duration {
259        self.stable_since.map_or(Duration::ZERO, |stable_since| {
260            now.saturating_duration_since(stable_since)
261        })
262    }
263}
264
265/// Parsed Google Docs model/document capture.
266#[derive(Debug, Clone, Default)]
267pub struct CapturedDocument {
268    /// Ordered document blocks.
269    pub blocks: Vec<CapturedBlock>,
270    /// Tables extracted from `blocks` for compatibility with tests and callers.
271    pub tables: Vec<TableBlock>,
272    /// Images extracted from model positions.
273    pub images: Vec<ContentNode>,
274    /// Plain text projection.
275    pub text: String,
276}
277
278/// Captured block.
279#[derive(Debug, Clone)]
280pub enum CapturedBlock {
281    /// Paragraph-like block.
282    Paragraph {
283        /// Paragraph content.
284        content: Vec<ContentNode>,
285        /// Optional Google Docs named style.
286        style: Option<String>,
287        /// Optional list metadata.
288        list: Option<ListMeta>,
289        /// Whether paragraph is a blockquote.
290        quote: bool,
291        /// Whether paragraph is a horizontal rule.
292        horizontal_rule: bool,
293    },
294    /// Table block.
295    Table(TableBlock),
296}
297
298/// Captured table.
299#[derive(Debug, Clone, Default)]
300pub struct TableBlock {
301    /// Table rows.
302    pub rows: Vec<TableRow>,
303}
304
305/// Captured table row.
306#[derive(Debug, Clone, Default)]
307pub struct TableRow {
308    /// Row cells.
309    pub cells: Vec<TableCell>,
310}
311
312/// Captured table cell.
313#[derive(Debug, Clone, Default)]
314pub struct TableCell {
315    /// Cell content.
316    pub content: Vec<ContentNode>,
317}
318
319/// Captured inline content node.
320#[derive(Debug, Clone, PartialEq, Eq)]
321pub enum ContentNode {
322    /// Text run.
323    Text {
324        /// Text content.
325        text: String,
326        /// Bold text style.
327        bold: bool,
328        /// Italic text style.
329        italic: bool,
330        /// Strikethrough text style.
331        strike: bool,
332        /// Optional hyperlink target.
333        link: Option<String>,
334    },
335    /// Image placeholder.
336    Image {
337        /// Content ID from Google Docs model data.
338        cid: Option<String>,
339        /// Resolved image URL.
340        url: Option<String>,
341        /// Alt text.
342        alt: String,
343        /// Editor-model image width, when available.
344        width: Option<String>,
345        /// Editor-model image height, when available.
346        height: Option<String>,
347        /// Whether this image came from a suggested edit.
348        is_suggestion: bool,
349    },
350}
351
352#[derive(Debug, Clone, Default, PartialEq, Eq)]
353struct TextStyle {
354    bold: bool,
355    italic: bool,
356    strike: bool,
357    link: Option<String>,
358}
359
360#[derive(Debug, Clone, Default)]
361struct ParagraphMeta {
362    style: Option<String>,
363    list: Option<ListMeta>,
364    quote: bool,
365    horizontal_rule: bool,
366}
367
368#[derive(Debug, Clone)]
369pub struct ListMeta {
370    /// Google Docs list identifier.
371    pub id: String,
372    /// Nesting level, zero-based.
373    pub level: usize,
374    /// Whether Markdown should render this list item with an ordered marker.
375    pub ordered: bool,
376}
377
378#[derive(Debug, Clone)]
379struct ParagraphStyle {
380    style: Option<String>,
381    indent_start: f64,
382    indent_first_line: f64,
383}
384
385#[derive(Debug, Clone)]
386struct ExportSemanticHint {
387    text: String,
388    list_ordered: Option<bool>,
389    quote: bool,
390}
391
392#[derive(Debug, Clone, Default)]
393struct ModelStyleMaps {
394    inline_styles: Vec<TextStyle>,
395    paragraph_by_end: HashMap<usize, ParagraphStyle>,
396    list_by_end: HashMap<usize, ListMeta>,
397    horizontal_rules: std::collections::HashSet<usize>,
398}
399
400/// Check if a URL is a Google Docs document URL.
401#[must_use]
402pub fn is_google_docs_url(url: &str) -> bool {
403    gdocs_url_pattern().is_match(url)
404}
405
406/// Extract the document ID from a Google Docs URL.
407///
408/// Returns `None` if the URL is not a valid Google Docs URL.
409#[must_use]
410pub fn extract_document_id(url: &str) -> Option<String> {
411    gdocs_url_pattern()
412        .captures(url)
413        .and_then(|caps| caps.get(1))
414        .map(|m| m.as_str().to_string())
415}
416
417/// Build a Google Docs export URL.
418///
419/// # Arguments
420///
421/// * `document_id` - The Google Docs document ID
422/// * `format` - Export format (html, txt, md, pdf, docx, epub)
423#[must_use]
424pub fn build_export_url(document_id: &str, format: &str) -> String {
425    let export_format = match format {
426        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
427        _ => "html",
428    };
429    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
430}
431
432/// Build a Google Docs editor URL.
433#[must_use]
434pub fn build_edit_url(document_id: &str) -> String {
435    format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
436}
437
438/// Build a Google Docs REST API URL.
439#[must_use]
440pub fn build_docs_api_url(document_id: &str) -> String {
441    format!("{GDOCS_API_BASE}/{document_id}")
442}
443
444/// Select a Google Docs capture backend from the CLI `--capture` value.
445///
446/// # Errors
447///
448/// Returns an error when `capture` is neither `browser` nor `api`.
449pub fn select_capture_method(
450    capture: &str,
451    api_token: Option<&str>,
452) -> crate::Result<GDocsCaptureMethod> {
453    match capture.to_lowercase().as_str() {
454        "browser" => Ok(GDocsCaptureMethod::BrowserModel),
455        "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
456        "api" => Ok(GDocsCaptureMethod::PublicExport),
457        other => Err(WebCaptureError::InvalidUrl(format!(
458            "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
459        ))),
460    }
461}
462
463/// Fetch a Google Docs document via the export URL.
464///
465/// For public documents, pass `None` for `api_token`.
466/// For private documents, pass a Bearer token string.
467///
468/// # Arguments
469///
470/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
471/// * `format` - Export format (html, txt, md, pdf, docx, epub)
472/// * `api_token` - Optional API token for private documents
473///
474/// # Errors
475///
476/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
477pub async fn fetch_google_doc(
478    url: &str,
479    format: &str,
480    api_token: Option<&str>,
481) -> crate::Result<GDocsResult> {
482    let document_id = extract_document_id(url).ok_or_else(|| {
483        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
484    })?;
485
486    let export_url = build_export_url(&document_id, format);
487    debug!(
488        document_id = %document_id,
489        format = %format,
490        export_url = %export_url,
491        has_api_token = api_token.is_some(),
492        "fetching Google Doc via public export"
493    );
494
495    let mut request = reqwest::Client::new()
496        .get(&export_url)
497        .header(
498            "User-Agent",
499            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
500        )
501        .header("Accept-Charset", "utf-8")
502        .header("Accept-Language", "en-US,en;q=0.9");
503
504    if let Some(token) = api_token {
505        request = request.header("Authorization", format!("Bearer {token}"));
506    }
507
508    let response = request
509        .send()
510        .await
511        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
512    debug!(
513        document_id = %document_id,
514        status = response.status().as_u16(),
515        success = response.status().is_success(),
516        content_type = response
517            .headers()
518            .get(reqwest::header::CONTENT_TYPE)
519            .and_then(|value| value.to_str().ok())
520            .unwrap_or(""),
521        "received Google Docs public export response"
522    );
523
524    if !response.status().is_success() {
525        return Err(WebCaptureError::FetchError(format!(
526            "Failed to fetch Google Doc ({} {}): {}",
527            response.status().as_u16(),
528            response.status().canonical_reason().unwrap_or("Unknown"),
529            export_url
530        )));
531    }
532
533    let raw_content = response.text().await.map_err(|e| {
534        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
535    })?;
536    debug!(
537        document_id = %document_id,
538        bytes = raw_content.len(),
539        "read Google Docs public export body"
540    );
541
542    // Keep HTML markup escaped so literal examples such as `&lt;ol&gt;` do not
543    // become real tags before the HTML parser sees the document.
544    let content = match format {
545        "txt" | "md" => crate::html::decode_html_entities(&raw_content),
546        _ => raw_content,
547    };
548
549    Ok(GDocsResult {
550        content,
551        format: format.to_string(),
552        document_id,
553        export_url,
554    })
555}
556
557/// Fetch a Google Docs document and convert to Markdown.
558///
559/// Fetches the document as HTML, then converts to Markdown using the
560/// existing HTML-to-Markdown pipeline.
561///
562/// # Arguments
563///
564/// * `url` - Google Docs URL
565/// * `api_token` - Optional API token for private documents
566///
567/// # Errors
568///
569/// Returns an error if the fetch or conversion fails.
570pub async fn fetch_google_doc_as_markdown(
571    url: &str,
572    api_token: Option<&str>,
573) -> crate::Result<GDocsResult> {
574    let result = fetch_google_doc(url, "html", api_token).await?;
575
576    let preprocess = preprocess_google_docs_export_html(&result.content);
577    debug!(
578        document_id = %result.document_id,
579        hoisted = preprocess.hoisted,
580        unwrapped_links = preprocess.unwrapped_links,
581        "google-docs-export pre-processor rewrote markup"
582    );
583    let markdown = normalize_google_docs_export_markdown(
584        &crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?,
585    );
586    debug!(
587        document_id = %result.document_id,
588        bytes = markdown.len(),
589        "rendered Google Docs public export markdown"
590    );
591
592    Ok(GDocsResult {
593        content: markdown,
594        format: "markdown".to_string(),
595        document_id: result.document_id,
596        export_url: result.export_url,
597    })
598}
599
600/// Result of running the Google Docs export HTML pre-processor.
601///
602/// Exposes the rewritten HTML alongside counters that are useful for debug
603/// logging (`gdocs.export.style-hoist`). See issue #92 R6.
604#[derive(Debug, Clone)]
605pub struct GDocsExportPreprocessResult {
606    /// Rewritten HTML.
607    pub html: String,
608    /// Number of inline-style spans turned into `<strong>`/`<em>`/`<del>`.
609    pub hoisted: usize,
610    /// Number of `google.com/url?q=` redirect wrappers unwrapped.
611    pub unwrapped_links: usize,
612}
613
614/// Pre-process Google Docs export HTML so the generic `html2md` pipeline
615/// preserves inline formatting, heading numbering, and link targets.
616///
617/// Google Drive serves bold/italic/strikethrough as inline style spans and
618/// wraps every link through a `google.com/url?q=` redirect, both of which
619/// the generic converter would otherwise discard. This function rewrites
620/// those constructs into semantic HTML before conversion.
621#[must_use]
622pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
623    let mut hoisted: usize = 0;
624    let mut unwrapped_links: usize = 0;
625    let class_styles = extract_css_class_styles(html);
626
627    let mut out = hoist_inline_style_spans(html, &mut hoisted);
628    out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
629    out = split_strong_at_block_boundaries(&out);
630    out = split_paragraphs_at_bold_boundaries(&out);
631    out = remove_empty_strong(&out);
632    out = coalesce_adjacent_strong(&out);
633    out = convert_class_indented_blockquotes(&out, &class_styles);
634    out = nest_google_docs_lists(&out, &class_styles);
635    out = strip_google_docs_heading_noise(&out);
636    out = strip_heading_inline_formatting(&out);
637    out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
638    out = out.replace("&nbsp;", " ");
639    out = out.replace('\u{00A0}', " ");
640
641    GDocsExportPreprocessResult {
642        html: out,
643        hoisted,
644        unwrapped_links,
645    }
646}
647
648/// Normalize Markdown emitted from Google Docs public-export HTML converters.
649#[must_use]
650pub fn normalize_google_docs_export_markdown(markdown: &str) -> String {
651    let markdown = unescape_public_export_punctuation(markdown);
652    let markdown = convert_setext_headings(&markdown);
653    let markdown = normalize_atx_headings(&markdown);
654    let markdown = normalize_bullet_markers(&markdown);
655    let markdown = normalize_list_spacing(&markdown);
656    let markdown = normalize_blockquote_spacing(&markdown);
657    let markdown = normalize_markdown_tables(&markdown);
658    crate::markdown::clean_markdown(&markdown)
659}
660
661fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
662    let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
663        .expect("valid regex");
664    span_re
665        .replace_all(html, |caps: &regex::Captures<'_>| {
666            let style = caps.get(2).map_or("", |m| m.as_str());
667            let inner = caps.get(3).map_or("", |m| m.as_str());
668            semantic_wrapped_html(inner, style).map_or_else(
669                || caps[0].to_string(),
670                |wrapped| {
671                    *hoisted += 1;
672                    wrapped
673                },
674            )
675        })
676        .into_owned()
677}
678
679fn hoist_class_style_spans(
680    html: &str,
681    class_styles: &HashMap<String, String>,
682    hoisted: &mut usize,
683) -> String {
684    let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
685        .expect("valid regex");
686    class_span_re
687        .replace_all(html, |caps: &regex::Captures<'_>| {
688            let class_attr = caps.get(2).map_or("", |m| m.as_str());
689            let inner = caps.get(3).map_or("", |m| m.as_str());
690            let style = combined_class_style(class_styles, class_attr);
691            semantic_wrapped_html(inner, &style).map_or_else(
692                || caps[0].to_string(),
693                |wrapped| {
694                    *hoisted += 1;
695                    wrapped
696                },
697            )
698        })
699        .into_owned()
700}
701
702// Issue #120: a `<strong>` whose closing tag would land on the far side of a
703// `<br>`/`<img>` boundary leaks bold across an image or paragraph break.
704// Split such a strong so each piece on either side of the boundary becomes
705// its own bold run.
706fn split_strong_at_block_boundaries(html: &str) -> String {
707    let strong_re = Regex::new(r"(?is)<strong>(.*?)</strong>").expect("valid regex");
708    let boundary_re = Regex::new(r"(?is)<br\s*/?>|<img\b[^>]*/?>").expect("valid regex");
709
710    let mut current = html.to_string();
711    loop {
712        let mut changed = false;
713        let next = strong_re
714            .replace_all(&current, |caps: &regex::Captures<'_>| {
715                let inner = caps.get(1).map_or("", |m| m.as_str());
716                if !boundary_re.is_match(inner) {
717                    return caps[0].to_string();
718                }
719                changed = true;
720                let mut result = String::new();
721                let mut last_end = 0usize;
722                for boundary in boundary_re.find_iter(inner) {
723                    let chunk = &inner[last_end..boundary.start()];
724                    if chunk.trim().is_empty() {
725                        result.push_str(chunk);
726                    } else {
727                        result.push_str("<strong>");
728                        result.push_str(chunk);
729                        result.push_str("</strong>");
730                    }
731                    result.push_str(boundary.as_str());
732                    last_end = boundary.end();
733                }
734                let tail = &inner[last_end..];
735                if tail.trim().is_empty() {
736                    result.push_str(tail);
737                } else {
738                    result.push_str("<strong>");
739                    result.push_str(tail);
740                    result.push_str("</strong>");
741                }
742                result
743            })
744            .into_owned();
745        current = next;
746        if !changed {
747            break;
748        }
749    }
750    current
751}
752
753// Issue #120: when a `<p>` contains a `<br>` adjacent to a `<strong>` or
754// `<img>`, split the paragraph at those boundaries so the markdown converter
755// emits separate blocks (`**Caption**\n\n![](x)`) instead of an inline soft
756// break that keeps them on the same line.
757fn split_paragraphs_at_bold_boundaries(html: &str) -> String {
758    let p_re = Regex::new(r"(?is)<p\b([^>]*)>(.*?)</p>").expect("valid regex");
759    let br_re = Regex::new(r"(?is)<br\s*/?>").expect("valid regex");
760    let img_re = Regex::new(r"(?is)<img\b[^>]*/?>").expect("valid regex");
761    let strong_or_img_re = Regex::new(r"(?is)<strong>|<img\b").expect("valid regex");
762
763    p_re.replace_all(html, |caps: &regex::Captures<'_>| {
764        let attrs = caps.get(1).map_or("", |m| m.as_str());
765        let inner = caps.get(2).map_or("", |m| m.as_str());
766        if !br_re.is_match(inner) {
767            return caps[0].to_string();
768        }
769        if !strong_or_img_re.is_match(inner) {
770            return caps[0].to_string();
771        }
772
773        let mut segments: Vec<String> = Vec::new();
774        let mut current = String::new();
775        let mut idx = 0usize;
776        while idx < inner.len() {
777            if let Some(br) = br_re.find_at(inner, idx) {
778                if br.start() == idx {
779                    flush_paragraph_segment(&mut segments, &mut current);
780                    idx = br.end();
781                    continue;
782                }
783            }
784            if let Some(img) = img_re.find_at(inner, idx) {
785                if img.start() == idx {
786                    flush_paragraph_segment(&mut segments, &mut current);
787                    segments.push(img.as_str().to_string());
788                    idx = img.end();
789                    continue;
790                }
791            }
792            // Find the next boundary.
793            let next_br = br_re.find_at(inner, idx).map(|m| m.start());
794            let next_img = img_re.find_at(inner, idx).map(|m| m.start());
795            let next = match (next_br, next_img) {
796                (Some(a), Some(b)) => a.min(b),
797                (Some(a), None) | (None, Some(a)) => a,
798                (None, None) => inner.len(),
799            };
800            current.push_str(&inner[idx..next]);
801            idx = next;
802        }
803        flush_paragraph_segment(&mut segments, &mut current);
804
805        if segments.len() <= 1 {
806            return caps[0].to_string();
807        }
808        let mut out = String::new();
809        for segment in &segments {
810            let _ = write!(out, "<p{attrs}>{segment}</p>");
811        }
812        out
813    })
814    .into_owned()
815}
816
817fn flush_paragraph_segment(segments: &mut Vec<String>, current: &mut String) {
818    let trimmed = current.trim();
819    if !trimmed.is_empty() {
820        segments.push(current.clone());
821    }
822    current.clear();
823}
824
825// Issue #120: drop empty `<strong></strong>` so they cannot pair with a
826// neighbour and emit `****`.
827fn remove_empty_strong(html: &str) -> String {
828    let empty_re = Regex::new(r"(?is)<strong>\s*</strong>").expect("valid regex");
829    empty_re.replace_all(html, "").into_owned()
830}
831
832// Issue #120: merge adjacent `<strong>` siblings (optionally separated by
833// whitespace) into a single bold run so the converter emits `**a b**`
834// instead of `**a** **b**`.
835fn coalesce_adjacent_strong(html: &str) -> String {
836    let adjacent_re = Regex::new(r"(?is)</strong>(\s*)<strong>").expect("valid regex");
837    let mut current = html.to_string();
838    loop {
839        let next = adjacent_re.replace_all(&current, "$1").into_owned();
840        if next == current {
841            return next;
842        }
843        current = next;
844    }
845}
846
847fn convert_class_indented_blockquotes(
848    html: &str,
849    class_styles: &HashMap<String, String>,
850) -> String {
851    let class_paragraph_re =
852        Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
853    class_paragraph_re
854        .replace_all(html, |caps: &regex::Captures<'_>| {
855            let class_attr = caps.get(2).map_or("", |m| m.as_str());
856            let inner = caps.get(3).map_or("", |m| m.as_str());
857            let style = combined_class_style(class_styles, class_attr);
858            if is_blockquote_style(&style) {
859                format!("<blockquote><p>{inner}</p></blockquote>")
860            } else {
861                caps[0].to_string()
862            }
863        })
864        .into_owned()
865}
866
867#[derive(Debug, Clone)]
868struct ExportListBlock {
869    start: usize,
870    end: usize,
871    tag: String,
872    inner: String,
873    start_attr: Option<String>,
874}
875
876#[derive(Debug, Clone)]
877struct ExportListItem {
878    tag: String,
879    level: usize,
880    inner: String,
881}
882
883fn nest_google_docs_lists(html: &str, class_styles: &HashMap<String, String>) -> String {
884    let list_re = Regex::new(r"(?is)<(ul|ol)\b([^>]*)>(.*?)</(ul|ol)>").expect("valid regex");
885    let start_attr_re = Regex::new(r#"(?i)\bstart\s*=\s*"([^"]*)""#).expect("valid regex");
886    let blocks: Vec<ExportListBlock> = list_re
887        .captures_iter(html)
888        .filter_map(|caps| {
889            let open_tag = caps.get(1)?.as_str().to_ascii_lowercase();
890            let close_tag = caps.get(4)?.as_str().to_ascii_lowercase();
891            if open_tag != close_tag {
892                return None;
893            }
894            let whole = caps.get(0)?;
895            let attrs = caps.get(2).map_or("", |m| m.as_str());
896            let start_attr = if open_tag == "ol" {
897                start_attr_re
898                    .captures(attrs)
899                    .and_then(|c| c.get(1).map(|m| m.as_str().to_string()))
900            } else {
901                None
902            };
903            Some(ExportListBlock {
904                start: whole.start(),
905                end: whole.end(),
906                tag: open_tag,
907                inner: caps.get(3).map_or("", |m| m.as_str()).to_string(),
908                start_attr,
909            })
910        })
911        .collect();
912
913    if blocks.len() < 2 {
914        return html.to_string();
915    }
916
917    let mut groups: Vec<Vec<ExportListBlock>> = Vec::new();
918    let mut current: Vec<ExportListBlock> = Vec::new();
919    for block in blocks {
920        if let Some(previous) = current.last() {
921            if !html[previous.end..block.start].trim().is_empty() {
922                if current.len() > 1 {
923                    groups.push(std::mem::take(&mut current));
924                } else {
925                    current.clear();
926                }
927            }
928        }
929        current.push(block);
930    }
931    if current.len() > 1 {
932        groups.push(current);
933    }
934
935    if groups.is_empty() {
936        return html.to_string();
937    }
938
939    let mut out = html.to_string();
940    for group in groups.iter().rev() {
941        let rendered = render_nested_list_group(group, class_styles);
942        let start = group.first().expect("non-empty group").start;
943        let end = group.last().expect("non-empty group").end;
944        out.replace_range(start..end, &rendered);
945    }
946    out
947}
948
949#[allow(clippy::too_many_lines)]
950fn render_nested_list_group(
951    group: &[ExportListBlock],
952    class_styles: &HashMap<String, String>,
953) -> String {
954    let item_re = Regex::new(r"(?is)<li\b([^>]*)>(.*?)</li>").expect("valid regex");
955    let items: Vec<ExportListItem> = group
956        .iter()
957        .flat_map(|block| {
958            item_re.captures_iter(&block.inner).map(|caps| {
959                let attrs = caps.get(1).map_or("", |m| m.as_str());
960                let inner = caps.get(2).map_or("", |m| m.as_str()).to_string();
961                ExportListItem {
962                    tag: block.tag.clone(),
963                    level: google_docs_list_item_level(attrs, class_styles),
964                    inner,
965                }
966            })
967        })
968        .collect();
969
970    if items.is_empty() {
971        let mut unchanged = String::new();
972        for block in group {
973            write!(unchanged, "<{}>{}</{}>", block.tag, block.inner, block.tag)
974                .expect("write to String");
975        }
976        return unchanged;
977    }
978
979    let top_level_start = group.first().and_then(|block| block.start_attr.clone());
980
981    let mut html = String::new();
982    let mut current_level: Option<usize> = None;
983    let mut open_tags: Vec<Option<String>> = Vec::new();
984    let mut item_open: Vec<bool> = Vec::new();
985    let mut top_level_opened = false;
986
987    for item in items {
988        let level = item.level;
989        while current_level.is_some_and(|current| current > level) {
990            let current = current_level.expect("checked as Some");
991            close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
992            current_level = current.checked_sub(1);
993        }
994
995        while current_level.is_none_or(|current| current < level) {
996            let next_level = current_level.map_or(0, |current| current + 1);
997            let start_attr = if next_level == 0 && !top_level_opened {
998                top_level_opened = true;
999                top_level_start.as_deref()
1000            } else {
1001                None
1002            };
1003            open_rendered_list(
1004                &mut html,
1005                &mut open_tags,
1006                &mut item_open,
1007                next_level,
1008                &item.tag,
1009                start_attr,
1010            );
1011            current_level = Some(next_level);
1012        }
1013
1014        ensure_list_stack(&mut open_tags, &mut item_open, level);
1015        if open_tags[level]
1016            .as_deref()
1017            .is_some_and(|tag| tag != item.tag)
1018        {
1019            close_rendered_list(&mut html, &mut open_tags, &mut item_open, level);
1020            let start_attr = if level == 0 && !top_level_opened {
1021                top_level_opened = true;
1022                top_level_start.as_deref()
1023            } else {
1024                None
1025            };
1026            open_rendered_list(
1027                &mut html,
1028                &mut open_tags,
1029                &mut item_open,
1030                level,
1031                &item.tag,
1032                start_attr,
1033            );
1034        } else if open_tags[level].is_none() {
1035            let start_attr = if level == 0 && !top_level_opened {
1036                top_level_opened = true;
1037                top_level_start.as_deref()
1038            } else {
1039                None
1040            };
1041            open_rendered_list(
1042                &mut html,
1043                &mut open_tags,
1044                &mut item_open,
1045                level,
1046                &item.tag,
1047                start_attr,
1048            );
1049        }
1050
1051        close_rendered_item(&mut html, &mut item_open, level);
1052        html.push_str("<li>");
1053        html.push_str(&item.inner);
1054        item_open[level] = true;
1055
1056        for deeper in (level + 1)..item_open.len() {
1057            item_open[deeper] = false;
1058            open_tags[deeper] = None;
1059        }
1060    }
1061
1062    while let Some(current) = current_level {
1063        close_rendered_list(&mut html, &mut open_tags, &mut item_open, current);
1064        current_level = current.checked_sub(1);
1065    }
1066
1067    html
1068}
1069
1070fn ensure_list_stack(open_tags: &mut Vec<Option<String>>, item_open: &mut Vec<bool>, level: usize) {
1071    while open_tags.len() <= level {
1072        open_tags.push(None);
1073        item_open.push(false);
1074    }
1075}
1076
1077fn open_rendered_list(
1078    html: &mut String,
1079    open_tags: &mut Vec<Option<String>>,
1080    item_open: &mut Vec<bool>,
1081    level: usize,
1082    tag: &str,
1083    start_attr: Option<&str>,
1084) {
1085    ensure_list_stack(open_tags, item_open, level);
1086    html.push('<');
1087    html.push_str(tag);
1088    if let Some(start) = start_attr {
1089        if tag == "ol" && !start.is_empty() {
1090            write!(html, r#" start="{start}""#).expect("write to String");
1091        }
1092    }
1093    html.push('>');
1094    open_tags[level] = Some(tag.to_string());
1095    item_open[level] = false;
1096}
1097
1098fn close_rendered_item(html: &mut String, item_open: &mut [bool], level: usize) {
1099    if item_open.get(level).copied().unwrap_or(false) {
1100        html.push_str("</li>");
1101        item_open[level] = false;
1102    }
1103}
1104
1105fn close_rendered_list(
1106    html: &mut String,
1107    open_tags: &mut [Option<String>],
1108    item_open: &mut [bool],
1109    level: usize,
1110) {
1111    close_rendered_item(html, item_open, level);
1112    if let Some(tag) = open_tags.get_mut(level).and_then(Option::take) {
1113        html.push_str("</");
1114        html.push_str(&tag);
1115        html.push('>');
1116    }
1117}
1118
1119fn google_docs_list_item_level(attrs: &str, class_styles: &HashMap<String, String>) -> usize {
1120    let style = combined_attr_style(class_styles, attrs);
1121    let margin_left = css_point_value(&style, "margin-left");
1122    if margin_left <= 0.0 {
1123        return 0;
1124    }
1125    [54.0, 90.0, 126.0, 162.0, 198.0, 234.0, 270.0, 306.0]
1126        .iter()
1127        .take_while(|boundary| margin_left >= **boundary)
1128        .count()
1129}
1130
1131fn combined_attr_style(class_styles: &HashMap<String, String>, attrs: &str) -> String {
1132    let mut styles = String::new();
1133    if let Some(style) = attr_value(attrs, "style") {
1134        styles.push_str(&style);
1135    }
1136    if let Some(class_attr) = attr_value(attrs, "class") {
1137        styles.push_str(&combined_class_style(class_styles, &class_attr));
1138    }
1139    styles
1140}
1141
1142fn attr_value(attrs: &str, name: &str) -> Option<String> {
1143    let attr_re = Regex::new(&format!(
1144        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)')"#,
1145        regex::escape(name)
1146    ))
1147    .expect("valid regex");
1148    attr_re.captures(attrs).and_then(|caps| {
1149        caps.get(1)
1150            .or_else(|| caps.get(2))
1151            .map(|value| value.as_str().to_string())
1152    })
1153}
1154
1155fn strip_google_docs_heading_noise(html: &str) -> String {
1156    let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
1157    let numbering_re =
1158        Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
1159    let mut out = empty_anchor_re.replace_all(html, "").into_owned();
1160    for level in 1..=6 {
1161        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
1162            .expect("valid regex");
1163        out = heading_re
1164            .replace_all(&out, |caps: &regex::Captures<'_>| {
1165                let open = &caps[1];
1166                let inner = &caps[2];
1167                let close = &caps[3];
1168                let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
1169                cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
1170                format!("{open}{cleaned}{close}")
1171            })
1172            .into_owned();
1173    }
1174    out
1175}
1176
1177fn strip_heading_inline_formatting(html: &str) -> String {
1178    let inline_marker_re = Regex::new(r"(?is)</?(?:strong|em|del)>").expect("valid regex");
1179    let mut out = html.to_string();
1180    for level in 1..=6 {
1181        let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
1182            .expect("valid regex");
1183        out = heading_re
1184            .replace_all(&out, |caps: &regex::Captures<'_>| {
1185                let open = &caps[1];
1186                let inner = &caps[2];
1187                let close = &caps[3];
1188                let cleaned = inline_marker_re.replace_all(inner, "");
1189                format!("{open}{cleaned}{close}")
1190            })
1191            .into_owned();
1192    }
1193    out
1194}
1195
1196fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
1197    let redirect_re =
1198        Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
1199            .expect("valid regex");
1200    redirect_re
1201        .replace_all(html, |caps: &regex::Captures<'_>| {
1202            let encoded = caps.get(1).map_or("", |m| m.as_str());
1203            let decoded = percent_decode_utf8_lossy(encoded);
1204            *unwrapped_links += 1;
1205            format!(r#"href="{decoded}""#)
1206        })
1207        .into_owned()
1208}
1209
1210fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
1211    let mut class_styles: HashMap<String, String> = HashMap::new();
1212    let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
1213    let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
1214    for style_caps in style_re.captures_iter(html) {
1215        let css = style_caps.get(1).map_or("", |m| m.as_str());
1216        for class_caps in class_re.captures_iter(css) {
1217            let class_name = class_caps.get(1).map_or("", |m| m.as_str());
1218            let style = class_caps.get(2).map_or("", |m| m.as_str());
1219            class_styles
1220                .entry(class_name.to_string())
1221                .and_modify(|existing| {
1222                    existing.push(';');
1223                    existing.push_str(style);
1224                })
1225                .or_insert_with(|| style.to_string());
1226        }
1227    }
1228    class_styles
1229}
1230
1231fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
1232    class_attr
1233        .split_whitespace()
1234        .filter_map(|class_name| class_styles.get(class_name))
1235        .fold(String::new(), |mut out, style| {
1236            out.push(';');
1237            out.push_str(style);
1238            out
1239        })
1240}
1241
1242fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
1243    let bold = css_has_bold(style);
1244    let italic = css_has_italic(style);
1245    let strike = css_has_strike(style);
1246    if !bold && !italic && !strike {
1247        return None;
1248    }
1249    let mut wrapped = inner.to_string();
1250    if strike {
1251        wrapped = format!("<del>{wrapped}</del>");
1252    }
1253    if italic {
1254        wrapped = format!("<em>{wrapped}</em>");
1255    }
1256    if bold {
1257        wrapped = format!("<strong>{wrapped}</strong>");
1258    }
1259    Some(wrapped)
1260}
1261
1262fn css_has_bold(style: &str) -> bool {
1263    Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
1264        .expect("valid regex")
1265        .is_match(style)
1266}
1267
1268fn css_has_italic(style: &str) -> bool {
1269    Regex::new(r"(?i)font-style\s*:\s*italic")
1270        .expect("valid regex")
1271        .is_match(style)
1272}
1273
1274fn css_has_strike(style: &str) -> bool {
1275    Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
1276        .expect("valid regex")
1277        .is_match(style)
1278}
1279
1280fn is_blockquote_style(style: &str) -> bool {
1281    let margin_left = css_point_value(style, "margin-left");
1282    let margin_right = css_point_value(style, "margin-right");
1283    margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
1284}
1285
1286fn css_point_value(style: &str, property: &str) -> f64 {
1287    let re = Regex::new(&format!(
1288        r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
1289        regex::escape(property)
1290    ))
1291    .expect("valid regex");
1292    re.captures(style)
1293        .and_then(|caps| caps.get(1))
1294        .and_then(|value| value.as_str().parse::<f64>().ok())
1295        .unwrap_or(0.0)
1296}
1297
1298/// Decode %XX percent escapes in `input`. Invalid sequences are left
1299/// untouched so well-formed ASCII URLs round-trip unchanged.
1300fn percent_decode_utf8_lossy(input: &str) -> String {
1301    let bytes = input.as_bytes();
1302    let mut decoded = Vec::with_capacity(bytes.len());
1303    let mut i = 0;
1304    while i < bytes.len() {
1305        if bytes[i] == b'%' && i + 2 < bytes.len() {
1306            let hi = (bytes[i + 1] as char).to_digit(16);
1307            let lo = (bytes[i + 2] as char).to_digit(16);
1308            if let (Some(hi), Some(lo)) = (hi, lo) {
1309                if let Ok(byte) = u8::try_from((hi << 4) | lo) {
1310                    decoded.push(byte);
1311                    i += 3;
1312                    continue;
1313                }
1314            }
1315        }
1316        decoded.push(bytes[i]);
1317        i += 1;
1318    }
1319    String::from_utf8_lossy(&decoded).into_owned()
1320}
1321
1322fn unescape_public_export_punctuation(markdown: &str) -> String {
1323    markdown
1324        .replace("\\.", ".")
1325        .replace("\\!", "!")
1326        .replace("\\(", "(")
1327        .replace("\\)", ")")
1328        .replace("\\[", "[")
1329        .replace("\\]", "]")
1330}
1331
1332fn convert_setext_headings(markdown: &str) -> String {
1333    let lines: Vec<&str> = markdown.lines().collect();
1334    let mut out = Vec::with_capacity(lines.len());
1335    let mut index = 0;
1336    while index < lines.len() {
1337        if index + 1 < lines.len() {
1338            let underline = lines[index + 1].trim();
1339            if is_setext_underline(underline, '=') {
1340                out.push(format!("# {}", lines[index].trim()));
1341                index += 2;
1342                continue;
1343            }
1344            if is_setext_underline(underline, '-') {
1345                out.push(format!("## {}", lines[index].trim()));
1346                index += 2;
1347                continue;
1348            }
1349        }
1350        out.push(lines[index].to_string());
1351        index += 1;
1352    }
1353    out.join("\n")
1354}
1355
1356fn is_setext_underline(line: &str, marker: char) -> bool {
1357    line.len() >= 5 && line.chars().all(|ch| ch == marker)
1358}
1359
1360fn normalize_atx_headings(markdown: &str) -> String {
1361    let heading_re = Regex::new(r"^(#{1,6})\s+(.+?)\s*$").expect("valid regex");
1362    let closing_re = closing_atx_heading_re();
1363    markdown
1364        .lines()
1365        .map(|line| {
1366            let Some(caps) = heading_re.captures(line) else {
1367                return line.to_string();
1368            };
1369            let hashes = caps.get(1).map_or("", |m| m.as_str());
1370            let mut text = caps.get(2).map_or("", |m| m.as_str()).trim().to_string();
1371            text = closing_re.replace(&text, "").trim().to_string();
1372            text = strip_wrapping_markdown_emphasis(&text);
1373            format!("{hashes} {text}")
1374        })
1375        .collect::<Vec<_>>()
1376        .join("\n")
1377}
1378
1379fn strip_wrapping_markdown_emphasis(text: &str) -> String {
1380    let trimmed = text.trim();
1381    for marker in ["***", "**", "*"] {
1382        if trimmed.len() > marker.len() * 2
1383            && trimmed.starts_with(marker)
1384            && trimmed.ends_with(marker)
1385        {
1386            return trimmed[marker.len()..trimmed.len() - marker.len()]
1387                .trim()
1388                .to_string();
1389        }
1390    }
1391    trimmed.to_string()
1392}
1393
1394fn normalize_bullet_markers(markdown: &str) -> String {
1395    let bullet_re = asterisk_bullet_re();
1396    markdown
1397        .lines()
1398        .map(|line| bullet_re.replace(line, "$1- ").into_owned())
1399        .collect::<Vec<_>>()
1400        .join("\n")
1401}
1402
1403fn normalize_list_spacing(markdown: &str) -> String {
1404    let lines: Vec<&str> = markdown.lines().collect();
1405    let mut out = Vec::with_capacity(lines.len());
1406
1407    for (index, line) in lines.iter().enumerate() {
1408        if line.trim().is_empty()
1409            && previous_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1410            && next_non_empty_line(&lines, index).is_some_and(is_markdown_list_item)
1411        {
1412            continue;
1413        }
1414        out.push((*line).to_string());
1415    }
1416
1417    out.join("\n")
1418}
1419
1420fn previous_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1421    lines[..index]
1422        .iter()
1423        .rev()
1424        .copied()
1425        .find(|line| !line.trim().is_empty())
1426}
1427
1428fn next_non_empty_line<'a>(lines: &'a [&str], index: usize) -> Option<&'a str> {
1429    lines[index + 1..]
1430        .iter()
1431        .copied()
1432        .find(|line| !line.trim().is_empty())
1433}
1434
1435fn is_markdown_list_item(line: &str) -> bool {
1436    markdown_list_item_re().is_match(line)
1437}
1438
1439fn normalize_blockquote_spacing(markdown: &str) -> String {
1440    let mut out = String::with_capacity(markdown.len());
1441    let mut pending_quote_blank = false;
1442    let mut in_quote = false;
1443
1444    for line in markdown.lines() {
1445        if line.trim().is_empty() && in_quote {
1446            pending_quote_blank = true;
1447            continue;
1448        }
1449
1450        if line.trim() == ">" {
1451            if in_quote {
1452                pending_quote_blank = true;
1453            }
1454            continue;
1455        }
1456
1457        if line.starts_with("> ") {
1458            if pending_quote_blank {
1459                out.push_str(">\n");
1460                pending_quote_blank = false;
1461            }
1462            out.push_str(line);
1463            out.push('\n');
1464            in_quote = true;
1465            continue;
1466        }
1467
1468        if in_quote && !line.trim().is_empty() {
1469            out.push('\n');
1470        }
1471        pending_quote_blank = false;
1472        in_quote = false;
1473        out.push_str(line);
1474        out.push('\n');
1475    }
1476
1477    out
1478}
1479
1480fn normalize_markdown_tables(markdown: &str) -> String {
1481    let lines: Vec<&str> = markdown.lines().collect();
1482    let mut out = Vec::with_capacity(lines.len());
1483    let mut index = 0;
1484
1485    while index < lines.len() {
1486        if !is_markdown_table_line(lines[index]) {
1487            out.push(lines[index].to_string());
1488            index += 1;
1489            continue;
1490        }
1491
1492        let start = index;
1493        while index < lines.len() && is_markdown_table_line(lines[index]) {
1494            index += 1;
1495        }
1496        let block = &lines[start..index];
1497        if block.len() >= 2 && is_markdown_separator_line(block[1]) {
1498            out.extend(normalize_markdown_table_block(block));
1499        } else {
1500            out.extend(block.iter().map(|line| (*line).to_string()));
1501        }
1502    }
1503
1504    out.join("\n")
1505}
1506
1507fn is_markdown_table_line(line: &str) -> bool {
1508    let trimmed = line.trim();
1509    trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.matches('|').count() >= 2
1510}
1511
1512fn is_markdown_separator_line(line: &str) -> bool {
1513    split_markdown_table_cells(line)
1514        .iter()
1515        .all(|cell| markdown_table_separator_cell_re().is_match(cell))
1516}
1517
1518fn normalize_markdown_table_block(lines: &[&str]) -> Vec<String> {
1519    lines
1520        .iter()
1521        .enumerate()
1522        .map(|(index, line)| {
1523            let cells = split_markdown_table_cells(line);
1524            if index == 1 {
1525                let separators = vec!["---".to_string(); cells.len()];
1526                render_markdown_table_row(&separators)
1527            } else {
1528                render_markdown_table_row(&cells)
1529            }
1530        })
1531        .collect()
1532}
1533
1534fn split_markdown_table_cells(line: &str) -> Vec<String> {
1535    line.trim()
1536        .trim_matches('|')
1537        .split('|')
1538        .map(|cell| cell.trim().to_string())
1539        .collect()
1540}
1541
1542fn render_markdown_table_row(cells: &[String]) -> String {
1543    format!("| {} |", cells.join(" | "))
1544}
1545
1546fn closing_atx_heading_re() -> &'static Regex {
1547    static RE: OnceLock<Regex> = OnceLock::new();
1548    RE.get_or_init(|| Regex::new(r"\s+#{1,6}$").expect("valid regex"))
1549}
1550
1551fn asterisk_bullet_re() -> &'static Regex {
1552    static RE: OnceLock<Regex> = OnceLock::new();
1553    RE.get_or_init(|| Regex::new(r"^(\s*)\* ").expect("valid regex"))
1554}
1555
1556fn markdown_list_item_re() -> &'static Regex {
1557    static RE: OnceLock<Regex> = OnceLock::new();
1558    RE.get_or_init(|| Regex::new(r"^\s*(?:[-+*]|\d+\.)\s+").expect("valid regex"))
1559}
1560
1561fn markdown_table_separator_cell_re() -> &'static Regex {
1562    static RE: OnceLock<Regex> = OnceLock::new();
1563    RE.get_or_init(|| Regex::new(r"^:?-{3,}:?$").expect("valid regex"))
1564}
1565
1566/// Fetch and render a Google Docs document via the authenticated REST API.
1567///
1568/// # Errors
1569///
1570/// Returns an error when the URL is invalid, no token is provided, or the API request fails.
1571pub async fn fetch_google_doc_from_docs_api(
1572    url: &str,
1573    api_token: &str,
1574) -> crate::Result<GDocsRenderedResult> {
1575    let document_id = extract_document_id(url).ok_or_else(|| {
1576        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1577    })?;
1578    let api_url = build_docs_api_url(&document_id);
1579    debug!(
1580        document_id = %document_id,
1581        api_url = %api_url,
1582        "fetching Google Doc via Docs API"
1583    );
1584
1585    let response = reqwest::Client::new()
1586        .get(&api_url)
1587        .header("Authorization", format!("Bearer {api_token}"))
1588        .header("Accept", "application/json")
1589        .send()
1590        .await
1591        .map_err(|e| {
1592            WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
1593        })?;
1594    debug!(
1595        document_id = %document_id,
1596        status = response.status().as_u16(),
1597        success = response.status().is_success(),
1598        content_type = response
1599            .headers()
1600            .get(reqwest::header::CONTENT_TYPE)
1601            .and_then(|value| value.to_str().ok())
1602            .unwrap_or(""),
1603        "received Google Docs API response"
1604    );
1605
1606    if !response.status().is_success() {
1607        return Err(WebCaptureError::FetchError(format!(
1608            "Failed to fetch Google Doc via Docs API ({} {}): {}",
1609            response.status().as_u16(),
1610            response.status().canonical_reason().unwrap_or("Unknown"),
1611            api_url
1612        )));
1613    }
1614
1615    let body = response.text().await.map_err(|e| {
1616        WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
1617    })?;
1618    let document = serde_json::from_str::<Value>(&body).map_err(|e| {
1619        WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
1620    })?;
1621    let rendered = render_docs_api_document(&document);
1622    debug!(
1623        document_id = %document_id,
1624        title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
1625        markdown_bytes = rendered.markdown.len(),
1626        html_bytes = rendered.html.len(),
1627        text_bytes = rendered.text.len(),
1628        "rendered Google Docs API document"
1629    );
1630
1631    Ok(GDocsRenderedResult {
1632        markdown: rendered.markdown,
1633        html: rendered.html,
1634        text: rendered.text,
1635        document_id,
1636        export_url: api_url,
1637        remote_images: Vec::new(),
1638    })
1639}
1640
1641/// Fetch and render the model data embedded in the Google Docs `/edit` route.
1642///
1643/// # Errors
1644///
1645/// Returns an error when the URL is invalid, the fetch fails, or no model chunks are present.
1646pub async fn fetch_google_doc_from_model(
1647    url: &str,
1648    api_token: Option<&str>,
1649) -> crate::Result<GDocsRenderedResult> {
1650    if api_token.is_some() {
1651        return Err(WebCaptureError::BrowserError(
1652            "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
1653        ));
1654    }
1655    let document_id = extract_document_id(url).ok_or_else(|| {
1656        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
1657    })?;
1658    let edit_url = build_edit_url(&document_id);
1659    debug!(
1660        document_id = %document_id,
1661        edit_url = %edit_url,
1662        "capturing Google Doc editor model with a real browser"
1663    );
1664    let model_data = fetch_google_doc_editor_model_with_cdp(&edit_url, &document_id).await?;
1665    let BrowserModelData {
1666        chunks,
1667        cid_urls,
1668        chunk_payload_bytes,
1669        poll_count,
1670        stable_for,
1671    } = model_data;
1672    debug!(
1673        document_id = %document_id,
1674        chunks = chunks.len(),
1675        cid_urls = cid_urls.len(),
1676        chunk_payload_bytes,
1677        poll_count,
1678        stable_for_ms = stable_for.as_millis(),
1679        "extracted Google Docs editor model chunks through CDP"
1680    );
1681    if chunks.is_empty() {
1682        return Err(WebCaptureError::ParseError(
1683            "Google Docs editor page did not expose DOCS_modelChunk data".to_string(),
1684        ));
1685    }
1686
1687    let export_html = match fetch_google_doc(url, "html", None).await {
1688        Ok(result) => Some(result.content),
1689        Err(error) => {
1690            warn!(
1691                document_id = %document_id,
1692                error = %error,
1693                "failed to fetch Google Docs export HTML for browser-model semantic hints"
1694            );
1695            None
1696        }
1697    };
1698    let capture = parse_model_chunks_with_export_html(&chunks, &cid_urls, export_html.as_deref());
1699    let remote_images = remote_images_from_capture(&capture);
1700    info!(
1701        document_id = %document_id,
1702        chunks = chunks.len(),
1703        cid_urls = cid_urls.len(),
1704        chunk_payload_bytes,
1705        poll_count,
1706        stable_for_ms = stable_for.as_millis(),
1707        blocks = capture.blocks.len(),
1708        tables = capture.tables.len(),
1709        images = capture.images.len(),
1710        text_bytes = capture.text.len(),
1711        "parsed Google Docs editor model"
1712    );
1713
1714    Ok(GDocsRenderedResult {
1715        markdown: render_captured_document(&capture, "markdown"),
1716        html: render_captured_document(&capture, "html"),
1717        text: render_captured_document(&capture, "txt"),
1718        document_id,
1719        export_url: edit_url,
1720        remote_images,
1721    })
1722}
1723
1724async fn fetch_google_doc_editor_model_with_cdp(
1725    edit_url: &str,
1726    document_id: &str,
1727) -> crate::Result<BrowserModelData> {
1728    let chrome = crate::browser::find_chrome_executable().ok_or_else(|| {
1729        WebCaptureError::BrowserError(
1730            "Chrome/Chromium executable was not found. Set WEB_CAPTURE_CHROME, CHROME_PATH, or GOOGLE_CHROME_BIN.".to_string(),
1731        )
1732    })?;
1733    let user_data_dir = crate::browser::temporary_user_data_dir();
1734    std::fs::create_dir_all(&user_data_dir)?;
1735
1736    debug!(
1737        document_id = %document_id,
1738        chrome = %chrome.display(),
1739        user_data_dir = %user_data_dir.display(),
1740        edit_url = %edit_url,
1741        "launching headless Chrome CDP session for Google Docs model capture"
1742    );
1743
1744    let mut child = launch_cdp_chrome(&chrome, &user_data_dir)?;
1745    let capture_result = async {
1746        let ws_url = wait_for_devtools_ws_url(&mut child).await?;
1747        let (mut ws, _) = connect_async(&ws_url).await.map_err(|error| {
1748            WebCaptureError::BrowserError(format!(
1749                "Failed to connect to Chrome DevTools websocket: {error}"
1750            ))
1751        })?;
1752        let mut next_id = 0u64;
1753        let session_id = navigate_google_docs_cdp_page(&mut ws, &mut next_id, edit_url).await?;
1754        wait_for_google_docs_model_chunks(&mut ws, &mut next_id, &session_id, document_id).await
1755    }
1756    .await;
1757
1758    if let Err(error) = child.kill().await {
1759        debug!(
1760            document_id = %document_id,
1761            error = %error,
1762            "failed to kill Chrome CDP browser process"
1763        );
1764    }
1765    let _ = child.wait().await;
1766    let _ = std::fs::remove_dir_all(&user_data_dir);
1767
1768    capture_result
1769}
1770
1771async fn navigate_google_docs_cdp_page(
1772    ws: &mut CdpWebSocket,
1773    next_id: &mut u64,
1774    edit_url: &str,
1775) -> crate::Result<String> {
1776    let target = cdp_send(
1777        ws,
1778        next_id,
1779        None,
1780        "Target.createTarget",
1781        serde_json::json!({ "url": "about:blank" }),
1782    )
1783    .await?;
1784    let target_id = target
1785        .get("targetId")
1786        .and_then(Value::as_str)
1787        .ok_or_else(|| {
1788            WebCaptureError::BrowserError(
1789                "Chrome DevTools Target.createTarget did not return targetId".to_string(),
1790            )
1791        })?
1792        .to_string();
1793    let attached = cdp_send(
1794        ws,
1795        next_id,
1796        None,
1797        "Target.attachToTarget",
1798        serde_json::json!({ "targetId": target_id, "flatten": true }),
1799    )
1800    .await?;
1801    let session_id = attached
1802        .get("sessionId")
1803        .and_then(Value::as_str)
1804        .ok_or_else(|| {
1805            WebCaptureError::BrowserError(
1806                "Chrome DevTools Target.attachToTarget did not return sessionId".to_string(),
1807            )
1808        })?
1809        .to_string();
1810
1811    cdp_send(
1812        ws,
1813        next_id,
1814        Some(&session_id),
1815        "Page.enable",
1816        serde_json::json!({}),
1817    )
1818    .await?;
1819    cdp_send(
1820        ws,
1821        next_id,
1822        Some(&session_id),
1823        "Runtime.enable",
1824        serde_json::json!({}),
1825    )
1826    .await?;
1827    cdp_send(
1828        ws,
1829        next_id,
1830        Some(&session_id),
1831        "Page.addScriptToEvaluateOnNewDocument",
1832        serde_json::json!({ "source": GDOCS_MODEL_CAPTURE_INIT_SCRIPT }),
1833    )
1834    .await?;
1835    cdp_send(
1836        ws,
1837        next_id,
1838        Some(&session_id),
1839        "Page.navigate",
1840        serde_json::json!({ "url": edit_url }),
1841    )
1842    .await?;
1843
1844    Ok(session_id)
1845}
1846
1847async fn wait_for_google_docs_model_chunks(
1848    ws: &mut CdpWebSocket,
1849    next_id: &mut u64,
1850    session_id: &str,
1851    document_id: &str,
1852) -> crate::Result<BrowserModelData> {
1853    let started = Instant::now();
1854    let max_wait = gdocs_editor_model_max_wait();
1855    let stability_window = gdocs_editor_model_stability_window();
1856    let mut quiescence = BrowserModelQuiescence::default();
1857    let mut last_chunks = 0usize;
1858    let mut last_cid_urls = 0usize;
1859    let mut last_payload_bytes = 0usize;
1860    let mut last_stable_for = Duration::ZERO;
1861    let mut poll_count = 0usize;
1862
1863    while started.elapsed() < max_wait {
1864        let result = cdp_send(
1865            ws,
1866            next_id,
1867            Some(session_id),
1868            "Runtime.evaluate",
1869            serde_json::json!({
1870                "expression": format!("({GDOCS_MODEL_EXTRACT_SCRIPT})()"),
1871                "returnByValue": true,
1872                "awaitPromise": true
1873            }),
1874        )
1875        .await?;
1876        if let Some(exception) = result.get("exceptionDetails") {
1877            return Err(WebCaptureError::BrowserError(format!(
1878                "Google Docs model extraction script failed: {exception}"
1879            )));
1880        }
1881        let value = result
1882            .pointer("/result/value")
1883            .cloned()
1884            .unwrap_or(Value::Null);
1885        let model_data = browser_model_data_from_value(&value);
1886        poll_count += 1;
1887        let fingerprint = model_data.fingerprint();
1888        last_chunks = model_data.chunks.len();
1889        last_cid_urls = model_data.cid_urls.len();
1890        last_payload_bytes = model_data.chunk_payload_bytes;
1891        let now = Instant::now();
1892        if let Some(stable_for) = quiescence.observe(fingerprint, now, stability_window) {
1893            let mut model_data = model_data;
1894            model_data.poll_count = poll_count;
1895            model_data.stable_for = stable_for;
1896            debug!(
1897                document_id = %document_id,
1898                chunks = model_data.chunks.len(),
1899                cid_urls = model_data.cid_urls.len(),
1900                chunk_payload_bytes = model_data.chunk_payload_bytes,
1901                poll_count,
1902                stable_for_ms = stable_for.as_millis(),
1903                elapsed_ms = started.elapsed().as_millis(),
1904                "captured quiesced Google Docs model chunks through CDP Runtime.evaluate"
1905            );
1906            return Ok(model_data);
1907        }
1908        last_stable_for = quiescence.stable_for(now);
1909        tokio::time::sleep(GDOCS_EDITOR_MODEL_POLL_INTERVAL).await;
1910    }
1911
1912    Err(WebCaptureError::BrowserError(format!(
1913        "Timed out waiting for Google Docs DOCS_modelChunk stream to quiesce for document {document_id} after {} ms (last chunks={last_chunks}, payload_bytes={last_payload_bytes}, cid_urls={last_cid_urls}, poll_count={poll_count}, stable_for_ms={})",
1914        max_wait.as_millis(),
1915        last_stable_for.as_millis()
1916    )))
1917}
1918
1919fn launch_cdp_chrome(
1920    chrome: &std::path::Path,
1921    user_data_dir: &std::path::Path,
1922) -> crate::Result<Child> {
1923    let mut command = Command::new(chrome);
1924    command
1925        .args([
1926            "--headless=new",
1927            "--disable-gpu",
1928            "--disable-extensions",
1929            "--disable-dev-shm-usage",
1930            "--disable-background-networking",
1931            "--disable-component-update",
1932            "--disable-default-apps",
1933            "--disable-sync",
1934            "--metrics-recording-only",
1935            "--no-default-browser-check",
1936            "--no-first-run",
1937            "--no-sandbox",
1938            "--remote-debugging-port=0",
1939            "--window-size=1280,800",
1940        ])
1941        .arg(format!("--user-data-dir={}", user_data_dir.display()))
1942        .arg(format!("--user-agent={GDOCS_USER_AGENT}"))
1943        .stderr(Stdio::piped())
1944        .stdout(Stdio::null())
1945        .kill_on_drop(true);
1946
1947    command.spawn().map_err(|error| {
1948        WebCaptureError::BrowserError(format!("Failed to launch Chrome CDP browser: {error}"))
1949    })
1950}
1951
1952async fn wait_for_devtools_ws_url(child: &mut Child) -> crate::Result<String> {
1953    let stderr = child.stderr.take().ok_or_else(|| {
1954        WebCaptureError::BrowserError("Chrome CDP process did not expose stderr".to_string())
1955    })?;
1956    let mut lines = BufReader::new(stderr).lines();
1957    let started = Instant::now();
1958
1959    while started.elapsed() < GDOCS_BROWSER_LAUNCH_TIMEOUT {
1960        let line = tokio::time::timeout(Duration::from_millis(250), lines.next_line()).await;
1961        match line {
1962            Ok(Ok(Some(line))) => {
1963                if let Some((_, ws_url)) = line.split_once("DevTools listening on ") {
1964                    return Ok(ws_url.trim().to_string());
1965                }
1966            }
1967            Ok(Ok(None)) => {
1968                break;
1969            }
1970            Ok(Err(error)) => {
1971                return Err(WebCaptureError::BrowserError(format!(
1972                    "Failed to read Chrome CDP stderr: {error}"
1973                )));
1974            }
1975            Err(_) => {}
1976        }
1977    }
1978
1979    Err(WebCaptureError::BrowserError(format!(
1980        "Timed out waiting for Chrome DevTools websocket URL after {} ms",
1981        GDOCS_BROWSER_LAUNCH_TIMEOUT.as_millis()
1982    )))
1983}
1984
1985async fn cdp_send(
1986    ws: &mut CdpWebSocket,
1987    next_id: &mut u64,
1988    session_id: Option<&str>,
1989    method: &str,
1990    params: Value,
1991) -> crate::Result<Value> {
1992    *next_id += 1;
1993    let id = *next_id;
1994    let mut message = serde_json::json!({
1995        "id": id,
1996        "method": method,
1997        "params": params
1998    });
1999    if let Some(session_id) = session_id {
2000        message["sessionId"] = Value::String(session_id.to_string());
2001    }
2002
2003    ws.send(Message::Text(message.to_string()))
2004        .await
2005        .map_err(|error| {
2006            WebCaptureError::BrowserError(format!(
2007                "Failed to send Chrome DevTools command {method}: {error}"
2008            ))
2009        })?;
2010
2011    while let Some(message) = ws.next().await {
2012        let message = message.map_err(|error| {
2013            WebCaptureError::BrowserError(format!(
2014                "Failed to read Chrome DevTools response for {method}: {error}"
2015            ))
2016        })?;
2017        if !message.is_text() {
2018            continue;
2019        }
2020        let text = message.to_text().map_err(|error| {
2021            WebCaptureError::BrowserError(format!(
2022                "Chrome DevTools response for {method} was not text: {error}"
2023            ))
2024        })?;
2025        let value = serde_json::from_str::<Value>(text).map_err(|error| {
2026            WebCaptureError::ParseError(format!(
2027                "Failed to parse Chrome DevTools response for {method}: {error}; response={text}"
2028            ))
2029        })?;
2030        if value.get("id").and_then(Value::as_u64) != Some(id) {
2031            continue;
2032        }
2033        if let Some(error) = value.get("error") {
2034            return Err(WebCaptureError::BrowserError(format!(
2035                "Chrome DevTools command {method} failed: {error}"
2036            )));
2037        }
2038        return Ok(value.get("result").cloned().unwrap_or(Value::Null));
2039    }
2040
2041    Err(WebCaptureError::BrowserError(format!(
2042        "Chrome DevTools websocket closed before response for {method}"
2043    )))
2044}
2045
2046fn browser_model_data_from_value(value: &Value) -> BrowserModelData {
2047    let chunks = value
2048        .get("chunks")
2049        .and_then(Value::as_array)
2050        .cloned()
2051        .unwrap_or_default();
2052    let chunk_payload_bytes = model_chunk_payload_bytes(&chunks);
2053    let cid_urls = value
2054        .get("cidUrlMap")
2055        .and_then(Value::as_object)
2056        .map(|map| {
2057            map.iter()
2058                .filter_map(|(key, value)| value.as_str().map(|url| (key.clone(), url.to_string())))
2059                .collect::<HashMap<_, _>>()
2060        })
2061        .unwrap_or_default();
2062    BrowserModelData {
2063        chunks,
2064        cid_urls,
2065        chunk_payload_bytes,
2066        poll_count: 0,
2067        stable_for: Duration::ZERO,
2068    }
2069}
2070
2071fn model_chunk_payload_bytes(chunks: &[Value]) -> usize {
2072    chunks
2073        .iter()
2074        .map(|chunk| serde_json::to_vec(chunk).map_or(0, |encoded| encoded.len()))
2075        .sum()
2076}
2077
2078fn gdocs_editor_model_max_wait() -> Duration {
2079    duration_from_env_ms(
2080        "WEB_CAPTURE_GDOCS_MAX_WAIT_MS",
2081        GDOCS_EDITOR_MODEL_MAX_WAIT_DEFAULT,
2082    )
2083}
2084
2085fn gdocs_editor_model_stability_window() -> Duration {
2086    duration_from_env_ms(
2087        "WEB_CAPTURE_GDOCS_STABILITY_MS",
2088        GDOCS_EDITOR_MODEL_STABILITY_DEFAULT,
2089    )
2090}
2091
2092fn duration_from_env_ms(name: &str, default: Duration) -> Duration {
2093    std::env::var(name).map_or(default, |value| match value.trim().parse::<u64>() {
2094        Ok(ms) => Duration::from_millis(ms),
2095        Err(error) => {
2096            warn!(
2097                name,
2098                value,
2099                error = %error,
2100                default_ms = default.as_millis(),
2101                "ignoring invalid Google Docs model wait environment variable"
2102            );
2103            default
2104        }
2105    })
2106}
2107
2108fn remote_images_from_capture(capture: &CapturedDocument) -> Vec<RemoteImage> {
2109    capture
2110        .images
2111        .iter()
2112        .filter_map(|node| match node {
2113            ContentNode::Image {
2114                url: Some(url),
2115                alt,
2116                ..
2117            } => Some(RemoteImage {
2118                url: url.clone(),
2119                alt: alt.clone(),
2120            }),
2121            ContentNode::Image { .. } | ContentNode::Text { .. } => None,
2122        })
2123        .collect()
2124}
2125
2126/// Render a Google Docs REST API document value.
2127#[must_use]
2128pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
2129    let blocks = structural_elements_to_blocks(
2130        document
2131            .pointer("/body/content")
2132            .and_then(Value::as_array)
2133            .map_or(&[] as &[Value], Vec::as_slice),
2134        document.pointer("/inlineObjects").unwrap_or(&Value::Null),
2135    );
2136    GDocsRenderedOutput {
2137        markdown: render_blocks_markdown(&blocks),
2138        html: render_blocks_html(&blocks),
2139        text: blocks_to_text(&blocks),
2140    }
2141}
2142
2143/// Rendered document output.
2144#[derive(Debug, Clone, PartialEq, Eq)]
2145pub struct GDocsRenderedOutput {
2146    /// Markdown output.
2147    pub markdown: String,
2148    /// HTML output.
2149    pub html: String,
2150    /// Plain text output.
2151    pub text: String,
2152}
2153
2154fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
2155    let mut blocks = Vec::new();
2156    for element in elements {
2157        if let Some(paragraph) = element.get("paragraph") {
2158            let content = paragraph_to_content(paragraph, inline_objects);
2159            if !content_to_text(&content).trim().is_empty()
2160                || content
2161                    .iter()
2162                    .any(|node| matches!(node, ContentNode::Image { .. }))
2163            {
2164                blocks.push(CapturedBlock::Paragraph {
2165                    style: paragraph
2166                        .pointer("/paragraphStyle/namedStyleType")
2167                        .and_then(Value::as_str)
2168                        .map(ToString::to_string),
2169                    list: None,
2170                    quote: false,
2171                    horizontal_rule: false,
2172                    content,
2173                });
2174            }
2175        } else if let Some(table) = element.get("table") {
2176            blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
2177        }
2178    }
2179    blocks
2180}
2181
2182fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
2183    let rows = table
2184        .get("tableRows")
2185        .and_then(Value::as_array)
2186        .map_or(&[] as &[Value], Vec::as_slice)
2187        .iter()
2188        .map(|row| TableRow {
2189            cells: row
2190                .get("tableCells")
2191                .and_then(Value::as_array)
2192                .map_or(&[] as &[Value], Vec::as_slice)
2193                .iter()
2194                .map(|cell| TableCell {
2195                    content: structural_elements_to_inline_content(
2196                        cell.get("content")
2197                            .and_then(Value::as_array)
2198                            .map_or(&[] as &[Value], Vec::as_slice),
2199                        inline_objects,
2200                    ),
2201                })
2202                .collect(),
2203        })
2204        .collect();
2205    TableBlock { rows }
2206}
2207
2208fn structural_elements_to_inline_content(
2209    elements: &[Value],
2210    inline_objects: &Value,
2211) -> Vec<ContentNode> {
2212    let mut content = Vec::new();
2213    for element in elements {
2214        if let Some(paragraph) = element.get("paragraph") {
2215            let paragraph_content = paragraph_to_content(paragraph, inline_objects);
2216            if !content.is_empty() && !paragraph_content.is_empty() {
2217                append_text(&mut content, "\n");
2218            }
2219            content.extend(paragraph_content);
2220        } else if let Some(table) = element.get("table") {
2221            append_text(
2222                &mut content,
2223                &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
2224                    table,
2225                    inline_objects,
2226                ))]),
2227            );
2228        }
2229    }
2230    content
2231}
2232
2233fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
2234    let mut content = Vec::new();
2235    for element in paragraph
2236        .get("elements")
2237        .and_then(Value::as_array)
2238        .map_or(&[] as &[Value], Vec::as_slice)
2239    {
2240        if let Some(text) = element
2241            .pointer("/textRun/content")
2242            .and_then(Value::as_str)
2243            .map(|text| text.strip_suffix('\n').unwrap_or(text))
2244        {
2245            append_text(&mut content, text);
2246        } else if let Some(inline_id) = element
2247            .pointer("/inlineObjectElement/inlineObjectId")
2248            .and_then(Value::as_str)
2249        {
2250            if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
2251                content.push(image);
2252            }
2253        }
2254    }
2255    content
2256}
2257
2258fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
2259    let embedded = inline_objects
2260        .get(inline_id)?
2261        .pointer("/inlineObjectProperties/embeddedObject")?;
2262    let url = embedded
2263        .pointer("/imageProperties/contentUri")
2264        .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
2265        .and_then(Value::as_str)?;
2266    let alt = embedded
2267        .get("title")
2268        .or_else(|| embedded.get("description"))
2269        .and_then(Value::as_str)
2270        .unwrap_or("image");
2271    Some(ContentNode::Image {
2272        cid: None,
2273        url: Some(url.to_string()),
2274        alt: alt.to_string(),
2275        width: json_dimension_to_string(embedded.pointer("/size/width/magnitude")),
2276        height: json_dimension_to_string(embedded.pointer("/size/height/magnitude")),
2277        is_suggestion: false,
2278    })
2279}
2280
2281fn json_dimension_to_string(value: Option<&Value>) -> Option<String> {
2282    match value? {
2283        Value::Number(number) => Some(number.to_string()),
2284        Value::String(text) if !text.is_empty() => Some(text.clone()),
2285        _ => None,
2286    }
2287}
2288
2289fn build_model_style_maps(
2290    items: &[Value],
2291    text_len: usize,
2292    utf16_position_map: &[usize],
2293) -> ModelStyleMaps {
2294    let mut maps = ModelStyleMaps {
2295        inline_styles: vec![TextStyle::default(); text_len],
2296        ..ModelStyleMaps::default()
2297    };
2298
2299    for item in items {
2300        if item.get("ty").and_then(Value::as_str) != Some("as") {
2301            continue;
2302        }
2303        let (Some(start), Some(end), Some(style_type)) = (
2304            item.get("si").and_then(Value::as_u64),
2305            item.get("ei").and_then(Value::as_u64),
2306            item.get("st").and_then(Value::as_str),
2307        ) else {
2308            continue;
2309        };
2310        let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
2311            continue;
2312        };
2313
2314        let start = utf16_position_to_char_position(utf16_position_map, start);
2315        let end = utf16_position_to_char_position(utf16_position_map, end);
2316        if start == 0 || end == 0 {
2317            continue;
2318        }
2319
2320        match style_type {
2321            "text" => {
2322                let style = text_style(item);
2323                apply_inline_style(&mut maps.inline_styles, start, end, &style);
2324            }
2325            "link" => {
2326                let style = TextStyle {
2327                    link: item
2328                        .pointer("/sm/lnks_link/ulnk_url")
2329                        .and_then(Value::as_str)
2330                        .map(ToString::to_string),
2331                    ..TextStyle::default()
2332                };
2333                apply_inline_style(&mut maps.inline_styles, start, end, &style);
2334            }
2335            "paragraph" => {
2336                maps.paragraph_by_end
2337                    .insert(end, paragraph_style_from_model(item));
2338            }
2339            "list" => {
2340                maps.list_by_end.insert(
2341                    end,
2342                    ListMeta {
2343                        id: item
2344                            .pointer("/sm/ls_id")
2345                            .and_then(Value::as_str)
2346                            .unwrap_or("")
2347                            .to_string(),
2348                        level: item
2349                            .pointer("/sm/ls_nest")
2350                            .and_then(Value::as_u64)
2351                            .and_then(|value| usize::try_from(value).ok())
2352                            .unwrap_or(0),
2353                        ordered: false,
2354                    },
2355                );
2356            }
2357            "horizontal_rule" => {
2358                maps.horizontal_rules.insert(end);
2359            }
2360            _ => {}
2361        }
2362    }
2363
2364    maps
2365}
2366
2367fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
2368    let from = start.saturating_sub(1);
2369    let to = end.min(styles.len());
2370    if from >= to {
2371        return;
2372    }
2373    for style in &mut styles[from..to] {
2374        if patch.bold {
2375            style.bold = true;
2376        }
2377        if patch.italic {
2378            style.italic = true;
2379        }
2380        if patch.strike {
2381            style.strike = true;
2382        }
2383        if patch.link.is_some() {
2384            style.link.clone_from(&patch.link);
2385        }
2386    }
2387}
2388
2389fn text_style(item: &Value) -> TextStyle {
2390    TextStyle {
2391        bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true)
2392            && item.pointer("/sm/ts_bd_i").and_then(Value::as_bool) != Some(true),
2393        italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true)
2394            && item.pointer("/sm/ts_it_i").and_then(Value::as_bool) != Some(true),
2395        strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true)
2396            && item.pointer("/sm/ts_st_i").and_then(Value::as_bool) != Some(true),
2397        link: None,
2398    }
2399}
2400
2401fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
2402    let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
2403    ParagraphStyle {
2404        style: heading.map(|level| format!("HEADING_{level}")),
2405        indent_start: item
2406            .pointer("/sm/ps_il")
2407            .and_then(Value::as_f64)
2408            .unwrap_or(0.0),
2409        indent_first_line: item
2410            .pointer("/sm/ps_ifl")
2411            .and_then(Value::as_f64)
2412            .unwrap_or(0.0),
2413    }
2414}
2415
2416fn build_utf16_position_map(text: &str) -> Vec<usize> {
2417    let mut map = vec![0; text.encode_utf16().count() + 1];
2418    let mut utf16_pos = 1usize;
2419    for (idx, ch) in text.chars().enumerate() {
2420        let char_pos = idx + 1;
2421        for _ in 0..ch.len_utf16() {
2422            if let Some(slot) = map.get_mut(utf16_pos) {
2423                *slot = char_pos;
2424            }
2425            utf16_pos += 1;
2426        }
2427    }
2428    map
2429}
2430
2431fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
2432    map.get(position)
2433        .copied()
2434        .filter(|position| *position > 0)
2435        .or_else(|| map.iter().rfind(|position| **position > 0).copied())
2436        .unwrap_or(0)
2437}
2438
2439/// Parse captured `DOCS_modelChunk` values.
2440#[must_use]
2441pub fn parse_model_chunks<S: BuildHasher>(
2442    chunks: &[Value],
2443    cid_urls: &HashMap<String, String, S>,
2444) -> CapturedDocument {
2445    parse_model_chunks_with_export_html(chunks, cid_urls, None)
2446}
2447
2448/// Parse captured `DOCS_modelChunk` values and optionally merge semantic hints
2449/// from Google Docs export HTML.
2450#[must_use]
2451#[allow(clippy::too_many_lines)]
2452pub fn parse_model_chunks_with_export_html<S: BuildHasher>(
2453    chunks: &[Value],
2454    cid_urls: &HashMap<String, String, S>,
2455    export_html: Option<&str>,
2456) -> CapturedDocument {
2457    let items = collect_model_items(chunks);
2458    let full_text = items
2459        .iter()
2460        .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
2461        .filter_map(|item| item.get("s").and_then(Value::as_str))
2462        .collect::<String>();
2463    let chars: Vec<char> = full_text.chars().collect();
2464    let utf16_position_map = build_utf16_position_map(&full_text);
2465    let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
2466
2467    let mut positions = HashMap::new();
2468    for item in &items {
2469        if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
2470            if let (Some(id), Some(pos)) = (
2471                item.get("id").and_then(Value::as_str),
2472                item.get("spi").and_then(Value::as_u64),
2473            ) {
2474                if let Ok(pos) = usize::try_from(pos) {
2475                    positions.insert(
2476                        id.to_string(),
2477                        utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
2478                    );
2479                }
2480            }
2481        }
2482    }
2483
2484    let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
2485    let mut images = Vec::new();
2486    for item in &items {
2487        let ty = item.get("ty").and_then(Value::as_str);
2488        if !matches!(ty, Some("ae" | "ase")) {
2489            continue;
2490        }
2491        let Some(id) = item.get("id").and_then(Value::as_str) else {
2492            continue;
2493        };
2494        let Some(pos) = positions.get(id).copied() else {
2495            continue;
2496        };
2497        let cid = item
2498            .pointer("/epm/ee_eo/i_cid")
2499            .and_then(Value::as_str)
2500            .map(ToString::to_string);
2501        let node = ContentNode::Image {
2502            url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
2503            cid,
2504            alt: item
2505                .pointer("/epm/ee_eo/eo_ad")
2506                .and_then(Value::as_str)
2507                .unwrap_or_else(|| {
2508                    if ty == Some("ase") {
2509                        "suggested image"
2510                    } else {
2511                        "image"
2512                    }
2513                })
2514                .to_string(),
2515            width: json_dimension_to_string(item.pointer("/epm/ee_eo/i_wth")),
2516            height: json_dimension_to_string(item.pointer("/epm/ee_eo/i_ht")),
2517            is_suggestion: ty == Some("ase"),
2518        };
2519        images_by_pos.insert(pos, node.clone());
2520        images.push(node);
2521    }
2522
2523    let mut blocks = Vec::new();
2524    let mut tables = Vec::new();
2525    let mut paragraph = Vec::new();
2526    let mut table: Option<TableBlock> = None;
2527    let mut row: Option<TableRow> = None;
2528    let mut cell: Option<TableCell> = None;
2529    let mut previous_table_control: Option<u32> = None;
2530    let mut skip_next_table_newline = false;
2531
2532    for (idx, ch) in chars.iter().copied().enumerate() {
2533        match ch as u32 {
2534            0x10 => {
2535                flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2536                table = Some(TableBlock::default());
2537                previous_table_control = Some(0x10);
2538                skip_next_table_newline = false;
2539            }
2540            0x11 => {
2541                flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2542                previous_table_control = None;
2543                skip_next_table_newline = false;
2544            }
2545            0x12 => {
2546                flush_row(&mut row, &mut cell, table.as_mut(), true);
2547                row = Some(TableRow::default());
2548                previous_table_control = Some(0x12);
2549                skip_next_table_newline = false;
2550            }
2551            0x1c => {
2552                if cell.as_ref().is_none_or(cell_is_empty) && previous_table_control == Some(0x0a) {
2553                    previous_table_control = Some(0x1c);
2554                    continue;
2555                }
2556                let had_content = cell.as_ref().is_some_and(|cell| !cell_is_empty(cell));
2557                flush_cell(&mut row, &mut cell, false);
2558                if row.is_none() {
2559                    row = Some(TableRow::default());
2560                }
2561                cell = Some(TableCell::default());
2562                if had_content && chars.get(idx + 1).is_some_and(|ch| *ch as u32 == 0x0a) {
2563                    skip_next_table_newline = true;
2564                }
2565                previous_table_control = Some(0x1c);
2566            }
2567            0x0a => {
2568                if table.is_some() {
2569                    if skip_next_table_newline {
2570                        skip_next_table_newline = false;
2571                        previous_table_control = Some(0x0a);
2572                        continue;
2573                    }
2574                    // Inside a table, a bare newline separates cells within the
2575                    // current row (rows are delimited by 0x12/0x11). See R2.
2576                    flush_cell(&mut row, &mut cell, false);
2577                    if row.is_none() {
2578                        row = Some(TableRow::default());
2579                    }
2580                    cell = Some(TableCell::default());
2581                    previous_table_control = Some(0x0a);
2582                } else {
2583                    flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
2584                }
2585            }
2586            0x0b => {
2587                append_to_current(
2588                    &mut paragraph,
2589                    &mut row,
2590                    &mut cell,
2591                    table.is_some(),
2592                    "\n",
2593                    TextStyle::default(),
2594                );
2595                previous_table_control = None;
2596                skip_next_table_newline = false;
2597            }
2598            _ => {
2599                if let Some(image) = images_by_pos.get(&idx).cloned() {
2600                    push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
2601                    previous_table_control = None;
2602                    skip_next_table_newline = false;
2603                    if ch == '*' {
2604                        continue;
2605                    }
2606                }
2607                append_to_current(
2608                    &mut paragraph,
2609                    &mut row,
2610                    &mut cell,
2611                    table.is_some(),
2612                    &ch.to_string(),
2613                    style_maps
2614                        .inline_styles
2615                        .get(idx)
2616                        .cloned()
2617                        .unwrap_or_default(),
2618                );
2619                previous_table_control = None;
2620                skip_next_table_newline = false;
2621            }
2622        }
2623    }
2624
2625    if table.is_some() {
2626        flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
2627    }
2628    flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
2629
2630    let mut capture = CapturedDocument {
2631        text: blocks_to_text(&blocks),
2632        blocks,
2633        tables,
2634        images,
2635    };
2636    if let Some(export_html) = export_html {
2637        apply_export_semantic_hints(&mut capture.blocks, export_html);
2638        capture.text = blocks_to_text(&capture.blocks);
2639    }
2640    capture
2641}
2642
2643fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
2644    let mut items = Vec::new();
2645    for chunk in chunks {
2646        if let Some(array) = chunk.as_array() {
2647            items.extend(array.iter().cloned());
2648        } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
2649            items.extend(array.iter().cloned());
2650        } else if chunk.get("ty").and_then(Value::as_str).is_some() {
2651            items.push(chunk.clone());
2652        }
2653    }
2654    items
2655}
2656
2657fn flush_paragraph(
2658    paragraph: &mut Vec<ContentNode>,
2659    blocks: &mut Vec<CapturedBlock>,
2660    end_pos: Option<usize>,
2661    style_maps: &ModelStyleMaps,
2662) {
2663    if !content_to_text(paragraph).trim().is_empty()
2664        || paragraph
2665            .iter()
2666            .any(|node| matches!(node, ContentNode::Image { .. }))
2667    {
2668        let meta =
2669            paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
2670        blocks.push(CapturedBlock::Paragraph {
2671            content: std::mem::take(paragraph),
2672            style: meta.style,
2673            list: meta.list,
2674            quote: meta.quote,
2675            horizontal_rule: meta.horizontal_rule,
2676        });
2677    } else {
2678        paragraph.clear();
2679    }
2680}
2681
2682fn paragraph_meta_for_end_position(
2683    style_maps: &ModelStyleMaps,
2684    end_pos: Option<usize>,
2685    text: &str,
2686) -> ParagraphMeta {
2687    let Some(end_pos) = end_pos else {
2688        return ParagraphMeta::default();
2689    };
2690    let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
2691    let mut meta = ParagraphMeta {
2692        style: paragraph_style.and_then(|style| style.style.clone()),
2693        ..ParagraphMeta::default()
2694    };
2695
2696    if let Some(list) = style_maps.list_by_end.get(&end_pos) {
2697        let mut list = list.clone();
2698        list.ordered = infer_ordered_list(&list, text);
2699        meta.list = Some(list);
2700    } else if paragraph_style.is_some_and(|style| {
2701        style.indent_start > 0.0
2702            && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
2703    }) {
2704        meta.quote = true;
2705    }
2706
2707    meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
2708        || end_pos
2709            .checked_sub(1)
2710            .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
2711        && text.trim().chars().all(|ch| ch == '-');
2712    meta
2713}
2714
2715const fn infer_ordered_list(_list: &ListMeta, _text: &str) -> bool {
2716    false
2717}
2718
2719fn apply_export_semantic_hints(blocks: &mut [CapturedBlock], export_html: &str) {
2720    let hints = extract_export_semantic_hints(export_html);
2721    let mut cursor = 0usize;
2722    for block in blocks {
2723        let CapturedBlock::Paragraph {
2724            content,
2725            list,
2726            quote,
2727            ..
2728        } = block
2729        else {
2730            continue;
2731        };
2732        let text = normalize_semantic_text(&content_to_text(content));
2733        if text.is_empty() {
2734            continue;
2735        }
2736        let Some((index, hint)) = find_next_semantic_hint(&hints, &text, cursor, list.is_some())
2737        else {
2738            continue;
2739        };
2740        cursor = index + 1;
2741        if let Some(list) = list.as_mut() {
2742            if let Some(ordered) = hint.list_ordered {
2743                list.ordered = ordered;
2744            }
2745        } else {
2746            *quote = hint.quote;
2747        }
2748    }
2749}
2750
2751fn find_next_semantic_hint<'a>(
2752    hints: &'a [ExportSemanticHint],
2753    text: &str,
2754    cursor: usize,
2755    needs_list_hint: bool,
2756) -> Option<(usize, &'a ExportSemanticHint)> {
2757    hints.iter().enumerate().skip(cursor).find(|(_, hint)| {
2758        hint.text == text
2759            && if needs_list_hint {
2760                hint.list_ordered.is_some()
2761            } else {
2762                hint.list_ordered.is_none()
2763            }
2764    })
2765}
2766
2767fn extract_export_semantic_hints(export_html: &str) -> Vec<ExportSemanticHint> {
2768    let preprocessed = preprocess_google_docs_export_html(export_html).html;
2769    let document = Html::parse_document(&preprocessed);
2770    let selector =
2771        Selector::parse("body h1,body h2,body h3,body h4,body h5,body h6,body p,body li")
2772            .expect("valid semantic hint selector");
2773    document
2774        .select(&selector)
2775        .filter_map(|element| {
2776            let tag = element.value().name();
2777            let text = export_element_semantic_text(&element);
2778            if text.is_empty() {
2779                return None;
2780            }
2781            let list_ordered = if tag == "li" {
2782                nearest_list_is_ordered(&element)
2783            } else {
2784                None
2785            };
2786            Some(ExportSemanticHint {
2787                text,
2788                list_ordered,
2789                quote: tag != "li" && has_ancestor_tag(&element, "blockquote"),
2790            })
2791        })
2792        .collect()
2793}
2794
2795fn export_element_semantic_text(element: &ElementRef<'_>) -> String {
2796    let raw_text = if element.value().name() == "li" {
2797        list_item_own_text(element)
2798    } else {
2799        element.text().collect()
2800    };
2801    normalize_semantic_text(&raw_text)
2802}
2803
2804fn list_item_own_text(element: &ElementRef<'_>) -> String {
2805    let mut text = String::new();
2806    let mut stack: Vec<_> = element.children().collect();
2807    stack.reverse();
2808
2809    while let Some(node) = stack.pop() {
2810        match node.value() {
2811            Node::Text(value) => text.push_str(value),
2812            Node::Element(child) if matches!(child.name(), "ol" | "ul") => {}
2813            Node::Element(_) => {
2814                let mut children: Vec<_> = node.children().collect();
2815                children.reverse();
2816                stack.extend(children);
2817            }
2818            _ => {}
2819        }
2820    }
2821
2822    text
2823}
2824
2825fn nearest_list_is_ordered(element: &ElementRef<'_>) -> Option<bool> {
2826    element
2827        .ancestors()
2828        .filter_map(ElementRef::wrap)
2829        .find_map(|ancestor| match ancestor.value().name() {
2830            "ol" => Some(true),
2831            "ul" => Some(false),
2832            _ => None,
2833        })
2834}
2835
2836fn has_ancestor_tag(element: &ElementRef<'_>, tag: &str) -> bool {
2837    element
2838        .ancestors()
2839        .filter_map(ElementRef::wrap)
2840        .any(|ancestor| ancestor.value().name() == tag)
2841}
2842
2843fn normalize_semantic_text(text: &str) -> String {
2844    text.replace('\u{a0}', " ")
2845        .split_whitespace()
2846        .collect::<Vec<_>>()
2847        .join(" ")
2848}
2849
2850fn cell_is_empty(cell: &TableCell) -> bool {
2851    cell.content.iter().all(|node| match node {
2852        ContentNode::Text { text, .. } => text.trim().is_empty(),
2853        ContentNode::Image { .. } => false,
2854    })
2855}
2856
2857fn row_is_empty(row: &TableRow) -> bool {
2858    row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
2859}
2860
2861fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
2862    if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
2863        if drop_empty && cell_is_empty(&cell) {
2864            return;
2865        }
2866        row.cells.push(cell);
2867    }
2868}
2869
2870fn flush_row(
2871    row: &mut Option<TableRow>,
2872    cell: &mut Option<TableCell>,
2873    table: Option<&mut TableBlock>,
2874    drop_empty_trailing_cell: bool,
2875) {
2876    flush_cell(row, cell, drop_empty_trailing_cell);
2877    if let (Some(table), Some(row)) = (table, row.take()) {
2878        table.rows.push(row);
2879    }
2880}
2881
2882fn flush_table(
2883    table: &mut Option<TableBlock>,
2884    row: &mut Option<TableRow>,
2885    cell: &mut Option<TableCell>,
2886    tables: &mut Vec<TableBlock>,
2887    blocks: &mut Vec<CapturedBlock>,
2888) {
2889    flush_row(row, cell, table.as_mut(), true);
2890    if let Some(mut table) = table.take() {
2891        // Drop trailing empty rows that can be introduced by '\n' immediately
2892        // before the 0x11 table-close marker. See R2.
2893        while table.rows.last().is_some_and(row_is_empty) {
2894            table.rows.pop();
2895        }
2896        tables.push(table.clone());
2897        blocks.push(CapturedBlock::Table(table));
2898    }
2899}
2900
2901fn push_to_current(
2902    paragraph: &mut Vec<ContentNode>,
2903    row: &mut Option<TableRow>,
2904    cell: &mut Option<TableCell>,
2905    in_table: bool,
2906    node: ContentNode,
2907) {
2908    if in_table {
2909        if row.is_none() {
2910            *row = Some(TableRow::default());
2911        }
2912        if cell.is_none() {
2913            *cell = Some(TableCell::default());
2914        }
2915        if let Some(cell) = cell.as_mut() {
2916            cell.content.push(node);
2917        }
2918    } else {
2919        paragraph.push(node);
2920    }
2921}
2922
2923fn append_to_current(
2924    paragraph: &mut Vec<ContentNode>,
2925    row: &mut Option<TableRow>,
2926    cell: &mut Option<TableCell>,
2927    in_table: bool,
2928    text: &str,
2929    style: TextStyle,
2930) {
2931    if in_table {
2932        if row.is_none() {
2933            *row = Some(TableRow::default());
2934        }
2935        if cell.is_none() {
2936            *cell = Some(TableCell::default());
2937        }
2938        if let Some(cell) = cell.as_mut() {
2939            append_styled_text(&mut cell.content, text, style);
2940        }
2941    } else {
2942        append_styled_text(paragraph, text, style);
2943    }
2944}
2945
2946fn append_text(content: &mut Vec<ContentNode>, text: &str) {
2947    append_styled_text(content, text, TextStyle::default());
2948}
2949
2950fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
2951    if text.is_empty() {
2952        return;
2953    }
2954    if let Some(ContentNode::Text {
2955        text: last,
2956        bold,
2957        italic,
2958        strike,
2959        link,
2960    }) = content.last_mut()
2961    {
2962        let last_style = TextStyle {
2963            bold: *bold,
2964            italic: *italic,
2965            strike: *strike,
2966            link: link.clone(),
2967        };
2968        if last_style == style {
2969            last.push_str(text);
2970            return;
2971        }
2972    }
2973    content.push(ContentNode::Text {
2974        text: text.to_string(),
2975        bold: style.bold,
2976        italic: style.italic,
2977        strike: style.strike,
2978        link: style.link,
2979    });
2980}
2981
2982/// Render a parsed Google Docs capture as Markdown, HTML, or text.
2983#[must_use]
2984pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
2985    match format.to_lowercase().as_str() {
2986        "html" => render_blocks_html(&capture.blocks),
2987        "txt" | "text" => blocks_to_text(&capture.blocks),
2988        _ => render_blocks_markdown(&capture.blocks),
2989    }
2990}
2991
2992/// One rendered block plus enough context for `render_blocks_markdown` to
2993/// choose a Markdown-safe separator.
2994struct RenderedBlock {
2995    markdown: String,
2996    list_id: Option<String>,
2997    quote: bool,
2998}
2999
3000fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
3001    // Track an ordered-list counter per (list.id, level) so ordered items are
3002    // numbered sequentially 1., 2., 3., ... instead of all being "1.". See R3.
3003    // When we re-enter a shallower list level, deeper counters reset so a new
3004    // parent restarts its children at 1.
3005    let mut counters: HashMap<(String, usize), usize> = HashMap::new();
3006    let mut rendered: Vec<RenderedBlock> = Vec::new();
3007
3008    for block in blocks {
3009        match block {
3010            CapturedBlock::Paragraph {
3011                content,
3012                style,
3013                list,
3014                quote,
3015                horizontal_rule,
3016            } => {
3017                let text = render_content_markdown(content).trim().to_string();
3018                if text.is_empty() {
3019                    continue;
3020                }
3021                let ordered_index = list.as_ref().and_then(|list_meta| {
3022                    if !list_meta.ordered {
3023                        return None;
3024                    }
3025                    // Reset counters for deeper levels when we move up to a
3026                    // shallower level — otherwise a new parent item would see
3027                    // its previous children's final count.
3028                    let key = (list_meta.id.clone(), list_meta.level);
3029                    counters.retain(|(id, level), _| {
3030                        !(id == &list_meta.id && *level > list_meta.level)
3031                    });
3032                    let next = counters.entry(key).or_insert(0);
3033                    *next += 1;
3034                    Some(*next)
3035                });
3036                let markdown = render_paragraph_markdown(
3037                    &text,
3038                    style.as_deref(),
3039                    list.as_ref(),
3040                    *quote,
3041                    *horizontal_rule,
3042                    ordered_index,
3043                );
3044                rendered.push(RenderedBlock {
3045                    markdown,
3046                    list_id: list.as_ref().map(|l| l.id.clone()),
3047                    quote: *quote,
3048                });
3049            }
3050            CapturedBlock::Table(table) => {
3051                rendered.push(RenderedBlock {
3052                    markdown: render_table_markdown(table),
3053                    list_id: None,
3054                    quote: false,
3055                });
3056            }
3057        }
3058    }
3059
3060    // Choose separator per adjacent pair: consecutive items from the same
3061    // Google Docs list use a single newline, including nested levels; adjacent
3062    // blockquote paragraphs keep a quoted blank line between them.
3063    let mut out = String::new();
3064    for (idx, block) in rendered.iter().enumerate() {
3065        if idx == 0 {
3066            out.push_str(&block.markdown);
3067            continue;
3068        }
3069        let prev = &rendered[idx - 1];
3070        if block.list_id.is_some() && prev.list_id.is_some() {
3071            out.push('\n');
3072        } else if block.quote && prev.quote {
3073            out.push_str("\n>\n");
3074        } else {
3075            out.push_str("\n\n");
3076        }
3077        out.push_str(&block.markdown);
3078    }
3079    if !out.is_empty() && !out.ends_with('\n') {
3080        out.push('\n');
3081    }
3082    out
3083}
3084
3085fn render_paragraph_markdown(
3086    text: &str,
3087    style: Option<&str>,
3088    list: Option<&ListMeta>,
3089    quote: bool,
3090    horizontal_rule: bool,
3091    ordered_index: Option<usize>,
3092) -> String {
3093    if horizontal_rule {
3094        return "---".to_string();
3095    }
3096    match style {
3097        Some("TITLE") => format!("# {text}"),
3098        Some("SUBTITLE") => format!("## {text}"),
3099        Some(style) if style.starts_with("HEADING_") => {
3100            let level = style
3101                .trim_start_matches("HEADING_")
3102                .parse::<usize>()
3103                .unwrap_or(1);
3104            format!("{} {text}", "#".repeat(level.clamp(1, 6)))
3105        }
3106        _ => list.map_or_else(
3107            || {
3108                if quote {
3109                    text.lines()
3110                        .map(|line| {
3111                            if line.is_empty() {
3112                                ">".to_string()
3113                            } else {
3114                                format!("> {line}")
3115                            }
3116                        })
3117                        .collect::<Vec<_>>()
3118                        .join("\n")
3119                } else {
3120                    text.to_string()
3121                }
3122            },
3123            |list| {
3124                let indent = "    ".repeat(list.level);
3125                let marker = if list.ordered {
3126                    format!("{}.", ordered_index.unwrap_or(1))
3127                } else {
3128                    "-".to_string()
3129                };
3130                format!("{indent}{marker} {text}")
3131            },
3132        ),
3133    }
3134}
3135
3136fn render_table_markdown(table: &TableBlock) -> String {
3137    if table.rows.is_empty() {
3138        return String::new();
3139    }
3140    let width = table
3141        .rows
3142        .iter()
3143        .map(|row| row.cells.len())
3144        .max()
3145        .unwrap_or(1);
3146    let rows = table
3147        .rows
3148        .iter()
3149        .map(|row| {
3150            (0..width)
3151                .map(|idx| {
3152                    row.cells.get(idx).map_or_else(String::new, |cell| {
3153                        escape_markdown_table_cell(&render_content_markdown(&cell.content))
3154                    })
3155                })
3156                .collect::<Vec<_>>()
3157        })
3158        .collect::<Vec<_>>();
3159    let separator = vec!["---".to_string(); width];
3160    std::iter::once(&rows[0])
3161        .chain(std::iter::once(&separator))
3162        .chain(rows.iter().skip(1))
3163        .map(|row| format!("| {} |", row.join(" | ")))
3164        .collect::<Vec<_>>()
3165        .join("\n")
3166}
3167
3168fn render_content_markdown(content: &[ContentNode]) -> String {
3169    let mut rendered = String::new();
3170    let mut idx = 0usize;
3171    while idx < content.len() {
3172        match &content[idx] {
3173            ContentNode::Text {
3174                text,
3175                bold,
3176                italic,
3177                strike,
3178                link,
3179            } => {
3180                let link_target = link.as_deref();
3181                let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
3182                idx += 1;
3183                while let Some(ContentNode::Text {
3184                    text,
3185                    bold,
3186                    italic,
3187                    strike,
3188                    link: next_link,
3189                }) = content.get(idx)
3190                {
3191                    if next_link.as_deref() != link_target {
3192                        break;
3193                    }
3194                    runs.push((text.as_str(), *bold, *italic, *strike));
3195                    idx += 1;
3196                }
3197                let label = render_text_runs_markdown(&runs);
3198                if let Some(link_target) = link_target {
3199                    let _ = write!(rendered, "[{label}]({link_target})");
3200                } else {
3201                    rendered.push_str(&label);
3202                }
3203            }
3204            ContentNode::Image {
3205                url: Some(url),
3206                alt,
3207                ..
3208            } => {
3209                let _ = write!(rendered, "![{alt}]({url})");
3210                idx += 1;
3211            }
3212            ContentNode::Image { .. } => idx += 1,
3213        }
3214    }
3215    rendered
3216}
3217
3218#[derive(Clone, Copy, Default)]
3219struct MarkdownMarkerState {
3220    bold: bool,
3221    italic: bool,
3222    strike: bool,
3223}
3224
3225fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
3226    let inactive = MarkdownMarkerState::default();
3227    let mut active = inactive;
3228    let mut output = String::new();
3229    for (text, bold, italic, strike) in runs {
3230        let next = MarkdownMarkerState {
3231            bold: *bold,
3232            italic: *italic,
3233            strike: *strike,
3234        };
3235        let mut start = 0usize;
3236        for (offset, ch) in text.char_indices() {
3237            if ch != '\n' {
3238                continue;
3239            }
3240            if offset > start {
3241                output.push_str(&markdown_marker_transition(active, next));
3242                output.push_str(&text[start..offset]);
3243                active = next;
3244            }
3245            output.push_str(&markdown_marker_transition(active, inactive));
3246            output.push('\n');
3247            active = inactive;
3248            start = offset + ch.len_utf8();
3249        }
3250        if start < text.len() {
3251            output.push_str(&markdown_marker_transition(active, next));
3252            output.push_str(&text[start..]);
3253            active = next;
3254        }
3255    }
3256    output.push_str(&markdown_marker_transition(active, inactive));
3257    output
3258}
3259
3260fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
3261    let mut markers = String::new();
3262    if active.strike && !next.strike {
3263        markers.push_str("~~");
3264    }
3265    if active.italic && !next.italic {
3266        markers.push('*');
3267    }
3268    if active.bold && !next.bold {
3269        markers.push_str("**");
3270    }
3271    if !active.bold && next.bold {
3272        markers.push_str("**");
3273    }
3274    if !active.italic && next.italic {
3275        markers.push('*');
3276    }
3277    if !active.strike && next.strike {
3278        markers.push_str("~~");
3279    }
3280    markers
3281}
3282
3283fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
3284    format!(
3285        "<!doctype html><html><body>{}</body></html>",
3286        blocks
3287            .iter()
3288            .map(|block| match block {
3289                CapturedBlock::Paragraph {
3290                    content,
3291                    style,
3292                    list,
3293                    quote,
3294                    horizontal_rule,
3295                } => {
3296                    if *horizontal_rule {
3297                        "<hr>".to_string()
3298                    } else if let Some(list) = list {
3299                        let tag = if list.ordered { "ol" } else { "ul" };
3300                        format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
3301                    } else if *quote {
3302                        format!("<blockquote>{}</blockquote>", render_content_html(content))
3303                    } else {
3304                        let tag = paragraph_tag(style.as_deref());
3305                        format!("<{tag}>{}</{tag}>", render_content_html(content))
3306                    }
3307                }
3308                CapturedBlock::Table(table) => render_table_html(table),
3309            })
3310            .collect::<String>()
3311    )
3312}
3313
3314fn render_table_html(table: &TableBlock) -> String {
3315    let mut html = String::from("<table>");
3316    for row in &table.rows {
3317        html.push_str("<tr>");
3318        for cell in &row.cells {
3319            html.push_str("<td>");
3320            html.push_str(&render_content_html(&cell.content));
3321            html.push_str("</td>");
3322        }
3323        html.push_str("</tr>");
3324    }
3325    html.push_str("</table>");
3326    html
3327}
3328
3329fn render_content_html(content: &[ContentNode]) -> String {
3330    content
3331        .iter()
3332        .map(|node| match node {
3333            ContentNode::Text {
3334                text,
3335                bold,
3336                italic,
3337                strike,
3338                link,
3339            } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
3340            ContentNode::Image {
3341                url: Some(url),
3342                alt,
3343                width,
3344                height,
3345                ..
3346            } => render_image_html(url, alt, width.as_deref(), height.as_deref()),
3347            ContentNode::Image { .. } => String::new(),
3348        })
3349        .collect()
3350}
3351
3352fn render_image_html(url: &str, alt: &str, width: Option<&str>, height: Option<&str>) -> String {
3353    let mut html = format!(
3354        "<img src=\"{}\" alt=\"{}\"",
3355        escape_html(url),
3356        escape_html(alt)
3357    );
3358    if let Some(width) = width.filter(|value| !value.is_empty()) {
3359        let _ = write!(html, " width=\"{}\"", escape_html(width));
3360    }
3361    if let Some(height) = height.filter(|value| !value.is_empty()) {
3362        let _ = write!(html, " height=\"{}\"", escape_html(height));
3363    }
3364    html.push('>');
3365    html
3366}
3367
3368fn render_marked_html(
3369    text: &str,
3370    bold: bool,
3371    italic: bool,
3372    strike: bool,
3373    link: Option<&str>,
3374) -> String {
3375    text.split('\n')
3376        .map(|segment| render_marked_html_segment(segment, bold, italic, strike, link))
3377        .collect::<Vec<_>>()
3378        .join("<br>")
3379}
3380
3381fn render_marked_html_segment(
3382    text: &str,
3383    bold: bool,
3384    italic: bool,
3385    strike: bool,
3386    link: Option<&str>,
3387) -> String {
3388    if text.is_empty() {
3389        return String::new();
3390    }
3391    let mut output = escape_html(text);
3392    if bold {
3393        output = format!("<strong>{output}</strong>");
3394    }
3395    if italic {
3396        output = format!("<em>{output}</em>");
3397    }
3398    if strike {
3399        output = format!("<s>{output}</s>");
3400    }
3401    if let Some(link) = link {
3402        output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
3403    }
3404    output
3405}
3406
3407fn paragraph_tag(style: Option<&str>) -> &'static str {
3408    match style {
3409        Some("TITLE" | "HEADING_1") => "h1",
3410        Some("SUBTITLE" | "HEADING_2") => "h2",
3411        Some("HEADING_3") => "h3",
3412        Some("HEADING_4") => "h4",
3413        Some("HEADING_5") => "h5",
3414        Some("HEADING_6") => "h6",
3415        _ => "p",
3416    }
3417}
3418
3419fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
3420    blocks
3421        .iter()
3422        .map(|block| match block {
3423            CapturedBlock::Paragraph { content, .. } => content_to_text(content),
3424            CapturedBlock::Table(table) => table
3425                .rows
3426                .iter()
3427                .map(|row| {
3428                    row.cells
3429                        .iter()
3430                        .map(|cell| content_to_text(&cell.content))
3431                        .collect::<Vec<_>>()
3432                        .join("\t")
3433                })
3434                .collect::<Vec<_>>()
3435                .join("\n"),
3436        })
3437        .filter(|text| !text.is_empty())
3438        .collect::<Vec<_>>()
3439        .join("\n")
3440}
3441
3442fn content_to_text(content: &[ContentNode]) -> String {
3443    content
3444        .iter()
3445        .map(|node| match node {
3446            ContentNode::Text { text, .. } => text.clone(),
3447            ContentNode::Image {
3448                url: Some(_), alt, ..
3449            } => format!("[{alt}]"),
3450            ContentNode::Image { .. } => String::new(),
3451        })
3452        .collect()
3453}
3454
3455fn escape_html(value: &str) -> String {
3456    value
3457        .replace('&', "&amp;")
3458        .replace('<', "&lt;")
3459        .replace('>', "&gt;")
3460        .replace('"', "&quot;")
3461        .replace('\'', "&#39;")
3462}
3463
3464fn escape_markdown_table_cell(value: &str) -> String {
3465    value.replace('|', "\\|").replace('\n', "<br>")
3466}
3467
3468/// Extract a Bearer token from an Authorization header value.
3469///
3470/// Returns `None` if the header is not a valid Bearer token.
3471#[must_use]
3472pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
3473    let trimmed = auth_header.trim();
3474    trimmed
3475        .strip_prefix("Bearer ")
3476        .or_else(|| trimmed.strip_prefix("bearer "))
3477        .map(str::trim)
3478        .filter(|t| !t.is_empty())
3479}
3480
3481/// An image extracted from base64 data URIs in HTML.
3482#[derive(Debug, Clone)]
3483pub struct ExtractedImage {
3484    /// Local filename (e.g., "image-01.png")
3485    pub filename: String,
3486    /// Raw image bytes
3487    pub data: Vec<u8>,
3488    /// MIME type (e.g., "image/png")
3489    pub mime_type: String,
3490}
3491
3492/// Result of fetching a Google Doc as an archive.
3493#[derive(Debug, Clone)]
3494pub struct GDocsArchiveResult {
3495    /// HTML content with local image paths
3496    pub html: String,
3497    /// Markdown content with local image paths
3498    pub markdown: String,
3499    /// Extracted images
3500    pub images: Vec<ExtractedImage>,
3501    /// Document ID
3502    pub document_id: String,
3503    /// Export URL used
3504    pub export_url: String,
3505}
3506
3507/// Build a self-contained archive result from browser-model rendered output.
3508///
3509/// `DOCS_modelChunk` image nodes point at `docs-images-rt` URLs. Archive mode
3510/// downloads those URLs into `images/` and rewrites markdown/html references to
3511/// local paths so Rust browser capture matches the JavaScript archive path.
3512///
3513/// # Errors
3514///
3515/// Returns an error if the HTTP client cannot be created or an image response
3516/// body cannot be read. Individual failed image downloads are logged and left
3517/// out of the archive, matching the JS behavior.
3518pub async fn localize_rendered_remote_images_for_archive(
3519    rendered: &GDocsRenderedResult,
3520) -> crate::Result<GDocsArchiveResult> {
3521    let client = reqwest::Client::builder().build().map_err(|error| {
3522        WebCaptureError::FetchError(format!("Failed to create image download client: {error}"))
3523    })?;
3524    let mut seen = HashMap::new();
3525    let mut images = Vec::new();
3526    let mut next_index = 1usize;
3527
3528    for image in &rendered.remote_images {
3529        if seen.contains_key(&image.url) {
3530            continue;
3531        }
3532        let filename = remote_image_filename(&image.url, next_index);
3533        next_index += 1;
3534        seen.insert(image.url.clone(), filename.clone());
3535
3536        match client
3537            .get(&image.url)
3538            .header("User-Agent", GDOCS_USER_AGENT)
3539            .header("Accept", "image/*,*/*;q=0.8")
3540            .send()
3541            .await
3542        {
3543            Ok(response) if response.status().is_success() => {
3544                let mime_type = response
3545                    .headers()
3546                    .get(reqwest::header::CONTENT_TYPE)
3547                    .and_then(|value| value.to_str().ok())
3548                    .map_or_else(|| mime_type_for_filename(&filename), ToString::to_string);
3549                let data = response.bytes().await.map_err(|error| {
3550                    WebCaptureError::FetchError(format!(
3551                        "Failed to read Google Docs image {}: {error}",
3552                        image.url
3553                    ))
3554                })?;
3555                debug!(
3556                    url = %image.url,
3557                    filename = %filename,
3558                    bytes = data.len(),
3559                    mime_type = %mime_type,
3560                    "downloaded Google Docs browser-model archive image"
3561                );
3562                images.push(ExtractedImage {
3563                    filename,
3564                    data: data.to_vec(),
3565                    mime_type,
3566                });
3567            }
3568            Ok(response) => {
3569                warn!(
3570                    url = %image.url,
3571                    status = response.status().as_u16(),
3572                    "failed to download Google Docs browser-model archive image"
3573                );
3574            }
3575            Err(error) => {
3576                warn!(
3577                    url = %image.url,
3578                    error = %error,
3579                    "failed to download Google Docs browser-model archive image"
3580                );
3581            }
3582        }
3583    }
3584
3585    let mut markdown = rendered.markdown.clone();
3586    let mut html = rendered.html.clone();
3587    for (url, filename) in seen {
3588        let local_path = format!("images/{filename}");
3589        markdown = markdown.replace(&url, &local_path);
3590        html = html.replace(&url, &local_path);
3591    }
3592
3593    Ok(GDocsArchiveResult {
3594        html,
3595        markdown,
3596        images,
3597        document_id: rendered.document_id.clone(),
3598        export_url: rendered.export_url.clone(),
3599    })
3600}
3601
3602fn remote_image_filename(url: &str, index: usize) -> String {
3603    let ext = crate::localize_images::get_extension_from_url(url);
3604    format!("image-{index:02}{ext}")
3605}
3606
3607fn mime_type_for_filename(filename: &str) -> String {
3608    match filename
3609        .rsplit('.')
3610        .next()
3611        .unwrap_or("png")
3612        .to_lowercase()
3613        .as_str()
3614    {
3615        "jpg" | "jpeg" => "image/jpeg",
3616        "gif" => "image/gif",
3617        "webp" => "image/webp",
3618        "svg" => "image/svg+xml",
3619        _ => "image/png",
3620    }
3621    .to_string()
3622}
3623
3624fn base64_image_pattern() -> &'static Regex {
3625    static PATTERN: OnceLock<Regex> = OnceLock::new();
3626    PATTERN.get_or_init(|| {
3627        Regex::new(
3628            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
3629        )
3630        .unwrap()
3631    })
3632}
3633
3634/// Extract base64 data URI images from HTML content.
3635///
3636/// Google Docs HTML exports embed images as base64 data URIs.
3637/// This function extracts them and replaces with local file paths.
3638///
3639/// # Arguments
3640///
3641/// * `html` - HTML content with embedded base64 images
3642///
3643/// # Returns
3644///
3645/// Tuple of (updated HTML with local paths, extracted images)
3646#[must_use]
3647pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
3648    let mut images = Vec::new();
3649    let mut idx = 1u32;
3650
3651    let updated_html = base64_image_pattern()
3652        .replace_all(html, |caps: &regex::Captures<'_>| {
3653            let prefix = &caps[1];
3654            let mime_ext = &caps[2];
3655            let base64_data = &caps[3];
3656            let suffix = &caps[4];
3657
3658            let ext = match mime_ext {
3659                "jpeg" => "jpg",
3660                "svg+xml" => "svg",
3661                other => other,
3662            };
3663
3664            let filename = format!("image-{idx:02}.{ext}");
3665            let mime_type = format!("image/{mime_ext}");
3666
3667            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
3668                debug!("Extracted image: {} ({} bytes)", filename, data.len());
3669                images.push(ExtractedImage {
3670                    filename: filename.clone(),
3671                    data,
3672                    mime_type,
3673                });
3674            }
3675
3676            idx += 1;
3677            format!("{prefix}images/{filename}{suffix}")
3678        })
3679        .into_owned();
3680
3681    (updated_html, images)
3682}
3683
3684/// Fetch a Google Docs document as a ZIP archive.
3685///
3686/// Fetches the document as HTML, extracts embedded base64 images,
3687/// converts to Markdown, and returns all components ready for archiving.
3688///
3689/// The archive contains:
3690/// - `document.md` — Markdown version
3691/// - `document.html` — HTML version with local image paths
3692/// - `images/` — extracted images
3693///
3694/// # Arguments
3695///
3696/// * `url` - Google Docs URL
3697/// * `api_token` - Optional API token for private documents
3698///
3699/// # Errors
3700///
3701/// Returns an error if the fetch or conversion fails.
3702pub async fn fetch_google_doc_as_archive(
3703    url: &str,
3704    api_token: Option<&str>,
3705) -> crate::Result<GDocsArchiveResult> {
3706    let result = fetch_google_doc(url, "html", api_token).await?;
3707
3708    let preprocess = preprocess_google_docs_export_html(&result.content);
3709    debug!(
3710        document_id = %result.document_id,
3711        hoisted = preprocess.hoisted,
3712        unwrapped_links = preprocess.unwrapped_links,
3713        "google-docs-export pre-processor rewrote archive markup"
3714    );
3715
3716    let (local_html, images) = extract_base64_images(&preprocess.html);
3717
3718    let markdown = normalize_google_docs_export_markdown(
3719        &crate::markdown::convert_html_to_markdown(&local_html, None)?,
3720    );
3721
3722    debug!(
3723        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
3724        images.len(),
3725        local_html.len(),
3726        markdown.len()
3727    );
3728
3729    Ok(GDocsArchiveResult {
3730        html: local_html,
3731        markdown,
3732        images,
3733        document_id: result.document_id,
3734        export_url: result.export_url,
3735    })
3736}
3737
3738/// Create a ZIP archive from a `GDocsArchiveResult`.
3739///
3740/// # Arguments
3741///
3742/// * `archive` - The archive result to bundle
3743/// * `pretty_html` - Whether to pretty-print the HTML output
3744///
3745/// # Errors
3746///
3747/// Returns an error if ZIP creation fails.
3748pub fn create_archive_zip(
3749    archive: &GDocsArchiveResult,
3750    pretty_html: bool,
3751) -> crate::Result<Vec<u8>> {
3752    let mut buf = std::io::Cursor::new(Vec::new());
3753
3754    {
3755        let mut zip = zip::ZipWriter::new(&mut buf);
3756        let options = zip::write::SimpleFileOptions::default()
3757            .compression_method(zip::CompressionMethod::Deflated);
3758
3759        zip.start_file("document.md", options)
3760            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3761        zip.write_all(archive.markdown.as_bytes())?;
3762
3763        let html_output = if pretty_html {
3764            crate::html::pretty_print_html(&archive.html)
3765        } else {
3766            archive.html.clone()
3767        };
3768        zip.start_file("document.html", options)
3769            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3770        zip.write_all(html_output.as_bytes())?;
3771
3772        for img in &archive.images {
3773            zip.start_file(format!("images/{}", img.filename), options)
3774                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3775            zip.write_all(&img.data)?;
3776        }
3777
3778        zip.finish()
3779            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
3780    }
3781
3782    Ok(buf.into_inner())
3783}
3784
3785#[cfg(test)]
3786mod tests {
3787    use super::*;
3788    use serde_json::json;
3789
3790    #[test]
3791    fn browser_model_fingerprint_includes_payload_size() {
3792        let small = browser_model_data_from_value(&json!({
3793            "chunks": [{ "chunk": [{ "ty": "is", "s": "first" }] }],
3794            "cidUrlMap": {}
3795        }));
3796        let larger = browser_model_data_from_value(&json!({
3797            "chunks": [{ "chunk": [{ "ty": "is", "s": "first and later text" }] }],
3798            "cidUrlMap": {}
3799        }));
3800
3801        assert_eq!(small.fingerprint().chunks, larger.fingerprint().chunks);
3802        assert_ne!(
3803            small.fingerprint().payload_bytes,
3804            larger.fingerprint().payload_bytes
3805        );
3806    }
3807
3808    #[test]
3809    fn browser_model_quiescence_resets_when_chunks_change() {
3810        let start = Instant::now();
3811        let stability_window = Duration::from_millis(1500);
3812        let one_chunk = BrowserModelFingerprint {
3813            chunks: 1,
3814            payload_bytes: 100,
3815        };
3816        let two_chunks = BrowserModelFingerprint {
3817            chunks: 2,
3818            payload_bytes: 200,
3819        };
3820        let mut quiescence = BrowserModelQuiescence::default();
3821
3822        assert_eq!(quiescence.observe(one_chunk, start, stability_window), None);
3823        assert_eq!(
3824            quiescence.observe(
3825                one_chunk,
3826                start + Duration::from_millis(250),
3827                stability_window
3828            ),
3829            None
3830        );
3831        assert_eq!(
3832            quiescence.observe(
3833                two_chunks,
3834                start + Duration::from_millis(500),
3835                stability_window
3836            ),
3837            None
3838        );
3839        assert_eq!(
3840            quiescence.observe(
3841                two_chunks,
3842                start + Duration::from_millis(750),
3843                stability_window
3844            ),
3845            None
3846        );
3847        assert_eq!(
3848            quiescence.observe(
3849                two_chunks,
3850                start + Duration::from_millis(2300),
3851                stability_window
3852            ),
3853            Some(Duration::from_millis(1550))
3854        );
3855    }
3856}
web_capture/gdocs.rs

web_capture/
gdocs.rs