web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use base64::Engine;
32use regex::Regex;
33use serde_json::Value;
34use std::collections::HashMap;
35use std::hash::BuildHasher;
36use std::io::Write;
37use std::sync::OnceLock;
38use tracing::{debug, info};
39
40use crate::WebCaptureError;
41
42const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
43const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
44
45fn gdocs_url_pattern() -> &'static Regex {
46    static PATTERN: OnceLock<Regex> = OnceLock::new();
47    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
48}
49
50/// Result of fetching a Google Docs document.
51#[derive(Debug, Clone)]
52pub struct GDocsResult {
53    /// The document content in the requested format.
54    pub content: String,
55    /// The export format used.
56    pub format: String,
57    /// The extracted document ID.
58    pub document_id: String,
59    /// The export URL that was fetched.
60    pub export_url: String,
61}
62
63/// Google Docs capture backend selected from the CLI `--capture` flag.
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
65pub enum GDocsCaptureMethod {
66    /// Load `/edit` and extract `DOCS_modelChunk`.
67    BrowserModel,
68    /// Use the public `/export?format=...` endpoint.
69    PublicExport,
70    /// Use the authenticated `docs.googleapis.com` REST API.
71    DocsApi,
72}
73
74/// Rendered Google Docs content from either Docs API or editor model data.
75#[derive(Debug, Clone)]
76pub struct GDocsRenderedResult {
77    /// Markdown output.
78    pub markdown: String,
79    /// HTML output.
80    pub html: String,
81    /// Plain text output.
82    pub text: String,
83    /// The extracted document ID.
84    pub document_id: String,
85    /// Source URL used for capture.
86    pub export_url: String,
87}
88
89/// Parsed Google Docs model/document capture.
90#[derive(Debug, Clone, Default)]
91pub struct CapturedDocument {
92    /// Ordered document blocks.
93    pub blocks: Vec<CapturedBlock>,
94    /// Tables extracted from `blocks` for compatibility with tests and callers.
95    pub tables: Vec<TableBlock>,
96    /// Images extracted from model positions.
97    pub images: Vec<ContentNode>,
98    /// Plain text projection.
99    pub text: String,
100}
101
102/// Captured block.
103#[derive(Debug, Clone)]
104pub enum CapturedBlock {
105    /// Paragraph-like block.
106    Paragraph {
107        /// Paragraph content.
108        content: Vec<ContentNode>,
109        /// Optional Google Docs named style.
110        style: Option<String>,
111    },
112    /// Table block.
113    Table(TableBlock),
114}
115
116/// Captured table.
117#[derive(Debug, Clone, Default)]
118pub struct TableBlock {
119    /// Table rows.
120    pub rows: Vec<TableRow>,
121}
122
123/// Captured table row.
124#[derive(Debug, Clone, Default)]
125pub struct TableRow {
126    /// Row cells.
127    pub cells: Vec<TableCell>,
128}
129
130/// Captured table cell.
131#[derive(Debug, Clone, Default)]
132pub struct TableCell {
133    /// Cell content.
134    pub content: Vec<ContentNode>,
135}
136
137/// Captured inline content node.
138#[derive(Debug, Clone, PartialEq, Eq)]
139pub enum ContentNode {
140    /// Text run.
141    Text(String),
142    /// Image placeholder.
143    Image {
144        /// Content ID from Google Docs model data.
145        cid: Option<String>,
146        /// Resolved image URL.
147        url: Option<String>,
148        /// Alt text.
149        alt: String,
150        /// Whether this image came from a suggested edit.
151        is_suggestion: bool,
152    },
153}
154
155/// Check if a URL is a Google Docs document URL.
156#[must_use]
157pub fn is_google_docs_url(url: &str) -> bool {
158    gdocs_url_pattern().is_match(url)
159}
160
161/// Extract the document ID from a Google Docs URL.
162///
163/// Returns `None` if the URL is not a valid Google Docs URL.
164#[must_use]
165pub fn extract_document_id(url: &str) -> Option<String> {
166    gdocs_url_pattern()
167        .captures(url)
168        .and_then(|caps| caps.get(1))
169        .map(|m| m.as_str().to_string())
170}
171
172/// Build a Google Docs export URL.
173///
174/// # Arguments
175///
176/// * `document_id` - The Google Docs document ID
177/// * `format` - Export format (html, txt, md, pdf, docx, epub)
178#[must_use]
179pub fn build_export_url(document_id: &str, format: &str) -> String {
180    let export_format = match format {
181        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
182        _ => "html",
183    };
184    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
185}
186
187/// Build a Google Docs editor URL.
188#[must_use]
189pub fn build_edit_url(document_id: &str) -> String {
190    format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
191}
192
193/// Build a Google Docs REST API URL.
194#[must_use]
195pub fn build_docs_api_url(document_id: &str) -> String {
196    format!("{GDOCS_API_BASE}/{document_id}")
197}
198
199/// Select a Google Docs capture backend from the CLI `--capture` value.
200///
201/// # Errors
202///
203/// Returns an error when `capture` is neither `browser` nor `api`.
204pub fn select_capture_method(
205    capture: &str,
206    api_token: Option<&str>,
207) -> crate::Result<GDocsCaptureMethod> {
208    match capture.to_lowercase().as_str() {
209        "browser" => Ok(GDocsCaptureMethod::BrowserModel),
210        "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
211        "api" => Ok(GDocsCaptureMethod::PublicExport),
212        other => Err(WebCaptureError::InvalidUrl(format!(
213            "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
214        ))),
215    }
216}
217
218/// Fetch a Google Docs document via the export URL.
219///
220/// For public documents, pass `None` for `api_token`.
221/// For private documents, pass a Bearer token string.
222///
223/// # Arguments
224///
225/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
226/// * `format` - Export format (html, txt, md, pdf, docx, epub)
227/// * `api_token` - Optional API token for private documents
228///
229/// # Errors
230///
231/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
232pub async fn fetch_google_doc(
233    url: &str,
234    format: &str,
235    api_token: Option<&str>,
236) -> crate::Result<GDocsResult> {
237    let document_id = extract_document_id(url).ok_or_else(|| {
238        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
239    })?;
240
241    let export_url = build_export_url(&document_id, format);
242    debug!(
243        document_id = %document_id,
244        format = %format,
245        export_url = %export_url,
246        has_api_token = api_token.is_some(),
247        "fetching Google Doc via public export"
248    );
249
250    let mut request = reqwest::Client::new()
251        .get(&export_url)
252        .header(
253            "User-Agent",
254            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
255        )
256        .header("Accept-Charset", "utf-8")
257        .header("Accept-Language", "en-US,en;q=0.9");
258
259    if let Some(token) = api_token {
260        request = request.header("Authorization", format!("Bearer {token}"));
261    }
262
263    let response = request
264        .send()
265        .await
266        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
267    debug!(
268        document_id = %document_id,
269        status = response.status().as_u16(),
270        success = response.status().is_success(),
271        content_type = response
272            .headers()
273            .get(reqwest::header::CONTENT_TYPE)
274            .and_then(|value| value.to_str().ok())
275            .unwrap_or(""),
276        "received Google Docs public export response"
277    );
278
279    if !response.status().is_success() {
280        return Err(WebCaptureError::FetchError(format!(
281            "Failed to fetch Google Doc ({} {}): {}",
282            response.status().as_u16(),
283            response.status().canonical_reason().unwrap_or("Unknown"),
284            export_url
285        )));
286    }
287
288    let raw_content = response.text().await.map_err(|e| {
289        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
290    })?;
291    debug!(
292        document_id = %document_id,
293        bytes = raw_content.len(),
294        "read Google Docs public export body"
295    );
296
297    // Decode HTML entities to unicode for text-based formats
298    let content = match format {
299        "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
300        _ => raw_content,
301    };
302
303    Ok(GDocsResult {
304        content,
305        format: format.to_string(),
306        document_id,
307        export_url,
308    })
309}
310
311/// Fetch a Google Docs document and convert to Markdown.
312///
313/// Fetches the document as HTML, then converts to Markdown using the
314/// existing HTML-to-Markdown pipeline.
315///
316/// # Arguments
317///
318/// * `url` - Google Docs URL
319/// * `api_token` - Optional API token for private documents
320///
321/// # Errors
322///
323/// Returns an error if the fetch or conversion fails.
324pub async fn fetch_google_doc_as_markdown(
325    url: &str,
326    api_token: Option<&str>,
327) -> crate::Result<GDocsResult> {
328    let result = fetch_google_doc(url, "html", api_token).await?;
329
330    let markdown =
331        crate::markdown::convert_html_to_markdown(&result.content, Some(&result.export_url))?;
332    debug!(
333        document_id = %result.document_id,
334        bytes = markdown.len(),
335        "rendered Google Docs public export markdown"
336    );
337
338    Ok(GDocsResult {
339        content: markdown,
340        format: "markdown".to_string(),
341        document_id: result.document_id,
342        export_url: result.export_url,
343    })
344}
345
346/// Fetch and render a Google Docs document via the authenticated REST API.
347///
348/// # Errors
349///
350/// Returns an error when the URL is invalid, no token is provided, or the API request fails.
351pub async fn fetch_google_doc_from_docs_api(
352    url: &str,
353    api_token: &str,
354) -> crate::Result<GDocsRenderedResult> {
355    let document_id = extract_document_id(url).ok_or_else(|| {
356        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
357    })?;
358    let api_url = build_docs_api_url(&document_id);
359    debug!(
360        document_id = %document_id,
361        api_url = %api_url,
362        "fetching Google Doc via Docs API"
363    );
364
365    let response = reqwest::Client::new()
366        .get(&api_url)
367        .header("Authorization", format!("Bearer {api_token}"))
368        .header("Accept", "application/json")
369        .send()
370        .await
371        .map_err(|e| {
372            WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
373        })?;
374    debug!(
375        document_id = %document_id,
376        status = response.status().as_u16(),
377        success = response.status().is_success(),
378        content_type = response
379            .headers()
380            .get(reqwest::header::CONTENT_TYPE)
381            .and_then(|value| value.to_str().ok())
382            .unwrap_or(""),
383        "received Google Docs API response"
384    );
385
386    if !response.status().is_success() {
387        return Err(WebCaptureError::FetchError(format!(
388            "Failed to fetch Google Doc via Docs API ({} {}): {}",
389            response.status().as_u16(),
390            response.status().canonical_reason().unwrap_or("Unknown"),
391            api_url
392        )));
393    }
394
395    let body = response.text().await.map_err(|e| {
396        WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
397    })?;
398    let document = serde_json::from_str::<Value>(&body).map_err(|e| {
399        WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
400    })?;
401    let rendered = render_docs_api_document(&document);
402    debug!(
403        document_id = %document_id,
404        title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
405        markdown_bytes = rendered.markdown.len(),
406        html_bytes = rendered.html.len(),
407        text_bytes = rendered.text.len(),
408        "rendered Google Docs API document"
409    );
410
411    Ok(GDocsRenderedResult {
412        markdown: rendered.markdown,
413        html: rendered.html,
414        text: rendered.text,
415        document_id,
416        export_url: api_url,
417    })
418}
419
420/// Fetch and render the model data embedded in the Google Docs `/edit` route.
421///
422/// The Rust browser automation crate currently exposes a placeholder browser,
423/// so this path fetches the editor HTML and parses embedded `DOCS_modelChunk`
424/// data when available.
425///
426/// # Errors
427///
428/// Returns an error when the URL is invalid, the fetch fails, or no model chunks are present.
429pub async fn fetch_google_doc_from_model(
430    url: &str,
431    api_token: Option<&str>,
432) -> crate::Result<GDocsRenderedResult> {
433    let document_id = extract_document_id(url).ok_or_else(|| {
434        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
435    })?;
436    let edit_url = build_edit_url(&document_id);
437    debug!(
438        document_id = %document_id,
439        edit_url = %edit_url,
440        has_api_token = api_token.is_some(),
441        "fetching Google Doc editor model"
442    );
443    let mut request = reqwest::Client::new()
444        .get(&edit_url)
445        .header(
446            "User-Agent",
447            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
448        )
449        .header("Accept-Language", "en-US,en;q=0.9");
450
451    if let Some(token) = api_token {
452        request = request.header("Authorization", format!("Bearer {token}"));
453    }
454
455    let response = request.send().await.map_err(|e| {
456        WebCaptureError::FetchError(format!("Failed to fetch Google Doc editor: {e}"))
457    })?;
458    debug!(
459        document_id = %document_id,
460        status = response.status().as_u16(),
461        success = response.status().is_success(),
462        content_type = response
463            .headers()
464            .get(reqwest::header::CONTENT_TYPE)
465            .and_then(|value| value.to_str().ok())
466            .unwrap_or(""),
467        "received Google Docs editor response"
468    );
469
470    if !response.status().is_success() {
471        return Err(WebCaptureError::FetchError(format!(
472            "Failed to fetch Google Doc editor ({} {}): {}",
473            response.status().as_u16(),
474            response.status().canonical_reason().unwrap_or("Unknown"),
475            edit_url
476        )));
477    }
478
479    let html = response.text().await.map_err(|e| {
480        WebCaptureError::FetchError(format!("Failed to read Google Doc editor response: {e}"))
481    })?;
482    let chunks = extract_model_chunks_from_html(&html);
483    debug!(
484        document_id = %document_id,
485        html_bytes = html.len(),
486        chunks = chunks.len(),
487        "extracted Google Docs editor model chunks"
488    );
489    if chunks.is_empty() {
490        return Err(WebCaptureError::ParseError(
491            "Google Docs editor HTML did not contain DOCS_modelChunk data".to_string(),
492        ));
493    }
494
495    let cid_urls = extract_cid_urls_from_html(&html);
496    let capture = parse_model_chunks(&chunks, &cid_urls);
497    info!(
498        document_id = %document_id,
499        chunks = chunks.len(),
500        cid_urls = cid_urls.len(),
501        blocks = capture.blocks.len(),
502        tables = capture.tables.len(),
503        images = capture.images.len(),
504        text_bytes = capture.text.len(),
505        "parsed Google Docs editor model"
506    );
507
508    Ok(GDocsRenderedResult {
509        markdown: render_captured_document(&capture, "markdown"),
510        html: render_captured_document(&capture, "html"),
511        text: render_captured_document(&capture, "txt"),
512        document_id,
513        export_url: edit_url,
514    })
515}
516
517/// Render a Google Docs REST API document value.
518#[must_use]
519pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
520    let blocks = structural_elements_to_blocks(
521        document
522            .pointer("/body/content")
523            .and_then(Value::as_array)
524            .map_or(&[] as &[Value], Vec::as_slice),
525        document.pointer("/inlineObjects").unwrap_or(&Value::Null),
526    );
527    GDocsRenderedOutput {
528        markdown: render_blocks_markdown(&blocks),
529        html: render_blocks_html(&blocks),
530        text: blocks_to_text(&blocks),
531    }
532}
533
534/// Rendered document output.
535#[derive(Debug, Clone, PartialEq, Eq)]
536pub struct GDocsRenderedOutput {
537    /// Markdown output.
538    pub markdown: String,
539    /// HTML output.
540    pub html: String,
541    /// Plain text output.
542    pub text: String,
543}
544
545fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
546    let mut blocks = Vec::new();
547    for element in elements {
548        if let Some(paragraph) = element.get("paragraph") {
549            let content = paragraph_to_content(paragraph, inline_objects);
550            if !content_to_text(&content).trim().is_empty()
551                || content
552                    .iter()
553                    .any(|node| matches!(node, ContentNode::Image { .. }))
554            {
555                blocks.push(CapturedBlock::Paragraph {
556                    style: paragraph
557                        .pointer("/paragraphStyle/namedStyleType")
558                        .and_then(Value::as_str)
559                        .map(ToString::to_string),
560                    content,
561                });
562            }
563        } else if let Some(table) = element.get("table") {
564            blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
565        }
566    }
567    blocks
568}
569
570fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
571    let rows = table
572        .get("tableRows")
573        .and_then(Value::as_array)
574        .map_or(&[] as &[Value], Vec::as_slice)
575        .iter()
576        .map(|row| TableRow {
577            cells: row
578                .get("tableCells")
579                .and_then(Value::as_array)
580                .map_or(&[] as &[Value], Vec::as_slice)
581                .iter()
582                .map(|cell| TableCell {
583                    content: structural_elements_to_inline_content(
584                        cell.get("content")
585                            .and_then(Value::as_array)
586                            .map_or(&[] as &[Value], Vec::as_slice),
587                        inline_objects,
588                    ),
589                })
590                .collect(),
591        })
592        .collect();
593    TableBlock { rows }
594}
595
596fn structural_elements_to_inline_content(
597    elements: &[Value],
598    inline_objects: &Value,
599) -> Vec<ContentNode> {
600    let mut content = Vec::new();
601    for element in elements {
602        if let Some(paragraph) = element.get("paragraph") {
603            let paragraph_content = paragraph_to_content(paragraph, inline_objects);
604            if !content.is_empty() && !paragraph_content.is_empty() {
605                append_text(&mut content, "\n");
606            }
607            content.extend(paragraph_content);
608        } else if let Some(table) = element.get("table") {
609            append_text(
610                &mut content,
611                &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
612                    table,
613                    inline_objects,
614                ))]),
615            );
616        }
617    }
618    content
619}
620
621fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
622    let mut content = Vec::new();
623    for element in paragraph
624        .get("elements")
625        .and_then(Value::as_array)
626        .map_or(&[] as &[Value], Vec::as_slice)
627    {
628        if let Some(text) = element
629            .pointer("/textRun/content")
630            .and_then(Value::as_str)
631            .map(|text| text.strip_suffix('\n').unwrap_or(text))
632        {
633            append_text(&mut content, text);
634        } else if let Some(inline_id) = element
635            .pointer("/inlineObjectElement/inlineObjectId")
636            .and_then(Value::as_str)
637        {
638            if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
639                content.push(image);
640            }
641        }
642    }
643    content
644}
645
646fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
647    let embedded = inline_objects
648        .get(inline_id)?
649        .pointer("/inlineObjectProperties/embeddedObject")?;
650    let url = embedded
651        .pointer("/imageProperties/contentUri")
652        .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
653        .and_then(Value::as_str)?;
654    let alt = embedded
655        .get("title")
656        .or_else(|| embedded.get("description"))
657        .and_then(Value::as_str)
658        .unwrap_or("image");
659    Some(ContentNode::Image {
660        cid: None,
661        url: Some(url.to_string()),
662        alt: alt.to_string(),
663        is_suggestion: false,
664    })
665}
666
667/// Parse captured `DOCS_modelChunk` values.
668#[must_use]
669#[allow(clippy::too_many_lines)]
670pub fn parse_model_chunks<S: BuildHasher>(
671    chunks: &[Value],
672    cid_urls: &HashMap<String, String, S>,
673) -> CapturedDocument {
674    let items = collect_model_items(chunks);
675    let full_text = items
676        .iter()
677        .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
678        .filter_map(|item| item.get("s").and_then(Value::as_str))
679        .collect::<String>();
680
681    let mut positions = HashMap::new();
682    for item in &items {
683        if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
684            if let (Some(id), Some(pos)) = (
685                item.get("id").and_then(Value::as_str),
686                item.get("spi").and_then(Value::as_u64),
687            ) {
688                if let Ok(pos) = usize::try_from(pos) {
689                    positions.insert(id.to_string(), pos);
690                }
691            }
692        }
693    }
694
695    let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
696    let mut images = Vec::new();
697    for item in &items {
698        let ty = item.get("ty").and_then(Value::as_str);
699        if !matches!(ty, Some("ae" | "ase")) {
700            continue;
701        }
702        let Some(id) = item.get("id").and_then(Value::as_str) else {
703            continue;
704        };
705        let Some(pos) = positions.get(id).copied() else {
706            continue;
707        };
708        let cid = item
709            .pointer("/epm/ee_eo/i_cid")
710            .and_then(Value::as_str)
711            .map(ToString::to_string);
712        let node = ContentNode::Image {
713            url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
714            cid,
715            alt: if ty == Some("ase") {
716                "suggested image".to_string()
717            } else {
718                "image".to_string()
719            },
720            is_suggestion: ty == Some("ase"),
721        };
722        images_by_pos.insert(pos, node.clone());
723        images.push(node);
724    }
725
726    let chars: Vec<char> = full_text.chars().collect();
727    let mut blocks = Vec::new();
728    let mut tables = Vec::new();
729    let mut paragraph = Vec::new();
730    let mut table: Option<TableBlock> = None;
731    let mut row: Option<TableRow> = None;
732    let mut cell: Option<TableCell> = None;
733
734    for (idx, ch) in chars.iter().copied().enumerate() {
735        match ch as u32 {
736            0x10 => {
737                flush_paragraph(&mut paragraph, &mut blocks);
738                table = Some(TableBlock::default());
739            }
740            0x11 => flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks),
741            0x12 => {
742                flush_row(&mut row, &mut cell, table.as_mut());
743                row = Some(TableRow::default());
744            }
745            0x1c => {
746                flush_cell(&mut row, &mut cell);
747                if row.is_none() {
748                    row = Some(TableRow::default());
749                }
750                cell = Some(TableCell::default());
751            }
752            0x0a => {
753                if table.is_some() {
754                    flush_row(&mut row, &mut cell, table.as_mut());
755                } else {
756                    flush_paragraph(&mut paragraph, &mut blocks);
757                }
758            }
759            0x0b => append_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), "\n"),
760            _ => {
761                if let Some(image) = images_by_pos.get(&idx).cloned() {
762                    push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
763                    if ch == '*' {
764                        continue;
765                    }
766                }
767                append_to_current(
768                    &mut paragraph,
769                    &mut row,
770                    &mut cell,
771                    table.is_some(),
772                    &ch.to_string(),
773                );
774            }
775        }
776    }
777
778    if table.is_some() {
779        flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
780    }
781    flush_paragraph(&mut paragraph, &mut blocks);
782
783    CapturedDocument {
784        text: blocks_to_text(&blocks),
785        blocks,
786        tables,
787        images,
788    }
789}
790
791fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
792    let mut items = Vec::new();
793    for chunk in chunks {
794        if let Some(array) = chunk.as_array() {
795            items.extend(array.iter().cloned());
796        } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
797            items.extend(array.iter().cloned());
798        }
799    }
800    items
801}
802
803fn flush_paragraph(paragraph: &mut Vec<ContentNode>, blocks: &mut Vec<CapturedBlock>) {
804    if !content_to_text(paragraph).trim().is_empty()
805        || paragraph
806            .iter()
807            .any(|node| matches!(node, ContentNode::Image { .. }))
808    {
809        blocks.push(CapturedBlock::Paragraph {
810            content: std::mem::take(paragraph),
811            style: None,
812        });
813    } else {
814        paragraph.clear();
815    }
816}
817
818fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>) {
819    if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
820        row.cells.push(cell);
821    }
822}
823
824fn flush_row(
825    row: &mut Option<TableRow>,
826    cell: &mut Option<TableCell>,
827    table: Option<&mut TableBlock>,
828) {
829    flush_cell(row, cell);
830    if let (Some(table), Some(row)) = (table, row.take()) {
831        table.rows.push(row);
832    }
833}
834
835fn flush_table(
836    table: &mut Option<TableBlock>,
837    row: &mut Option<TableRow>,
838    cell: &mut Option<TableCell>,
839    tables: &mut Vec<TableBlock>,
840    blocks: &mut Vec<CapturedBlock>,
841) {
842    flush_row(row, cell, table.as_mut());
843    if let Some(table) = table.take() {
844        tables.push(table.clone());
845        blocks.push(CapturedBlock::Table(table));
846    }
847}
848
849fn push_to_current(
850    paragraph: &mut Vec<ContentNode>,
851    row: &mut Option<TableRow>,
852    cell: &mut Option<TableCell>,
853    in_table: bool,
854    node: ContentNode,
855) {
856    if in_table {
857        if row.is_none() {
858            *row = Some(TableRow::default());
859        }
860        if cell.is_none() {
861            *cell = Some(TableCell::default());
862        }
863        if let Some(cell) = cell.as_mut() {
864            cell.content.push(node);
865        }
866    } else {
867        paragraph.push(node);
868    }
869}
870
871fn append_to_current(
872    paragraph: &mut Vec<ContentNode>,
873    row: &mut Option<TableRow>,
874    cell: &mut Option<TableCell>,
875    in_table: bool,
876    text: &str,
877) {
878    if in_table {
879        if row.is_none() {
880            *row = Some(TableRow::default());
881        }
882        if cell.is_none() {
883            *cell = Some(TableCell::default());
884        }
885        if let Some(cell) = cell.as_mut() {
886            append_text(&mut cell.content, text);
887        }
888    } else {
889        append_text(paragraph, text);
890    }
891}
892
893fn append_text(content: &mut Vec<ContentNode>, text: &str) {
894    if text.is_empty() {
895        return;
896    }
897    if let Some(ContentNode::Text(last)) = content.last_mut() {
898        last.push_str(text);
899    } else {
900        content.push(ContentNode::Text(text.to_string()));
901    }
902}
903
904/// Render a parsed Google Docs capture as Markdown, HTML, or text.
905#[must_use]
906pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
907    match format.to_lowercase().as_str() {
908        "html" => render_blocks_html(&capture.blocks),
909        "txt" | "text" => blocks_to_text(&capture.blocks),
910        _ => render_blocks_markdown(&capture.blocks),
911    }
912}
913
914fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
915    blocks
916        .iter()
917        .filter_map(|block| match block {
918            CapturedBlock::Paragraph { content, style } => {
919                let text = render_content_markdown(content).trim().to_string();
920                if text.is_empty() {
921                    None
922                } else {
923                    Some(render_paragraph_markdown(&text, style.as_deref()))
924                }
925            }
926            CapturedBlock::Table(table) => Some(render_table_markdown(table)),
927        })
928        .collect::<Vec<_>>()
929        .join("\n\n")
930}
931
932fn render_paragraph_markdown(text: &str, style: Option<&str>) -> String {
933    match style {
934        Some("TITLE") => format!("# {text}"),
935        Some("SUBTITLE") => format!("## {text}"),
936        Some(style) if style.starts_with("HEADING_") => {
937            let level = style
938                .trim_start_matches("HEADING_")
939                .parse::<usize>()
940                .unwrap_or(1);
941            format!("{} {text}", "#".repeat(level.clamp(1, 6)))
942        }
943        _ => text.to_string(),
944    }
945}
946
947fn render_table_markdown(table: &TableBlock) -> String {
948    if table.rows.is_empty() {
949        return String::new();
950    }
951    let width = table
952        .rows
953        .iter()
954        .map(|row| row.cells.len())
955        .max()
956        .unwrap_or(1);
957    let rows = table
958        .rows
959        .iter()
960        .map(|row| {
961            (0..width)
962                .map(|idx| {
963                    row.cells.get(idx).map_or_else(String::new, |cell| {
964                        escape_markdown_table_cell(&render_content_markdown(&cell.content))
965                    })
966                })
967                .collect::<Vec<_>>()
968        })
969        .collect::<Vec<_>>();
970    let separator = vec!["---".to_string(); width];
971    std::iter::once(&rows[0])
972        .chain(std::iter::once(&separator))
973        .chain(rows.iter().skip(1))
974        .map(|row| format!("| {} |", row.join(" | ")))
975        .collect::<Vec<_>>()
976        .join("\n")
977}
978
979fn render_content_markdown(content: &[ContentNode]) -> String {
980    content
981        .iter()
982        .map(|node| match node {
983            ContentNode::Text(text) => text.clone(),
984            ContentNode::Image {
985                url: Some(url),
986                alt,
987                ..
988            } => format!("![{alt}]({url})"),
989            ContentNode::Image { .. } => String::new(),
990        })
991        .collect()
992}
993
994fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
995    format!(
996        "<!doctype html><html><body>{}</body></html>",
997        blocks
998            .iter()
999            .map(|block| match block {
1000                CapturedBlock::Paragraph { content, style } => {
1001                    let tag = paragraph_tag(style.as_deref());
1002                    format!("<{tag}>{}</{tag}>", render_content_html(content))
1003                }
1004                CapturedBlock::Table(table) => render_table_html(table),
1005            })
1006            .collect::<String>()
1007    )
1008}
1009
1010fn render_table_html(table: &TableBlock) -> String {
1011    let mut html = String::from("<table>");
1012    for row in &table.rows {
1013        html.push_str("<tr>");
1014        for cell in &row.cells {
1015            html.push_str("<td>");
1016            html.push_str(&render_content_html(&cell.content));
1017            html.push_str("</td>");
1018        }
1019        html.push_str("</tr>");
1020    }
1021    html.push_str("</table>");
1022    html
1023}
1024
1025fn render_content_html(content: &[ContentNode]) -> String {
1026    content
1027        .iter()
1028        .map(|node| match node {
1029            ContentNode::Text(text) => escape_html(text).replace('\n', "<br>"),
1030            ContentNode::Image {
1031                url: Some(url),
1032                alt,
1033                ..
1034            } => {
1035                format!(
1036                    "<img src=\"{}\" alt=\"{}\">",
1037                    escape_html(url),
1038                    escape_html(alt)
1039                )
1040            }
1041            ContentNode::Image { .. } => String::new(),
1042        })
1043        .collect()
1044}
1045
1046fn paragraph_tag(style: Option<&str>) -> &'static str {
1047    match style {
1048        Some("TITLE" | "HEADING_1") => "h1",
1049        Some("SUBTITLE" | "HEADING_2") => "h2",
1050        Some("HEADING_3") => "h3",
1051        Some("HEADING_4") => "h4",
1052        Some("HEADING_5") => "h5",
1053        Some("HEADING_6") => "h6",
1054        _ => "p",
1055    }
1056}
1057
1058fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
1059    blocks
1060        .iter()
1061        .map(|block| match block {
1062            CapturedBlock::Paragraph { content, .. } => content_to_text(content),
1063            CapturedBlock::Table(table) => table
1064                .rows
1065                .iter()
1066                .map(|row| {
1067                    row.cells
1068                        .iter()
1069                        .map(|cell| content_to_text(&cell.content))
1070                        .collect::<Vec<_>>()
1071                        .join("\t")
1072                })
1073                .collect::<Vec<_>>()
1074                .join("\n"),
1075        })
1076        .filter(|text| !text.is_empty())
1077        .collect::<Vec<_>>()
1078        .join("\n")
1079}
1080
1081fn content_to_text(content: &[ContentNode]) -> String {
1082    content
1083        .iter()
1084        .map(|node| match node {
1085            ContentNode::Text(text) => text.clone(),
1086            ContentNode::Image {
1087                url: Some(_), alt, ..
1088            } => format!("[{alt}]"),
1089            ContentNode::Image { .. } => String::new(),
1090        })
1091        .collect()
1092}
1093
1094fn escape_html(value: &str) -> String {
1095    value
1096        .replace('&', "&amp;")
1097        .replace('<', "&lt;")
1098        .replace('>', "&gt;")
1099        .replace('"', "&quot;")
1100        .replace('\'', "&#39;")
1101}
1102
1103fn escape_markdown_table_cell(value: &str) -> String {
1104    value.replace('|', "\\|").replace('\n', "<br>")
1105}
1106
1107fn extract_cid_urls_from_html(html: &str) -> HashMap<String, String> {
1108    let pattern = Regex::new(
1109        r#""([A-Za-z0-9_-]{20,})"\s*:\s*"(https://docs\.google\.com/docs-images-rt/[^"]+)""#,
1110    )
1111    .unwrap();
1112    pattern
1113        .captures_iter(html)
1114        .filter_map(|caps| {
1115            Some((
1116                caps.get(1)?.as_str().to_string(),
1117                caps.get(2)?
1118                    .as_str()
1119                    .replace(r"\u003d", "=")
1120                    .replace(r"\u0026", "&")
1121                    .replace(r"\/", "/"),
1122            ))
1123        })
1124        .collect()
1125}
1126
1127fn extract_model_chunks_from_html(html: &str) -> Vec<Value> {
1128    let mut chunks = Vec::new();
1129    let mut offset = 0;
1130    while let Some(relative) = html[offset..].find("DOCS_modelChunk") {
1131        let marker = offset + relative;
1132        let Some(start) = html[marker..].find(['{', '[']).map(|idx| marker + idx) else {
1133            break;
1134        };
1135        let Some(end) = find_json_end(html, start) else {
1136            offset = start + 1;
1137            continue;
1138        };
1139        if let Ok(value) = serde_json::from_str::<Value>(&html[start..end]) {
1140            chunks.push(value);
1141        }
1142        offset = end;
1143    }
1144    chunks
1145}
1146
1147fn find_json_end(input: &str, start: usize) -> Option<usize> {
1148    let mut chars = input[start..].char_indices();
1149    let (_, opening) = chars.next()?;
1150    let closing = match opening {
1151        '{' => '}',
1152        '[' => ']',
1153        _ => return None,
1154    };
1155    let mut depth = 0usize;
1156    let mut in_string = false;
1157    let mut escaped = false;
1158
1159    for (relative, ch) in input[start..].char_indices() {
1160        if in_string {
1161            if escaped {
1162                escaped = false;
1163            } else if ch == '\\' {
1164                escaped = true;
1165            } else if ch == '"' {
1166                in_string = false;
1167            }
1168            continue;
1169        }
1170
1171        if ch == '"' {
1172            in_string = true;
1173        } else if ch == opening {
1174            depth += 1;
1175        } else if ch == closing {
1176            depth = depth.saturating_sub(1);
1177            if depth == 0 {
1178                return Some(start + relative + ch.len_utf8());
1179            }
1180        }
1181    }
1182    None
1183}
1184
1185/// Extract a Bearer token from an Authorization header value.
1186///
1187/// Returns `None` if the header is not a valid Bearer token.
1188#[must_use]
1189pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
1190    let trimmed = auth_header.trim();
1191    trimmed
1192        .strip_prefix("Bearer ")
1193        .or_else(|| trimmed.strip_prefix("bearer "))
1194        .map(str::trim)
1195        .filter(|t| !t.is_empty())
1196}
1197
1198/// An image extracted from base64 data URIs in HTML.
1199#[derive(Debug, Clone)]
1200pub struct ExtractedImage {
1201    /// Local filename (e.g., "image-01.png")
1202    pub filename: String,
1203    /// Raw image bytes
1204    pub data: Vec<u8>,
1205    /// MIME type (e.g., "image/png")
1206    pub mime_type: String,
1207}
1208
1209/// Result of fetching a Google Doc as an archive.
1210#[derive(Debug, Clone)]
1211pub struct GDocsArchiveResult {
1212    /// HTML content with local image paths
1213    pub html: String,
1214    /// Markdown content with local image paths
1215    pub markdown: String,
1216    /// Extracted images
1217    pub images: Vec<ExtractedImage>,
1218    /// Document ID
1219    pub document_id: String,
1220    /// Export URL used
1221    pub export_url: String,
1222}
1223
1224fn base64_image_pattern() -> &'static Regex {
1225    static PATTERN: OnceLock<Regex> = OnceLock::new();
1226    PATTERN.get_or_init(|| {
1227        Regex::new(
1228            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
1229        )
1230        .unwrap()
1231    })
1232}
1233
1234/// Extract base64 data URI images from HTML content.
1235///
1236/// Google Docs HTML exports embed images as base64 data URIs.
1237/// This function extracts them and replaces with local file paths.
1238///
1239/// # Arguments
1240///
1241/// * `html` - HTML content with embedded base64 images
1242///
1243/// # Returns
1244///
1245/// Tuple of (updated HTML with local paths, extracted images)
1246#[must_use]
1247pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
1248    let mut images = Vec::new();
1249    let mut idx = 1u32;
1250
1251    let updated_html = base64_image_pattern()
1252        .replace_all(html, |caps: &regex::Captures<'_>| {
1253            let prefix = &caps[1];
1254            let mime_ext = &caps[2];
1255            let base64_data = &caps[3];
1256            let suffix = &caps[4];
1257
1258            let ext = match mime_ext {
1259                "jpeg" => "jpg",
1260                "svg+xml" => "svg",
1261                other => other,
1262            };
1263
1264            let filename = format!("image-{idx:02}.{ext}");
1265            let mime_type = format!("image/{mime_ext}");
1266
1267            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
1268                debug!("Extracted image: {} ({} bytes)", filename, data.len());
1269                images.push(ExtractedImage {
1270                    filename: filename.clone(),
1271                    data,
1272                    mime_type,
1273                });
1274            }
1275
1276            idx += 1;
1277            format!("{prefix}images/{filename}{suffix}")
1278        })
1279        .into_owned();
1280
1281    (updated_html, images)
1282}
1283
1284/// Fetch a Google Docs document as a ZIP archive.
1285///
1286/// Fetches the document as HTML, extracts embedded base64 images,
1287/// converts to Markdown, and returns all components ready for archiving.
1288///
1289/// The archive contains:
1290/// - `document.md` — Markdown version
1291/// - `document.html` — HTML version with local image paths
1292/// - `images/` — extracted images
1293///
1294/// # Arguments
1295///
1296/// * `url` - Google Docs URL
1297/// * `api_token` - Optional API token for private documents
1298///
1299/// # Errors
1300///
1301/// Returns an error if the fetch or conversion fails.
1302pub async fn fetch_google_doc_as_archive(
1303    url: &str,
1304    api_token: Option<&str>,
1305) -> crate::Result<GDocsArchiveResult> {
1306    let result = fetch_google_doc(url, "html", api_token).await?;
1307
1308    let (local_html, images) = extract_base64_images(&result.content);
1309
1310    let markdown = crate::markdown::convert_html_to_markdown(&local_html, None)?;
1311
1312    debug!(
1313        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
1314        images.len(),
1315        local_html.len(),
1316        markdown.len()
1317    );
1318
1319    Ok(GDocsArchiveResult {
1320        html: local_html,
1321        markdown,
1322        images,
1323        document_id: result.document_id,
1324        export_url: result.export_url,
1325    })
1326}
1327
1328/// Create a ZIP archive from a `GDocsArchiveResult`.
1329///
1330/// # Arguments
1331///
1332/// * `archive` - The archive result to bundle
1333/// * `pretty_html` - Whether to pretty-print the HTML output
1334///
1335/// # Errors
1336///
1337/// Returns an error if ZIP creation fails.
1338pub fn create_archive_zip(
1339    archive: &GDocsArchiveResult,
1340    pretty_html: bool,
1341) -> crate::Result<Vec<u8>> {
1342    let mut buf = std::io::Cursor::new(Vec::new());
1343
1344    {
1345        let mut zip = zip::ZipWriter::new(&mut buf);
1346        let options = zip::write::SimpleFileOptions::default()
1347            .compression_method(zip::CompressionMethod::Deflated);
1348
1349        zip.start_file("document.md", options)
1350            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
1351        zip.write_all(archive.markdown.as_bytes())?;
1352
1353        let html_output = if pretty_html {
1354            crate::html::pretty_print_html(&archive.html)
1355        } else {
1356            archive.html.clone()
1357        };
1358        zip.start_file("document.html", options)
1359            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
1360        zip.write_all(html_output.as_bytes())?;
1361
1362        for img in &archive.images {
1363            zip.start_file(format!("images/{}", img.filename), options)
1364                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
1365            zip.write_all(&img.data)?;
1366        }
1367
1368        zip.finish()
1369            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
1370    }
1371
1372    Ok(buf.into_inner())
1373}
web_capture/gdocs.rs

web_capture/
gdocs.rs