web_capture/
gdocs.rs

1//! Google Docs capture module.
2//!
3//! Supports API-based capture of Google Docs documents via the export URL pattern:
4//! `https://docs.google.com/document/d/{DOCUMENT_ID}/export?format={FORMAT}`
5//!
6//! # Supported Export Formats
7//!
8//! - `html` — HTML document (images as base64 data URIs)
9//! - `txt` — Plain text
10//! - `md` — Markdown (native Google Docs export)
11//! - `pdf` — PDF document
12//! - `docx` — Microsoft Word document
13//! - `epub` — EPUB ebook format
14//!
15//! # Example
16//!
17//! ```rust,no_run
18//! use web_capture::gdocs;
19//!
20//! #[tokio::main]
21//! async fn main() -> anyhow::Result<()> {
22//!     let url = "https://docs.google.com/document/d/abc123/edit";
23//!     if gdocs::is_google_docs_url(url) {
24//!         let result = gdocs::fetch_google_doc(url, "html", None).await?;
25//!         println!("Content length: {}", result.content.len());
26//!     }
27//!     Ok(())
28//! }
29//! ```
30
31use base64::Engine;
32use regex::Regex;
33use std::io::Write;
34use std::sync::OnceLock;
35use tracing::debug;
36
37use crate::WebCaptureError;
38
39const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
40
41fn gdocs_url_pattern() -> &'static Regex {
42    static PATTERN: OnceLock<Regex> = OnceLock::new();
43    PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
44}
45
46/// Result of fetching a Google Docs document.
47#[derive(Debug, Clone)]
48pub struct GDocsResult {
49    /// The document content in the requested format.
50    pub content: String,
51    /// The export format used.
52    pub format: String,
53    /// The extracted document ID.
54    pub document_id: String,
55    /// The export URL that was fetched.
56    pub export_url: String,
57}
58
59/// Check if a URL is a Google Docs document URL.
60#[must_use]
61pub fn is_google_docs_url(url: &str) -> bool {
62    gdocs_url_pattern().is_match(url)
63}
64
65/// Extract the document ID from a Google Docs URL.
66///
67/// Returns `None` if the URL is not a valid Google Docs URL.
68#[must_use]
69pub fn extract_document_id(url: &str) -> Option<String> {
70    gdocs_url_pattern()
71        .captures(url)
72        .and_then(|caps| caps.get(1))
73        .map(|m| m.as_str().to_string())
74}
75
76/// Build a Google Docs export URL.
77///
78/// # Arguments
79///
80/// * `document_id` - The Google Docs document ID
81/// * `format` - Export format (html, txt, md, pdf, docx, epub)
82#[must_use]
83pub fn build_export_url(document_id: &str, format: &str) -> String {
84    let export_format = match format {
85        "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
86        _ => "html",
87    };
88    format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
89}
90
91/// Fetch a Google Docs document via the export URL.
92///
93/// For public documents, pass `None` for `api_token`.
94/// For private documents, pass a Bearer token string.
95///
96/// # Arguments
97///
98/// * `url` - Google Docs URL (edit URL or any URL containing the document ID)
99/// * `format` - Export format (html, txt, md, pdf, docx, epub)
100/// * `api_token` - Optional API token for private documents
101///
102/// # Errors
103///
104/// Returns an error if the URL is not a valid Google Docs URL, or if the fetch fails.
105pub async fn fetch_google_doc(
106    url: &str,
107    format: &str,
108    api_token: Option<&str>,
109) -> crate::Result<GDocsResult> {
110    let document_id = extract_document_id(url).ok_or_else(|| {
111        WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
112    })?;
113
114    let export_url = build_export_url(&document_id, format);
115
116    let mut request = reqwest::Client::new()
117        .get(&export_url)
118        .header(
119            "User-Agent",
120            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
121        )
122        .header("Accept-Charset", "utf-8")
123        .header("Accept-Language", "en-US,en;q=0.9");
124
125    if let Some(token) = api_token {
126        request = request.header("Authorization", format!("Bearer {token}"));
127    }
128
129    let response = request
130        .send()
131        .await
132        .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
133
134    if !response.status().is_success() {
135        return Err(WebCaptureError::FetchError(format!(
136            "Failed to fetch Google Doc ({} {}): {}",
137            response.status().as_u16(),
138            response.status().canonical_reason().unwrap_or("Unknown"),
139            export_url
140        )));
141    }
142
143    let raw_content = response.text().await.map_err(|e| {
144        WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
145    })?;
146
147    // Decode HTML entities to unicode for text-based formats
148    let content = match format {
149        "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
150        _ => raw_content,
151    };
152
153    Ok(GDocsResult {
154        content,
155        format: format.to_string(),
156        document_id,
157        export_url,
158    })
159}
160
161/// Fetch a Google Docs document and convert to Markdown.
162///
163/// Fetches the document as HTML, then converts to Markdown using the
164/// existing HTML-to-Markdown pipeline.
165///
166/// # Arguments
167///
168/// * `url` - Google Docs URL
169/// * `api_token` - Optional API token for private documents
170///
171/// # Errors
172///
173/// Returns an error if the fetch or conversion fails.
174pub async fn fetch_google_doc_as_markdown(
175    url: &str,
176    api_token: Option<&str>,
177) -> crate::Result<GDocsResult> {
178    let result = fetch_google_doc(url, "html", api_token).await?;
179
180    let markdown =
181        crate::markdown::convert_html_to_markdown(&result.content, Some(&result.export_url))?;
182
183    Ok(GDocsResult {
184        content: markdown,
185        format: "markdown".to_string(),
186        document_id: result.document_id,
187        export_url: result.export_url,
188    })
189}
190
191/// Extract a Bearer token from an Authorization header value.
192///
193/// Returns `None` if the header is not a valid Bearer token.
194#[must_use]
195pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
196    let trimmed = auth_header.trim();
197    trimmed
198        .strip_prefix("Bearer ")
199        .or_else(|| trimmed.strip_prefix("bearer "))
200        .map(str::trim)
201        .filter(|t| !t.is_empty())
202}
203
204/// An image extracted from base64 data URIs in HTML.
205#[derive(Debug, Clone)]
206pub struct ExtractedImage {
207    /// Local filename (e.g., "image-01.png")
208    pub filename: String,
209    /// Raw image bytes
210    pub data: Vec<u8>,
211    /// MIME type (e.g., "image/png")
212    pub mime_type: String,
213}
214
215/// Result of fetching a Google Doc as an archive.
216#[derive(Debug, Clone)]
217pub struct GDocsArchiveResult {
218    /// HTML content with local image paths
219    pub html: String,
220    /// Markdown content with local image paths
221    pub markdown: String,
222    /// Extracted images
223    pub images: Vec<ExtractedImage>,
224    /// Document ID
225    pub document_id: String,
226    /// Export URL used
227    pub export_url: String,
228}
229
230fn base64_image_pattern() -> &'static Regex {
231    static PATTERN: OnceLock<Regex> = OnceLock::new();
232    PATTERN.get_or_init(|| {
233        Regex::new(
234            r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
235        )
236        .unwrap()
237    })
238}
239
240/// Extract base64 data URI images from HTML content.
241///
242/// Google Docs HTML exports embed images as base64 data URIs.
243/// This function extracts them and replaces with local file paths.
244///
245/// # Arguments
246///
247/// * `html` - HTML content with embedded base64 images
248///
249/// # Returns
250///
251/// Tuple of (updated HTML with local paths, extracted images)
252#[must_use]
253pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
254    let mut images = Vec::new();
255    let mut idx = 1u32;
256
257    let updated_html = base64_image_pattern()
258        .replace_all(html, |caps: &regex::Captures<'_>| {
259            let prefix = &caps[1];
260            let mime_ext = &caps[2];
261            let base64_data = &caps[3];
262            let suffix = &caps[4];
263
264            let ext = match mime_ext {
265                "jpeg" => "jpg",
266                "svg+xml" => "svg",
267                other => other,
268            };
269
270            let filename = format!("image-{idx:02}.{ext}");
271            let mime_type = format!("image/{mime_ext}");
272
273            if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
274                debug!("Extracted image: {} ({} bytes)", filename, data.len());
275                images.push(ExtractedImage {
276                    filename: filename.clone(),
277                    data,
278                    mime_type,
279                });
280            }
281
282            idx += 1;
283            format!("{prefix}images/{filename}{suffix}")
284        })
285        .into_owned();
286
287    (updated_html, images)
288}
289
290/// Fetch a Google Docs document as a ZIP archive.
291///
292/// Fetches the document as HTML, extracts embedded base64 images,
293/// converts to Markdown, and returns all components ready for archiving.
294///
295/// The archive contains:
296/// - `article.md` — Markdown version
297/// - `article.html` — HTML version with local image paths
298/// - `images/` — extracted images
299///
300/// # Arguments
301///
302/// * `url` - Google Docs URL
303/// * `api_token` - Optional API token for private documents
304///
305/// # Errors
306///
307/// Returns an error if the fetch or conversion fails.
308pub async fn fetch_google_doc_as_archive(
309    url: &str,
310    api_token: Option<&str>,
311) -> crate::Result<GDocsArchiveResult> {
312    let result = fetch_google_doc(url, "html", api_token).await?;
313
314    let (local_html, images) = extract_base64_images(&result.content);
315
316    let markdown =
317        crate::markdown::convert_html_to_markdown(&local_html, Some(&result.export_url))?;
318
319    debug!(
320        "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
321        images.len(),
322        local_html.len(),
323        markdown.len()
324    );
325
326    Ok(GDocsArchiveResult {
327        html: local_html,
328        markdown,
329        images,
330        document_id: result.document_id,
331        export_url: result.export_url,
332    })
333}
334
335/// Create a ZIP archive from a `GDocsArchiveResult`.
336///
337/// # Errors
338///
339/// Returns an error if ZIP creation fails.
340pub fn create_archive_zip(archive: &GDocsArchiveResult) -> crate::Result<Vec<u8>> {
341    let mut buf = std::io::Cursor::new(Vec::new());
342
343    {
344        let mut zip = zip::ZipWriter::new(&mut buf);
345        let options = zip::write::SimpleFileOptions::default()
346            .compression_method(zip::CompressionMethod::Deflated);
347
348        zip.start_file("article.md", options)
349            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
350        zip.write_all(archive.markdown.as_bytes())?;
351
352        zip.start_file("article.html", options)
353            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
354        zip.write_all(archive.html.as_bytes())?;
355
356        for img in &archive.images {
357            zip.start_file(format!("images/{}", img.filename), options)
358                .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
359            zip.write_all(&img.data)?;
360        }
361
362        zip.finish()
363            .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
364    }
365
366    Ok(buf.into_inner())
367}
web_capture/gdocs.rs

web_capture/
gdocs.rs