1use base64::Engine;
32use regex::Regex;
33use std::io::Write;
34use std::sync::OnceLock;
35use tracing::debug;
36
37use crate::WebCaptureError;
38
39const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
40
41fn gdocs_url_pattern() -> &'static Regex {
42 static PATTERN: OnceLock<Regex> = OnceLock::new();
43 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
44}
45
46#[derive(Debug, Clone)]
48pub struct GDocsResult {
49 pub content: String,
51 pub format: String,
53 pub document_id: String,
55 pub export_url: String,
57}
58
59#[must_use]
61pub fn is_google_docs_url(url: &str) -> bool {
62 gdocs_url_pattern().is_match(url)
63}
64
65#[must_use]
69pub fn extract_document_id(url: &str) -> Option<String> {
70 gdocs_url_pattern()
71 .captures(url)
72 .and_then(|caps| caps.get(1))
73 .map(|m| m.as_str().to_string())
74}
75
76#[must_use]
83pub fn build_export_url(document_id: &str, format: &str) -> String {
84 let export_format = match format {
85 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
86 _ => "html",
87 };
88 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
89}
90
91pub async fn fetch_google_doc(
106 url: &str,
107 format: &str,
108 api_token: Option<&str>,
109) -> crate::Result<GDocsResult> {
110 let document_id = extract_document_id(url).ok_or_else(|| {
111 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
112 })?;
113
114 let export_url = build_export_url(&document_id, format);
115
116 let mut request = reqwest::Client::new()
117 .get(&export_url)
118 .header(
119 "User-Agent",
120 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
121 )
122 .header("Accept-Charset", "utf-8")
123 .header("Accept-Language", "en-US,en;q=0.9");
124
125 if let Some(token) = api_token {
126 request = request.header("Authorization", format!("Bearer {token}"));
127 }
128
129 let response = request
130 .send()
131 .await
132 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
133
134 if !response.status().is_success() {
135 return Err(WebCaptureError::FetchError(format!(
136 "Failed to fetch Google Doc ({} {}): {}",
137 response.status().as_u16(),
138 response.status().canonical_reason().unwrap_or("Unknown"),
139 export_url
140 )));
141 }
142
143 let raw_content = response.text().await.map_err(|e| {
144 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
145 })?;
146
147 let content = match format {
149 "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
150 _ => raw_content,
151 };
152
153 Ok(GDocsResult {
154 content,
155 format: format.to_string(),
156 document_id,
157 export_url,
158 })
159}
160
161pub async fn fetch_google_doc_as_markdown(
175 url: &str,
176 api_token: Option<&str>,
177) -> crate::Result<GDocsResult> {
178 let result = fetch_google_doc(url, "html", api_token).await?;
179
180 let markdown =
181 crate::markdown::convert_html_to_markdown(&result.content, Some(&result.export_url))?;
182
183 Ok(GDocsResult {
184 content: markdown,
185 format: "markdown".to_string(),
186 document_id: result.document_id,
187 export_url: result.export_url,
188 })
189}
190
191#[must_use]
195pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
196 let trimmed = auth_header.trim();
197 trimmed
198 .strip_prefix("Bearer ")
199 .or_else(|| trimmed.strip_prefix("bearer "))
200 .map(str::trim)
201 .filter(|t| !t.is_empty())
202}
203
204#[derive(Debug, Clone)]
206pub struct ExtractedImage {
207 pub filename: String,
209 pub data: Vec<u8>,
211 pub mime_type: String,
213}
214
215#[derive(Debug, Clone)]
217pub struct GDocsArchiveResult {
218 pub html: String,
220 pub markdown: String,
222 pub images: Vec<ExtractedImage>,
224 pub document_id: String,
226 pub export_url: String,
228}
229
230fn base64_image_pattern() -> &'static Regex {
231 static PATTERN: OnceLock<Regex> = OnceLock::new();
232 PATTERN.get_or_init(|| {
233 Regex::new(
234 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
235 )
236 .unwrap()
237 })
238}
239
240#[must_use]
253pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
254 let mut images = Vec::new();
255 let mut idx = 1u32;
256
257 let updated_html = base64_image_pattern()
258 .replace_all(html, |caps: ®ex::Captures<'_>| {
259 let prefix = &caps[1];
260 let mime_ext = &caps[2];
261 let base64_data = &caps[3];
262 let suffix = &caps[4];
263
264 let ext = match mime_ext {
265 "jpeg" => "jpg",
266 "svg+xml" => "svg",
267 other => other,
268 };
269
270 let filename = format!("image-{idx:02}.{ext}");
271 let mime_type = format!("image/{mime_ext}");
272
273 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
274 debug!("Extracted image: {} ({} bytes)", filename, data.len());
275 images.push(ExtractedImage {
276 filename: filename.clone(),
277 data,
278 mime_type,
279 });
280 }
281
282 idx += 1;
283 format!("{prefix}images/{filename}{suffix}")
284 })
285 .into_owned();
286
287 (updated_html, images)
288}
289
290pub async fn fetch_google_doc_as_archive(
309 url: &str,
310 api_token: Option<&str>,
311) -> crate::Result<GDocsArchiveResult> {
312 let result = fetch_google_doc(url, "html", api_token).await?;
313
314 let (local_html, images) = extract_base64_images(&result.content);
315
316 let markdown =
317 crate::markdown::convert_html_to_markdown(&local_html, Some(&result.export_url))?;
318
319 debug!(
320 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
321 images.len(),
322 local_html.len(),
323 markdown.len()
324 );
325
326 Ok(GDocsArchiveResult {
327 html: local_html,
328 markdown,
329 images,
330 document_id: result.document_id,
331 export_url: result.export_url,
332 })
333}
334
335pub fn create_archive_zip(archive: &GDocsArchiveResult) -> crate::Result<Vec<u8>> {
341 let mut buf = std::io::Cursor::new(Vec::new());
342
343 {
344 let mut zip = zip::ZipWriter::new(&mut buf);
345 let options = zip::write::SimpleFileOptions::default()
346 .compression_method(zip::CompressionMethod::Deflated);
347
348 zip.start_file("article.md", options)
349 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
350 zip.write_all(archive.markdown.as_bytes())?;
351
352 zip.start_file("article.html", options)
353 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
354 zip.write_all(archive.html.as_bytes())?;
355
356 for img in &archive.images {
357 zip.start_file(format!("images/{}", img.filename), options)
358 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
359 zip.write_all(&img.data)?;
360 }
361
362 zip.finish()
363 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
364 }
365
366 Ok(buf.into_inner())
367}