1use base64::Engine;
32use regex::Regex;
33use serde_json::Value;
34use std::collections::HashMap;
35use std::hash::BuildHasher;
36use std::io::Write;
37use std::sync::OnceLock;
38use tracing::{debug, info};
39
40use crate::WebCaptureError;
41
42const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
43const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
44
45fn gdocs_url_pattern() -> &'static Regex {
46 static PATTERN: OnceLock<Regex> = OnceLock::new();
47 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
48}
49
50#[derive(Debug, Clone)]
52pub struct GDocsResult {
53 pub content: String,
55 pub format: String,
57 pub document_id: String,
59 pub export_url: String,
61}
62
63#[derive(Debug, Clone, Copy, PartialEq, Eq)]
65pub enum GDocsCaptureMethod {
66 BrowserModel,
68 PublicExport,
70 DocsApi,
72}
73
74#[derive(Debug, Clone)]
76pub struct GDocsRenderedResult {
77 pub markdown: String,
79 pub html: String,
81 pub text: String,
83 pub document_id: String,
85 pub export_url: String,
87}
88
89#[derive(Debug, Clone, Default)]
91pub struct CapturedDocument {
92 pub blocks: Vec<CapturedBlock>,
94 pub tables: Vec<TableBlock>,
96 pub images: Vec<ContentNode>,
98 pub text: String,
100}
101
102#[derive(Debug, Clone)]
104pub enum CapturedBlock {
105 Paragraph {
107 content: Vec<ContentNode>,
109 style: Option<String>,
111 },
112 Table(TableBlock),
114}
115
116#[derive(Debug, Clone, Default)]
118pub struct TableBlock {
119 pub rows: Vec<TableRow>,
121}
122
123#[derive(Debug, Clone, Default)]
125pub struct TableRow {
126 pub cells: Vec<TableCell>,
128}
129
130#[derive(Debug, Clone, Default)]
132pub struct TableCell {
133 pub content: Vec<ContentNode>,
135}
136
137#[derive(Debug, Clone, PartialEq, Eq)]
139pub enum ContentNode {
140 Text(String),
142 Image {
144 cid: Option<String>,
146 url: Option<String>,
148 alt: String,
150 is_suggestion: bool,
152 },
153}
154
155#[must_use]
157pub fn is_google_docs_url(url: &str) -> bool {
158 gdocs_url_pattern().is_match(url)
159}
160
161#[must_use]
165pub fn extract_document_id(url: &str) -> Option<String> {
166 gdocs_url_pattern()
167 .captures(url)
168 .and_then(|caps| caps.get(1))
169 .map(|m| m.as_str().to_string())
170}
171
172#[must_use]
179pub fn build_export_url(document_id: &str, format: &str) -> String {
180 let export_format = match format {
181 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
182 _ => "html",
183 };
184 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
185}
186
187#[must_use]
189pub fn build_edit_url(document_id: &str) -> String {
190 format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
191}
192
193#[must_use]
195pub fn build_docs_api_url(document_id: &str) -> String {
196 format!("{GDOCS_API_BASE}/{document_id}")
197}
198
199pub fn select_capture_method(
205 capture: &str,
206 api_token: Option<&str>,
207) -> crate::Result<GDocsCaptureMethod> {
208 match capture.to_lowercase().as_str() {
209 "browser" => Ok(GDocsCaptureMethod::BrowserModel),
210 "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
211 "api" => Ok(GDocsCaptureMethod::PublicExport),
212 other => Err(WebCaptureError::InvalidUrl(format!(
213 "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
214 ))),
215 }
216}
217
218pub async fn fetch_google_doc(
233 url: &str,
234 format: &str,
235 api_token: Option<&str>,
236) -> crate::Result<GDocsResult> {
237 let document_id = extract_document_id(url).ok_or_else(|| {
238 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
239 })?;
240
241 let export_url = build_export_url(&document_id, format);
242 debug!(
243 document_id = %document_id,
244 format = %format,
245 export_url = %export_url,
246 has_api_token = api_token.is_some(),
247 "fetching Google Doc via public export"
248 );
249
250 let mut request = reqwest::Client::new()
251 .get(&export_url)
252 .header(
253 "User-Agent",
254 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
255 )
256 .header("Accept-Charset", "utf-8")
257 .header("Accept-Language", "en-US,en;q=0.9");
258
259 if let Some(token) = api_token {
260 request = request.header("Authorization", format!("Bearer {token}"));
261 }
262
263 let response = request
264 .send()
265 .await
266 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
267 debug!(
268 document_id = %document_id,
269 status = response.status().as_u16(),
270 success = response.status().is_success(),
271 content_type = response
272 .headers()
273 .get(reqwest::header::CONTENT_TYPE)
274 .and_then(|value| value.to_str().ok())
275 .unwrap_or(""),
276 "received Google Docs public export response"
277 );
278
279 if !response.status().is_success() {
280 return Err(WebCaptureError::FetchError(format!(
281 "Failed to fetch Google Doc ({} {}): {}",
282 response.status().as_u16(),
283 response.status().canonical_reason().unwrap_or("Unknown"),
284 export_url
285 )));
286 }
287
288 let raw_content = response.text().await.map_err(|e| {
289 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
290 })?;
291 debug!(
292 document_id = %document_id,
293 bytes = raw_content.len(),
294 "read Google Docs public export body"
295 );
296
297 let content = match format {
299 "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
300 _ => raw_content,
301 };
302
303 Ok(GDocsResult {
304 content,
305 format: format.to_string(),
306 document_id,
307 export_url,
308 })
309}
310
311pub async fn fetch_google_doc_as_markdown(
325 url: &str,
326 api_token: Option<&str>,
327) -> crate::Result<GDocsResult> {
328 let result = fetch_google_doc(url, "html", api_token).await?;
329
330 let markdown =
331 crate::markdown::convert_html_to_markdown(&result.content, Some(&result.export_url))?;
332 debug!(
333 document_id = %result.document_id,
334 bytes = markdown.len(),
335 "rendered Google Docs public export markdown"
336 );
337
338 Ok(GDocsResult {
339 content: markdown,
340 format: "markdown".to_string(),
341 document_id: result.document_id,
342 export_url: result.export_url,
343 })
344}
345
346pub async fn fetch_google_doc_from_docs_api(
352 url: &str,
353 api_token: &str,
354) -> crate::Result<GDocsRenderedResult> {
355 let document_id = extract_document_id(url).ok_or_else(|| {
356 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
357 })?;
358 let api_url = build_docs_api_url(&document_id);
359 debug!(
360 document_id = %document_id,
361 api_url = %api_url,
362 "fetching Google Doc via Docs API"
363 );
364
365 let response = reqwest::Client::new()
366 .get(&api_url)
367 .header("Authorization", format!("Bearer {api_token}"))
368 .header("Accept", "application/json")
369 .send()
370 .await
371 .map_err(|e| {
372 WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
373 })?;
374 debug!(
375 document_id = %document_id,
376 status = response.status().as_u16(),
377 success = response.status().is_success(),
378 content_type = response
379 .headers()
380 .get(reqwest::header::CONTENT_TYPE)
381 .and_then(|value| value.to_str().ok())
382 .unwrap_or(""),
383 "received Google Docs API response"
384 );
385
386 if !response.status().is_success() {
387 return Err(WebCaptureError::FetchError(format!(
388 "Failed to fetch Google Doc via Docs API ({} {}): {}",
389 response.status().as_u16(),
390 response.status().canonical_reason().unwrap_or("Unknown"),
391 api_url
392 )));
393 }
394
395 let body = response.text().await.map_err(|e| {
396 WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
397 })?;
398 let document = serde_json::from_str::<Value>(&body).map_err(|e| {
399 WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
400 })?;
401 let rendered = render_docs_api_document(&document);
402 debug!(
403 document_id = %document_id,
404 title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
405 markdown_bytes = rendered.markdown.len(),
406 html_bytes = rendered.html.len(),
407 text_bytes = rendered.text.len(),
408 "rendered Google Docs API document"
409 );
410
411 Ok(GDocsRenderedResult {
412 markdown: rendered.markdown,
413 html: rendered.html,
414 text: rendered.text,
415 document_id,
416 export_url: api_url,
417 })
418}
419
420pub async fn fetch_google_doc_from_model(
430 url: &str,
431 api_token: Option<&str>,
432) -> crate::Result<GDocsRenderedResult> {
433 let document_id = extract_document_id(url).ok_or_else(|| {
434 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
435 })?;
436 let edit_url = build_edit_url(&document_id);
437 debug!(
438 document_id = %document_id,
439 edit_url = %edit_url,
440 has_api_token = api_token.is_some(),
441 "fetching Google Doc editor model"
442 );
443 let mut request = reqwest::Client::new()
444 .get(&edit_url)
445 .header(
446 "User-Agent",
447 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
448 )
449 .header("Accept-Language", "en-US,en;q=0.9");
450
451 if let Some(token) = api_token {
452 request = request.header("Authorization", format!("Bearer {token}"));
453 }
454
455 let response = request.send().await.map_err(|e| {
456 WebCaptureError::FetchError(format!("Failed to fetch Google Doc editor: {e}"))
457 })?;
458 debug!(
459 document_id = %document_id,
460 status = response.status().as_u16(),
461 success = response.status().is_success(),
462 content_type = response
463 .headers()
464 .get(reqwest::header::CONTENT_TYPE)
465 .and_then(|value| value.to_str().ok())
466 .unwrap_or(""),
467 "received Google Docs editor response"
468 );
469
470 if !response.status().is_success() {
471 return Err(WebCaptureError::FetchError(format!(
472 "Failed to fetch Google Doc editor ({} {}): {}",
473 response.status().as_u16(),
474 response.status().canonical_reason().unwrap_or("Unknown"),
475 edit_url
476 )));
477 }
478
479 let html = response.text().await.map_err(|e| {
480 WebCaptureError::FetchError(format!("Failed to read Google Doc editor response: {e}"))
481 })?;
482 let chunks = extract_model_chunks_from_html(&html);
483 debug!(
484 document_id = %document_id,
485 html_bytes = html.len(),
486 chunks = chunks.len(),
487 "extracted Google Docs editor model chunks"
488 );
489 if chunks.is_empty() {
490 return Err(WebCaptureError::ParseError(
491 "Google Docs editor HTML did not contain DOCS_modelChunk data".to_string(),
492 ));
493 }
494
495 let cid_urls = extract_cid_urls_from_html(&html);
496 let capture = parse_model_chunks(&chunks, &cid_urls);
497 info!(
498 document_id = %document_id,
499 chunks = chunks.len(),
500 cid_urls = cid_urls.len(),
501 blocks = capture.blocks.len(),
502 tables = capture.tables.len(),
503 images = capture.images.len(),
504 text_bytes = capture.text.len(),
505 "parsed Google Docs editor model"
506 );
507
508 Ok(GDocsRenderedResult {
509 markdown: render_captured_document(&capture, "markdown"),
510 html: render_captured_document(&capture, "html"),
511 text: render_captured_document(&capture, "txt"),
512 document_id,
513 export_url: edit_url,
514 })
515}
516
517#[must_use]
519pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
520 let blocks = structural_elements_to_blocks(
521 document
522 .pointer("/body/content")
523 .and_then(Value::as_array)
524 .map_or(&[] as &[Value], Vec::as_slice),
525 document.pointer("/inlineObjects").unwrap_or(&Value::Null),
526 );
527 GDocsRenderedOutput {
528 markdown: render_blocks_markdown(&blocks),
529 html: render_blocks_html(&blocks),
530 text: blocks_to_text(&blocks),
531 }
532}
533
534#[derive(Debug, Clone, PartialEq, Eq)]
536pub struct GDocsRenderedOutput {
537 pub markdown: String,
539 pub html: String,
541 pub text: String,
543}
544
545fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
546 let mut blocks = Vec::new();
547 for element in elements {
548 if let Some(paragraph) = element.get("paragraph") {
549 let content = paragraph_to_content(paragraph, inline_objects);
550 if !content_to_text(&content).trim().is_empty()
551 || content
552 .iter()
553 .any(|node| matches!(node, ContentNode::Image { .. }))
554 {
555 blocks.push(CapturedBlock::Paragraph {
556 style: paragraph
557 .pointer("/paragraphStyle/namedStyleType")
558 .and_then(Value::as_str)
559 .map(ToString::to_string),
560 content,
561 });
562 }
563 } else if let Some(table) = element.get("table") {
564 blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
565 }
566 }
567 blocks
568}
569
570fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
571 let rows = table
572 .get("tableRows")
573 .and_then(Value::as_array)
574 .map_or(&[] as &[Value], Vec::as_slice)
575 .iter()
576 .map(|row| TableRow {
577 cells: row
578 .get("tableCells")
579 .and_then(Value::as_array)
580 .map_or(&[] as &[Value], Vec::as_slice)
581 .iter()
582 .map(|cell| TableCell {
583 content: structural_elements_to_inline_content(
584 cell.get("content")
585 .and_then(Value::as_array)
586 .map_or(&[] as &[Value], Vec::as_slice),
587 inline_objects,
588 ),
589 })
590 .collect(),
591 })
592 .collect();
593 TableBlock { rows }
594}
595
596fn structural_elements_to_inline_content(
597 elements: &[Value],
598 inline_objects: &Value,
599) -> Vec<ContentNode> {
600 let mut content = Vec::new();
601 for element in elements {
602 if let Some(paragraph) = element.get("paragraph") {
603 let paragraph_content = paragraph_to_content(paragraph, inline_objects);
604 if !content.is_empty() && !paragraph_content.is_empty() {
605 append_text(&mut content, "\n");
606 }
607 content.extend(paragraph_content);
608 } else if let Some(table) = element.get("table") {
609 append_text(
610 &mut content,
611 &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
612 table,
613 inline_objects,
614 ))]),
615 );
616 }
617 }
618 content
619}
620
621fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
622 let mut content = Vec::new();
623 for element in paragraph
624 .get("elements")
625 .and_then(Value::as_array)
626 .map_or(&[] as &[Value], Vec::as_slice)
627 {
628 if let Some(text) = element
629 .pointer("/textRun/content")
630 .and_then(Value::as_str)
631 .map(|text| text.strip_suffix('\n').unwrap_or(text))
632 {
633 append_text(&mut content, text);
634 } else if let Some(inline_id) = element
635 .pointer("/inlineObjectElement/inlineObjectId")
636 .and_then(Value::as_str)
637 {
638 if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
639 content.push(image);
640 }
641 }
642 }
643 content
644}
645
646fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
647 let embedded = inline_objects
648 .get(inline_id)?
649 .pointer("/inlineObjectProperties/embeddedObject")?;
650 let url = embedded
651 .pointer("/imageProperties/contentUri")
652 .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
653 .and_then(Value::as_str)?;
654 let alt = embedded
655 .get("title")
656 .or_else(|| embedded.get("description"))
657 .and_then(Value::as_str)
658 .unwrap_or("image");
659 Some(ContentNode::Image {
660 cid: None,
661 url: Some(url.to_string()),
662 alt: alt.to_string(),
663 is_suggestion: false,
664 })
665}
666
667#[must_use]
669#[allow(clippy::too_many_lines)]
670pub fn parse_model_chunks<S: BuildHasher>(
671 chunks: &[Value],
672 cid_urls: &HashMap<String, String, S>,
673) -> CapturedDocument {
674 let items = collect_model_items(chunks);
675 let full_text = items
676 .iter()
677 .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
678 .filter_map(|item| item.get("s").and_then(Value::as_str))
679 .collect::<String>();
680
681 let mut positions = HashMap::new();
682 for item in &items {
683 if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
684 if let (Some(id), Some(pos)) = (
685 item.get("id").and_then(Value::as_str),
686 item.get("spi").and_then(Value::as_u64),
687 ) {
688 if let Ok(pos) = usize::try_from(pos) {
689 positions.insert(id.to_string(), pos);
690 }
691 }
692 }
693 }
694
695 let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
696 let mut images = Vec::new();
697 for item in &items {
698 let ty = item.get("ty").and_then(Value::as_str);
699 if !matches!(ty, Some("ae" | "ase")) {
700 continue;
701 }
702 let Some(id) = item.get("id").and_then(Value::as_str) else {
703 continue;
704 };
705 let Some(pos) = positions.get(id).copied() else {
706 continue;
707 };
708 let cid = item
709 .pointer("/epm/ee_eo/i_cid")
710 .and_then(Value::as_str)
711 .map(ToString::to_string);
712 let node = ContentNode::Image {
713 url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
714 cid,
715 alt: if ty == Some("ase") {
716 "suggested image".to_string()
717 } else {
718 "image".to_string()
719 },
720 is_suggestion: ty == Some("ase"),
721 };
722 images_by_pos.insert(pos, node.clone());
723 images.push(node);
724 }
725
726 let chars: Vec<char> = full_text.chars().collect();
727 let mut blocks = Vec::new();
728 let mut tables = Vec::new();
729 let mut paragraph = Vec::new();
730 let mut table: Option<TableBlock> = None;
731 let mut row: Option<TableRow> = None;
732 let mut cell: Option<TableCell> = None;
733
734 for (idx, ch) in chars.iter().copied().enumerate() {
735 match ch as u32 {
736 0x10 => {
737 flush_paragraph(&mut paragraph, &mut blocks);
738 table = Some(TableBlock::default());
739 }
740 0x11 => flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks),
741 0x12 => {
742 flush_row(&mut row, &mut cell, table.as_mut());
743 row = Some(TableRow::default());
744 }
745 0x1c => {
746 flush_cell(&mut row, &mut cell);
747 if row.is_none() {
748 row = Some(TableRow::default());
749 }
750 cell = Some(TableCell::default());
751 }
752 0x0a => {
753 if table.is_some() {
754 flush_row(&mut row, &mut cell, table.as_mut());
755 } else {
756 flush_paragraph(&mut paragraph, &mut blocks);
757 }
758 }
759 0x0b => append_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), "\n"),
760 _ => {
761 if let Some(image) = images_by_pos.get(&idx).cloned() {
762 push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
763 if ch == '*' {
764 continue;
765 }
766 }
767 append_to_current(
768 &mut paragraph,
769 &mut row,
770 &mut cell,
771 table.is_some(),
772 &ch.to_string(),
773 );
774 }
775 }
776 }
777
778 if table.is_some() {
779 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
780 }
781 flush_paragraph(&mut paragraph, &mut blocks);
782
783 CapturedDocument {
784 text: blocks_to_text(&blocks),
785 blocks,
786 tables,
787 images,
788 }
789}
790
791fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
792 let mut items = Vec::new();
793 for chunk in chunks {
794 if let Some(array) = chunk.as_array() {
795 items.extend(array.iter().cloned());
796 } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
797 items.extend(array.iter().cloned());
798 }
799 }
800 items
801}
802
803fn flush_paragraph(paragraph: &mut Vec<ContentNode>, blocks: &mut Vec<CapturedBlock>) {
804 if !content_to_text(paragraph).trim().is_empty()
805 || paragraph
806 .iter()
807 .any(|node| matches!(node, ContentNode::Image { .. }))
808 {
809 blocks.push(CapturedBlock::Paragraph {
810 content: std::mem::take(paragraph),
811 style: None,
812 });
813 } else {
814 paragraph.clear();
815 }
816}
817
818fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>) {
819 if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
820 row.cells.push(cell);
821 }
822}
823
824fn flush_row(
825 row: &mut Option<TableRow>,
826 cell: &mut Option<TableCell>,
827 table: Option<&mut TableBlock>,
828) {
829 flush_cell(row, cell);
830 if let (Some(table), Some(row)) = (table, row.take()) {
831 table.rows.push(row);
832 }
833}
834
835fn flush_table(
836 table: &mut Option<TableBlock>,
837 row: &mut Option<TableRow>,
838 cell: &mut Option<TableCell>,
839 tables: &mut Vec<TableBlock>,
840 blocks: &mut Vec<CapturedBlock>,
841) {
842 flush_row(row, cell, table.as_mut());
843 if let Some(table) = table.take() {
844 tables.push(table.clone());
845 blocks.push(CapturedBlock::Table(table));
846 }
847}
848
849fn push_to_current(
850 paragraph: &mut Vec<ContentNode>,
851 row: &mut Option<TableRow>,
852 cell: &mut Option<TableCell>,
853 in_table: bool,
854 node: ContentNode,
855) {
856 if in_table {
857 if row.is_none() {
858 *row = Some(TableRow::default());
859 }
860 if cell.is_none() {
861 *cell = Some(TableCell::default());
862 }
863 if let Some(cell) = cell.as_mut() {
864 cell.content.push(node);
865 }
866 } else {
867 paragraph.push(node);
868 }
869}
870
871fn append_to_current(
872 paragraph: &mut Vec<ContentNode>,
873 row: &mut Option<TableRow>,
874 cell: &mut Option<TableCell>,
875 in_table: bool,
876 text: &str,
877) {
878 if in_table {
879 if row.is_none() {
880 *row = Some(TableRow::default());
881 }
882 if cell.is_none() {
883 *cell = Some(TableCell::default());
884 }
885 if let Some(cell) = cell.as_mut() {
886 append_text(&mut cell.content, text);
887 }
888 } else {
889 append_text(paragraph, text);
890 }
891}
892
893fn append_text(content: &mut Vec<ContentNode>, text: &str) {
894 if text.is_empty() {
895 return;
896 }
897 if let Some(ContentNode::Text(last)) = content.last_mut() {
898 last.push_str(text);
899 } else {
900 content.push(ContentNode::Text(text.to_string()));
901 }
902}
903
904#[must_use]
906pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
907 match format.to_lowercase().as_str() {
908 "html" => render_blocks_html(&capture.blocks),
909 "txt" | "text" => blocks_to_text(&capture.blocks),
910 _ => render_blocks_markdown(&capture.blocks),
911 }
912}
913
914fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
915 blocks
916 .iter()
917 .filter_map(|block| match block {
918 CapturedBlock::Paragraph { content, style } => {
919 let text = render_content_markdown(content).trim().to_string();
920 if text.is_empty() {
921 None
922 } else {
923 Some(render_paragraph_markdown(&text, style.as_deref()))
924 }
925 }
926 CapturedBlock::Table(table) => Some(render_table_markdown(table)),
927 })
928 .collect::<Vec<_>>()
929 .join("\n\n")
930}
931
932fn render_paragraph_markdown(text: &str, style: Option<&str>) -> String {
933 match style {
934 Some("TITLE") => format!("# {text}"),
935 Some("SUBTITLE") => format!("## {text}"),
936 Some(style) if style.starts_with("HEADING_") => {
937 let level = style
938 .trim_start_matches("HEADING_")
939 .parse::<usize>()
940 .unwrap_or(1);
941 format!("{} {text}", "#".repeat(level.clamp(1, 6)))
942 }
943 _ => text.to_string(),
944 }
945}
946
947fn render_table_markdown(table: &TableBlock) -> String {
948 if table.rows.is_empty() {
949 return String::new();
950 }
951 let width = table
952 .rows
953 .iter()
954 .map(|row| row.cells.len())
955 .max()
956 .unwrap_or(1);
957 let rows = table
958 .rows
959 .iter()
960 .map(|row| {
961 (0..width)
962 .map(|idx| {
963 row.cells.get(idx).map_or_else(String::new, |cell| {
964 escape_markdown_table_cell(&render_content_markdown(&cell.content))
965 })
966 })
967 .collect::<Vec<_>>()
968 })
969 .collect::<Vec<_>>();
970 let separator = vec!["---".to_string(); width];
971 std::iter::once(&rows[0])
972 .chain(std::iter::once(&separator))
973 .chain(rows.iter().skip(1))
974 .map(|row| format!("| {} |", row.join(" | ")))
975 .collect::<Vec<_>>()
976 .join("\n")
977}
978
979fn render_content_markdown(content: &[ContentNode]) -> String {
980 content
981 .iter()
982 .map(|node| match node {
983 ContentNode::Text(text) => text.clone(),
984 ContentNode::Image {
985 url: Some(url),
986 alt,
987 ..
988 } => format!(""),
989 ContentNode::Image { .. } => String::new(),
990 })
991 .collect()
992}
993
994fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
995 format!(
996 "<!doctype html><html><body>{}</body></html>",
997 blocks
998 .iter()
999 .map(|block| match block {
1000 CapturedBlock::Paragraph { content, style } => {
1001 let tag = paragraph_tag(style.as_deref());
1002 format!("<{tag}>{}</{tag}>", render_content_html(content))
1003 }
1004 CapturedBlock::Table(table) => render_table_html(table),
1005 })
1006 .collect::<String>()
1007 )
1008}
1009
1010fn render_table_html(table: &TableBlock) -> String {
1011 let mut html = String::from("<table>");
1012 for row in &table.rows {
1013 html.push_str("<tr>");
1014 for cell in &row.cells {
1015 html.push_str("<td>");
1016 html.push_str(&render_content_html(&cell.content));
1017 html.push_str("</td>");
1018 }
1019 html.push_str("</tr>");
1020 }
1021 html.push_str("</table>");
1022 html
1023}
1024
1025fn render_content_html(content: &[ContentNode]) -> String {
1026 content
1027 .iter()
1028 .map(|node| match node {
1029 ContentNode::Text(text) => escape_html(text).replace('\n', "<br>"),
1030 ContentNode::Image {
1031 url: Some(url),
1032 alt,
1033 ..
1034 } => {
1035 format!(
1036 "<img src=\"{}\" alt=\"{}\">",
1037 escape_html(url),
1038 escape_html(alt)
1039 )
1040 }
1041 ContentNode::Image { .. } => String::new(),
1042 })
1043 .collect()
1044}
1045
1046fn paragraph_tag(style: Option<&str>) -> &'static str {
1047 match style {
1048 Some("TITLE" | "HEADING_1") => "h1",
1049 Some("SUBTITLE" | "HEADING_2") => "h2",
1050 Some("HEADING_3") => "h3",
1051 Some("HEADING_4") => "h4",
1052 Some("HEADING_5") => "h5",
1053 Some("HEADING_6") => "h6",
1054 _ => "p",
1055 }
1056}
1057
1058fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
1059 blocks
1060 .iter()
1061 .map(|block| match block {
1062 CapturedBlock::Paragraph { content, .. } => content_to_text(content),
1063 CapturedBlock::Table(table) => table
1064 .rows
1065 .iter()
1066 .map(|row| {
1067 row.cells
1068 .iter()
1069 .map(|cell| content_to_text(&cell.content))
1070 .collect::<Vec<_>>()
1071 .join("\t")
1072 })
1073 .collect::<Vec<_>>()
1074 .join("\n"),
1075 })
1076 .filter(|text| !text.is_empty())
1077 .collect::<Vec<_>>()
1078 .join("\n")
1079}
1080
1081fn content_to_text(content: &[ContentNode]) -> String {
1082 content
1083 .iter()
1084 .map(|node| match node {
1085 ContentNode::Text(text) => text.clone(),
1086 ContentNode::Image {
1087 url: Some(_), alt, ..
1088 } => format!("[{alt}]"),
1089 ContentNode::Image { .. } => String::new(),
1090 })
1091 .collect()
1092}
1093
1094fn escape_html(value: &str) -> String {
1095 value
1096 .replace('&', "&")
1097 .replace('<', "<")
1098 .replace('>', ">")
1099 .replace('"', """)
1100 .replace('\'', "'")
1101}
1102
1103fn escape_markdown_table_cell(value: &str) -> String {
1104 value.replace('|', "\\|").replace('\n', "<br>")
1105}
1106
1107fn extract_cid_urls_from_html(html: &str) -> HashMap<String, String> {
1108 let pattern = Regex::new(
1109 r#""([A-Za-z0-9_-]{20,})"\s*:\s*"(https://docs\.google\.com/docs-images-rt/[^"]+)""#,
1110 )
1111 .unwrap();
1112 pattern
1113 .captures_iter(html)
1114 .filter_map(|caps| {
1115 Some((
1116 caps.get(1)?.as_str().to_string(),
1117 caps.get(2)?
1118 .as_str()
1119 .replace(r"\u003d", "=")
1120 .replace(r"\u0026", "&")
1121 .replace(r"\/", "/"),
1122 ))
1123 })
1124 .collect()
1125}
1126
1127fn extract_model_chunks_from_html(html: &str) -> Vec<Value> {
1128 let mut chunks = Vec::new();
1129 let mut offset = 0;
1130 while let Some(relative) = html[offset..].find("DOCS_modelChunk") {
1131 let marker = offset + relative;
1132 let Some(start) = html[marker..].find(['{', '[']).map(|idx| marker + idx) else {
1133 break;
1134 };
1135 let Some(end) = find_json_end(html, start) else {
1136 offset = start + 1;
1137 continue;
1138 };
1139 if let Ok(value) = serde_json::from_str::<Value>(&html[start..end]) {
1140 chunks.push(value);
1141 }
1142 offset = end;
1143 }
1144 chunks
1145}
1146
1147fn find_json_end(input: &str, start: usize) -> Option<usize> {
1148 let mut chars = input[start..].char_indices();
1149 let (_, opening) = chars.next()?;
1150 let closing = match opening {
1151 '{' => '}',
1152 '[' => ']',
1153 _ => return None,
1154 };
1155 let mut depth = 0usize;
1156 let mut in_string = false;
1157 let mut escaped = false;
1158
1159 for (relative, ch) in input[start..].char_indices() {
1160 if in_string {
1161 if escaped {
1162 escaped = false;
1163 } else if ch == '\\' {
1164 escaped = true;
1165 } else if ch == '"' {
1166 in_string = false;
1167 }
1168 continue;
1169 }
1170
1171 if ch == '"' {
1172 in_string = true;
1173 } else if ch == opening {
1174 depth += 1;
1175 } else if ch == closing {
1176 depth = depth.saturating_sub(1);
1177 if depth == 0 {
1178 return Some(start + relative + ch.len_utf8());
1179 }
1180 }
1181 }
1182 None
1183}
1184
1185#[must_use]
1189pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
1190 let trimmed = auth_header.trim();
1191 trimmed
1192 .strip_prefix("Bearer ")
1193 .or_else(|| trimmed.strip_prefix("bearer "))
1194 .map(str::trim)
1195 .filter(|t| !t.is_empty())
1196}
1197
1198#[derive(Debug, Clone)]
1200pub struct ExtractedImage {
1201 pub filename: String,
1203 pub data: Vec<u8>,
1205 pub mime_type: String,
1207}
1208
1209#[derive(Debug, Clone)]
1211pub struct GDocsArchiveResult {
1212 pub html: String,
1214 pub markdown: String,
1216 pub images: Vec<ExtractedImage>,
1218 pub document_id: String,
1220 pub export_url: String,
1222}
1223
1224fn base64_image_pattern() -> &'static Regex {
1225 static PATTERN: OnceLock<Regex> = OnceLock::new();
1226 PATTERN.get_or_init(|| {
1227 Regex::new(
1228 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
1229 )
1230 .unwrap()
1231 })
1232}
1233
1234#[must_use]
1247pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
1248 let mut images = Vec::new();
1249 let mut idx = 1u32;
1250
1251 let updated_html = base64_image_pattern()
1252 .replace_all(html, |caps: ®ex::Captures<'_>| {
1253 let prefix = &caps[1];
1254 let mime_ext = &caps[2];
1255 let base64_data = &caps[3];
1256 let suffix = &caps[4];
1257
1258 let ext = match mime_ext {
1259 "jpeg" => "jpg",
1260 "svg+xml" => "svg",
1261 other => other,
1262 };
1263
1264 let filename = format!("image-{idx:02}.{ext}");
1265 let mime_type = format!("image/{mime_ext}");
1266
1267 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
1268 debug!("Extracted image: {} ({} bytes)", filename, data.len());
1269 images.push(ExtractedImage {
1270 filename: filename.clone(),
1271 data,
1272 mime_type,
1273 });
1274 }
1275
1276 idx += 1;
1277 format!("{prefix}images/{filename}{suffix}")
1278 })
1279 .into_owned();
1280
1281 (updated_html, images)
1282}
1283
1284pub async fn fetch_google_doc_as_archive(
1303 url: &str,
1304 api_token: Option<&str>,
1305) -> crate::Result<GDocsArchiveResult> {
1306 let result = fetch_google_doc(url, "html", api_token).await?;
1307
1308 let (local_html, images) = extract_base64_images(&result.content);
1309
1310 let markdown = crate::markdown::convert_html_to_markdown(&local_html, None)?;
1311
1312 debug!(
1313 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
1314 images.len(),
1315 local_html.len(),
1316 markdown.len()
1317 );
1318
1319 Ok(GDocsArchiveResult {
1320 html: local_html,
1321 markdown,
1322 images,
1323 document_id: result.document_id,
1324 export_url: result.export_url,
1325 })
1326}
1327
1328pub fn create_archive_zip(
1339 archive: &GDocsArchiveResult,
1340 pretty_html: bool,
1341) -> crate::Result<Vec<u8>> {
1342 let mut buf = std::io::Cursor::new(Vec::new());
1343
1344 {
1345 let mut zip = zip::ZipWriter::new(&mut buf);
1346 let options = zip::write::SimpleFileOptions::default()
1347 .compression_method(zip::CompressionMethod::Deflated);
1348
1349 zip.start_file("document.md", options)
1350 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
1351 zip.write_all(archive.markdown.as_bytes())?;
1352
1353 let html_output = if pretty_html {
1354 crate::html::pretty_print_html(&archive.html)
1355 } else {
1356 archive.html.clone()
1357 };
1358 zip.start_file("document.html", options)
1359 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
1360 zip.write_all(html_output.as_bytes())?;
1361
1362 for img in &archive.images {
1363 zip.start_file(format!("images/{}", img.filename), options)
1364 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
1365 zip.write_all(&img.data)?;
1366 }
1367
1368 zip.finish()
1369 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
1370 }
1371
1372 Ok(buf.into_inner())
1373}