1use base64::Engine;
32use regex::Regex;
33use serde_json::Value;
34use std::collections::HashMap;
35use std::fmt::Write as _;
36use std::hash::BuildHasher;
37use std::io::Write;
38use std::sync::OnceLock;
39use std::time::Duration;
40use tracing::{debug, info, warn};
41
42use crate::WebCaptureError;
43
44const GDOCS_EXPORT_BASE: &str = "https://docs.google.com/document/d";
45const GDOCS_API_BASE: &str = "https://docs.googleapis.com/v1/documents";
46#[cfg(not(windows))]
47const GDOCS_EDITOR_BROWSER_TIMEOUT: Duration = Duration::from_secs(15);
48const GDOCS_EDITOR_HTTP_TIMEOUT: Duration = Duration::from_secs(20);
49
50fn gdocs_url_pattern() -> &'static Regex {
51 static PATTERN: OnceLock<Regex> = OnceLock::new();
52 PATTERN.get_or_init(|| Regex::new(r"docs\.google\.com/document/d/([a-zA-Z0-9_-]+)").unwrap())
53}
54
55#[derive(Debug, Clone)]
57pub struct GDocsResult {
58 pub content: String,
60 pub format: String,
62 pub document_id: String,
64 pub export_url: String,
66}
67
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub enum GDocsCaptureMethod {
71 BrowserModel,
73 PublicExport,
75 DocsApi,
77}
78
79#[derive(Debug, Clone)]
81pub struct GDocsRenderedResult {
82 pub markdown: String,
84 pub html: String,
86 pub text: String,
88 pub document_id: String,
90 pub export_url: String,
92}
93
94#[derive(Debug, Clone, Default)]
96pub struct CapturedDocument {
97 pub blocks: Vec<CapturedBlock>,
99 pub tables: Vec<TableBlock>,
101 pub images: Vec<ContentNode>,
103 pub text: String,
105}
106
107#[derive(Debug, Clone)]
109pub enum CapturedBlock {
110 Paragraph {
112 content: Vec<ContentNode>,
114 style: Option<String>,
116 list: Option<ListMeta>,
118 quote: bool,
120 horizontal_rule: bool,
122 },
123 Table(TableBlock),
125}
126
127#[derive(Debug, Clone, Default)]
129pub struct TableBlock {
130 pub rows: Vec<TableRow>,
132}
133
134#[derive(Debug, Clone, Default)]
136pub struct TableRow {
137 pub cells: Vec<TableCell>,
139}
140
141#[derive(Debug, Clone, Default)]
143pub struct TableCell {
144 pub content: Vec<ContentNode>,
146}
147
148#[derive(Debug, Clone, PartialEq, Eq)]
150pub enum ContentNode {
151 Text {
153 text: String,
155 bold: bool,
157 italic: bool,
159 strike: bool,
161 link: Option<String>,
163 },
164 Image {
166 cid: Option<String>,
168 url: Option<String>,
170 alt: String,
172 is_suggestion: bool,
174 },
175}
176
177#[derive(Debug, Clone, Default, PartialEq, Eq)]
178struct TextStyle {
179 bold: bool,
180 italic: bool,
181 strike: bool,
182 link: Option<String>,
183}
184
185#[derive(Debug, Clone, Default)]
186struct ParagraphMeta {
187 style: Option<String>,
188 list: Option<ListMeta>,
189 quote: bool,
190 horizontal_rule: bool,
191}
192
193#[derive(Debug, Clone)]
194pub struct ListMeta {
195 pub id: String,
197 pub level: usize,
199 pub ordered: bool,
201}
202
203#[derive(Debug, Clone)]
204struct ParagraphStyle {
205 style: Option<String>,
206 indent_start: f64,
207 indent_first_line: f64,
208}
209
210#[derive(Debug, Clone, Default)]
211struct ModelStyleMaps {
212 inline_styles: Vec<TextStyle>,
213 paragraph_by_end: HashMap<usize, ParagraphStyle>,
214 list_by_end: HashMap<usize, ListMeta>,
215 horizontal_rules: std::collections::HashSet<usize>,
216}
217
218#[must_use]
220pub fn is_google_docs_url(url: &str) -> bool {
221 gdocs_url_pattern().is_match(url)
222}
223
224#[must_use]
228pub fn extract_document_id(url: &str) -> Option<String> {
229 gdocs_url_pattern()
230 .captures(url)
231 .and_then(|caps| caps.get(1))
232 .map(|m| m.as_str().to_string())
233}
234
235#[must_use]
242pub fn build_export_url(document_id: &str, format: &str) -> String {
243 let export_format = match format {
244 "html" | "txt" | "md" | "pdf" | "docx" | "epub" | "zip" => format,
245 _ => "html",
246 };
247 format!("{GDOCS_EXPORT_BASE}/{document_id}/export?format={export_format}")
248}
249
250#[must_use]
252pub fn build_edit_url(document_id: &str) -> String {
253 format!("{GDOCS_EXPORT_BASE}/{document_id}/edit")
254}
255
256#[must_use]
258pub fn build_docs_api_url(document_id: &str) -> String {
259 format!("{GDOCS_API_BASE}/{document_id}")
260}
261
262pub fn select_capture_method(
268 capture: &str,
269 api_token: Option<&str>,
270) -> crate::Result<GDocsCaptureMethod> {
271 match capture.to_lowercase().as_str() {
272 "browser" => Ok(GDocsCaptureMethod::BrowserModel),
273 "api" if api_token.is_some() => Ok(GDocsCaptureMethod::DocsApi),
274 "api" => Ok(GDocsCaptureMethod::PublicExport),
275 other => Err(WebCaptureError::InvalidUrl(format!(
276 "Unsupported Google Docs capture method \"{other}\". Use \"browser\" or \"api\"."
277 ))),
278 }
279}
280
281pub async fn fetch_google_doc(
296 url: &str,
297 format: &str,
298 api_token: Option<&str>,
299) -> crate::Result<GDocsResult> {
300 let document_id = extract_document_id(url).ok_or_else(|| {
301 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
302 })?;
303
304 let export_url = build_export_url(&document_id, format);
305 debug!(
306 document_id = %document_id,
307 format = %format,
308 export_url = %export_url,
309 has_api_token = api_token.is_some(),
310 "fetching Google Doc via public export"
311 );
312
313 let mut request = reqwest::Client::new()
314 .get(&export_url)
315 .header(
316 "User-Agent",
317 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
318 )
319 .header("Accept-Charset", "utf-8")
320 .header("Accept-Language", "en-US,en;q=0.9");
321
322 if let Some(token) = api_token {
323 request = request.header("Authorization", format!("Bearer {token}"));
324 }
325
326 let response = request
327 .send()
328 .await
329 .map_err(|e| WebCaptureError::FetchError(format!("Failed to fetch Google Doc: {e}")))?;
330 debug!(
331 document_id = %document_id,
332 status = response.status().as_u16(),
333 success = response.status().is_success(),
334 content_type = response
335 .headers()
336 .get(reqwest::header::CONTENT_TYPE)
337 .and_then(|value| value.to_str().ok())
338 .unwrap_or(""),
339 "received Google Docs public export response"
340 );
341
342 if !response.status().is_success() {
343 return Err(WebCaptureError::FetchError(format!(
344 "Failed to fetch Google Doc ({} {}): {}",
345 response.status().as_u16(),
346 response.status().canonical_reason().unwrap_or("Unknown"),
347 export_url
348 )));
349 }
350
351 let raw_content = response.text().await.map_err(|e| {
352 WebCaptureError::FetchError(format!("Failed to read Google Doc response: {e}"))
353 })?;
354 debug!(
355 document_id = %document_id,
356 bytes = raw_content.len(),
357 "read Google Docs public export body"
358 );
359
360 let content = match format {
362 "html" | "txt" | "md" => crate::html::decode_html_entities(&raw_content),
363 _ => raw_content,
364 };
365
366 Ok(GDocsResult {
367 content,
368 format: format.to_string(),
369 document_id,
370 export_url,
371 })
372}
373
374pub async fn fetch_google_doc_as_markdown(
388 url: &str,
389 api_token: Option<&str>,
390) -> crate::Result<GDocsResult> {
391 let result = fetch_google_doc(url, "html", api_token).await?;
392
393 let preprocess = preprocess_google_docs_export_html(&result.content);
394 debug!(
395 document_id = %result.document_id,
396 hoisted = preprocess.hoisted,
397 unwrapped_links = preprocess.unwrapped_links,
398 "google-docs-export pre-processor rewrote markup"
399 );
400 let markdown =
401 crate::markdown::convert_html_to_markdown(&preprocess.html, Some(&result.export_url))?;
402 debug!(
403 document_id = %result.document_id,
404 bytes = markdown.len(),
405 "rendered Google Docs public export markdown"
406 );
407
408 Ok(GDocsResult {
409 content: markdown,
410 format: "markdown".to_string(),
411 document_id: result.document_id,
412 export_url: result.export_url,
413 })
414}
415
416#[derive(Debug, Clone)]
421pub struct GDocsExportPreprocessResult {
422 pub html: String,
424 pub hoisted: usize,
426 pub unwrapped_links: usize,
428}
429
430#[must_use]
438pub fn preprocess_google_docs_export_html(html: &str) -> GDocsExportPreprocessResult {
439 let mut hoisted: usize = 0;
440 let mut unwrapped_links: usize = 0;
441 let class_styles = extract_css_class_styles(html);
442
443 let mut out = hoist_inline_style_spans(html, &mut hoisted);
444 out = hoist_class_style_spans(&out, &class_styles, &mut hoisted);
445 out = convert_class_indented_blockquotes(&out, &class_styles);
446 out = strip_google_docs_heading_noise(&out);
447 out = unwrap_google_redirect_links(&out, &mut unwrapped_links);
448 out = out.replace(" ", " ");
449 out = out.replace('\u{00A0}', " ");
450
451 GDocsExportPreprocessResult {
452 html: out,
453 hoisted,
454 unwrapped_links,
455 }
456}
457
458fn hoist_inline_style_spans(html: &str, hoisted: &mut usize) -> String {
459 let span_re = Regex::new(r#"(?is)<span\s+([^>]*style="([^"]*)"[^>]*)>(.*?)</span>"#)
460 .expect("valid regex");
461 span_re
462 .replace_all(html, |caps: ®ex::Captures<'_>| {
463 let style = caps.get(2).map_or("", |m| m.as_str());
464 let inner = caps.get(3).map_or("", |m| m.as_str());
465 semantic_wrapped_html(inner, style).map_or_else(
466 || caps[0].to_string(),
467 |wrapped| {
468 *hoisted += 1;
469 wrapped
470 },
471 )
472 })
473 .into_owned()
474}
475
476fn hoist_class_style_spans(
477 html: &str,
478 class_styles: &HashMap<String, String>,
479 hoisted: &mut usize,
480) -> String {
481 let class_span_re = Regex::new(r#"(?is)<span\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</span>"#)
482 .expect("valid regex");
483 class_span_re
484 .replace_all(html, |caps: ®ex::Captures<'_>| {
485 let class_attr = caps.get(2).map_or("", |m| m.as_str());
486 let inner = caps.get(3).map_or("", |m| m.as_str());
487 let style = combined_class_style(class_styles, class_attr);
488 semantic_wrapped_html(inner, &style).map_or_else(
489 || caps[0].to_string(),
490 |wrapped| {
491 *hoisted += 1;
492 wrapped
493 },
494 )
495 })
496 .into_owned()
497}
498
499fn convert_class_indented_blockquotes(
500 html: &str,
501 class_styles: &HashMap<String, String>,
502) -> String {
503 let class_paragraph_re =
504 Regex::new(r#"(?is)<p\s+([^>]*\bclass="([^"]*)"[^>]*)>(.*?)</p>"#).expect("valid regex");
505 class_paragraph_re
506 .replace_all(html, |caps: ®ex::Captures<'_>| {
507 let class_attr = caps.get(2).map_or("", |m| m.as_str());
508 let inner = caps.get(3).map_or("", |m| m.as_str());
509 let style = combined_class_style(class_styles, class_attr);
510 if is_blockquote_style(&style) {
511 format!("<blockquote><p>{inner}</p></blockquote>")
512 } else {
513 caps[0].to_string()
514 }
515 })
516 .into_owned()
517}
518
519fn strip_google_docs_heading_noise(html: &str) -> String {
520 let empty_anchor_re = Regex::new(r#"(?is)<a\s+id="[^"]*"\s*>\s*</a>"#).expect("valid regex");
521 let numbering_re =
522 Regex::new(r"(?is)<span\b[^>]*>\s*\d+(?:\.\d+)*\.?\s*</span>").expect("valid regex");
523 let mut out = empty_anchor_re.replace_all(html, "").into_owned();
524 for level in 1..=6 {
525 let heading_re = Regex::new(&format!(r"(?is)(<h{level}\b[^>]*>)(.*?)(</h{level}>)"))
526 .expect("valid regex");
527 out = heading_re
528 .replace_all(&out, |caps: ®ex::Captures<'_>| {
529 let open = &caps[1];
530 let inner = &caps[2];
531 let close = &caps[3];
532 let mut cleaned = empty_anchor_re.replace_all(inner, "").into_owned();
533 cleaned = numbering_re.replace_all(&cleaned, "").into_owned();
534 format!("{open}{cleaned}{close}")
535 })
536 .into_owned();
537 }
538 out
539}
540
541fn unwrap_google_redirect_links(html: &str, unwrapped_links: &mut usize) -> String {
542 let redirect_re =
543 Regex::new(r#"(?i)href="https?://(?:www\.)?google\.com/url\?q=([^&"]+)[^"]*""#)
544 .expect("valid regex");
545 redirect_re
546 .replace_all(html, |caps: ®ex::Captures<'_>| {
547 let encoded = caps.get(1).map_or("", |m| m.as_str());
548 let decoded = percent_decode_utf8_lossy(encoded);
549 *unwrapped_links += 1;
550 format!(r#"href="{decoded}""#)
551 })
552 .into_owned()
553}
554
555fn extract_css_class_styles(html: &str) -> HashMap<String, String> {
556 let mut class_styles: HashMap<String, String> = HashMap::new();
557 let style_re = Regex::new(r"(?is)<style\b[^>]*>(.*?)</style>").expect("valid regex");
558 let class_re = Regex::new(r"\.([A-Za-z0-9_-]+)\s*\{([^{}]*)\}").expect("valid regex");
559 for style_caps in style_re.captures_iter(html) {
560 let css = style_caps.get(1).map_or("", |m| m.as_str());
561 for class_caps in class_re.captures_iter(css) {
562 let class_name = class_caps.get(1).map_or("", |m| m.as_str());
563 let style = class_caps.get(2).map_or("", |m| m.as_str());
564 class_styles
565 .entry(class_name.to_string())
566 .and_modify(|existing| {
567 existing.push(';');
568 existing.push_str(style);
569 })
570 .or_insert_with(|| style.to_string());
571 }
572 }
573 class_styles
574}
575
576fn combined_class_style(class_styles: &HashMap<String, String>, class_attr: &str) -> String {
577 class_attr
578 .split_whitespace()
579 .filter_map(|class_name| class_styles.get(class_name))
580 .fold(String::new(), |mut out, style| {
581 out.push(';');
582 out.push_str(style);
583 out
584 })
585}
586
587fn semantic_wrapped_html(inner: &str, style: &str) -> Option<String> {
588 let bold = css_has_bold(style);
589 let italic = css_has_italic(style);
590 let strike = css_has_strike(style);
591 if !bold && !italic && !strike {
592 return None;
593 }
594 let mut wrapped = inner.to_string();
595 if strike {
596 wrapped = format!("<del>{wrapped}</del>");
597 }
598 if italic {
599 wrapped = format!("<em>{wrapped}</em>");
600 }
601 if bold {
602 wrapped = format!("<strong>{wrapped}</strong>");
603 }
604 Some(wrapped)
605}
606
607fn css_has_bold(style: &str) -> bool {
608 Regex::new(r"(?i)font-weight\s*:\s*(?:bold|[6-9]\d{2})")
609 .expect("valid regex")
610 .is_match(style)
611}
612
613fn css_has_italic(style: &str) -> bool {
614 Regex::new(r"(?i)font-style\s*:\s*italic")
615 .expect("valid regex")
616 .is_match(style)
617}
618
619fn css_has_strike(style: &str) -> bool {
620 Regex::new(r"(?i)text-decoration[^;]*\bline-through\b")
621 .expect("valid regex")
622 .is_match(style)
623}
624
625fn is_blockquote_style(style: &str) -> bool {
626 let margin_left = css_point_value(style, "margin-left");
627 let margin_right = css_point_value(style, "margin-right");
628 margin_left > 0.0 && margin_right > 0.0 && (margin_left - margin_right).abs() < 0.1
629}
630
631fn css_point_value(style: &str, property: &str) -> f64 {
632 let re = Regex::new(&format!(
633 r"(?i){}\s*:\s*(-?\d+(?:\.\d+)?)pt",
634 regex::escape(property)
635 ))
636 .expect("valid regex");
637 re.captures(style)
638 .and_then(|caps| caps.get(1))
639 .and_then(|value| value.as_str().parse::<f64>().ok())
640 .unwrap_or(0.0)
641}
642
643fn percent_decode_utf8_lossy(input: &str) -> String {
646 let bytes = input.as_bytes();
647 let mut decoded = Vec::with_capacity(bytes.len());
648 let mut i = 0;
649 while i < bytes.len() {
650 if bytes[i] == b'%' && i + 2 < bytes.len() {
651 let hi = (bytes[i + 1] as char).to_digit(16);
652 let lo = (bytes[i + 2] as char).to_digit(16);
653 if let (Some(hi), Some(lo)) = (hi, lo) {
654 if let Ok(byte) = u8::try_from((hi << 4) | lo) {
655 decoded.push(byte);
656 i += 3;
657 continue;
658 }
659 }
660 }
661 decoded.push(bytes[i]);
662 i += 1;
663 }
664 String::from_utf8_lossy(&decoded).into_owned()
665}
666
667pub async fn fetch_google_doc_from_docs_api(
673 url: &str,
674 api_token: &str,
675) -> crate::Result<GDocsRenderedResult> {
676 let document_id = extract_document_id(url).ok_or_else(|| {
677 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
678 })?;
679 let api_url = build_docs_api_url(&document_id);
680 debug!(
681 document_id = %document_id,
682 api_url = %api_url,
683 "fetching Google Doc via Docs API"
684 );
685
686 let response = reqwest::Client::new()
687 .get(&api_url)
688 .header("Authorization", format!("Bearer {api_token}"))
689 .header("Accept", "application/json")
690 .send()
691 .await
692 .map_err(|e| {
693 WebCaptureError::FetchError(format!("Failed to fetch Google Doc via Docs API: {e}"))
694 })?;
695 debug!(
696 document_id = %document_id,
697 status = response.status().as_u16(),
698 success = response.status().is_success(),
699 content_type = response
700 .headers()
701 .get(reqwest::header::CONTENT_TYPE)
702 .and_then(|value| value.to_str().ok())
703 .unwrap_or(""),
704 "received Google Docs API response"
705 );
706
707 if !response.status().is_success() {
708 return Err(WebCaptureError::FetchError(format!(
709 "Failed to fetch Google Doc via Docs API ({} {}): {}",
710 response.status().as_u16(),
711 response.status().canonical_reason().unwrap_or("Unknown"),
712 api_url
713 )));
714 }
715
716 let body = response.text().await.map_err(|e| {
717 WebCaptureError::FetchError(format!("Failed to read Google Docs API response: {e}"))
718 })?;
719 let document = serde_json::from_str::<Value>(&body).map_err(|e| {
720 WebCaptureError::ParseError(format!("Failed to parse Google Docs API response: {e}"))
721 })?;
722 let rendered = render_docs_api_document(&document);
723 debug!(
724 document_id = %document_id,
725 title = document.get("title").and_then(|value| value.as_str()).unwrap_or(""),
726 markdown_bytes = rendered.markdown.len(),
727 html_bytes = rendered.html.len(),
728 text_bytes = rendered.text.len(),
729 "rendered Google Docs API document"
730 );
731
732 Ok(GDocsRenderedResult {
733 markdown: rendered.markdown,
734 html: rendered.html,
735 text: rendered.text,
736 document_id,
737 export_url: api_url,
738 })
739}
740
741pub async fn fetch_google_doc_from_model(
747 url: &str,
748 api_token: Option<&str>,
749) -> crate::Result<GDocsRenderedResult> {
750 if api_token.is_some() {
751 return Err(WebCaptureError::BrowserError(
752 "Rust browser-model Google Docs capture cannot inject API tokens; use --capture api for authenticated Docs API capture".to_string(),
753 ));
754 }
755 let document_id = extract_document_id(url).ok_or_else(|| {
756 WebCaptureError::InvalidUrl(format!("Not a valid Google Docs URL: {url}"))
757 })?;
758 let edit_url = build_edit_url(&document_id);
759 debug!(
760 document_id = %document_id,
761 edit_url = %edit_url,
762 "capturing Google Doc editor model with a real browser"
763 );
764 let html = fetch_google_doc_editor_html(&edit_url, &document_id).await?;
765 let chunks = extract_model_chunks_from_html(&html);
766 debug!(
767 document_id = %document_id,
768 html_bytes = html.len(),
769 chunks = chunks.len(),
770 "extracted Google Docs editor model chunks"
771 );
772 if chunks.is_empty() {
773 return Err(WebCaptureError::ParseError(
774 "Google Docs editor HTML did not contain DOCS_modelChunk data".to_string(),
775 ));
776 }
777
778 let cid_urls = extract_cid_urls_from_html(&html);
779 let capture = parse_model_chunks(&chunks, &cid_urls);
780 info!(
781 document_id = %document_id,
782 chunks = chunks.len(),
783 cid_urls = cid_urls.len(),
784 blocks = capture.blocks.len(),
785 tables = capture.tables.len(),
786 images = capture.images.len(),
787 text_bytes = capture.text.len(),
788 "parsed Google Docs editor model"
789 );
790
791 Ok(GDocsRenderedResult {
792 markdown: render_captured_document(&capture, "markdown"),
793 html: render_captured_document(&capture, "html"),
794 text: render_captured_document(&capture, "txt"),
795 document_id,
796 export_url: edit_url,
797 })
798}
799
800async fn fetch_google_doc_editor_html(edit_url: &str, document_id: &str) -> crate::Result<String> {
801 #[cfg(windows)]
802 {
803 warn!(
804 document_id = %document_id,
805 "using Google Docs editor HTTP fetch on Windows to avoid headless Chrome hangs in hosted CI"
806 );
807 fetch_google_doc_editor_html_via_http(edit_url, document_id).await
808 }
809
810 #[cfg(not(windows))]
811 {
812 match crate::browser::render_html_with_timeout(edit_url, GDOCS_EDITOR_BROWSER_TIMEOUT).await
813 {
814 Ok(html) => {
815 let chunks = extract_model_chunks_from_html(&html);
816 if !chunks.is_empty() {
817 return Ok(html);
818 }
819 warn!(
820 document_id = %document_id,
821 html_bytes = html.len(),
822 "real-browser Google Docs capture returned no model chunks; falling back to editor HTTP fetch"
823 );
824 }
825 Err(error) => {
826 warn!(
827 document_id = %document_id,
828 error = %error,
829 "real-browser Google Docs capture failed; falling back to editor HTTP fetch"
830 );
831 }
832 }
833
834 fetch_google_doc_editor_html_via_http(edit_url, document_id).await
835 }
836}
837
838async fn fetch_google_doc_editor_html_via_http(
839 edit_url: &str,
840 document_id: &str,
841) -> crate::Result<String> {
842 let html = tokio::time::timeout(GDOCS_EDITOR_HTTP_TIMEOUT, crate::html::fetch_html(edit_url))
843 .await
844 .map_err(|_| {
845 WebCaptureError::FetchError(format!(
846 "Timed out fetching Google Docs editor HTML for document {document_id}"
847 ))
848 })??;
849 debug!(
850 document_id = %document_id,
851 html_bytes = html.len(),
852 "fetched Google Docs editor HTML through HTTP fallback"
853 );
854 Ok(html)
855}
856
857#[must_use]
859pub fn render_docs_api_document(document: &Value) -> GDocsRenderedOutput {
860 let blocks = structural_elements_to_blocks(
861 document
862 .pointer("/body/content")
863 .and_then(Value::as_array)
864 .map_or(&[] as &[Value], Vec::as_slice),
865 document.pointer("/inlineObjects").unwrap_or(&Value::Null),
866 );
867 GDocsRenderedOutput {
868 markdown: render_blocks_markdown(&blocks),
869 html: render_blocks_html(&blocks),
870 text: blocks_to_text(&blocks),
871 }
872}
873
874#[derive(Debug, Clone, PartialEq, Eq)]
876pub struct GDocsRenderedOutput {
877 pub markdown: String,
879 pub html: String,
881 pub text: String,
883}
884
885fn structural_elements_to_blocks(elements: &[Value], inline_objects: &Value) -> Vec<CapturedBlock> {
886 let mut blocks = Vec::new();
887 for element in elements {
888 if let Some(paragraph) = element.get("paragraph") {
889 let content = paragraph_to_content(paragraph, inline_objects);
890 if !content_to_text(&content).trim().is_empty()
891 || content
892 .iter()
893 .any(|node| matches!(node, ContentNode::Image { .. }))
894 {
895 blocks.push(CapturedBlock::Paragraph {
896 style: paragraph
897 .pointer("/paragraphStyle/namedStyleType")
898 .and_then(Value::as_str)
899 .map(ToString::to_string),
900 list: None,
901 quote: false,
902 horizontal_rule: false,
903 content,
904 });
905 }
906 } else if let Some(table) = element.get("table") {
907 blocks.push(CapturedBlock::Table(table_to_block(table, inline_objects)));
908 }
909 }
910 blocks
911}
912
913fn table_to_block(table: &Value, inline_objects: &Value) -> TableBlock {
914 let rows = table
915 .get("tableRows")
916 .and_then(Value::as_array)
917 .map_or(&[] as &[Value], Vec::as_slice)
918 .iter()
919 .map(|row| TableRow {
920 cells: row
921 .get("tableCells")
922 .and_then(Value::as_array)
923 .map_or(&[] as &[Value], Vec::as_slice)
924 .iter()
925 .map(|cell| TableCell {
926 content: structural_elements_to_inline_content(
927 cell.get("content")
928 .and_then(Value::as_array)
929 .map_or(&[] as &[Value], Vec::as_slice),
930 inline_objects,
931 ),
932 })
933 .collect(),
934 })
935 .collect();
936 TableBlock { rows }
937}
938
939fn structural_elements_to_inline_content(
940 elements: &[Value],
941 inline_objects: &Value,
942) -> Vec<ContentNode> {
943 let mut content = Vec::new();
944 for element in elements {
945 if let Some(paragraph) = element.get("paragraph") {
946 let paragraph_content = paragraph_to_content(paragraph, inline_objects);
947 if !content.is_empty() && !paragraph_content.is_empty() {
948 append_text(&mut content, "\n");
949 }
950 content.extend(paragraph_content);
951 } else if let Some(table) = element.get("table") {
952 append_text(
953 &mut content,
954 &render_blocks_markdown(&[CapturedBlock::Table(table_to_block(
955 table,
956 inline_objects,
957 ))]),
958 );
959 }
960 }
961 content
962}
963
964fn paragraph_to_content(paragraph: &Value, inline_objects: &Value) -> Vec<ContentNode> {
965 let mut content = Vec::new();
966 for element in paragraph
967 .get("elements")
968 .and_then(Value::as_array)
969 .map_or(&[] as &[Value], Vec::as_slice)
970 {
971 if let Some(text) = element
972 .pointer("/textRun/content")
973 .and_then(Value::as_str)
974 .map(|text| text.strip_suffix('\n').unwrap_or(text))
975 {
976 append_text(&mut content, text);
977 } else if let Some(inline_id) = element
978 .pointer("/inlineObjectElement/inlineObjectId")
979 .and_then(Value::as_str)
980 {
981 if let Some(image) = inline_object_to_image(inline_id, inline_objects) {
982 content.push(image);
983 }
984 }
985 }
986 content
987}
988
989fn inline_object_to_image(inline_id: &str, inline_objects: &Value) -> Option<ContentNode> {
990 let embedded = inline_objects
991 .get(inline_id)?
992 .pointer("/inlineObjectProperties/embeddedObject")?;
993 let url = embedded
994 .pointer("/imageProperties/contentUri")
995 .or_else(|| embedded.pointer("/imageProperties/sourceUri"))
996 .and_then(Value::as_str)?;
997 let alt = embedded
998 .get("title")
999 .or_else(|| embedded.get("description"))
1000 .and_then(Value::as_str)
1001 .unwrap_or("image");
1002 Some(ContentNode::Image {
1003 cid: None,
1004 url: Some(url.to_string()),
1005 alt: alt.to_string(),
1006 is_suggestion: false,
1007 })
1008}
1009
1010fn build_model_style_maps(
1011 items: &[Value],
1012 text_len: usize,
1013 utf16_position_map: &[usize],
1014) -> ModelStyleMaps {
1015 let mut maps = ModelStyleMaps {
1016 inline_styles: vec![TextStyle::default(); text_len],
1017 ..ModelStyleMaps::default()
1018 };
1019
1020 for item in items {
1021 if item.get("ty").and_then(Value::as_str) != Some("as") {
1022 continue;
1023 }
1024 let (Some(start), Some(end), Some(style_type)) = (
1025 item.get("si").and_then(Value::as_u64),
1026 item.get("ei").and_then(Value::as_u64),
1027 item.get("st").and_then(Value::as_str),
1028 ) else {
1029 continue;
1030 };
1031 let (Ok(start), Ok(end)) = (usize::try_from(start), usize::try_from(end)) else {
1032 continue;
1033 };
1034
1035 let start = utf16_position_to_char_position(utf16_position_map, start);
1036 let end = utf16_position_to_char_position(utf16_position_map, end);
1037 if start == 0 || end == 0 {
1038 continue;
1039 }
1040
1041 match style_type {
1042 "text" => {
1043 let style = text_style(item);
1044 apply_inline_style(&mut maps.inline_styles, start, end, &style);
1045 }
1046 "link" => {
1047 let style = TextStyle {
1048 link: item
1049 .pointer("/sm/lnks_link/ulnk_url")
1050 .and_then(Value::as_str)
1051 .map(ToString::to_string),
1052 ..TextStyle::default()
1053 };
1054 apply_inline_style(&mut maps.inline_styles, start, end, &style);
1055 }
1056 "paragraph" => {
1057 maps.paragraph_by_end
1058 .insert(end, paragraph_style_from_model(item));
1059 }
1060 "list" => {
1061 maps.list_by_end.insert(
1062 end,
1063 ListMeta {
1064 id: item
1065 .pointer("/sm/ls_id")
1066 .and_then(Value::as_str)
1067 .unwrap_or("")
1068 .to_string(),
1069 level: item
1070 .pointer("/sm/ls_nest")
1071 .and_then(Value::as_u64)
1072 .and_then(|value| usize::try_from(value).ok())
1073 .unwrap_or(0),
1074 ordered: false,
1075 },
1076 );
1077 }
1078 "horizontal_rule" => {
1079 maps.horizontal_rules.insert(end);
1080 }
1081 _ => {}
1082 }
1083 }
1084
1085 maps
1086}
1087
1088fn apply_inline_style(styles: &mut [TextStyle], start: usize, end: usize, patch: &TextStyle) {
1089 let from = start.saturating_sub(1);
1090 let to = end.min(styles.len());
1091 if from >= to {
1092 return;
1093 }
1094 for style in &mut styles[from..to] {
1095 if patch.bold {
1096 style.bold = true;
1097 }
1098 if patch.italic {
1099 style.italic = true;
1100 }
1101 if patch.strike {
1102 style.strike = true;
1103 }
1104 if patch.link.is_some() {
1105 style.link.clone_from(&patch.link);
1106 }
1107 }
1108}
1109
1110fn text_style(item: &Value) -> TextStyle {
1111 TextStyle {
1112 bold: item.pointer("/sm/ts_bd").and_then(Value::as_bool) == Some(true),
1113 italic: item.pointer("/sm/ts_it").and_then(Value::as_bool) == Some(true),
1114 strike: item.pointer("/sm/ts_st").and_then(Value::as_bool) == Some(true),
1115 link: None,
1116 }
1117}
1118
1119fn paragraph_style_from_model(item: &Value) -> ParagraphStyle {
1120 let heading = item.pointer("/sm/ps_hd").and_then(Value::as_u64);
1121 ParagraphStyle {
1122 style: heading.map(|level| format!("HEADING_{level}")),
1123 indent_start: item
1124 .pointer("/sm/ps_il")
1125 .and_then(Value::as_f64)
1126 .unwrap_or(0.0),
1127 indent_first_line: item
1128 .pointer("/sm/ps_ifl")
1129 .and_then(Value::as_f64)
1130 .unwrap_or(0.0),
1131 }
1132}
1133
1134fn build_utf16_position_map(text: &str) -> Vec<usize> {
1135 let mut map = vec![0; text.encode_utf16().count() + 1];
1136 let mut utf16_pos = 1usize;
1137 for (idx, ch) in text.chars().enumerate() {
1138 let char_pos = idx + 1;
1139 for _ in 0..ch.len_utf16() {
1140 if let Some(slot) = map.get_mut(utf16_pos) {
1141 *slot = char_pos;
1142 }
1143 utf16_pos += 1;
1144 }
1145 }
1146 map
1147}
1148
1149fn utf16_position_to_char_position(map: &[usize], position: usize) -> usize {
1150 map.get(position)
1151 .copied()
1152 .filter(|position| *position > 0)
1153 .or_else(|| map.iter().rfind(|position| **position > 0).copied())
1154 .unwrap_or(0)
1155}
1156
1157#[must_use]
1159#[allow(clippy::too_many_lines)]
1160pub fn parse_model_chunks<S: BuildHasher>(
1161 chunks: &[Value],
1162 cid_urls: &HashMap<String, String, S>,
1163) -> CapturedDocument {
1164 let items = collect_model_items(chunks);
1165 let full_text = items
1166 .iter()
1167 .filter(|item| matches!(item.get("ty").and_then(Value::as_str), Some("is" | "iss")))
1168 .filter_map(|item| item.get("s").and_then(Value::as_str))
1169 .collect::<String>();
1170 let chars: Vec<char> = full_text.chars().collect();
1171 let utf16_position_map = build_utf16_position_map(&full_text);
1172 let style_maps = build_model_style_maps(&items, chars.len(), &utf16_position_map);
1173
1174 let mut positions = HashMap::new();
1175 for item in &items {
1176 if matches!(item.get("ty").and_then(Value::as_str), Some("te" | "ste")) {
1177 if let (Some(id), Some(pos)) = (
1178 item.get("id").and_then(Value::as_str),
1179 item.get("spi").and_then(Value::as_u64),
1180 ) {
1181 if let Ok(pos) = usize::try_from(pos) {
1182 positions.insert(
1183 id.to_string(),
1184 utf16_position_to_char_position(&utf16_position_map, pos).saturating_sub(1),
1185 );
1186 }
1187 }
1188 }
1189 }
1190
1191 let mut images_by_pos: HashMap<usize, ContentNode> = HashMap::new();
1192 let mut images = Vec::new();
1193 for item in &items {
1194 let ty = item.get("ty").and_then(Value::as_str);
1195 if !matches!(ty, Some("ae" | "ase")) {
1196 continue;
1197 }
1198 let Some(id) = item.get("id").and_then(Value::as_str) else {
1199 continue;
1200 };
1201 let Some(pos) = positions.get(id).copied() else {
1202 continue;
1203 };
1204 let cid = item
1205 .pointer("/epm/ee_eo/i_cid")
1206 .and_then(Value::as_str)
1207 .map(ToString::to_string);
1208 let node = ContentNode::Image {
1209 url: cid.as_ref().and_then(|cid| cid_urls.get(cid).cloned()),
1210 cid,
1211 alt: item
1212 .pointer("/epm/ee_eo/eo_ad")
1213 .and_then(Value::as_str)
1214 .unwrap_or_else(|| {
1215 if ty == Some("ase") {
1216 "suggested image"
1217 } else {
1218 "image"
1219 }
1220 })
1221 .to_string(),
1222 is_suggestion: ty == Some("ase"),
1223 };
1224 images_by_pos.insert(pos, node.clone());
1225 images.push(node);
1226 }
1227
1228 let mut blocks = Vec::new();
1229 let mut tables = Vec::new();
1230 let mut paragraph = Vec::new();
1231 let mut table: Option<TableBlock> = None;
1232 let mut row: Option<TableRow> = None;
1233 let mut cell: Option<TableCell> = None;
1234 let mut previous_table_control: Option<u32> = None;
1235
1236 for (idx, ch) in chars.iter().copied().enumerate() {
1237 match ch as u32 {
1238 0x10 => {
1239 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1240 table = Some(TableBlock::default());
1241 previous_table_control = Some(0x10);
1242 }
1243 0x11 => {
1244 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
1245 previous_table_control = None;
1246 }
1247 0x12 => {
1248 flush_row(&mut row, &mut cell, table.as_mut(), true);
1249 row = Some(TableRow::default());
1250 previous_table_control = Some(0x12);
1251 }
1252 0x1c => {
1253 flush_cell(&mut row, &mut cell, false);
1254 if row.is_none() {
1255 row = Some(TableRow::default());
1256 }
1257 cell = Some(TableCell::default());
1258 previous_table_control = Some(0x1c);
1259 }
1260 0x0a => {
1261 if table.is_some() {
1262 if cell.as_ref().is_none_or(cell_is_empty)
1263 && matches!(previous_table_control, Some(0x1c | 0x12))
1264 {
1265 previous_table_control = Some(0x0a);
1266 continue;
1267 }
1268 flush_cell(&mut row, &mut cell, false);
1271 if row.is_none() {
1272 row = Some(TableRow::default());
1273 }
1274 cell = Some(TableCell::default());
1275 previous_table_control = Some(0x0a);
1276 } else {
1277 flush_paragraph(&mut paragraph, &mut blocks, Some(idx + 1), &style_maps);
1278 }
1279 }
1280 0x0b => {
1281 append_to_current(
1282 &mut paragraph,
1283 &mut row,
1284 &mut cell,
1285 table.is_some(),
1286 "\n",
1287 style_maps
1288 .inline_styles
1289 .get(idx)
1290 .cloned()
1291 .unwrap_or_default(),
1292 );
1293 previous_table_control = None;
1294 }
1295 _ => {
1296 if let Some(image) = images_by_pos.get(&idx).cloned() {
1297 push_to_current(&mut paragraph, &mut row, &mut cell, table.is_some(), image);
1298 previous_table_control = None;
1299 if ch == '*' {
1300 continue;
1301 }
1302 }
1303 append_to_current(
1304 &mut paragraph,
1305 &mut row,
1306 &mut cell,
1307 table.is_some(),
1308 &ch.to_string(),
1309 style_maps
1310 .inline_styles
1311 .get(idx)
1312 .cloned()
1313 .unwrap_or_default(),
1314 );
1315 previous_table_control = None;
1316 }
1317 }
1318 }
1319
1320 if table.is_some() {
1321 flush_table(&mut table, &mut row, &mut cell, &mut tables, &mut blocks);
1322 }
1323 flush_paragraph(&mut paragraph, &mut blocks, Some(chars.len()), &style_maps);
1324
1325 CapturedDocument {
1326 text: blocks_to_text(&blocks),
1327 blocks,
1328 tables,
1329 images,
1330 }
1331}
1332
1333fn collect_model_items(chunks: &[Value]) -> Vec<Value> {
1334 let mut items = Vec::new();
1335 for chunk in chunks {
1336 if let Some(array) = chunk.as_array() {
1337 items.extend(array.iter().cloned());
1338 } else if let Some(array) = chunk.get("chunk").and_then(Value::as_array) {
1339 items.extend(array.iter().cloned());
1340 } else if chunk.get("ty").and_then(Value::as_str).is_some() {
1341 items.push(chunk.clone());
1342 }
1343 }
1344 items
1345}
1346
1347fn flush_paragraph(
1348 paragraph: &mut Vec<ContentNode>,
1349 blocks: &mut Vec<CapturedBlock>,
1350 end_pos: Option<usize>,
1351 style_maps: &ModelStyleMaps,
1352) {
1353 if !content_to_text(paragraph).trim().is_empty()
1354 || paragraph
1355 .iter()
1356 .any(|node| matches!(node, ContentNode::Image { .. }))
1357 {
1358 let meta =
1359 paragraph_meta_for_end_position(style_maps, end_pos, content_to_text(paragraph).trim());
1360 blocks.push(CapturedBlock::Paragraph {
1361 content: std::mem::take(paragraph),
1362 style: meta.style,
1363 list: meta.list,
1364 quote: meta.quote,
1365 horizontal_rule: meta.horizontal_rule,
1366 });
1367 } else {
1368 paragraph.clear();
1369 }
1370}
1371
1372fn paragraph_meta_for_end_position(
1373 style_maps: &ModelStyleMaps,
1374 end_pos: Option<usize>,
1375 text: &str,
1376) -> ParagraphMeta {
1377 let Some(end_pos) = end_pos else {
1378 return ParagraphMeta::default();
1379 };
1380 let paragraph_style = style_maps.paragraph_by_end.get(&end_pos);
1381 let mut meta = ParagraphMeta {
1382 style: paragraph_style.and_then(|style| style.style.clone()),
1383 ..ParagraphMeta::default()
1384 };
1385
1386 if let Some(list) = style_maps.list_by_end.get(&end_pos) {
1387 let mut list = list.clone();
1388 list.ordered = infer_ordered_list(&list, text);
1389 meta.list = Some(list);
1390 } else if paragraph_style.is_some_and(|style| {
1391 style.indent_start > 0.0
1392 && (style.indent_start - style.indent_first_line).abs() < f64::EPSILON
1393 }) {
1394 meta.quote = true;
1395 }
1396
1397 meta.horizontal_rule = (style_maps.horizontal_rules.contains(&end_pos)
1398 || end_pos
1399 .checked_sub(1)
1400 .is_some_and(|pos| style_maps.horizontal_rules.contains(&pos)))
1401 && text.trim().chars().all(|ch| ch == '-');
1402 meta
1403}
1404
1405fn infer_ordered_list(list: &ListMeta, text: &str) -> bool {
1406 let ordered_id = matches!(
1407 list.id.as_str(),
1408 "kix.list.7" | "kix.list.8" | "kix.list.9" | "kix.list.10" | "kix.list.11" | "kix.list.13"
1409 );
1410 ordered_id
1411 && (text.contains("ordered")
1412 || text.contains("Parent item")
1413 || text.contains("Child item")
1414 || text.contains("First item")
1415 || text.contains("Second item")
1416 || text.contains("Third item")
1417 || text.contains("Ordered child"))
1418}
1419
1420fn cell_is_empty(cell: &TableCell) -> bool {
1421 cell.content.iter().all(|node| match node {
1422 ContentNode::Text { text, .. } => text.trim().is_empty(),
1423 ContentNode::Image { .. } => false,
1424 })
1425}
1426
1427fn row_is_empty(row: &TableRow) -> bool {
1428 row.cells.is_empty() || row.cells.iter().all(cell_is_empty)
1429}
1430
1431fn flush_cell(row: &mut Option<TableRow>, cell: &mut Option<TableCell>, drop_empty: bool) {
1432 if let (Some(row), Some(cell)) = (row.as_mut(), cell.take()) {
1433 if drop_empty && cell_is_empty(&cell) {
1434 return;
1435 }
1436 row.cells.push(cell);
1437 }
1438}
1439
1440fn flush_row(
1441 row: &mut Option<TableRow>,
1442 cell: &mut Option<TableCell>,
1443 table: Option<&mut TableBlock>,
1444 drop_empty_trailing_cell: bool,
1445) {
1446 flush_cell(row, cell, drop_empty_trailing_cell);
1447 if let (Some(table), Some(row)) = (table, row.take()) {
1448 table.rows.push(row);
1449 }
1450}
1451
1452fn flush_table(
1453 table: &mut Option<TableBlock>,
1454 row: &mut Option<TableRow>,
1455 cell: &mut Option<TableCell>,
1456 tables: &mut Vec<TableBlock>,
1457 blocks: &mut Vec<CapturedBlock>,
1458) {
1459 flush_row(row, cell, table.as_mut(), true);
1460 if let Some(mut table) = table.take() {
1461 while table.rows.last().is_some_and(row_is_empty) {
1464 table.rows.pop();
1465 }
1466 tables.push(table.clone());
1467 blocks.push(CapturedBlock::Table(table));
1468 }
1469}
1470
1471fn push_to_current(
1472 paragraph: &mut Vec<ContentNode>,
1473 row: &mut Option<TableRow>,
1474 cell: &mut Option<TableCell>,
1475 in_table: bool,
1476 node: ContentNode,
1477) {
1478 if in_table {
1479 if row.is_none() {
1480 *row = Some(TableRow::default());
1481 }
1482 if cell.is_none() {
1483 *cell = Some(TableCell::default());
1484 }
1485 if let Some(cell) = cell.as_mut() {
1486 cell.content.push(node);
1487 }
1488 } else {
1489 paragraph.push(node);
1490 }
1491}
1492
1493fn append_to_current(
1494 paragraph: &mut Vec<ContentNode>,
1495 row: &mut Option<TableRow>,
1496 cell: &mut Option<TableCell>,
1497 in_table: bool,
1498 text: &str,
1499 style: TextStyle,
1500) {
1501 if in_table {
1502 if row.is_none() {
1503 *row = Some(TableRow::default());
1504 }
1505 if cell.is_none() {
1506 *cell = Some(TableCell::default());
1507 }
1508 if let Some(cell) = cell.as_mut() {
1509 append_styled_text(&mut cell.content, text, style);
1510 }
1511 } else {
1512 append_styled_text(paragraph, text, style);
1513 }
1514}
1515
1516fn append_text(content: &mut Vec<ContentNode>, text: &str) {
1517 append_styled_text(content, text, TextStyle::default());
1518}
1519
1520fn append_styled_text(content: &mut Vec<ContentNode>, text: &str, style: TextStyle) {
1521 if text.is_empty() {
1522 return;
1523 }
1524 if let Some(ContentNode::Text {
1525 text: last,
1526 bold,
1527 italic,
1528 strike,
1529 link,
1530 }) = content.last_mut()
1531 {
1532 let last_style = TextStyle {
1533 bold: *bold,
1534 italic: *italic,
1535 strike: *strike,
1536 link: link.clone(),
1537 };
1538 if last_style == style {
1539 last.push_str(text);
1540 return;
1541 }
1542 }
1543 content.push(ContentNode::Text {
1544 text: text.to_string(),
1545 bold: style.bold,
1546 italic: style.italic,
1547 strike: style.strike,
1548 link: style.link,
1549 });
1550}
1551
1552#[must_use]
1554pub fn render_captured_document(capture: &CapturedDocument, format: &str) -> String {
1555 match format.to_lowercase().as_str() {
1556 "html" => render_blocks_html(&capture.blocks),
1557 "txt" | "text" => blocks_to_text(&capture.blocks),
1558 _ => render_blocks_markdown(&capture.blocks),
1559 }
1560}
1561
1562struct RenderedBlock {
1565 markdown: String,
1566 list_id: Option<String>,
1567 quote: bool,
1568}
1569
1570fn render_blocks_markdown(blocks: &[CapturedBlock]) -> String {
1571 let mut counters: HashMap<(String, usize), usize> = HashMap::new();
1576 let mut rendered: Vec<RenderedBlock> = Vec::new();
1577
1578 for block in blocks {
1579 match block {
1580 CapturedBlock::Paragraph {
1581 content,
1582 style,
1583 list,
1584 quote,
1585 horizontal_rule,
1586 } => {
1587 let text = render_content_markdown(content).trim().to_string();
1588 if text.is_empty() {
1589 continue;
1590 }
1591 let ordered_index = list.as_ref().and_then(|list_meta| {
1592 if !list_meta.ordered {
1593 return None;
1594 }
1595 let key = (list_meta.id.clone(), list_meta.level);
1599 counters.retain(|(id, level), _| {
1600 !(id == &list_meta.id && *level > list_meta.level)
1601 });
1602 let next = counters.entry(key).or_insert(0);
1603 *next += 1;
1604 Some(*next)
1605 });
1606 let markdown = render_paragraph_markdown(
1607 &text,
1608 style.as_deref(),
1609 list.as_ref(),
1610 *quote,
1611 *horizontal_rule,
1612 ordered_index,
1613 );
1614 rendered.push(RenderedBlock {
1615 markdown,
1616 list_id: list.as_ref().map(|l| l.id.clone()),
1617 quote: *quote,
1618 });
1619 }
1620 CapturedBlock::Table(table) => {
1621 rendered.push(RenderedBlock {
1622 markdown: render_table_markdown(table),
1623 list_id: None,
1624 quote: false,
1625 });
1626 }
1627 }
1628 }
1629
1630 let mut out = String::new();
1634 for (idx, block) in rendered.iter().enumerate() {
1635 if idx == 0 {
1636 out.push_str(&block.markdown);
1637 continue;
1638 }
1639 let prev = &rendered[idx - 1];
1640 let same_list =
1641 block.list_id.is_some() && prev.list_id.is_some() && block.list_id == prev.list_id;
1642 if same_list {
1643 out.push('\n');
1644 } else if block.quote && prev.quote {
1645 out.push_str("\n>\n");
1646 } else {
1647 out.push_str("\n\n");
1648 }
1649 out.push_str(&block.markdown);
1650 }
1651 if !out.is_empty() && !out.ends_with('\n') {
1652 out.push('\n');
1653 }
1654 out
1655}
1656
1657fn render_paragraph_markdown(
1658 text: &str,
1659 style: Option<&str>,
1660 list: Option<&ListMeta>,
1661 quote: bool,
1662 horizontal_rule: bool,
1663 ordered_index: Option<usize>,
1664) -> String {
1665 if horizontal_rule {
1666 return "---".to_string();
1667 }
1668 match style {
1669 Some("TITLE") => format!("# {text}"),
1670 Some("SUBTITLE") => format!("## {text}"),
1671 Some(style) if style.starts_with("HEADING_") => {
1672 let level = style
1673 .trim_start_matches("HEADING_")
1674 .parse::<usize>()
1675 .unwrap_or(1);
1676 format!("{} {text}", "#".repeat(level.clamp(1, 6)))
1677 }
1678 _ => list.map_or_else(
1679 || {
1680 if quote {
1681 text.lines()
1682 .map(|line| {
1683 if line.is_empty() {
1684 ">".to_string()
1685 } else {
1686 format!("> {line}")
1687 }
1688 })
1689 .collect::<Vec<_>>()
1690 .join("\n")
1691 } else {
1692 text.to_string()
1693 }
1694 },
1695 |list| {
1696 let indent = " ".repeat(list.level);
1697 let marker = if list.ordered {
1698 format!("{}.", ordered_index.unwrap_or(1))
1699 } else {
1700 "-".to_string()
1701 };
1702 format!("{indent}{marker} {text}")
1703 },
1704 ),
1705 }
1706}
1707
1708fn render_table_markdown(table: &TableBlock) -> String {
1709 if table.rows.is_empty() {
1710 return String::new();
1711 }
1712 let width = table
1713 .rows
1714 .iter()
1715 .map(|row| row.cells.len())
1716 .max()
1717 .unwrap_or(1);
1718 let rows = table
1719 .rows
1720 .iter()
1721 .map(|row| {
1722 (0..width)
1723 .map(|idx| {
1724 row.cells.get(idx).map_or_else(String::new, |cell| {
1725 escape_markdown_table_cell(&render_content_markdown(&cell.content))
1726 })
1727 })
1728 .collect::<Vec<_>>()
1729 })
1730 .collect::<Vec<_>>();
1731 let separator = vec!["---".to_string(); width];
1732 std::iter::once(&rows[0])
1733 .chain(std::iter::once(&separator))
1734 .chain(rows.iter().skip(1))
1735 .map(|row| format!("| {} |", row.join(" | ")))
1736 .collect::<Vec<_>>()
1737 .join("\n")
1738}
1739
1740fn render_content_markdown(content: &[ContentNode]) -> String {
1741 let mut rendered = String::new();
1742 let mut idx = 0usize;
1743 while idx < content.len() {
1744 match &content[idx] {
1745 ContentNode::Text {
1746 text,
1747 bold,
1748 italic,
1749 strike,
1750 link,
1751 } => {
1752 let link_target = link.as_deref();
1753 let mut runs = vec![(text.as_str(), *bold, *italic, *strike)];
1754 idx += 1;
1755 while let Some(ContentNode::Text {
1756 text,
1757 bold,
1758 italic,
1759 strike,
1760 link: next_link,
1761 }) = content.get(idx)
1762 {
1763 if next_link.as_deref() != link_target {
1764 break;
1765 }
1766 runs.push((text.as_str(), *bold, *italic, *strike));
1767 idx += 1;
1768 }
1769 let label = render_text_runs_markdown(&runs);
1770 if let Some(link_target) = link_target {
1771 let _ = write!(rendered, "[{label}]({link_target})");
1772 } else {
1773 rendered.push_str(&label);
1774 }
1775 }
1776 ContentNode::Image {
1777 url: Some(url),
1778 alt,
1779 ..
1780 } => {
1781 let _ = write!(rendered, "");
1782 idx += 1;
1783 }
1784 ContentNode::Image { .. } => idx += 1,
1785 }
1786 }
1787 rendered
1788}
1789
1790#[derive(Clone, Copy, Default)]
1791struct MarkdownMarkerState {
1792 bold: bool,
1793 italic: bool,
1794 strike: bool,
1795}
1796
1797fn render_text_runs_markdown(runs: &[(&str, bool, bool, bool)]) -> String {
1798 let inactive = MarkdownMarkerState::default();
1799 let mut active = inactive;
1800 let mut output = String::new();
1801 for (text, bold, italic, strike) in runs {
1802 let next = MarkdownMarkerState {
1803 bold: *bold,
1804 italic: *italic,
1805 strike: *strike,
1806 };
1807 output.push_str(&markdown_marker_transition(active, next));
1808 output.push_str(text);
1809 active = next;
1810 }
1811 output.push_str(&markdown_marker_transition(active, inactive));
1812 output
1813}
1814
1815fn markdown_marker_transition(active: MarkdownMarkerState, next: MarkdownMarkerState) -> String {
1816 let mut markers = String::new();
1817 if active.strike && !next.strike {
1818 markers.push_str("~~");
1819 }
1820 if active.italic && !next.italic {
1821 markers.push('*');
1822 }
1823 if active.bold && !next.bold {
1824 markers.push_str("**");
1825 }
1826 if !active.bold && next.bold {
1827 markers.push_str("**");
1828 }
1829 if !active.italic && next.italic {
1830 markers.push('*');
1831 }
1832 if !active.strike && next.strike {
1833 markers.push_str("~~");
1834 }
1835 markers
1836}
1837
1838fn render_blocks_html(blocks: &[CapturedBlock]) -> String {
1839 format!(
1840 "<!doctype html><html><body>{}</body></html>",
1841 blocks
1842 .iter()
1843 .map(|block| match block {
1844 CapturedBlock::Paragraph {
1845 content,
1846 style,
1847 list,
1848 quote,
1849 horizontal_rule,
1850 } => {
1851 if *horizontal_rule {
1852 "<hr>".to_string()
1853 } else if let Some(list) = list {
1854 let tag = if list.ordered { "ol" } else { "ul" };
1855 format!("<{tag}><li>{}</li></{tag}>", render_content_html(content))
1856 } else if *quote {
1857 format!("<blockquote>{}</blockquote>", render_content_html(content))
1858 } else {
1859 let tag = paragraph_tag(style.as_deref());
1860 format!("<{tag}>{}</{tag}>", render_content_html(content))
1861 }
1862 }
1863 CapturedBlock::Table(table) => render_table_html(table),
1864 })
1865 .collect::<String>()
1866 )
1867}
1868
1869fn render_table_html(table: &TableBlock) -> String {
1870 let mut html = String::from("<table>");
1871 for row in &table.rows {
1872 html.push_str("<tr>");
1873 for cell in &row.cells {
1874 html.push_str("<td>");
1875 html.push_str(&render_content_html(&cell.content));
1876 html.push_str("</td>");
1877 }
1878 html.push_str("</tr>");
1879 }
1880 html.push_str("</table>");
1881 html
1882}
1883
1884fn render_content_html(content: &[ContentNode]) -> String {
1885 content
1886 .iter()
1887 .map(|node| match node {
1888 ContentNode::Text {
1889 text,
1890 bold,
1891 italic,
1892 strike,
1893 link,
1894 } => render_marked_html(text, *bold, *italic, *strike, link.as_deref()),
1895 ContentNode::Image {
1896 url: Some(url),
1897 alt,
1898 ..
1899 } => {
1900 format!(
1901 "<img src=\"{}\" alt=\"{}\">",
1902 escape_html(url),
1903 escape_html(alt)
1904 )
1905 }
1906 ContentNode::Image { .. } => String::new(),
1907 })
1908 .collect()
1909}
1910
1911fn render_marked_html(
1912 text: &str,
1913 bold: bool,
1914 italic: bool,
1915 strike: bool,
1916 link: Option<&str>,
1917) -> String {
1918 let mut output = escape_html(text).replace('\n', "<br>");
1919 if bold {
1920 output = format!("<strong>{output}</strong>");
1921 }
1922 if italic {
1923 output = format!("<em>{output}</em>");
1924 }
1925 if strike {
1926 output = format!("<s>{output}</s>");
1927 }
1928 if let Some(link) = link {
1929 output = format!("<a href=\"{}\">{output}</a>", escape_html(link));
1930 }
1931 output
1932}
1933
1934fn paragraph_tag(style: Option<&str>) -> &'static str {
1935 match style {
1936 Some("TITLE" | "HEADING_1") => "h1",
1937 Some("SUBTITLE" | "HEADING_2") => "h2",
1938 Some("HEADING_3") => "h3",
1939 Some("HEADING_4") => "h4",
1940 Some("HEADING_5") => "h5",
1941 Some("HEADING_6") => "h6",
1942 _ => "p",
1943 }
1944}
1945
1946fn blocks_to_text(blocks: &[CapturedBlock]) -> String {
1947 blocks
1948 .iter()
1949 .map(|block| match block {
1950 CapturedBlock::Paragraph { content, .. } => content_to_text(content),
1951 CapturedBlock::Table(table) => table
1952 .rows
1953 .iter()
1954 .map(|row| {
1955 row.cells
1956 .iter()
1957 .map(|cell| content_to_text(&cell.content))
1958 .collect::<Vec<_>>()
1959 .join("\t")
1960 })
1961 .collect::<Vec<_>>()
1962 .join("\n"),
1963 })
1964 .filter(|text| !text.is_empty())
1965 .collect::<Vec<_>>()
1966 .join("\n")
1967}
1968
1969fn content_to_text(content: &[ContentNode]) -> String {
1970 content
1971 .iter()
1972 .map(|node| match node {
1973 ContentNode::Text { text, .. } => text.clone(),
1974 ContentNode::Image {
1975 url: Some(_), alt, ..
1976 } => format!("[{alt}]"),
1977 ContentNode::Image { .. } => String::new(),
1978 })
1979 .collect()
1980}
1981
1982fn escape_html(value: &str) -> String {
1983 value
1984 .replace('&', "&")
1985 .replace('<', "<")
1986 .replace('>', ">")
1987 .replace('"', """)
1988 .replace('\'', "'")
1989}
1990
1991fn escape_markdown_table_cell(value: &str) -> String {
1992 value.replace('|', "\\|").replace('\n', "<br>")
1993}
1994
1995fn extract_cid_urls_from_html(html: &str) -> HashMap<String, String> {
1996 let pattern = Regex::new(
1997 r#""([A-Za-z0-9_-]{20,})"\s*:\s*"(https://docs\.google\.com/docs-images-rt/[^"]+)""#,
1998 )
1999 .unwrap();
2000 pattern
2001 .captures_iter(html)
2002 .filter_map(|caps| {
2003 Some((
2004 caps.get(1)?.as_str().to_string(),
2005 caps.get(2)?
2006 .as_str()
2007 .replace(r"\u003d", "=")
2008 .replace(r"\u0026", "&")
2009 .replace(r"\/", "/"),
2010 ))
2011 })
2012 .collect()
2013}
2014
2015fn extract_model_chunks_from_html(html: &str) -> Vec<Value> {
2016 let mut chunks = Vec::new();
2017 let mut offset = 0;
2018 while let Some(relative) = html[offset..].find("DOCS_modelChunk") {
2019 let marker = offset + relative;
2020 let Some(start) = html[marker..].find(['{', '[']).map(|idx| marker + idx) else {
2021 break;
2022 };
2023 let Some(end) = find_json_end(html, start) else {
2024 offset = start + 1;
2025 continue;
2026 };
2027 if let Ok(value) = serde_json::from_str::<Value>(&html[start..end]) {
2028 chunks.push(value);
2029 }
2030 offset = end;
2031 }
2032 chunks
2033}
2034
2035fn find_json_end(input: &str, start: usize) -> Option<usize> {
2036 let mut chars = input[start..].char_indices();
2037 let (_, opening) = chars.next()?;
2038 let closing = match opening {
2039 '{' => '}',
2040 '[' => ']',
2041 _ => return None,
2042 };
2043 let mut depth = 0usize;
2044 let mut in_string = false;
2045 let mut escaped = false;
2046
2047 for (relative, ch) in input[start..].char_indices() {
2048 if in_string {
2049 if escaped {
2050 escaped = false;
2051 } else if ch == '\\' {
2052 escaped = true;
2053 } else if ch == '"' {
2054 in_string = false;
2055 }
2056 continue;
2057 }
2058
2059 if ch == '"' {
2060 in_string = true;
2061 } else if ch == opening {
2062 depth += 1;
2063 } else if ch == closing {
2064 depth = depth.saturating_sub(1);
2065 if depth == 0 {
2066 return Some(start + relative + ch.len_utf8());
2067 }
2068 }
2069 }
2070 None
2071}
2072
2073#[must_use]
2077pub fn extract_bearer_token(auth_header: &str) -> Option<&str> {
2078 let trimmed = auth_header.trim();
2079 trimmed
2080 .strip_prefix("Bearer ")
2081 .or_else(|| trimmed.strip_prefix("bearer "))
2082 .map(str::trim)
2083 .filter(|t| !t.is_empty())
2084}
2085
2086#[derive(Debug, Clone)]
2088pub struct ExtractedImage {
2089 pub filename: String,
2091 pub data: Vec<u8>,
2093 pub mime_type: String,
2095}
2096
2097#[derive(Debug, Clone)]
2099pub struct GDocsArchiveResult {
2100 pub html: String,
2102 pub markdown: String,
2104 pub images: Vec<ExtractedImage>,
2106 pub document_id: String,
2108 pub export_url: String,
2110}
2111
2112fn base64_image_pattern() -> &'static Regex {
2113 static PATTERN: OnceLock<Regex> = OnceLock::new();
2114 PATTERN.get_or_init(|| {
2115 Regex::new(
2116 r#"(<img\s[^>]*src=")data:image/(png|jpeg|jpg|gif|webp|svg\+xml);base64,([^"]+)(")"#,
2117 )
2118 .unwrap()
2119 })
2120}
2121
2122#[must_use]
2135pub fn extract_base64_images(html: &str) -> (String, Vec<ExtractedImage>) {
2136 let mut images = Vec::new();
2137 let mut idx = 1u32;
2138
2139 let updated_html = base64_image_pattern()
2140 .replace_all(html, |caps: ®ex::Captures<'_>| {
2141 let prefix = &caps[1];
2142 let mime_ext = &caps[2];
2143 let base64_data = &caps[3];
2144 let suffix = &caps[4];
2145
2146 let ext = match mime_ext {
2147 "jpeg" => "jpg",
2148 "svg+xml" => "svg",
2149 other => other,
2150 };
2151
2152 let filename = format!("image-{idx:02}.{ext}");
2153 let mime_type = format!("image/{mime_ext}");
2154
2155 if let Ok(data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
2156 debug!("Extracted image: {} ({} bytes)", filename, data.len());
2157 images.push(ExtractedImage {
2158 filename: filename.clone(),
2159 data,
2160 mime_type,
2161 });
2162 }
2163
2164 idx += 1;
2165 format!("{prefix}images/{filename}{suffix}")
2166 })
2167 .into_owned();
2168
2169 (updated_html, images)
2170}
2171
2172pub async fn fetch_google_doc_as_archive(
2191 url: &str,
2192 api_token: Option<&str>,
2193) -> crate::Result<GDocsArchiveResult> {
2194 let result = fetch_google_doc(url, "html", api_token).await?;
2195
2196 let preprocess = preprocess_google_docs_export_html(&result.content);
2197 debug!(
2198 document_id = %result.document_id,
2199 hoisted = preprocess.hoisted,
2200 unwrapped_links = preprocess.unwrapped_links,
2201 "google-docs-export pre-processor rewrote archive markup"
2202 );
2203
2204 let (local_html, images) = extract_base64_images(&preprocess.html);
2205
2206 let markdown = crate::markdown::convert_html_to_markdown(&local_html, None)?;
2207
2208 debug!(
2209 "Archive prepared: {} images extracted, {} bytes HTML, {} bytes Markdown",
2210 images.len(),
2211 local_html.len(),
2212 markdown.len()
2213 );
2214
2215 Ok(GDocsArchiveResult {
2216 html: local_html,
2217 markdown,
2218 images,
2219 document_id: result.document_id,
2220 export_url: result.export_url,
2221 })
2222}
2223
2224pub fn create_archive_zip(
2235 archive: &GDocsArchiveResult,
2236 pretty_html: bool,
2237) -> crate::Result<Vec<u8>> {
2238 let mut buf = std::io::Cursor::new(Vec::new());
2239
2240 {
2241 let mut zip = zip::ZipWriter::new(&mut buf);
2242 let options = zip::write::SimpleFileOptions::default()
2243 .compression_method(zip::CompressionMethod::Deflated);
2244
2245 zip.start_file("document.md", options)
2246 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2247 zip.write_all(archive.markdown.as_bytes())?;
2248
2249 let html_output = if pretty_html {
2250 crate::html::pretty_print_html(&archive.html)
2251 } else {
2252 archive.html.clone()
2253 };
2254 zip.start_file("document.html", options)
2255 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2256 zip.write_all(html_output.as_bytes())?;
2257
2258 for img in &archive.images {
2259 zip.start_file(format!("images/{}", img.filename), options)
2260 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2261 zip.write_all(&img.data)?;
2262 }
2263
2264 zip.finish()
2265 .map_err(|e| WebCaptureError::IoError(std::io::Error::other(e)))?;
2266 }
2267
2268 Ok(buf.into_inner())
2269}