use crate::types::ExtractedImage;
use base64::Engine;
use bytes::Bytes;
use pulldown_cmark::{Event, Tag, TagEnd};
use std::borrow::Cow;
pub(crate) fn extract_text_from_events(events: &[Event], images: &mut Vec<ExtractedImage>) -> String {
let mut text = String::new();
let mut link_url: Option<String> = None;
let mut in_heading = false;
for event in events {
match event {
Event::Start(Tag::Heading { level, .. }) => {
text.push('\n');
let prefix = match *level {
pulldown_cmark::HeadingLevel::H1 => "# ",
pulldown_cmark::HeadingLevel::H2 => "## ",
pulldown_cmark::HeadingLevel::H3 => "### ",
pulldown_cmark::HeadingLevel::H4 => "#### ",
pulldown_cmark::HeadingLevel::H5 => "##### ",
pulldown_cmark::HeadingLevel::H6 => "###### ",
};
text.push_str(prefix);
in_heading = true;
}
Event::End(TagEnd::Heading(_)) => {
text.push('\n');
in_heading = false;
}
Event::Start(Tag::Link { dest_url, .. }) => {
link_url = Some(dest_url.to_string());
}
Event::End(TagEnd::Link) => {
if let Some(url) = link_url.take()
&& !url.is_empty()
&& !url.starts_with('#')
{
text.push_str(" (");
text.push_str(&url);
text.push(')');
}
}
Event::Start(Tag::Image { dest_url, .. }) => {
text.push_str("[Image");
if !dest_url.is_empty() {
text.push_str(": ");
text.push_str(dest_url);
}
text.push(']');
if dest_url.starts_with("data:image/")
&& let Some(image) = decode_data_uri_image(dest_url, images.len())
{
images.push(image);
}
}
Event::Start(Tag::CodeBlock(pulldown_cmark::CodeBlockKind::Fenced(lang))) => {
text.push('\n');
text.push_str("```");
if !lang.is_empty() {
text.push_str(lang);
}
text.push('\n');
}
Event::End(TagEnd::CodeBlock) => {
text.push_str("```\n");
}
Event::Start(Tag::BlockQuote(_)) => {
text.push_str("\n> ");
}
Event::Start(Tag::Paragraph) if !in_heading => {
text.push('\n');
}
Event::End(TagEnd::Paragraph) => {
text.push('\n');
}
Event::Text(s) | Event::Code(s) | Event::Html(s) => {
text.push_str(s);
}
Event::SoftBreak | Event::HardBreak => {
text.push('\n');
}
Event::TaskListMarker(checked) => {
text.push_str(if *checked { "[x] " } else { "[ ] " });
}
Event::FootnoteReference(s) => {
text.push('[');
text.push_str(s);
text.push(']');
}
Event::Rule => {
text.push_str("\n---\n");
}
_ => {}
}
}
text
}
pub(crate) fn decode_data_uri_image(uri: &str, index: usize) -> Option<ExtractedImage> {
let after_data = uri.strip_prefix("data:")?;
let (mime_and_encoding, data) = after_data.split_once(',')?;
if !mime_and_encoding.contains("base64") {
return None;
}
let format: &str = if mime_and_encoding.contains("image/png") {
"png"
} else if mime_and_encoding.contains("image/jpeg") {
"jpeg"
} else if mime_and_encoding.contains("image/gif") {
"gif"
} else if mime_and_encoding.contains("image/webp") {
"webp"
} else {
return None;
};
let cleaned = data.replace(['\n', '\r'], "");
let decoded = base64::engine::general_purpose::STANDARD.decode(&cleaned).ok()?;
Some(ExtractedImage {
data: Bytes::from(decoded),
format: Cow::Borrowed(format),
image_index: index,
page_number: None,
width: None,
height: None,
colorspace: None,
bits_per_component: None,
is_mask: false,
description: None,
ocr_result: None,
bounding_box: None,
})
}