use std::io::{Cursor, Read};
use super::*;
use crate::kb::content_store::atomic::sha256_hex;
fn extract_tag_text(xml: &str, tag: &str) -> Vec<String> {
let t = regex::escape(tag);
let pat = format!(r"(?s)<{t}(?:\s[^>]*)?>(.*?)</{t}>");
let re = regex::Regex::new(&pat).expect("static OOXML tag regex is valid");
re.captures_iter(xml)
.map(|c| decode_xml_entities(&c[1]))
.collect()
}
fn decode_xml_entities(s: &str) -> String {
if !s.contains('&') {
return s.to_string();
}
s.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("&", "&")
}
pub const DOCX_MIME: &str =
"application/vnd.openxmlformats-officedocument.wordprocessingml.document";
pub const PPTX_MIME: &str =
"application/vnd.openxmlformats-officedocument.presentationml.presentation";
fn read_zip_part(bytes: &[u8], name: &str) -> Result<Option<String>> {
let mut archive = zip::ZipArchive::new(Cursor::new(bytes))
.map_err(|e| anyhow::anyhow!("not a valid OOXML (zip) file: {e}"))?;
match archive.by_name(name) {
Ok(mut f) => {
let mut s = String::new();
f.read_to_string(&mut s)?;
Ok(Some(s))
}
Err(zip::result::ZipError::FileNotFound) => Ok(None),
Err(e) => Err(anyhow::anyhow!("reading {name} from zip: {e}")),
}
}
fn make_source(
input: &CanonicalizeInput<'_>,
markdown: String,
default_title: &str,
extra: serde_json::Value,
) -> CanonicalizedSource {
let lsid = input
.logical_source_id_seed
.clone()
.unwrap_or_else(|| LogicalSourceId::for_file(&sha256_hex(input.bytes)));
CanonicalizedSource {
markdown,
metadata: CanonicalMetadata {
source_kind: KbSourceKind::Doc,
logical_source_id: lsid,
title: input.hint_title.unwrap_or(default_title).to_string(),
mime: input.mime.to_string(),
created_at_ms: chrono::Utc::now().timestamp_millis(),
tags: vec![],
extra,
},
}
}
pub struct DocxCanonicalizer;
impl Canonicalizer for DocxCanonicalizer {
fn source_kind(&self) -> KbSourceKind {
KbSourceKind::Doc
}
fn supports_mime(&self, mime: &str) -> bool {
mime == DOCX_MIME
}
fn canonicalize(&self, input: CanonicalizeInput<'_>) -> Result<Option<CanonicalizedSource>> {
let xml = match read_zip_part(input.bytes, "word/document.xml")? {
Some(x) => x,
None => return Ok(None),
};
let mut paras = Vec::new();
for seg in xml.split("</w:p>") {
let runs = extract_tag_text(seg, "w:t");
if runs.is_empty() {
continue;
}
let para = runs.join("");
let trimmed = para.trim();
if !trimmed.is_empty() {
paras.push(trimmed.to_string());
}
}
if paras.is_empty() {
return Ok(None);
}
let md = paras.join("\n\n");
let extra = serde_json::json!({ "n_paragraphs": paras.len() });
Ok(Some(make_source(&input, md, "Untitled.docx", extra)))
}
}
pub struct PptxCanonicalizer;
fn slide_number(name: &str) -> u32 {
name.rsplit('/')
.next()
.unwrap_or("")
.trim_start_matches("slide")
.trim_end_matches(".xml")
.parse()
.unwrap_or(0)
}
impl Canonicalizer for PptxCanonicalizer {
fn source_kind(&self) -> KbSourceKind {
KbSourceKind::Doc
}
fn supports_mime(&self, mime: &str) -> bool {
mime == PPTX_MIME
}
fn canonicalize(&self, input: CanonicalizeInput<'_>) -> Result<Option<CanonicalizedSource>> {
let mut archive = zip::ZipArchive::new(Cursor::new(input.bytes))
.map_err(|e| anyhow::anyhow!("not a valid pptx (zip) file: {e}"))?;
let mut slides: Vec<String> = archive
.file_names()
.filter(|n| n.starts_with("ppt/slides/slide") && n.ends_with(".xml"))
.map(|s| s.to_string())
.collect();
slides.sort_by_key(|n| slide_number(n));
let mut sections = Vec::new();
for (i, name) in slides.iter().enumerate() {
let mut xml = String::new();
archive.by_name(name)?.read_to_string(&mut xml)?;
let runs: Vec<String> = extract_tag_text(&xml, "a:t")
.into_iter()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
if runs.is_empty() {
continue;
}
sections.push(format!("## Slide {}\n\n{}", i + 1, runs.join("\n")));
}
if sections.is_empty() {
return Ok(None);
}
let md = sections.join("\n\n");
let extra = serde_json::json!({ "n_slides": slides.len() });
Ok(Some(make_source(&input, md, "Untitled.pptx", extra)))
}
}
#[cfg(test)]
mod canon_tests {
use std::io::Write;
use super::*;
fn make_zip(parts: &[(&str, &str)]) -> Vec<u8> {
let mut buf = Vec::new();
{
let mut zw = zip::ZipWriter::new(Cursor::new(&mut buf));
let opts = zip::write::SimpleFileOptions::default();
for (name, content) in parts {
zw.start_file(*name, opts).unwrap();
zw.write_all(content.as_bytes()).unwrap();
}
zw.finish().unwrap();
}
buf
}
fn input<'a>(bytes: &'a [u8], mime: &'a str) -> CanonicalizeInput<'a> {
CanonicalizeInput {
bytes,
mime,
hint_title: Some("t"),
logical_source_id_seed: None,
}
}
#[test]
fn docx_extracts_paragraphs() {
let doc = "<w:document><w:body>\
<w:p><w:t>第一段中文</w:t></w:p>\
<w:p><w:t>second </w:t><w:t>paragraph</w:t></w:p>\
</w:body></w:document>";
let bytes = make_zip(&[("word/document.xml", doc)]);
let out = DocxCanonicalizer
.canonicalize(input(&bytes, DOCX_MIME))
.unwrap()
.expect("some");
assert_eq!(out.markdown, "第一段中文\n\nsecond paragraph");
}
#[test]
fn pptx_extracts_slides_in_order() {
let s2 = "<p:sld><a:t>第二页</a:t></p:sld>";
let s1 = "<p:sld><a:t>第一页</a:t><a:t>标题</a:t></p:sld>";
let bytes = make_zip(&[("ppt/slides/slide2.xml", s2), ("ppt/slides/slide1.xml", s1)]);
let out = PptxCanonicalizer
.canonicalize(input(&bytes, PPTX_MIME))
.unwrap()
.expect("some");
assert_eq!(
out.markdown,
"## Slide 1\n\n第一页\n标题\n\n## Slide 2\n\n第二页"
);
}
#[test]
fn invalid_zip_is_error_not_panic() {
let r = DocxCanonicalizer.canonicalize(input(b"not a zip", DOCX_MIME));
assert!(r.is_err());
}
#[test]
fn docx_with_no_text_is_none() {
let bytes = make_zip(&[(
"word/document.xml",
"<w:document><w:body></w:body></w:document>",
)]);
let out = DocxCanonicalizer
.canonicalize(input(&bytes, DOCX_MIME))
.unwrap();
assert!(out.is_none());
}
}
#[cfg(test)]
mod extract_tests {
use super::*;
#[test]
fn pulls_text_in_order() {
let xml = "<w:p><w:t>Hello</w:t><w:t> world</w:t></w:p>";
assert_eq!(extract_tag_text(xml, "w:t"), vec!["Hello", " world"]);
}
#[test]
fn ignores_attributes_and_preserves_space() {
let xml = r#"<w:t xml:space="preserve">leading </w:t>"#;
assert_eq!(extract_tag_text(xml, "w:t"), vec!["leading "]);
}
#[test]
fn decodes_basic_entities() {
let xml = "<a:t>A & B <tag> "q"</a:t>";
assert_eq!(extract_tag_text(xml, "a:t"), vec![r#"A & B <tag> "q""#]);
}
#[test]
fn does_not_match_other_tags_or_substrings() {
let xml = "<w:tab/><w:t>real</w:t><w:rPr>x</w:rPr>";
assert_eq!(extract_tag_text(xml, "w:t"), vec!["real"]);
}
}