1use quick_xml::events::Event;
2use quick_xml::Reader;
3
4pub fn parse_docx_text(content: &str) -> Vec<String> {
5 let mut reader = Reader::from_str(content);
6 let mut buf = Vec::new();
7 let mut path: Vec<(String, usize)> = Vec::new(); let mut counts: std::collections::HashMap<String, usize> = std::collections::HashMap::new(); let mut output = Vec::<String>::new();
10 let mut last_path_str: String = "w:document[1]/w:body[1]/w:p[1]//".to_string();
11 let mut last_text: String = "".to_string();
12 loop {
13 match reader.read_event_into(&mut buf) {
14 Ok(Event::Start(ref e)) => {
15 let tag_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
16 if e.name().as_ref() == b"w:date" {
18 if let Some(attr) = e.attributes().find(|a| {
20 let a = a.clone().unwrap();
21 a.key.as_ref() == b"w:fullDate"
22 }) {
23 match attr {
24 Ok(attr) => {
25 let date = String::from_utf8_lossy(attr.value.as_ref());
26 if let Some(date_only) = date.split('T').next() {
27 last_text = date_only.to_string();
28 output.push(last_text.clone());
29 }
30 },
31 Err(e) => {
32 println!("{}", e);
33 }
34
35 }
36 }
37 }
38 let count = counts.entry(tag_name.clone()).or_insert(0);
40 *count += 1;
41
42 path.push((tag_name, *count));
44
45 if e.name().as_ref() == b"w:t" {
46 if let Ok(Event::Text(e)) = reader.read_event_into(&mut buf) {
47 let path_str = path
49 .iter()
50 .map(|(tag, idx)| {
51 if tag == "w:t" || tag == "w:r" || tag == "w:p" {
52 "".to_string()
53 } else {
54 format!("{}[{}]", tag, idx)
55 }
56 })
57 .collect::<Vec<_>>()
58 .join("/");
59 let text = e.unescape().unwrap_or_default().to_string();
60 if last_path_str.is_empty() {
61 last_path_str = path_str.clone();
62 }
63 if path_str.clone() != last_path_str.clone() {
64 output.push(last_text.clone());
65 last_text = text.clone();
66 } else {
67 last_text = format!("{}{}", last_text, text);
68 }
69 last_path_str = path_str;
70 }
71 }
72 }
73 Ok(Event::End(_)) => {
74 path.pop();
75 }
76 Ok(Event::Eof) => break,
77 Err(e) => {
78 println!("解析错误: {}", e);
79 break;
80 }
81 _ => (),
82 }
83 buf.clear();
84 }
85 output.push(last_text.clone());
86 output
87}