summary_rs/parse/
read.rs

1use quick_xml::events::Event;
2use quick_xml::Reader;
3
4pub fn parse_docx_text(content: &str) -> Vec<String> {
5    let mut reader = Reader::from_str(content);
6    let mut buf = Vec::new();
7    let mut path: Vec<(String, usize)> = Vec::new(); // 存储(标签名, 索引)
8    let mut counts: std::collections::HashMap<String, usize> = std::collections::HashMap::new(); // 记录每个标签的计数
9    let mut output = Vec::<String>::new();
10    let mut last_path_str: String = "w:document[1]/w:body[1]/w:p[1]//".to_string();
11    let mut last_text: String = "".to_string();
12    loop {
13        match reader.read_event_into(&mut buf) {
14            Ok(Event::Start(ref e)) => {
15                let tag_name = String::from_utf8_lossy(e.name().as_ref()).to_string();
16				// 处理 w:date 标签
17                if e.name().as_ref() == b"w:date" {
18                    // 获取 fullDate 属性
19                    if let Some(attr) = e.attributes().find(|a| {
20                        let a = a.clone().unwrap();
21                        a.key.as_ref() == b"w:fullDate"
22                    }) {
23                        match attr {
24							Ok(attr) => {
25								let date =  String::from_utf8_lossy(attr.value.as_ref());
26								if let Some(date_only) = date.split('T').next() {
27									last_text = date_only.to_string();
28									output.push(last_text.clone());
29								}
30							},
31							Err(e) => {
32								println!("{}", e);
33							}
34
35						}
36                    }
37                }
38                // 更新该标签的计数
39                let count = counts.entry(tag_name.clone()).or_insert(0);
40                *count += 1;
41
42                // 将标签名和索引加入路径栈
43                path.push((tag_name, *count));
44
45                if e.name().as_ref() == b"w:t" {
46                    if let Ok(Event::Text(e)) = reader.read_event_into(&mut buf) {
47                        // 打印带索引的完整路径
48                        let path_str = path
49                            .iter()
50                            .map(|(tag, idx)| {
51                                if tag == "w:t" || tag == "w:r" || tag == "w:p" {
52                                    "".to_string()
53                                } else {
54                                    format!("{}[{}]", tag, idx)
55                                }
56                            })
57                            .collect::<Vec<_>>()
58                            .join("/");
59                        let text = e.unescape().unwrap_or_default().to_string();
60                        if last_path_str.is_empty() {
61                            last_path_str = path_str.clone();
62                        }
63                        if path_str.clone() != last_path_str.clone() {
64                            output.push(last_text.clone());
65                            last_text = text.clone();
66                        } else {
67                            last_text = format!("{}{}", last_text, text);
68                        }
69                        last_path_str = path_str;
70                    }
71                }
72            }
73            Ok(Event::End(_)) => {
74                path.pop();
75            }
76            Ok(Event::Eof) => break,
77            Err(e) => {
78                println!("解析错误: {}", e);
79                break;
80            }
81            _ => (),
82        }
83        buf.clear();
84    }
85    output.push(last_text.clone());
86    output
87}