1#[derive(Debug, Clone, Default)]
2pub struct Document {
3 pub paragraphs: Vec<Paragraph>,
4 pub tables: Vec<Table>,
5 pub lists: Vec<ListItem>,
6 pub headers: Vec<HeaderFooter>,
7 pub footers: Vec<HeaderFooter>,
8 pub footnotes: Vec<Note>,
9 pub endnotes: Vec<Note>,
10}
11
12#[derive(Debug, Clone, Default)]
13pub struct Paragraph {
14 pub runs: Vec<Run>,
15 pub style: Option<String>,
16 pub numbering_id: Option<i64>,
17 pub numbering_level: Option<i64>,
18}
19
20#[derive(Debug, Clone, Default)]
21pub struct Run {
22 pub text: String,
23 pub bold: bool,
24 pub italic: bool,
25 pub underline: bool,
26}
27
28#[derive(Debug, Clone, Default)]
29pub struct Table {
30 pub rows: Vec<TableRow>,
31}
32
33#[derive(Debug, Clone, Default)]
34pub struct TableRow {
35 pub cells: Vec<TableCell>,
36}
37
38#[derive(Debug, Clone, Default)]
39pub struct TableCell {
40 pub paragraphs: Vec<Paragraph>,
41}
42
43#[derive(Debug, Clone)]
46pub struct ListItem {
47 pub level: u32,
48 pub list_type: ListType,
49 pub number: Option<String>,
50 pub text: String,
51}
52
53#[derive(Debug, Clone, PartialEq)]
54pub enum ListType {
55 Bullet,
56 Numbered,
57}
58
59#[derive(Debug, Clone, Default)]
60pub struct HeaderFooter {
61 pub paragraphs: Vec<Paragraph>,
62 pub tables: Vec<Table>,
63 pub header_type: HeaderFooterType,
64}
65
66#[derive(Debug, Clone, Default, PartialEq)]
67pub enum HeaderFooterType {
68 #[default]
69 Default,
70 First,
71 Even,
72 Odd,
73}
74
75#[derive(Debug, Clone)]
76pub struct Note {
77 pub id: String,
78 pub note_type: NoteType,
79 pub paragraphs: Vec<Paragraph>,
80}
81
82#[derive(Debug, Clone, PartialEq)]
83pub enum NoteType {
84 Footnote,
85 Endnote,
86}
87
88#[derive(Debug, Clone, Default)]
89pub struct ExtractOptions {
90 pub include_headers: bool,
91 pub include_footers: bool,
92 pub include_footnotes: bool,
93 pub include_endnotes: bool,
94 pub include_list_markers: bool,
95}
96
97impl ExtractOptions {
98 pub fn all() -> Self {
99 Self {
100 include_headers: true,
101 include_footers: true,
102 include_footnotes: true,
103 include_endnotes: true,
104 include_list_markers: true,
105 }
106 }
107
108 pub fn none() -> Self {
109 Self::default()
110 }
111}
112
113impl Document {
114 pub fn new() -> Self {
115 Self::default()
116 }
117
118 pub fn extract_text(&self) -> String {
119 self.extract_text_with_options(&ExtractOptions::none())
120 }
121
122 pub fn extract_text_with_options(&self, options: &ExtractOptions) -> String {
123 let mut text = String::new();
124
125 if options.include_headers && !self.headers.is_empty() {
127 text.push_str("--- Headers ---\n");
128 for header in &self.headers {
129 text.push_str(&header.extract_text());
130 text.push('\n');
131 }
132 text.push('\n');
133 }
134
135 let mut list_index = 0;
137 for paragraph in &self.paragraphs {
138 if let (Some(_num_id), Some(level)) = (paragraph.numbering_id, paragraph.numbering_level) {
140 if options.include_list_markers && list_index < self.lists.len() {
142 let list_item = &self.lists[list_index];
143 let indent = " ".repeat(level as usize);
144 let marker = match list_item.list_type {
145 ListType::Bullet => "• ".to_string(),
146 ListType::Numbered => {
147 if let Some(ref num) = list_item.number {
148 format!("{}. ", num)
149 } else {
150 "• ".to_string()
151 }
152 }
153 };
154 text.push_str(&format!("{}{}{}\n", indent, marker, list_item.text));
155 list_index += 1;
156 } else {
157 let para_text = paragraph.to_text();
159 if !para_text.is_empty() {
160 text.push_str(¶_text);
161 text.push('\n');
162 }
163 }
164 } else {
165 let para_text = paragraph.to_text();
167 if !para_text.is_empty() {
168 text.push_str(¶_text);
169 text.push('\n');
170 }
171 }
172 }
173
174 for table in &self.tables {
176 for row in &table.rows {
177 for cell in &row.cells {
178 for paragraph in &cell.paragraphs {
179 let para_text = paragraph.to_text();
180 if !para_text.is_empty() {
181 text.push_str(¶_text);
182 text.push('\t');
183 }
184 }
185 }
186 text.push('\n');
187 }
188 text.push('\n');
189 }
190
191 if options.include_footnotes && !self.footnotes.is_empty() {
193 text.push_str("\n--- Footnotes ---\n");
194 for (i, note) in self.footnotes.iter().enumerate() {
195 text.push_str(&format!("[{}] ", i + 1));
196 for para in ¬e.paragraphs {
197 text.push_str(¶.to_text());
198 }
199 text.push('\n');
200 }
201 }
202
203 if options.include_endnotes && !self.endnotes.is_empty() {
205 text.push_str("\n--- Endnotes ---\n");
206 for (i, note) in self.endnotes.iter().enumerate() {
207 text.push_str(&format!("[{}] ", i + 1));
208 for para in ¬e.paragraphs {
209 text.push_str(¶.to_text());
210 }
211 text.push('\n');
212 }
213 }
214
215 if options.include_footers && !self.footers.is_empty() {
217 text.push_str("\n--- Footers ---\n");
218 for footer in &self.footers {
219 text.push_str(&footer.extract_text());
220 text.push('\n');
221 }
222 }
223
224 text
225 }
226}
227
228impl Paragraph {
229 pub fn new() -> Self {
230 Self::default()
231 }
232
233 pub fn to_text(&self) -> String {
234 self.runs.iter()
235 .map(|run| run.text.as_str())
236 .collect::<Vec<_>>()
237 .join("")
238 }
239
240 pub fn add_run(&mut self, run: Run) {
241 self.runs.push(run);
242 }
243}
244
245impl Run {
246 pub fn new(text: String) -> Self {
247 Self {
248 text,
249 ..Default::default()
250 }
251 }
252}
253
254impl Table {
255 pub fn new() -> Self {
256 Self::default()
257 }
258}
259
260impl HeaderFooter {
261 pub fn extract_text(&self) -> String {
262 let mut text = String::new();
263
264 for paragraph in &self.paragraphs {
265 let para_text = paragraph.to_text();
266 if !para_text.is_empty() {
267 text.push_str(¶_text);
268 text.push('\n');
269 }
270 }
271
272 for table in &self.tables {
273 for row in &table.rows {
274 for cell in &row.cells {
275 for paragraph in &cell.paragraphs {
276 let para_text = paragraph.to_text();
277 if !para_text.is_empty() {
278 text.push_str(¶_text);
279 text.push('\t');
280 }
281 }
282 }
283 text.push('\n');
284 }
285 }
286
287 text
288 }
289}