1#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
9
10use crate::types::structure::{ChunkingResult, StructuredChunk};
11
12use super::xlsx_ooxml::{
13 NumFmtKind, OoxmlMetadata, excel_serial_to_iso, format_currency, format_percentage,
14};
15use super::xlsx_table_detect::{CellValue, DetectedTable, SheetGrid};
16
17const DEFAULT_MAX_CHUNK_CHARS: usize = 1200;
19
20const MAX_SPREADSHEET_CHUNKS: usize = 500;
22
23#[derive(Debug, Clone)]
25pub struct XlsxChunkingOptions {
26 pub max_chars: usize,
27 pub max_chunks: usize,
28}
29
30impl Default for XlsxChunkingOptions {
31 fn default() -> Self {
32 Self {
33 max_chars: DEFAULT_MAX_CHUNK_CHARS,
34 max_chunks: MAX_SPREADSHEET_CHUNKS,
35 }
36 }
37}
38
39#[must_use]
41pub fn format_cell_value(
42 cell: &CellValue,
43 fmt_kind: NumFmtKind,
44 _metadata: &OoxmlMetadata,
45) -> String {
46 match (cell, fmt_kind) {
47 (CellValue::Empty, _) => String::new(),
48 (CellValue::Text(s), _) => s.trim().to_string(),
49 (CellValue::Number(v), NumFmtKind::Date | NumFmtKind::DateTime) => {
50 excel_serial_to_iso(*v).unwrap_or_else(|| format!("{v}"))
51 }
52 (CellValue::Number(v), NumFmtKind::Percentage) => format_percentage(*v),
53 (CellValue::Number(v), NumFmtKind::Currency) => format_currency(*v, "$"),
54 (CellValue::Number(v), _) => {
55 if (v.fract()).abs() < 1e-10 {
57 format!("{}", *v as i64)
58 } else {
59 format!("{v}")
60 }
61 }
62 (CellValue::Integer(v), NumFmtKind::Date | NumFmtKind::DateTime) => {
63 excel_serial_to_iso(*v as f64).unwrap_or_else(|| format!("{v}"))
64 }
65 (CellValue::Integer(v), NumFmtKind::Percentage) => format_percentage(*v as f64),
66 (CellValue::Integer(v), NumFmtKind::Currency) => format_currency(*v as f64, "$"),
67 (CellValue::Integer(v), _) => format!("{v}"),
68 (CellValue::Boolean(b), _) => if *b { "true" } else { "false" }.to_string(),
69 (CellValue::DateTime(s), _) => s.clone(),
70 (CellValue::Error(s), _) => s.clone(),
71 }
72}
73
74fn format_row_with_headers(
76 grid: &SheetGrid,
77 row_idx: u32,
78 headers: &[String],
79 first_col: u32,
80 last_col: u32,
81 metadata: &OoxmlMetadata,
82) -> String {
83 let mut parts = Vec::new();
84
85 for col in first_col..=last_col {
86 let cell = grid.cell(row_idx, col);
87 if cell.is_empty() {
88 continue;
89 }
90
91 let fmt_kind = grid.num_fmt(row_idx, col);
92 let formatted = format_cell_value(cell, fmt_kind, metadata);
93 if formatted.is_empty() {
94 continue;
95 }
96
97 let col_offset = (col - first_col) as usize;
98 let header = headers.get(col_offset).filter(|h| !h.is_empty()).cloned();
99
100 if let Some(h) = header {
101 parts.push(format!("{h}: {formatted}"));
102 } else {
103 parts.push(formatted);
104 }
105 }
106
107 parts.join(" | ")
108}
109
110fn build_context_prefix(sheet_name: &str, table_name: &str) -> String {
112 format!("[Sheet: {sheet_name}] [Table: {table_name}]")
113}
114
115fn build_header_line(headers: &[String]) -> String {
117 let nonempty: Vec<&str> = headers
118 .iter()
119 .map(String::as_str)
120 .filter(|h| !h.is_empty())
121 .collect();
122 if nonempty.is_empty() {
123 String::new()
124 } else {
125 nonempty.join(" | ")
126 }
127}
128
129fn chunk_table(
131 grid: &SheetGrid,
132 table: &DetectedTable,
133 metadata: &OoxmlMetadata,
134 options: &XlsxChunkingOptions,
135 chunk_index_start: usize,
136) -> Vec<StructuredChunk> {
137 let context_prefix = build_context_prefix(&table.sheet_name, &table.name);
138 let header_line = build_header_line(&table.headers);
139
140 let fixed_prefix = if header_line.is_empty() {
142 format!("{context_prefix}\n")
143 } else {
144 format!("{context_prefix}\n{header_line}\n")
145 };
146 let prefix_len = fixed_prefix.len();
147
148 let mut formatted_rows: Vec<String> = Vec::new();
150 for row_idx in table.first_data_row..=table.last_data_row {
151 let line = format_row_with_headers(
152 grid,
153 row_idx,
154 &table.headers,
155 table.first_col,
156 table.last_col,
157 metadata,
158 );
159 if !line.is_empty() {
160 formatted_rows.push(line);
161 }
162 }
163
164 if formatted_rows.is_empty() {
165 return Vec::new();
166 }
167
168 let mut chunks = Vec::new();
170 let mut current_rows: Vec<String> = Vec::new();
171 let mut current_len = prefix_len;
172
173 for row_text in &formatted_rows {
174 let row_len = row_text.len() + 1; if !current_rows.is_empty() && current_len + row_len > options.max_chars {
177 let text = format!("{fixed_prefix}{}", current_rows.join("\n"));
179 chunks.push(text);
180 current_rows.clear();
181 current_len = prefix_len;
182 }
183
184 current_rows.push(row_text.clone());
185 current_len += row_len;
186 }
187
188 if !current_rows.is_empty() {
190 let text = format!("{fixed_prefix}{}", current_rows.join("\n"));
191 chunks.push(text);
192 }
193
194 let total_parts = chunks.len() as u32;
196 let table_id = format!("{}:{}", table.sheet_name, table.name);
197
198 chunks
199 .into_iter()
200 .enumerate()
201 .map(|(i, text)| {
202 let char_count = text.len();
203 let idx = chunk_index_start + i;
204
205 if total_parts == 1 {
206 StructuredChunk::table(text, idx, &table_id, 0, char_count)
207 } else {
208 StructuredChunk::table_continuation(
209 text,
210 idx,
211 &table_id,
212 (i + 1) as u32,
213 total_parts,
214 &fixed_prefix,
215 0,
216 char_count,
217 )
218 }
219 })
220 .collect()
221}
222
223#[must_use]
225pub fn chunk_workbook(
226 grids: &[SheetGrid],
227 tables: &[DetectedTable],
228 metadata: &OoxmlMetadata,
229 options: &XlsxChunkingOptions,
230) -> ChunkingResult {
231 let mut result = ChunkingResult::empty();
232 let mut chunk_index = 0;
233
234 for table in tables {
235 let Some(grid) = grids.iter().find(|g| g.sheet_name == table.sheet_name) else {
237 result.warn(format!(
238 "No grid found for sheet '{}', skipping table '{}'",
239 table.sheet_name, table.name
240 ));
241 continue;
242 };
243
244 let table_chunks = chunk_table(grid, table, metadata, options, chunk_index);
245
246 if table_chunks.len() > 1 {
247 result.tables_split += 1;
248 }
249 result.tables_processed += 1;
250 chunk_index += table_chunks.len();
251 result.chunks.extend(table_chunks);
252
253 if result.chunks.len() >= options.max_chunks {
255 result.warn(format!(
256 "Hit max chunk limit ({}) — remaining tables skipped",
257 options.max_chunks
258 ));
259 result.chunks.truncate(options.max_chunks);
260 break;
261 }
262 }
263
264 result
265}
266
267#[must_use]
269pub fn generate_flat_text(
270 grids: &[SheetGrid],
271 tables: &[DetectedTable],
272 metadata: &OoxmlMetadata,
273) -> String {
274 let mut out = String::new();
275
276 for table in tables {
277 let grid = match grids.iter().find(|g| g.sheet_name == table.sheet_name) {
278 Some(g) => g,
279 None => continue,
280 };
281
282 if !out.is_empty() {
283 out.push('\n');
284 }
285 out.push_str(&format!("Sheet: {}\n", table.sheet_name));
286
287 if !table.headers.is_empty() {
289 let header_line = table
290 .headers
291 .iter()
292 .filter(|h| !h.is_empty())
293 .cloned()
294 .collect::<Vec<_>>()
295 .join(" | ");
296 if !header_line.is_empty() {
297 out.push_str(&header_line);
298 out.push('\n');
299 }
300 }
301
302 for row_idx in table.first_data_row..=table.last_data_row {
304 let line = format_row_with_headers(
305 grid,
306 row_idx,
307 &table.headers,
308 table.first_col,
309 table.last_col,
310 metadata,
311 );
312 if !line.is_empty() {
313 out.push_str(&line);
314 out.push('\n');
315 }
316 }
317 }
318
319 out.trim().to_string()
320}
321
322#[cfg(test)]
323mod tests {
324 use super::*;
325 use crate::reader::xlsx_table_detect::SheetGrid;
326 use crate::types::structure::ChunkType;
327
328 fn make_grid(data: Vec<Vec<CellValue>>, sheet_name: &str) -> SheetGrid {
329 let num_rows = data.len() as u32;
330 let num_cols = data.iter().map(|r| r.len()).max().unwrap_or(0) as u32;
331 SheetGrid {
332 sheet_name: sheet_name.to_string(),
333 rows: data,
334 num_fmt_kinds: Vec::new(),
335 num_rows,
336 num_cols,
337 }
338 }
339
340 #[test]
341 fn test_format_cell_value_date() {
342 let metadata = OoxmlMetadata::default();
343 let cell = CellValue::Number(44927.0);
344 let result = format_cell_value(&cell, NumFmtKind::Date, &metadata);
345 assert_eq!(result, "2023-01-01");
346 }
347
348 #[test]
349 fn test_format_cell_value_percentage() {
350 let metadata = OoxmlMetadata::default();
351 let cell = CellValue::Number(0.153);
352 let result = format_cell_value(&cell, NumFmtKind::Percentage, &metadata);
353 assert_eq!(result, "15.3%");
354 }
355
356 #[test]
357 fn test_format_cell_value_currency() {
358 let metadata = OoxmlMetadata::default();
359 let cell = CellValue::Number(1234.56);
360 let result = format_cell_value(&cell, NumFmtKind::Currency, &metadata);
361 assert_eq!(result, "$1234.56");
362 }
363
364 #[test]
365 fn test_format_row_with_headers() {
366 let grid = make_grid(
367 vec![vec![
368 CellValue::Text("Alice".into()),
369 CellValue::Integer(30),
370 CellValue::Text("Austin".into()),
371 ]],
372 "Sheet1",
373 );
374 let metadata = OoxmlMetadata::default();
375 let headers = vec!["Name".to_string(), "Age".to_string(), "City".to_string()];
376
377 let result = format_row_with_headers(&grid, 0, &headers, 0, 2, &metadata);
378 assert_eq!(result, "Name: Alice | Age: 30 | City: Austin");
379 }
380
381 #[test]
382 fn test_format_row_skips_empty() {
383 let grid = make_grid(
384 vec![vec![
385 CellValue::Text("Alice".into()),
386 CellValue::Empty,
387 CellValue::Text("Austin".into()),
388 ]],
389 "Sheet1",
390 );
391 let metadata = OoxmlMetadata::default();
392 let headers = vec!["Name".to_string(), "Age".to_string(), "City".to_string()];
393
394 let result = format_row_with_headers(&grid, 0, &headers, 0, 2, &metadata);
395 assert_eq!(result, "Name: Alice | City: Austin");
396 }
397
398 #[test]
399 fn test_chunk_table_single_chunk() {
400 let grid = make_grid(
401 vec![
402 vec![
403 CellValue::Text("Name".into()),
404 CellValue::Text("Value".into()),
405 ],
406 vec![CellValue::Text("A".into()), CellValue::Integer(100)],
407 vec![CellValue::Text("B".into()), CellValue::Integer(200)],
408 ],
409 "Sheet1",
410 );
411 let metadata = OoxmlMetadata::default();
412 let table = DetectedTable {
413 name: "Revenue".to_string(),
414 sheet_name: "Sheet1".to_string(),
415 headers: vec!["Name".to_string(), "Value".to_string()],
416 column_types: vec![],
417 first_data_row: 1,
418 last_data_row: 2,
419 first_col: 0,
420 last_col: 1,
421 header_row: Some(0),
422 confidence: 0.7,
423 };
424
425 let options = XlsxChunkingOptions::default();
426 let chunks = chunk_table(&grid, &table, &metadata, &options, 0);
427
428 assert_eq!(chunks.len(), 1);
429 let text = &chunks[0].text;
430 assert!(text.contains("[Sheet: Sheet1] [Table: Revenue]"));
431 assert!(text.contains("Name | Value"));
432 assert!(text.contains("Name: A | Value: 100"));
433 assert!(text.contains("Name: B | Value: 200"));
434 assert_eq!(chunks[0].chunk_type, ChunkType::Table);
435 }
436
437 #[test]
438 fn test_chunk_table_splits_large() {
439 let mut rows = vec![vec![
440 CellValue::Text("Col1".into()),
441 CellValue::Text("Col2".into()),
442 ]];
443 for i in 0..50 {
445 rows.push(vec![
446 CellValue::Text(format!("Row{i} long text that takes up space in the chunk")),
447 CellValue::Integer(i as i64 * 1000),
448 ]);
449 }
450
451 let grid = make_grid(rows, "Sheet1");
452 let metadata = OoxmlMetadata::default();
453 let table = DetectedTable {
454 name: "Data".to_string(),
455 sheet_name: "Sheet1".to_string(),
456 headers: vec!["Col1".to_string(), "Col2".to_string()],
457 column_types: vec![],
458 first_data_row: 1,
459 last_data_row: 50,
460 first_col: 0,
461 last_col: 1,
462 header_row: Some(0),
463 confidence: 0.7,
464 };
465
466 let options = XlsxChunkingOptions {
467 max_chars: 300,
468 max_chunks: 100,
469 };
470 let chunks = chunk_table(&grid, &table, &metadata, &options, 0);
471
472 assert!(chunks.len() > 1, "Should split into multiple chunks");
473 for chunk in &chunks {
475 assert!(chunk.text.contains("[Sheet: Sheet1]"));
476 assert!(chunk.text.contains("Col1 | Col2"));
477 assert_eq!(chunk.chunk_type, ChunkType::TableContinuation);
478 }
479 }
480
481 #[test]
482 fn test_generate_flat_text() {
483 let grid = make_grid(
484 vec![
485 vec![
486 CellValue::Text("Name".into()),
487 CellValue::Text("Score".into()),
488 ],
489 vec![CellValue::Text("Alice".into()), CellValue::Integer(95)],
490 ],
491 "Results",
492 );
493 let metadata = OoxmlMetadata::default();
494 let table = DetectedTable {
495 name: "Scores".to_string(),
496 sheet_name: "Results".to_string(),
497 headers: vec!["Name".to_string(), "Score".to_string()],
498 column_types: vec![],
499 first_data_row: 1,
500 last_data_row: 1,
501 first_col: 0,
502 last_col: 1,
503 header_row: Some(0),
504 confidence: 0.7,
505 };
506
507 let text = generate_flat_text(&[grid], &[table], &metadata);
508 assert!(text.contains("Sheet: Results"));
509 assert!(text.contains("Name | Score"));
510 assert!(text.contains("Name: Alice | Score: 95"));
511 }
512}