1use crate::analysis::{
2 classification,
3 formula::{FormulaAtlas, FormulaGraph},
4 style,
5};
6use crate::caps::BackendCaps;
7use crate::config::ServerConfig;
8use crate::model::{
9 NamedItemKind, NamedRangeDescriptor, SheetClassification, SheetOverviewResponse, SheetSummary,
10 WorkbookDescription, WorkbookId, WorkbookListResponse,
11};
12use crate::tools::filters::WorkbookFilter;
13use crate::utils::{
14 hash_path_metadata, make_short_workbook_id, path_to_forward_slashes, system_time_to_rfc3339,
15};
16use anyhow::{Context, Result, anyhow};
17use chrono::{DateTime, Utc};
18use parking_lot::RwLock;
19use std::cmp::Ordering;
20use std::collections::{HashMap, HashSet};
21use std::fs;
22use std::path::{Path, PathBuf};
23use std::sync::Arc;
24use std::time::Instant;
25use umya_spreadsheet::reader::xlsx;
26use umya_spreadsheet::{DefinedName, Spreadsheet, Worksheet};
27
28const KV_MAX_WIDTH_FOR_DENSITY_CHECK: u32 = 6;
29const KV_SAMPLE_ROWS: u32 = 20;
30const KV_DENSITY_THRESHOLD: f32 = 0.4;
31const KV_CHECK_ROWS: u32 = 15;
32const KV_MAX_LABEL_LEN: usize = 25;
33const KV_MIN_TEXT_VALUE_LEN: usize = 2;
34const KV_MIN_PAIRS: u32 = 3;
35const KV_MIN_PAIR_RATIO: f32 = 0.3;
36
37const HEADER_MAX_SCAN_ROWS: u32 = 2;
38const HEADER_LONG_STRING_PENALTY_THRESHOLD: usize = 40;
39const HEADER_LONG_STRING_PENALTY: f32 = 1.5;
40const HEADER_PROPER_NOUN_MIN_LEN: usize = 5;
41const HEADER_PROPER_NOUN_PENALTY: f32 = 1.0;
42const HEADER_DIGIT_STRING_MIN_LEN: usize = 3;
43const HEADER_DIGIT_STRING_PENALTY: f32 = 0.5;
44const HEADER_DATE_PENALTY: f32 = 1.0;
45const HEADER_YEAR_LIKE_BONUS: f32 = 0.5;
46const HEADER_YEAR_MIN: f64 = 1900.0;
47const HEADER_YEAR_MAX: f64 = 2100.0;
48const HEADER_UNIQUE_BONUS: f32 = 0.2;
49const HEADER_NUMBER_PENALTY: f32 = 0.3;
50const HEADER_SINGLE_COL_MIN_SCORE: f32 = 1.5;
51const HEADER_SCORE_TIE_THRESHOLD: f32 = 0.3;
52const HEADER_SECOND_ROW_MIN_SCORE_RATIO: f32 = 0.6;
53const HEADER_MAX_COLUMNS: u32 = 200;
54
55const DETECT_MAX_ROWS: u32 = 10_000;
56const DETECT_MAX_COLS: u32 = 500;
57const DETECT_MAX_AREA: u64 = 5_000_000;
58const DETECT_MAX_CELLS: usize = 200_000;
59const DETECT_MAX_LEAVES: usize = 200;
60const DETECT_MAX_DEPTH: u32 = 12;
61const DETECT_MAX_MS: u64 = 200;
62const DETECT_OUTLIER_FRACTION: f32 = 0.01;
63const DETECT_OUTLIER_MIN_CELLS: usize = 50;
64
65pub struct WorkbookContext {
66 pub id: WorkbookId,
67 pub short_id: String,
68 pub slug: String,
69 pub path: PathBuf,
70 pub caps: BackendCaps,
71 pub bytes: u64,
72 pub last_modified: Option<DateTime<Utc>>,
73 spreadsheet: Arc<RwLock<Spreadsheet>>,
74 sheet_cache: RwLock<HashMap<String, Arc<SheetCacheEntry>>>,
75 formula_atlas: Arc<FormulaAtlas>,
76}
77
78pub struct SheetCacheEntry {
79 pub metrics: SheetMetrics,
80 pub style_tags: Vec<String>,
81 pub named_ranges: Vec<NamedRangeDescriptor>,
82 detected_regions: RwLock<Option<Vec<crate::model::DetectedRegion>>>,
83 region_notes: RwLock<Vec<String>>,
84}
85
86#[derive(Debug, Clone)]
87pub struct SheetMetrics {
88 pub row_count: u32,
89 pub column_count: u32,
90 pub non_empty_cells: u32,
91 pub formula_cells: u32,
92 pub cached_values: u32,
93 pub comments: u32,
94 pub style_map: HashMap<String, StyleUsage>,
95 pub classification: SheetClassification,
96}
97
98#[derive(Debug, Clone)]
99pub struct StyleUsage {
100 pub occurrences: u32,
101 pub tags: Vec<String>,
102 pub example_cells: Vec<String>,
103}
104
105impl SheetCacheEntry {
106 pub fn detected_regions(&self) -> Vec<crate::model::DetectedRegion> {
107 self.detected_regions
108 .read()
109 .as_ref()
110 .cloned()
111 .unwrap_or_default()
112 }
113
114 pub fn region_notes(&self) -> Vec<String> {
115 self.region_notes.read().clone()
116 }
117
118 pub fn has_detected_regions(&self) -> bool {
119 self.detected_regions.read().is_some()
120 }
121
122 pub fn set_detected_regions(&self, regions: Vec<crate::model::DetectedRegion>) {
123 let mut guard = self.detected_regions.write();
124 if guard.is_none() {
125 *guard = Some(regions);
126 }
127 }
128
129 pub fn set_region_notes(&self, notes: Vec<String>) {
130 if notes.is_empty() {
131 return;
132 }
133 let mut guard = self.region_notes.write();
134 if guard.is_empty() {
135 *guard = notes;
136 }
137 }
138}
139
140impl WorkbookContext {
141 pub fn load(_config: &Arc<ServerConfig>, path: &Path) -> Result<Self> {
142 let metadata = fs::metadata(path)
143 .with_context(|| format!("unable to read metadata for {:?}", path))?;
144 let slug = path
145 .file_stem()
146 .map(|s| s.to_string_lossy().to_string())
147 .unwrap_or_else(|| "workbook".to_string());
148 let bytes = metadata.len();
149 let last_modified = metadata.modified().ok().and_then(system_time_to_rfc3339);
150 let id = WorkbookId(hash_path_metadata(path, &metadata));
151 let spreadsheet =
152 xlsx::read(path).with_context(|| format!("failed to parse workbook {:?}", path))?;
153 let short_id = make_short_workbook_id(&slug, id.as_str());
154
155 Ok(Self {
156 id,
157 short_id,
158 slug,
159 path: path.to_path_buf(),
160 caps: BackendCaps::xlsx(),
161 bytes,
162 last_modified,
163 spreadsheet: Arc::new(RwLock::new(spreadsheet)),
164 sheet_cache: RwLock::new(HashMap::new()),
165 formula_atlas: Arc::new(FormulaAtlas::default()),
166 })
167 }
168
169 pub fn sheet_names(&self) -> Vec<String> {
170 let book = self.spreadsheet.read();
171 book.get_sheet_collection()
172 .iter()
173 .map(|sheet| sheet.get_name().to_string())
174 .collect()
175 }
176
177 pub fn describe(&self) -> WorkbookDescription {
178 let book = self.spreadsheet.read();
179 let defined_names_count = book.get_defined_names().len();
180 let table_count: usize = book
181 .get_sheet_collection()
182 .iter()
183 .map(|sheet| sheet.get_tables().len())
184 .sum();
185 let macros_present = false;
186
187 WorkbookDescription {
188 workbook_id: self.id.clone(),
189 short_id: self.short_id.clone(),
190 slug: self.slug.clone(),
191 path: path_to_forward_slashes(&self.path),
192 bytes: self.bytes,
193 sheet_count: book.get_sheet_collection().len(),
194 defined_names: defined_names_count,
195 tables: table_count,
196 macros_present,
197 last_modified: self
198 .last_modified
199 .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
200 caps: self.caps.clone(),
201 }
202 }
203
204 pub fn get_sheet_metrics_fast(&self, sheet_name: &str) -> Result<Arc<SheetCacheEntry>> {
205 if let Some(entry) = self.sheet_cache.read().get(sheet_name) {
206 return Ok(entry.clone());
207 }
208
209 let mut writer = self.sheet_cache.write();
210 if let Some(entry) = writer.get(sheet_name) {
211 return Ok(entry.clone());
212 }
213
214 let book = self.spreadsheet.read();
215 let sheet = book
216 .get_sheet_by_name(sheet_name)
217 .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
218 let (metrics, style_tags) = compute_sheet_metrics(sheet);
219 let named_ranges = gather_named_ranges(sheet, book.get_defined_names());
220
221 let entry = Arc::new(SheetCacheEntry {
222 metrics,
223 style_tags,
224 named_ranges,
225 detected_regions: RwLock::new(None),
226 region_notes: RwLock::new(Vec::new()),
227 });
228
229 writer.insert(sheet_name.to_string(), entry.clone());
230 Ok(entry)
231 }
232
233 pub fn get_sheet_metrics(&self, sheet_name: &str) -> Result<Arc<SheetCacheEntry>> {
234 let entry = self.get_sheet_metrics_fast(sheet_name)?;
235 if entry.has_detected_regions() {
236 return Ok(entry);
237 }
238
239 let book = self.spreadsheet.read();
240 let sheet = book
241 .get_sheet_by_name(sheet_name)
242 .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
243 let detected = detect_regions(sheet, &entry.metrics);
244 entry.set_detected_regions(detected.regions);
245 entry.set_region_notes(detected.notes);
246 Ok(entry)
247 }
248
249 pub fn list_summaries(&self) -> Result<Vec<SheetSummary>> {
250 let book = self.spreadsheet.read();
251 let mut summaries = Vec::new();
252 for sheet in book.get_sheet_collection() {
253 let name = sheet.get_name().to_string();
254 let entry = self.get_sheet_metrics_fast(&name)?;
255 summaries.push(SheetSummary {
256 name: name.clone(),
257 visible: sheet.get_sheet_state() != "hidden",
258 row_count: entry.metrics.row_count,
259 column_count: entry.metrics.column_count,
260 non_empty_cells: entry.metrics.non_empty_cells,
261 formula_cells: entry.metrics.formula_cells,
262 cached_values: entry.metrics.cached_values,
263 classification: entry.metrics.classification.clone(),
264 style_tags: entry.style_tags.clone(),
265 });
266 }
267 Ok(summaries)
268 }
269
270 pub fn with_sheet<T, F>(&self, sheet_name: &str, func: F) -> Result<T>
271 where
272 F: FnOnce(&Worksheet) -> T,
273 {
274 let book = self.spreadsheet.read();
275 let sheet = book
276 .get_sheet_by_name(sheet_name)
277 .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
278 Ok(func(sheet))
279 }
280
281 pub fn with_spreadsheet<T, F>(&self, func: F) -> Result<T>
282 where
283 F: FnOnce(&Spreadsheet) -> T,
284 {
285 let book = self.spreadsheet.read();
286 Ok(func(&book))
287 }
288
289 pub fn formula_graph(&self, sheet_name: &str) -> Result<FormulaGraph> {
290 self.with_sheet(sheet_name, |sheet| {
291 FormulaGraph::build(sheet, &self.formula_atlas)
292 })?
293 }
294
295 pub fn named_items(&self) -> Result<Vec<NamedRangeDescriptor>> {
296 let book = self.spreadsheet.read();
297 let sheet_names: Vec<String> = book
298 .get_sheet_collection()
299 .iter()
300 .map(|sheet| sheet.get_name().to_string())
301 .collect();
302 let mut items = Vec::new();
303 for defined in book.get_defined_names() {
304 let refers_to = defined.get_address();
305 let scope = if defined.has_local_sheet_id() {
306 let idx = *defined.get_local_sheet_id() as usize;
307 sheet_names.get(idx).cloned()
308 } else {
309 None
310 };
311 let kind = if refers_to.starts_with('=') {
312 NamedItemKind::Formula
313 } else {
314 NamedItemKind::NamedRange
315 };
316
317 items.push(NamedRangeDescriptor {
318 name: defined.get_name().to_string(),
319 scope: scope.clone(),
320 refers_to: refers_to.clone(),
321 kind,
322 sheet_name: scope,
323 comment: None,
324 });
325 }
326
327 for sheet in book.get_sheet_collection() {
328 for table in sheet.get_tables() {
329 let start = table.get_area().0.get_coordinate();
330 let end = table.get_area().1.get_coordinate();
331 items.push(NamedRangeDescriptor {
332 name: table.get_name().to_string(),
333 scope: Some(sheet.get_name().to_string()),
334 refers_to: format!("{}:{}", start, end),
335 kind: NamedItemKind::Table,
336 sheet_name: Some(sheet.get_name().to_string()),
337 comment: None,
338 });
339 }
340 }
341
342 Ok(items)
343 }
344
345 pub fn sheet_overview(&self, sheet_name: &str) -> Result<SheetOverviewResponse> {
346 let entry = self.get_sheet_metrics(sheet_name)?;
347 let narrative = classification::narrative(&entry.metrics);
348 let regions = classification::regions(&entry.metrics);
349 let key_ranges = classification::key_ranges(&entry.metrics);
350 let detected_regions = entry.detected_regions();
351
352 Ok(SheetOverviewResponse {
353 workbook_id: self.id.clone(),
354 workbook_short_id: self.short_id.clone(),
355 sheet_name: sheet_name.to_string(),
356 narrative,
357 regions,
358 detected_regions: detected_regions.clone(),
359 detected_region_count: detected_regions.len() as u32,
360 detected_regions_truncated: false,
361 key_ranges,
362 formula_ratio: if entry.metrics.non_empty_cells == 0 {
363 0.0
364 } else {
365 entry.metrics.formula_cells as f32 / entry.metrics.non_empty_cells as f32
366 },
367 notable_features: entry.style_tags.clone(),
368 notes: entry.region_notes(),
369 })
370 }
371
372 pub fn detected_region(
373 &self,
374 sheet_name: &str,
375 id: u32,
376 ) -> Result<crate::model::DetectedRegion> {
377 let entry = self.get_sheet_metrics(sheet_name)?;
378 entry
379 .detected_regions()
380 .iter()
381 .find(|r| r.id == id)
382 .cloned()
383 .ok_or_else(|| anyhow!("region {} not found on sheet {}", id, sheet_name))
384 }
385}
386
387fn contains_date_time_token(format_code: &str) -> bool {
388 let mut in_quote = false;
389 let mut in_bracket = false;
390 let chars: Vec<char> = format_code.chars().collect();
391
392 for (i, &ch) in chars.iter().enumerate() {
393 match ch {
394 '"' => in_quote = !in_quote,
395 '[' if !in_quote => in_bracket = true,
396 ']' if !in_quote => in_bracket = false,
397 'y' | 'd' | 'h' | 's' | 'm' if !in_quote && !in_bracket => {
398 if ch == 'm' {
399 let prev = if i > 0 { chars.get(i - 1) } else { None };
400 let next = chars.get(i + 1);
401 let after_time_sep = prev == Some(&':') || prev == Some(&'h');
402 let before_time_sep = next == Some(&':') || next == Some(&'s');
403 if after_time_sep || before_time_sep {
404 return true;
405 }
406 if prev == Some(&'m') || next == Some(&'m') {
407 return true;
408 }
409 if matches!(prev, Some(&'/') | Some(&'-') | Some(&'.'))
410 || matches!(next, Some(&'/') | Some(&'-') | Some(&'.'))
411 {
412 return true;
413 }
414 } else {
415 return true;
416 }
417 }
418 _ => {}
419 }
420 }
421 false
422}
423
424const DATE_FORMAT_IDS: &[u32] = &[
425 14, 15, 16, 17, 18, 19, 20, 21, 22, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 45, 46, 47, 50, 51,
426 52, 53, 54, 55, 56, 57, 58,
427];
428
429const EXCEL_LEAP_YEAR_BUG_SERIAL: i64 = 60;
430
431fn is_date_formatted(cell: &umya_spreadsheet::Cell) -> bool {
432 let Some(nf) = cell.get_style().get_number_format() else {
433 return false;
434 };
435
436 let format_id = nf.get_number_format_id();
437 if DATE_FORMAT_IDS.contains(format_id) {
438 return true;
439 }
440
441 let code = nf.get_format_code();
442 if code == "General" || code == "@" || code == "0" || code == "0.00" {
443 return false;
444 }
445
446 contains_date_time_token(code)
447}
448
449pub fn excel_serial_to_iso(serial: f64, use_1904_system: bool) -> String {
450 excel_serial_to_iso_with_leap_bug(serial, use_1904_system, true)
451}
452
453pub fn excel_serial_to_iso_with_leap_bug(
454 serial: f64,
455 use_1904_system: bool,
456 compensate_leap_bug: bool,
457) -> String {
458 use chrono::NaiveDate;
459
460 let days = serial.trunc() as i64;
461
462 if use_1904_system {
463 let epoch_1904 = NaiveDate::from_ymd_opt(1904, 1, 1).unwrap();
464 return epoch_1904
465 .checked_add_signed(chrono::Duration::days(days))
466 .map(|d| d.format("%Y-%m-%d").to_string())
467 .unwrap_or_else(|| serial.to_string());
468 }
469
470 let epoch = if compensate_leap_bug && days >= EXCEL_LEAP_YEAR_BUG_SERIAL {
471 NaiveDate::from_ymd_opt(1899, 12, 30).unwrap()
472 } else {
473 NaiveDate::from_ymd_opt(1899, 12, 31).unwrap()
474 };
475
476 epoch
477 .checked_add_signed(chrono::Duration::days(days))
478 .map(|d| d.format("%Y-%m-%d").to_string())
479 .unwrap_or_else(|| serial.to_string())
480}
481
482pub fn cell_to_value(cell: &umya_spreadsheet::Cell) -> Option<crate::model::CellValue> {
483 cell_to_value_with_date_system(cell, false)
484}
485
486pub fn cell_to_value_with_date_system(
487 cell: &umya_spreadsheet::Cell,
488 use_1904_system: bool,
489) -> Option<crate::model::CellValue> {
490 let raw = cell.get_value();
491 if raw.is_empty() {
492 return None;
493 }
494 if let Ok(number) = raw.parse::<f64>() {
495 if is_date_formatted(cell) {
496 return Some(crate::model::CellValue::Date(excel_serial_to_iso(
497 number,
498 use_1904_system,
499 )));
500 }
501 return Some(crate::model::CellValue::Number(number));
502 }
503
504 let lower = raw.to_ascii_lowercase();
505 if lower == "true" {
506 return Some(crate::model::CellValue::Bool(true));
507 }
508 if lower == "false" {
509 return Some(crate::model::CellValue::Bool(false));
510 }
511
512 Some(crate::model::CellValue::Text(raw.to_string()))
513}
514
515pub fn compute_sheet_metrics(sheet: &Worksheet) -> (SheetMetrics, Vec<String>) {
516 use std::collections::HashMap as StdHashMap;
517 let mut non_empty = 0u32;
518 let mut formulas = 0u32;
519 let mut cached = 0u32;
520 let comments = sheet.get_comments().len() as u32;
521 let mut style_usage: StdHashMap<String, StyleUsage> = StdHashMap::new();
522
523 for cell in sheet.get_cell_collection() {
524 let value = cell.get_value();
525 if !value.is_empty() {
526 non_empty += 1;
527 }
528 if cell.is_formula() {
529 formulas += 1;
530 if !cell.get_value().is_empty() {
531 cached += 1;
532 }
533 }
534
535 if let Some((style_key, usage)) = style::tag_cell(cell) {
536 let entry = style_usage.entry(style_key).or_insert_with(|| StyleUsage {
537 occurrences: 0,
538 tags: usage.tags.clone(),
539 example_cells: Vec::new(),
540 });
541 entry.occurrences += 1;
542 if entry.example_cells.len() < 5 {
543 entry.example_cells.push(usage.example_cell.clone());
544 }
545 }
546 }
547
548 let (max_col, max_row) = sheet.get_highest_column_and_row();
549
550 let classification = classification::classify(
551 non_empty,
552 formulas,
553 max_row,
554 max_col,
555 comments,
556 &style_usage,
557 );
558
559 let style_tags: Vec<String> = style_usage
560 .values()
561 .flat_map(|usage| usage.tags.clone())
562 .collect();
563
564 let metrics = SheetMetrics {
565 row_count: max_row,
566 column_count: max_col,
567 non_empty_cells: non_empty,
568 formula_cells: formulas,
569 cached_values: cached,
570 comments,
571 style_map: style_usage,
572 classification,
573 };
574 (metrics, style_tags)
575}
576
577#[derive(Debug, Clone, Copy)]
578struct Rect {
579 start_row: u32,
580 end_row: u32,
581 start_col: u32,
582 end_col: u32,
583}
584
585#[derive(Debug, Clone)]
586struct CellInfo {
587 value: Option<crate::model::CellValue>,
588 is_formula: bool,
589}
590
591#[derive(Debug)]
592struct Occupancy {
593 cells: HashMap<(u32, u32), CellInfo>,
594 rows: HashMap<u32, Vec<u32>>,
595 cols: HashMap<u32, Vec<u32>>,
596 min_row: u32,
597 max_row: u32,
598 min_col: u32,
599 max_col: u32,
600}
601
602impl Occupancy {
603 fn bounds_rect(&self) -> Option<Rect> {
604 if self.cells.is_empty() {
605 None
606 } else {
607 Some(Rect {
608 start_row: self.min_row,
609 end_row: self.max_row,
610 start_col: self.min_col,
611 end_col: self.max_col,
612 })
613 }
614 }
615
616 fn dense_bounds(&self) -> Option<Rect> {
617 let bounds = self.bounds_rect()?;
618 let total_cells = self.cells.len();
619 if total_cells < DETECT_OUTLIER_MIN_CELLS {
620 return Some(bounds);
621 }
622 let trim_cells = ((total_cells as f32) * DETECT_OUTLIER_FRACTION).round() as usize;
623 if trim_cells == 0 || trim_cells * 2 >= total_cells {
624 return Some(bounds);
625 }
626
627 let mut row_counts: Vec<(u32, usize)> = self
628 .rows
629 .iter()
630 .map(|(row, cols)| (*row, cols.len()))
631 .collect();
632 row_counts.sort_by_key(|(row, _)| *row);
633
634 let mut col_counts: Vec<(u32, usize)> = self
635 .cols
636 .iter()
637 .map(|(col, rows)| (*col, rows.len()))
638 .collect();
639 col_counts.sort_by_key(|(col, _)| *col);
640
641 let (start_row, end_row) =
642 trim_bounds_by_cells(&row_counts, trim_cells, bounds.start_row, bounds.end_row);
643 let (start_col, end_col) =
644 trim_bounds_by_cells(&col_counts, trim_cells, bounds.start_col, bounds.end_col);
645
646 if start_row > end_row || start_col > end_col {
647 return Some(bounds);
648 }
649
650 Some(Rect {
651 start_row,
652 end_row,
653 start_col,
654 end_col,
655 })
656 }
657
658 fn row_col_counts(&self, rect: &Rect) -> (Vec<u32>, Vec<u32>) {
659 let height = (rect.end_row - rect.start_row + 1) as usize;
660 let width = (rect.end_col - rect.start_col + 1) as usize;
661 let mut row_counts = vec![0u32; height];
662 let mut col_counts = vec![0u32; width];
663
664 for (row, cols) in &self.rows {
665 if *row < rect.start_row || *row > rect.end_row {
666 continue;
667 }
668 let count = count_in_sorted_range(cols, rect.start_col, rect.end_col);
669 row_counts[(row - rect.start_row) as usize] = count;
670 }
671 for (col, rows) in &self.cols {
672 if *col < rect.start_col || *col > rect.end_col {
673 continue;
674 }
675 let count = count_in_sorted_range(rows, rect.start_row, rect.end_row);
676 col_counts[(col - rect.start_col) as usize] = count;
677 }
678 (row_counts, col_counts)
679 }
680
681 fn stats_in_rect(&self, rect: &Rect) -> RegionStats {
682 let mut stats = RegionStats::default();
683 for (row, cols) in &self.rows {
684 if *row < rect.start_row || *row > rect.end_row {
685 continue;
686 }
687 let start_idx = lower_bound(cols, rect.start_col);
688 let end_idx = upper_bound(cols, rect.end_col);
689 for col in &cols[start_idx..end_idx] {
690 if let Some(info) = self.cells.get(&(*row, *col)) {
691 stats.non_empty += 1;
692 if info.is_formula {
693 stats.formulas += 1;
694 }
695 if let Some(val) = &info.value {
696 match val {
697 crate::model::CellValue::Text(_) => stats.text += 1,
698 crate::model::CellValue::Number(_) => stats.numbers += 1,
699 crate::model::CellValue::Bool(_) => stats.bools += 1,
700 crate::model::CellValue::Date(_) => stats.dates += 1,
701 crate::model::CellValue::Error(_) => stats.errors += 1,
702 }
703 }
704 }
705 }
706 }
707 stats
708 }
709
710 fn value_at(&self, row: u32, col: u32) -> Option<&crate::model::CellValue> {
711 self.cells.get(&(row, col)).and_then(|c| c.value.as_ref())
712 }
713}
714
715fn lower_bound(values: &[u32], target: u32) -> usize {
716 let mut left = 0;
717 let mut right = values.len();
718 while left < right {
719 let mid = (left + right) / 2;
720 if values[mid] < target {
721 left = mid + 1;
722 } else {
723 right = mid;
724 }
725 }
726 left
727}
728
729fn upper_bound(values: &[u32], target: u32) -> usize {
730 let mut left = 0;
731 let mut right = values.len();
732 while left < right {
733 let mid = (left + right) / 2;
734 if values[mid] <= target {
735 left = mid + 1;
736 } else {
737 right = mid;
738 }
739 }
740 left
741}
742
743fn count_in_sorted_range(values: &[u32], start: u32, end: u32) -> u32 {
744 if values.is_empty() {
745 return 0;
746 }
747 let start_idx = lower_bound(values, start);
748 let end_idx = upper_bound(values, end);
749 end_idx.saturating_sub(start_idx) as u32
750}
751
752fn trim_bounds_by_cells(
753 entries: &[(u32, usize)],
754 trim_cells: usize,
755 default_start: u32,
756 default_end: u32,
757) -> (u32, u32) {
758 if entries.is_empty() {
759 return (default_start, default_end);
760 }
761
762 let mut remaining = trim_cells;
763 let mut start_idx = 0usize;
764 while start_idx < entries.len() {
765 let count = entries[start_idx].1;
766 if remaining < count {
767 break;
768 }
769 remaining -= count;
770 start_idx += 1;
771 }
772
773 let mut remaining = trim_cells;
774 let mut end_idx = entries.len();
775 while end_idx > 0 {
776 let count = entries[end_idx - 1].1;
777 if remaining < count {
778 break;
779 }
780 remaining -= count;
781 end_idx -= 1;
782 }
783
784 let start = entries
785 .get(start_idx)
786 .map(|(idx, _)| *idx)
787 .unwrap_or(default_start);
788 let end = if end_idx == 0 {
789 default_end
790 } else {
791 entries
792 .get(end_idx - 1)
793 .map(|(idx, _)| *idx)
794 .unwrap_or(default_end)
795 };
796 (start, end)
797}
798
799#[derive(Debug, Default, Clone)]
800struct RegionStats {
801 non_empty: u32,
802 formulas: u32,
803 text: u32,
804 numbers: u32,
805 bools: u32,
806 dates: u32,
807 errors: u32,
808}
809
810#[derive(Debug, Clone, Copy, PartialEq, Eq)]
811enum Gutter {
812 Row { start: u32, end: u32 },
813 Col { start: u32, end: u32 },
814}
815
816#[derive(Debug, Default)]
817struct DetectRegionsResult {
818 regions: Vec<crate::model::DetectedRegion>,
819 notes: Vec<String>,
820}
821
822#[derive(Debug)]
823struct DetectLimits {
824 start: Instant,
825 max_ms: u64,
826 max_leaves: usize,
827 max_depth: u32,
828 leaves: usize,
829 exceeded_time: bool,
830 exceeded_leaves: bool,
831}
832
833impl DetectLimits {
834 fn new() -> Self {
835 Self {
836 start: Instant::now(),
837 max_ms: DETECT_MAX_MS,
838 max_leaves: DETECT_MAX_LEAVES,
839 max_depth: DETECT_MAX_DEPTH,
840 leaves: 0,
841 exceeded_time: false,
842 exceeded_leaves: false,
843 }
844 }
845
846 fn should_stop(&mut self) -> bool {
847 if !self.exceeded_time && self.start.elapsed().as_millis() as u64 >= self.max_ms {
848 self.exceeded_time = true;
849 }
850 self.exceeded_time || self.exceeded_leaves
851 }
852
853 fn note_leaf(&mut self) {
854 self.leaves += 1;
855 if self.leaves >= self.max_leaves {
856 self.exceeded_leaves = true;
857 }
858 }
859}
860
861fn detect_regions(sheet: &Worksheet, metrics: &SheetMetrics) -> DetectRegionsResult {
862 if metrics.row_count == 0 || metrics.column_count == 0 {
863 return DetectRegionsResult::default();
864 }
865
866 let occupancy = build_occupancy(sheet);
867 if occupancy.cells.is_empty() {
868 return DetectRegionsResult::default();
869 }
870
871 let area = (metrics.row_count as u64) * (metrics.column_count as u64);
872 let exceeds_caps = metrics.row_count > DETECT_MAX_ROWS
873 || metrics.column_count > DETECT_MAX_COLS
874 || area > DETECT_MAX_AREA
875 || occupancy.cells.len() > DETECT_MAX_CELLS;
876
877 if exceeds_caps {
878 let mut result = DetectRegionsResult::default();
879 if let Some(bounds) = occupancy.dense_bounds() {
880 result.regions.push(build_fallback_region(&bounds, metrics));
881 }
882 result.notes.push(format!(
883 "Region detection capped: rows {}, cols {}, occupied {}.",
884 metrics.row_count,
885 metrics.column_count,
886 occupancy.cells.len()
887 ));
888 return result;
889 }
890
891 let root = occupancy.bounds_rect().unwrap_or(Rect {
892 start_row: 1,
893 end_row: metrics.row_count.max(1),
894 start_col: 1,
895 end_col: metrics.column_count.max(1),
896 });
897
898 let mut leaves = Vec::new();
899 let mut limits = DetectLimits::new();
900 split_rect(&occupancy, &root, 0, &mut limits, &mut leaves);
901
902 let mut regions = Vec::new();
903 for (idx, rect) in leaves.into_iter().enumerate() {
904 if limits.should_stop() {
905 break;
906 }
907 if let Some(trimmed) = trim_rect(&occupancy, rect, &mut limits) {
908 let region = build_region(&occupancy, &trimmed, metrics, idx as u32);
909 regions.push(region);
910 }
911 }
912
913 let mut notes = Vec::new();
914 if limits.exceeded_time || limits.exceeded_leaves {
915 notes.push("Region detection truncated due to time/complexity caps.".to_string());
916 }
917 if regions.is_empty() {
918 if let Some(bounds) = occupancy.dense_bounds() {
919 regions.push(build_fallback_region(&bounds, metrics));
920 notes.push("Region detection returned no regions; fallback bounds used.".to_string());
921 }
922 }
923
924 DetectRegionsResult { regions, notes }
925}
926
927fn build_fallback_region(rect: &Rect, metrics: &SheetMetrics) -> crate::model::DetectedRegion {
928 let kind = match metrics.classification {
929 SheetClassification::Calculator => crate::model::RegionKind::Calculator,
930 SheetClassification::Metadata => crate::model::RegionKind::Metadata,
931 _ => crate::model::RegionKind::Data,
932 };
933 let end_col = crate::utils::column_number_to_name(rect.end_col.max(1));
934 let end_cell = format!("{}{}", end_col, rect.end_row.max(1));
935 let header_count = rect.end_col - rect.start_col + 1;
936 crate::model::DetectedRegion {
937 id: 0,
938 bounds: format!(
939 "{}{}:{}",
940 crate::utils::column_number_to_name(rect.start_col),
941 rect.start_row,
942 end_cell
943 ),
944 header_row: None,
945 headers: Vec::new(),
946 header_count,
947 headers_truncated: header_count > 0,
948 row_count: rect.end_row - rect.start_row + 1,
949 classification: kind.clone(),
950 region_kind: Some(kind),
951 confidence: 0.2,
952 }
953}
954
955fn build_occupancy(sheet: &Worksheet) -> Occupancy {
956 let mut cells = HashMap::new();
957 let mut rows: HashMap<u32, Vec<u32>> = HashMap::new();
958 let mut cols: HashMap<u32, Vec<u32>> = HashMap::new();
959 let mut min_row = u32::MAX;
960 let mut max_row = 0u32;
961 let mut min_col = u32::MAX;
962 let mut max_col = 0u32;
963
964 for cell in sheet.get_cell_collection() {
965 let coord = cell.get_coordinate();
966 let row = *coord.get_row_num();
967 let col = *coord.get_col_num();
968 let value = cell_to_value(cell);
969 let is_formula = cell.is_formula();
970 cells.insert((row, col), CellInfo { value, is_formula });
971 rows.entry(row).or_default().push(col);
972 cols.entry(col).or_default().push(row);
973 min_row = min_row.min(row);
974 max_row = max_row.max(row);
975 min_col = min_col.min(col);
976 max_col = max_col.max(col);
977 }
978
979 for cols in rows.values_mut() {
980 cols.sort_unstable();
981 }
982 for rows in cols.values_mut() {
983 rows.sort_unstable();
984 }
985
986 if cells.is_empty() {
987 min_row = 0;
988 min_col = 0;
989 }
990
991 Occupancy {
992 cells,
993 rows,
994 cols,
995 min_row,
996 max_row,
997 min_col,
998 max_col,
999 }
1000}
1001
1002fn split_rect(
1003 occupancy: &Occupancy,
1004 rect: &Rect,
1005 depth: u32,
1006 limits: &mut DetectLimits,
1007 leaves: &mut Vec<Rect>,
1008) {
1009 if limits.should_stop() || depth >= limits.max_depth {
1010 limits.note_leaf();
1011 leaves.push(*rect);
1012 return;
1013 }
1014 if rect.start_row >= rect.end_row && rect.start_col >= rect.end_col {
1015 limits.note_leaf();
1016 leaves.push(*rect);
1017 return;
1018 }
1019 if let Some(gutter) = find_best_gutter(occupancy, rect, limits) {
1020 match gutter {
1021 Gutter::Row { start, end } => {
1022 if start > rect.start_row {
1023 let upper = Rect {
1024 start_row: rect.start_row,
1025 end_row: start - 1,
1026 start_col: rect.start_col,
1027 end_col: rect.end_col,
1028 };
1029 split_rect(occupancy, &upper, depth + 1, limits, leaves);
1030 }
1031 if end < rect.end_row {
1032 let lower = Rect {
1033 start_row: end + 1,
1034 end_row: rect.end_row,
1035 start_col: rect.start_col,
1036 end_col: rect.end_col,
1037 };
1038 split_rect(occupancy, &lower, depth + 1, limits, leaves);
1039 }
1040 }
1041 Gutter::Col { start, end } => {
1042 if start > rect.start_col {
1043 let left = Rect {
1044 start_row: rect.start_row,
1045 end_row: rect.end_row,
1046 start_col: rect.start_col,
1047 end_col: start - 1,
1048 };
1049 split_rect(occupancy, &left, depth + 1, limits, leaves);
1050 }
1051 if end < rect.end_col {
1052 let right = Rect {
1053 start_row: rect.start_row,
1054 end_row: rect.end_row,
1055 start_col: end + 1,
1056 end_col: rect.end_col,
1057 };
1058 split_rect(occupancy, &right, depth + 1, limits, leaves);
1059 }
1060 }
1061 }
1062 return;
1063 }
1064 limits.note_leaf();
1065 leaves.push(*rect);
1066}
1067
1068fn find_best_gutter(
1069 occupancy: &Occupancy,
1070 rect: &Rect,
1071 limits: &mut DetectLimits,
1072) -> Option<Gutter> {
1073 if limits.should_stop() {
1074 return None;
1075 }
1076 let (row_counts, col_counts) = occupancy.row_col_counts(rect);
1077 let width = rect.end_col - rect.start_col + 1;
1078 let height = rect.end_row - rect.start_row + 1;
1079
1080 let row_blank_runs = find_blank_runs(&row_counts, width);
1081 let col_blank_runs = find_blank_runs(&col_counts, height);
1082
1083 let mut best: Option<(Gutter, u32)> = None;
1084
1085 if let Some((start, end, len)) = row_blank_runs {
1086 let gutter = Gutter::Row {
1087 start: rect.start_row + start,
1088 end: rect.start_row + end,
1089 };
1090 best = Some((gutter, len));
1091 }
1092 if let Some((start, end, len)) = col_blank_runs {
1093 let gutter = Gutter::Col {
1094 start: rect.start_col + start,
1095 end: rect.start_col + end,
1096 };
1097 if best.map(|(_, l)| len > l).unwrap_or(true) {
1098 best = Some((gutter, len));
1099 }
1100 }
1101
1102 best.map(|(g, _)| g)
1103}
1104
1105fn find_blank_runs(counts: &[u32], span: u32) -> Option<(u32, u32, u32)> {
1106 if counts.is_empty() {
1107 return None;
1108 }
1109 let mut best_start = 0;
1110 let mut best_end = 0;
1111 let mut best_len = 0;
1112 let mut current_start = None;
1113 for (idx, count) in counts.iter().enumerate() {
1114 let is_blank = *count == 0 || (*count as f32 / span as f32) < 0.05;
1115 if is_blank {
1116 if current_start.is_none() {
1117 current_start = Some(idx as u32);
1118 }
1119 } else if let Some(start) = current_start.take() {
1120 let end = idx as u32 - 1;
1121 let len = end - start + 1;
1122 if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
1123 best_len = len;
1124 best_start = start;
1125 best_end = end;
1126 }
1127 }
1128 }
1129 if let Some(start) = current_start {
1130 let end = counts.len() as u32 - 1;
1131 let len = end - start + 1;
1132 if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
1133 best_len = len;
1134 best_start = start;
1135 best_end = end;
1136 }
1137 }
1138 if best_len >= 2 {
1139 Some((best_start, best_end, best_len))
1140 } else {
1141 None
1142 }
1143}
1144
1145fn trim_rect(occupancy: &Occupancy, rect: Rect, limits: &mut DetectLimits) -> Option<Rect> {
1146 let mut r = rect;
1147 loop {
1148 if limits.should_stop() {
1149 return Some(r);
1150 }
1151 let (row_counts, col_counts) = occupancy.row_col_counts(&r);
1152 let width = r.end_col - r.start_col + 1;
1153 let height = r.end_row - r.start_row + 1;
1154 let top_blank = row_counts
1155 .first()
1156 .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
1157 .unwrap_or(false);
1158 let bottom_blank = row_counts
1159 .last()
1160 .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
1161 .unwrap_or(false);
1162 let left_blank = col_counts
1163 .first()
1164 .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
1165 .unwrap_or(false);
1166 let right_blank = col_counts
1167 .last()
1168 .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
1169 .unwrap_or(false);
1170
1171 let mut changed = false;
1172 if top_blank && r.start_row < r.end_row {
1173 r.start_row += 1;
1174 changed = true;
1175 }
1176 if bottom_blank && r.end_row > r.start_row {
1177 r.end_row -= 1;
1178 changed = true;
1179 }
1180 if left_blank && r.start_col < r.end_col {
1181 r.start_col += 1;
1182 changed = true;
1183 }
1184 if right_blank && r.end_col > r.start_col {
1185 r.end_col -= 1;
1186 changed = true;
1187 }
1188
1189 if !changed {
1190 break;
1191 }
1192 if r.start_row > r.end_row || r.start_col > r.end_col {
1193 return None;
1194 }
1195 }
1196 Some(r)
1197}
1198
1199fn build_region(
1200 occupancy: &Occupancy,
1201 rect: &Rect,
1202 metrics: &SheetMetrics,
1203 id: u32,
1204) -> crate::model::DetectedRegion {
1205 let header_info = detect_headers(occupancy, rect);
1206 let stats = occupancy.stats_in_rect(rect);
1207 let (kind, confidence) = classify_region(rect, &stats, &header_info, metrics);
1208 let header_len = header_info.headers.len() as u32;
1209 let header_count = rect.end_col - rect.start_col + 1;
1210 let headers_truncated = header_len != header_count;
1211 crate::model::DetectedRegion {
1212 id,
1213 bounds: format!(
1214 "{}{}:{}{}",
1215 crate::utils::column_number_to_name(rect.start_col),
1216 rect.start_row,
1217 crate::utils::column_number_to_name(rect.end_col),
1218 rect.end_row
1219 ),
1220 header_row: header_info.header_row,
1221 headers: header_info.headers,
1222 header_count,
1223 headers_truncated,
1224 row_count: rect.end_row - rect.start_row + 1,
1225 classification: kind.clone(),
1226 region_kind: Some(kind),
1227 confidence,
1228 }
1229}
1230
1231#[derive(Debug, Default)]
1232struct HeaderInfo {
1233 header_row: Option<u32>,
1234 headers: Vec<String>,
1235 is_key_value: bool,
1236}
1237
1238fn is_key_value_layout(occupancy: &Occupancy, rect: &Rect) -> bool {
1239 let width = rect.end_col - rect.start_col + 1;
1240
1241 if width == 2 {
1242 return check_key_value_columns(occupancy, rect, rect.start_col, rect.start_col + 1);
1243 }
1244
1245 if width <= KV_MAX_WIDTH_FOR_DENSITY_CHECK {
1246 let rows_to_sample = (rect.end_row - rect.start_row + 1).min(KV_SAMPLE_ROWS);
1247 let density_threshold = (rows_to_sample as f32 * KV_DENSITY_THRESHOLD) as u32;
1248
1249 let mut col_densities: Vec<(u32, u32)> = Vec::new();
1250 for col in rect.start_col..=rect.end_col {
1251 let count = (rect.start_row..rect.start_row + rows_to_sample)
1252 .filter(|&row| occupancy.value_at(row, col).is_some())
1253 .count() as u32;
1254 if count >= density_threshold {
1255 col_densities.push((col, count));
1256 }
1257 }
1258
1259 if col_densities.len() == 2 {
1260 let label_col = col_densities[0].0;
1261 let value_col = col_densities[1].0;
1262 return check_key_value_columns(occupancy, rect, label_col, value_col);
1263 } else if col_densities.len() == 4 && width >= 4 {
1264 let pair1 =
1265 check_key_value_columns(occupancy, rect, col_densities[0].0, col_densities[1].0);
1266 let pair2 =
1267 check_key_value_columns(occupancy, rect, col_densities[2].0, col_densities[3].0);
1268 return pair1 && pair2;
1269 }
1270 }
1271
1272 false
1273}
1274
1275fn check_key_value_columns(
1276 occupancy: &Occupancy,
1277 rect: &Rect,
1278 label_col: u32,
1279 value_col: u32,
1280) -> bool {
1281 let mut label_value_pairs = 0u32;
1282 let rows_to_check = (rect.end_row - rect.start_row + 1).min(KV_CHECK_ROWS);
1283
1284 for row in rect.start_row..rect.start_row + rows_to_check {
1285 let first_col = occupancy.value_at(row, label_col);
1286 let second_col = occupancy.value_at(row, value_col);
1287
1288 if let (Some(crate::model::CellValue::Text(label)), Some(val)) = (first_col, second_col) {
1289 let label_looks_like_key = label.len() <= KV_MAX_LABEL_LEN
1290 && !label.chars().any(|c| c.is_ascii_digit())
1291 && label.contains(|c: char| c.is_alphabetic());
1292
1293 let value_is_data = matches!(
1294 val,
1295 crate::model::CellValue::Number(_) | crate::model::CellValue::Date(_)
1296 ) || matches!(val, crate::model::CellValue::Text(s) if s.len() > KV_MIN_TEXT_VALUE_LEN);
1297
1298 if label_looks_like_key && value_is_data {
1299 label_value_pairs += 1;
1300 }
1301 }
1302 }
1303
1304 label_value_pairs >= KV_MIN_PAIRS
1305 && label_value_pairs as f32 / rows_to_check as f32 >= KV_MIN_PAIR_RATIO
1306}
1307
1308fn header_data_penalty(s: &str) -> f32 {
1309 if s.is_empty() {
1310 return 0.0;
1311 }
1312 if s.len() > HEADER_LONG_STRING_PENALTY_THRESHOLD {
1313 return HEADER_LONG_STRING_PENALTY;
1314 }
1315 let first_char = s.chars().next().unwrap();
1316 let is_capitalized = first_char.is_uppercase();
1317 let has_lowercase = s.chars().skip(1).any(|c| c.is_lowercase());
1318 let is_all_caps = s.chars().all(|c| !c.is_alphabetic() || c.is_uppercase());
1319 let has_digits = s.chars().any(|c| c.is_ascii_digit());
1320 let is_proper_noun =
1321 is_capitalized && has_lowercase && !is_all_caps && s.len() > HEADER_PROPER_NOUN_MIN_LEN;
1322
1323 let mut penalty = 0.0;
1324 if is_proper_noun {
1325 penalty += HEADER_PROPER_NOUN_PENALTY;
1326 }
1327 if has_digits && s.len() > HEADER_DIGIT_STRING_MIN_LEN {
1328 penalty += HEADER_DIGIT_STRING_PENALTY;
1329 }
1330 penalty
1331}
1332
1333fn detect_headers(occupancy: &Occupancy, rect: &Rect) -> HeaderInfo {
1334 if is_key_value_layout(occupancy, rect) {
1335 let mut headers = Vec::new();
1336 for col in rect.start_col..=rect.end_col {
1337 headers.push(crate::utils::column_number_to_name(col));
1338 }
1339 return HeaderInfo {
1340 header_row: None,
1341 headers,
1342 is_key_value: true,
1343 };
1344 }
1345
1346 let width = rect.end_col - rect.start_col + 1;
1347 if width > HEADER_MAX_COLUMNS {
1348 return HeaderInfo {
1349 header_row: None,
1350 headers: Vec::new(),
1351 is_key_value: false,
1352 };
1353 }
1354
1355 let mut candidates = Vec::new();
1356 let max_row = rect
1357 .start_row
1358 .saturating_add(HEADER_MAX_SCAN_ROWS)
1359 .min(rect.end_row);
1360 for row in rect.start_row..=max_row {
1361 let mut text = 0;
1362 let mut numbers = 0;
1363 let mut non_empty = 0;
1364 let mut unique = HashSet::new();
1365 let mut data_like_penalty: f32 = 0.0;
1366 let mut year_like_bonus: f32 = 0.0;
1367
1368 for col in rect.start_col..=rect.end_col {
1369 if let Some(val) = occupancy.value_at(row, col) {
1370 non_empty += 1;
1371 match val {
1372 crate::model::CellValue::Text(s) => {
1373 text += 1;
1374 unique.insert(s.clone());
1375 data_like_penalty += header_data_penalty(s);
1376 }
1377 crate::model::CellValue::Number(n) => {
1378 if *n >= HEADER_YEAR_MIN && *n <= HEADER_YEAR_MAX && n.fract() == 0.0 {
1379 year_like_bonus += HEADER_YEAR_LIKE_BONUS;
1380 text += 1;
1381 } else {
1382 numbers += 1;
1383 }
1384 }
1385 crate::model::CellValue::Bool(_) => text += 1,
1386 crate::model::CellValue::Date(_) => {
1387 data_like_penalty += HEADER_DATE_PENALTY;
1388 }
1389 crate::model::CellValue::Error(_) => {}
1390 }
1391 }
1392 }
1393 if non_empty == 0 {
1394 continue;
1395 }
1396 let score = text as f32 + unique.len() as f32 * HEADER_UNIQUE_BONUS
1397 - numbers as f32 * HEADER_NUMBER_PENALTY
1398 - data_like_penalty
1399 + year_like_bonus;
1400 candidates.push((row, score, text, non_empty));
1401 }
1402
1403 let is_single_col = rect.start_col == rect.end_col;
1404
1405 let header_candidates: Vec<&(u32, f32, u32, u32)> = candidates
1406 .iter()
1407 .filter(|(_, score, text, non_empty)| {
1408 *text >= 1
1409 && *text * 2 >= *non_empty
1410 && (!is_single_col || *score > HEADER_SINGLE_COL_MIN_SCORE)
1411 })
1412 .collect();
1413
1414 let best = header_candidates.iter().copied().max_by(|a, b| {
1415 a.1.partial_cmp(&b.1)
1416 .unwrap_or(Ordering::Equal)
1417 .then_with(|| b.0.cmp(&a.0))
1418 });
1419 let earliest = header_candidates
1420 .iter()
1421 .copied()
1422 .min_by(|a, b| a.0.cmp(&b.0));
1423
1424 let maybe_header = match (best, earliest) {
1425 (Some(best_row), Some(early_row)) => {
1426 if (best_row.1 - early_row.1).abs() <= HEADER_SCORE_TIE_THRESHOLD {
1427 Some(early_row.0)
1428 } else {
1429 Some(best_row.0)
1430 }
1431 }
1432 (Some(best_row), None) => Some(best_row.0),
1433 _ => None,
1434 };
1435
1436 let mut header_rows = Vec::new();
1437 if let Some(hr) = maybe_header {
1438 header_rows.push(hr);
1439 if hr < rect.end_row
1440 && let Some((_, score_next, text_next, non_empty_next)) =
1441 candidates.iter().find(|(r, _, _, _)| *r == hr + 1)
1442 && *text_next >= 1
1443 && *text_next * 2 >= *non_empty_next
1444 && *score_next
1445 >= HEADER_SECOND_ROW_MIN_SCORE_RATIO
1446 * candidates
1447 .iter()
1448 .find(|(r, _, _, _)| *r == hr)
1449 .map(|c| c.1)
1450 .unwrap_or(0.0)
1451 {
1452 header_rows.push(hr + 1);
1453 }
1454 }
1455
1456 let mut headers = Vec::new();
1457 for col in rect.start_col..=rect.end_col {
1458 let mut parts = Vec::new();
1459 for hr in &header_rows {
1460 if let Some(val) = occupancy.value_at(*hr, col) {
1461 match val {
1462 crate::model::CellValue::Text(s) if !s.trim().is_empty() => {
1463 parts.push(s.trim().to_string())
1464 }
1465 crate::model::CellValue::Number(n) => parts.push(n.to_string()),
1466 crate::model::CellValue::Bool(b) => parts.push(b.to_string()),
1467 crate::model::CellValue::Date(d) => parts.push(d.clone()),
1468 crate::model::CellValue::Error(e) => parts.push(e.clone()),
1469 _ => {}
1470 }
1471 }
1472 }
1473 if parts.is_empty() {
1474 headers.push(crate::utils::column_number_to_name(col));
1475 } else {
1476 headers.push(parts.join(" / "));
1477 }
1478 }
1479
1480 HeaderInfo {
1481 header_row: header_rows.first().copied(),
1482 headers,
1483 is_key_value: false,
1484 }
1485}
1486
1487fn classify_region(
1488 rect: &Rect,
1489 stats: &RegionStats,
1490 header_info: &HeaderInfo,
1491 metrics: &SheetMetrics,
1492) -> (crate::model::RegionKind, f32) {
1493 let width = rect.end_col - rect.start_col + 1;
1494 let height = rect.end_row - rect.start_row + 1;
1495 let area = width.max(1) * height.max(1);
1496 let density = if area == 0 {
1497 0.0
1498 } else {
1499 stats.non_empty as f32 / area as f32
1500 };
1501 let formula_ratio = if stats.non_empty == 0 {
1502 0.0
1503 } else {
1504 stats.formulas as f32 / stats.non_empty as f32
1505 };
1506 let text_ratio = if stats.non_empty == 0 {
1507 0.0
1508 } else {
1509 stats.text as f32 / stats.non_empty as f32
1510 };
1511
1512 let mut kind = crate::model::RegionKind::Data;
1513 if formula_ratio > 0.25 && is_outputs_band(rect, metrics, height, width) {
1514 kind = crate::model::RegionKind::Outputs;
1515 } else if formula_ratio > 0.55 {
1516 kind = crate::model::RegionKind::Calculator;
1517 } else if height <= 3
1518 && width <= 4
1519 && text_ratio > 0.5
1520 && rect.end_row >= metrics.row_count.saturating_sub(3)
1521 {
1522 kind = crate::model::RegionKind::Metadata;
1523 } else if header_info.is_key_value
1524 || (formula_ratio < 0.25
1525 && stats.numbers > 0
1526 && stats.text > 0
1527 && text_ratio >= 0.3
1528 && (width <= 2 || (width <= 3 && header_info.header_row.is_none())))
1529 {
1530 kind = crate::model::RegionKind::Parameters;
1531 } else if height <= 4 && width <= 6 && formula_ratio < 0.2 && text_ratio > 0.4 && density < 0.5
1532 {
1533 kind = crate::model::RegionKind::Metadata;
1534 }
1535
1536 let mut confidence: f32 = 0.4;
1537 if header_info.header_row.is_some() {
1538 confidence += 0.2;
1539 }
1540 confidence += (density * 0.2).min(0.2);
1541 confidence += (formula_ratio * 0.2).min(0.2);
1542 if matches!(
1543 kind,
1544 crate::model::RegionKind::Parameters | crate::model::RegionKind::Metadata
1545 ) && width <= 4
1546 {
1547 confidence += 0.1;
1548 }
1549 if confidence > 1.0 {
1550 confidence = 1.0;
1551 }
1552
1553 (kind, confidence)
1554}
1555
1556fn is_outputs_band(rect: &Rect, metrics: &SheetMetrics, height: u32, width: u32) -> bool {
1557 let near_bottom = rect.end_row >= metrics.row_count.saturating_sub(6);
1558 let near_right = rect.end_col >= metrics.column_count.saturating_sub(3);
1559 let is_shallow = height <= 6;
1560 let is_narrow_at_edge = width <= 6 && near_right;
1561 let not_at_top_left = rect.start_row > 1 || rect.start_col > 1;
1562 let sheet_has_depth = metrics.row_count > 10 || metrics.column_count > 6;
1563 let is_band = (is_shallow && near_bottom) || is_narrow_at_edge;
1564 is_band && not_at_top_left && sheet_has_depth
1565}
1566
1567fn gather_named_ranges(
1568 sheet: &Worksheet,
1569 defined_names: &[DefinedName],
1570) -> Vec<NamedRangeDescriptor> {
1571 let name_str = sheet.get_name();
1572 defined_names
1573 .iter()
1574 .filter(|name| name.get_address().contains(name_str))
1575 .map(|name| NamedRangeDescriptor {
1576 name: name.get_name().to_string(),
1577 scope: if name.has_local_sheet_id() {
1578 Some(name_str.to_string())
1579 } else {
1580 None
1581 },
1582 refers_to: name.get_address(),
1583 kind: NamedItemKind::NamedRange,
1584 sheet_name: Some(name_str.to_string()),
1585 comment: None,
1586 })
1587 .collect()
1588}
1589
1590pub fn build_workbook_list(
1591 config: &Arc<ServerConfig>,
1592 filter: &WorkbookFilter,
1593) -> Result<WorkbookListResponse> {
1594 let mut descriptors = Vec::new();
1595
1596 if let Some(single) = config.single_workbook() {
1597 let metadata = fs::metadata(single)
1598 .with_context(|| format!("unable to read metadata for {:?}", single))?;
1599 let id = WorkbookId(hash_path_metadata(single, &metadata));
1600 let slug = single
1601 .file_stem()
1602 .map(|s| s.to_string_lossy().to_string())
1603 .unwrap_or_else(|| "workbook".to_string());
1604 let folder = derive_folder(config, single);
1605 let short_id = make_short_workbook_id(&slug, id.as_str());
1606 let caps = BackendCaps::xlsx();
1607
1608 if filter.matches(&slug, folder.as_deref(), single) {
1609 let relative = single
1610 .strip_prefix(&config.workspace_root)
1611 .unwrap_or(single);
1612 let descriptor = crate::model::WorkbookDescriptor {
1613 workbook_id: id,
1614 short_id,
1615 slug,
1616 folder,
1617 path: path_to_forward_slashes(relative),
1618 bytes: metadata.len(),
1619 last_modified: metadata
1620 .modified()
1621 .ok()
1622 .and_then(system_time_to_rfc3339)
1623 .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1624 caps,
1625 };
1626 descriptors.push(descriptor);
1627 }
1628
1629 return Ok(WorkbookListResponse {
1630 workbooks: descriptors,
1631 });
1632 }
1633
1634 use walkdir::WalkDir;
1635
1636 for entry in WalkDir::new(&config.workspace_root) {
1637 let entry = entry?;
1638 if !entry.file_type().is_file() {
1639 continue;
1640 }
1641 let path = entry.path();
1642 if !has_supported_extension(&config.supported_extensions, path) {
1643 continue;
1644 }
1645 let metadata = entry.metadata()?;
1646 let id = WorkbookId(hash_path_metadata(path, &metadata));
1647 let slug = path
1648 .file_stem()
1649 .map(|s| s.to_string_lossy().to_string())
1650 .unwrap_or_else(|| "workbook".to_string());
1651 let folder = derive_folder(config, path);
1652 let short_id = make_short_workbook_id(&slug, id.as_str());
1653 let caps = BackendCaps::xlsx();
1654
1655 if !filter.matches(&slug, folder.as_deref(), path) {
1656 continue;
1657 }
1658
1659 let relative = path.strip_prefix(&config.workspace_root).unwrap_or(path);
1660 let descriptor = crate::model::WorkbookDescriptor {
1661 workbook_id: id,
1662 short_id,
1663 slug,
1664 folder,
1665 path: path_to_forward_slashes(relative),
1666 bytes: metadata.len(),
1667 last_modified: metadata
1668 .modified()
1669 .ok()
1670 .and_then(system_time_to_rfc3339)
1671 .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1672 caps,
1673 };
1674 descriptors.push(descriptor);
1675 }
1676
1677 descriptors.sort_by(|a, b| a.slug.cmp(&b.slug));
1678
1679 Ok(WorkbookListResponse {
1680 workbooks: descriptors,
1681 })
1682}
1683
1684fn derive_folder(config: &Arc<ServerConfig>, path: &Path) -> Option<String> {
1685 path.strip_prefix(&config.workspace_root)
1686 .ok()
1687 .and_then(|relative| relative.parent())
1688 .and_then(|parent| parent.file_name())
1689 .map(|os| os.to_string_lossy().to_string())
1690}
1691
1692fn has_supported_extension(allowed: &[String], path: &Path) -> bool {
1693 path.extension()
1694 .and_then(|ext| ext.to_str())
1695 .map(|ext| {
1696 let lower = ext.to_ascii_lowercase();
1697 allowed.iter().any(|candidate| candidate == &lower)
1698 })
1699 .unwrap_or(false)
1700}