1use crate::analysis::{
2 classification,
3 formula::{FormulaAtlas, FormulaGraph},
4 style,
5};
6use crate::caps::BackendCaps;
7use crate::config::ServerConfig;
8use crate::model::{
9 NamedItemKind, NamedRangeDescriptor, SheetClassification, SheetOverviewResponse, SheetSummary,
10 WorkbookDescription, WorkbookId, WorkbookListResponse,
11};
12use crate::tools::filters::WorkbookFilter;
13use crate::utils::{
14 hash_path_metadata, make_short_workbook_id, path_to_forward_slashes, system_time_to_rfc3339,
15};
16use anyhow::{Context, Result, anyhow};
17use chrono::{DateTime, Utc};
18use parking_lot::RwLock;
19use std::cmp::Ordering;
20use std::collections::{HashMap, HashSet};
21use std::fs;
22use std::path::{Path, PathBuf};
23use std::sync::Arc;
24use umya_spreadsheet::reader::xlsx;
25use umya_spreadsheet::{DefinedName, Spreadsheet, Worksheet};
26
27const KV_MAX_WIDTH_FOR_DENSITY_CHECK: u32 = 6;
28const KV_SAMPLE_ROWS: u32 = 20;
29const KV_DENSITY_THRESHOLD: f32 = 0.4;
30const KV_CHECK_ROWS: u32 = 15;
31const KV_MAX_LABEL_LEN: usize = 25;
32const KV_MIN_TEXT_VALUE_LEN: usize = 2;
33const KV_MIN_PAIRS: u32 = 3;
34const KV_MIN_PAIR_RATIO: f32 = 0.3;
35
36const HEADER_MAX_SCAN_ROWS: u32 = 2;
37const HEADER_LONG_STRING_PENALTY_THRESHOLD: usize = 40;
38const HEADER_LONG_STRING_PENALTY: f32 = 1.5;
39const HEADER_PROPER_NOUN_MIN_LEN: usize = 5;
40const HEADER_PROPER_NOUN_PENALTY: f32 = 1.0;
41const HEADER_DIGIT_STRING_MIN_LEN: usize = 3;
42const HEADER_DIGIT_STRING_PENALTY: f32 = 0.5;
43const HEADER_DATE_PENALTY: f32 = 1.0;
44const HEADER_YEAR_LIKE_BONUS: f32 = 0.5;
45const HEADER_YEAR_MIN: f64 = 1900.0;
46const HEADER_YEAR_MAX: f64 = 2100.0;
47const HEADER_UNIQUE_BONUS: f32 = 0.2;
48const HEADER_NUMBER_PENALTY: f32 = 0.3;
49const HEADER_SINGLE_COL_MIN_SCORE: f32 = 1.5;
50const HEADER_SCORE_TIE_THRESHOLD: f32 = 0.3;
51const HEADER_SECOND_ROW_MIN_SCORE_RATIO: f32 = 0.6;
52
53pub struct WorkbookContext {
54 pub id: WorkbookId,
55 pub short_id: String,
56 pub slug: String,
57 pub path: PathBuf,
58 pub caps: BackendCaps,
59 pub bytes: u64,
60 pub last_modified: Option<DateTime<Utc>>,
61 spreadsheet: Arc<RwLock<Spreadsheet>>,
62 sheet_cache: RwLock<HashMap<String, Arc<SheetCacheEntry>>>,
63 formula_atlas: Arc<FormulaAtlas>,
64}
65
66pub struct SheetCacheEntry {
67 pub metrics: SheetMetrics,
68 pub style_tags: Vec<String>,
69 pub named_ranges: Vec<NamedRangeDescriptor>,
70 pub detected_regions: Vec<crate::model::DetectedRegion>,
71}
72
73#[derive(Debug, Clone)]
74pub struct SheetMetrics {
75 pub row_count: u32,
76 pub column_count: u32,
77 pub non_empty_cells: u32,
78 pub formula_cells: u32,
79 pub cached_values: u32,
80 pub comments: u32,
81 pub style_map: HashMap<String, StyleUsage>,
82 pub classification: SheetClassification,
83}
84
85#[derive(Debug, Clone)]
86pub struct StyleUsage {
87 pub occurrences: u32,
88 pub tags: Vec<String>,
89 pub example_cells: Vec<String>,
90}
91
92impl WorkbookContext {
93 pub fn load(_config: &Arc<ServerConfig>, path: &Path) -> Result<Self> {
94 let metadata = fs::metadata(path)
95 .with_context(|| format!("unable to read metadata for {:?}", path))?;
96 let slug = path
97 .file_stem()
98 .map(|s| s.to_string_lossy().to_string())
99 .unwrap_or_else(|| "workbook".to_string());
100 let bytes = metadata.len();
101 let last_modified = metadata.modified().ok().and_then(system_time_to_rfc3339);
102 let id = WorkbookId(hash_path_metadata(path, &metadata));
103 let spreadsheet =
104 xlsx::read(path).with_context(|| format!("failed to parse workbook {:?}", path))?;
105 let short_id = make_short_workbook_id(&slug, id.as_str());
106
107 Ok(Self {
108 id,
109 short_id,
110 slug,
111 path: path.to_path_buf(),
112 caps: BackendCaps::xlsx(),
113 bytes,
114 last_modified,
115 spreadsheet: Arc::new(RwLock::new(spreadsheet)),
116 sheet_cache: RwLock::new(HashMap::new()),
117 formula_atlas: Arc::new(FormulaAtlas::default()),
118 })
119 }
120
121 pub fn sheet_names(&self) -> Vec<String> {
122 let book = self.spreadsheet.read();
123 book.get_sheet_collection()
124 .iter()
125 .map(|sheet| sheet.get_name().to_string())
126 .collect()
127 }
128
129 pub fn describe(&self) -> WorkbookDescription {
130 let book = self.spreadsheet.read();
131 let defined_names_count = book.get_defined_names().len();
132 let table_count: usize = book
133 .get_sheet_collection()
134 .iter()
135 .map(|sheet| sheet.get_tables().len())
136 .sum();
137 let macros_present = false;
138
139 WorkbookDescription {
140 workbook_id: self.id.clone(),
141 short_id: self.short_id.clone(),
142 slug: self.slug.clone(),
143 path: path_to_forward_slashes(&self.path),
144 bytes: self.bytes,
145 sheet_count: book.get_sheet_collection().len(),
146 defined_names: defined_names_count,
147 tables: table_count,
148 macros_present,
149 last_modified: self
150 .last_modified
151 .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
152 caps: self.caps.clone(),
153 }
154 }
155
156 pub fn get_sheet_metrics(&self, sheet_name: &str) -> Result<Arc<SheetCacheEntry>> {
157 if let Some(entry) = self.sheet_cache.read().get(sheet_name) {
158 return Ok(entry.clone());
159 }
160
161 let mut writer = self.sheet_cache.write();
162 if let Some(entry) = writer.get(sheet_name) {
163 return Ok(entry.clone());
164 }
165
166 let book = self.spreadsheet.read();
167 let sheet = book
168 .get_sheet_by_name(sheet_name)
169 .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
170 let (metrics, style_tags) = compute_sheet_metrics(sheet);
171 let named_ranges = gather_named_ranges(sheet, book.get_defined_names());
172 let detected_regions = detect_regions(sheet, &metrics);
173
174 let entry = Arc::new(SheetCacheEntry {
175 metrics,
176 style_tags,
177 named_ranges,
178 detected_regions,
179 });
180
181 writer.insert(sheet_name.to_string(), entry.clone());
182 Ok(entry)
183 }
184
185 pub fn list_summaries(&self) -> Result<Vec<SheetSummary>> {
186 let book = self.spreadsheet.read();
187 let mut summaries = Vec::new();
188 for sheet in book.get_sheet_collection() {
189 let name = sheet.get_name().to_string();
190 let entry = self.get_sheet_metrics(&name)?;
191 summaries.push(SheetSummary {
192 name: name.clone(),
193 visible: sheet.get_sheet_state() != "hidden",
194 row_count: entry.metrics.row_count,
195 column_count: entry.metrics.column_count,
196 non_empty_cells: entry.metrics.non_empty_cells,
197 formula_cells: entry.metrics.formula_cells,
198 cached_values: entry.metrics.cached_values,
199 classification: entry.metrics.classification.clone(),
200 style_tags: entry.style_tags.clone(),
201 });
202 }
203 Ok(summaries)
204 }
205
206 pub fn with_sheet<T, F>(&self, sheet_name: &str, func: F) -> Result<T>
207 where
208 F: FnOnce(&Worksheet) -> T,
209 {
210 let book = self.spreadsheet.read();
211 let sheet = book
212 .get_sheet_by_name(sheet_name)
213 .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
214 Ok(func(sheet))
215 }
216
217 pub fn formula_graph(&self, sheet_name: &str) -> Result<FormulaGraph> {
218 self.with_sheet(sheet_name, |sheet| {
219 FormulaGraph::build(sheet, &self.formula_atlas)
220 })?
221 }
222
223 pub fn named_items(&self) -> Result<Vec<NamedRangeDescriptor>> {
224 let book = self.spreadsheet.read();
225 let sheet_names: Vec<String> = book
226 .get_sheet_collection()
227 .iter()
228 .map(|sheet| sheet.get_name().to_string())
229 .collect();
230 let mut items = Vec::new();
231
232 for defined in book.get_defined_names() {
233 let refers_to = defined.get_address();
234 let scope = if defined.has_local_sheet_id() {
235 let idx = *defined.get_local_sheet_id() as usize;
236 sheet_names.get(idx).cloned()
237 } else {
238 None
239 };
240 let kind = if refers_to.starts_with('=') {
241 NamedItemKind::Formula
242 } else {
243 NamedItemKind::NamedRange
244 };
245
246 items.push(NamedRangeDescriptor {
247 name: defined.get_name().to_string(),
248 scope: scope.clone(),
249 refers_to: refers_to.clone(),
250 kind,
251 sheet_name: scope,
252 comment: None,
253 });
254 }
255
256 for sheet in book.get_sheet_collection() {
257 for table in sheet.get_tables() {
258 let start = table.get_area().0.get_coordinate();
259 let end = table.get_area().1.get_coordinate();
260 items.push(NamedRangeDescriptor {
261 name: table.get_name().to_string(),
262 scope: Some(sheet.get_name().to_string()),
263 refers_to: format!("{}:{}", start, end),
264 kind: NamedItemKind::Table,
265 sheet_name: Some(sheet.get_name().to_string()),
266 comment: None,
267 });
268 }
269 }
270
271 Ok(items)
272 }
273
274 pub fn sheet_overview(&self, sheet_name: &str) -> Result<SheetOverviewResponse> {
275 let entry = self.get_sheet_metrics(sheet_name)?;
276 let narrative = classification::narrative(&entry.metrics);
277 let regions = classification::regions(&entry.metrics);
278 let key_ranges = classification::key_ranges(&entry.metrics);
279 let detected_regions = entry.detected_regions.clone();
280
281 Ok(SheetOverviewResponse {
282 workbook_id: self.id.clone(),
283 workbook_short_id: self.short_id.clone(),
284 sheet_name: sheet_name.to_string(),
285 narrative,
286 regions,
287 detected_regions,
288 key_ranges,
289 formula_ratio: if entry.metrics.non_empty_cells == 0 {
290 0.0
291 } else {
292 entry.metrics.formula_cells as f32 / entry.metrics.non_empty_cells as f32
293 },
294 notable_features: entry.style_tags.clone(),
295 })
296 }
297
298 pub fn detected_region(
299 &self,
300 sheet_name: &str,
301 id: u32,
302 ) -> Result<crate::model::DetectedRegion> {
303 let entry = self.get_sheet_metrics(sheet_name)?;
304 entry
305 .detected_regions
306 .iter()
307 .find(|r| r.id == id)
308 .cloned()
309 .ok_or_else(|| anyhow!("region {} not found on sheet {}", id, sheet_name))
310 }
311}
312
313fn contains_date_time_token(format_code: &str) -> bool {
314 let mut in_quote = false;
315 let mut in_bracket = false;
316 let chars: Vec<char> = format_code.chars().collect();
317
318 for (i, &ch) in chars.iter().enumerate() {
319 match ch {
320 '"' => in_quote = !in_quote,
321 '[' if !in_quote => in_bracket = true,
322 ']' if !in_quote => in_bracket = false,
323 'y' | 'd' | 'h' | 's' | 'm' if !in_quote && !in_bracket => {
324 if ch == 'm' {
325 let prev = if i > 0 { chars.get(i - 1) } else { None };
326 let next = chars.get(i + 1);
327 let after_time_sep = prev == Some(&':') || prev == Some(&'h');
328 let before_time_sep = next == Some(&':') || next == Some(&'s');
329 if after_time_sep || before_time_sep {
330 return true;
331 }
332 if prev == Some(&'m') || next == Some(&'m') {
333 return true;
334 }
335 if matches!(prev, Some(&'/') | Some(&'-') | Some(&'.'))
336 || matches!(next, Some(&'/') | Some(&'-') | Some(&'.'))
337 {
338 return true;
339 }
340 } else {
341 return true;
342 }
343 }
344 _ => {}
345 }
346 }
347 false
348}
349
350const DATE_FORMAT_IDS: &[u32] = &[
351 14, 15, 16, 17, 18, 19, 20, 21, 22, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 45, 46, 47, 50, 51,
352 52, 53, 54, 55, 56, 57, 58,
353];
354
355const EXCEL_LEAP_YEAR_BUG_SERIAL: i64 = 60;
356
357fn is_date_formatted(cell: &umya_spreadsheet::Cell) -> bool {
358 let Some(nf) = cell.get_style().get_number_format() else {
359 return false;
360 };
361
362 let format_id = nf.get_number_format_id();
363 if DATE_FORMAT_IDS.contains(format_id) {
364 return true;
365 }
366
367 let code = nf.get_format_code();
368 if code == "General" || code == "@" || code == "0" || code == "0.00" {
369 return false;
370 }
371
372 contains_date_time_token(code)
373}
374
375pub fn excel_serial_to_iso(serial: f64, use_1904_system: bool) -> String {
376 excel_serial_to_iso_with_leap_bug(serial, use_1904_system, true)
377}
378
379pub fn excel_serial_to_iso_with_leap_bug(
380 serial: f64,
381 use_1904_system: bool,
382 compensate_leap_bug: bool,
383) -> String {
384 use chrono::NaiveDate;
385
386 let days = serial.trunc() as i64;
387
388 if use_1904_system {
389 let epoch_1904 = NaiveDate::from_ymd_opt(1904, 1, 1).unwrap();
390 return epoch_1904
391 .checked_add_signed(chrono::Duration::days(days))
392 .map(|d| d.format("%Y-%m-%d").to_string())
393 .unwrap_or_else(|| serial.to_string());
394 }
395
396 let epoch = if compensate_leap_bug && days >= EXCEL_LEAP_YEAR_BUG_SERIAL {
397 NaiveDate::from_ymd_opt(1899, 12, 30).unwrap()
398 } else {
399 NaiveDate::from_ymd_opt(1899, 12, 31).unwrap()
400 };
401
402 epoch
403 .checked_add_signed(chrono::Duration::days(days))
404 .map(|d| d.format("%Y-%m-%d").to_string())
405 .unwrap_or_else(|| serial.to_string())
406}
407
408pub fn cell_to_value(cell: &umya_spreadsheet::Cell) -> Option<crate::model::CellValue> {
409 cell_to_value_with_date_system(cell, false)
410}
411
412pub fn cell_to_value_with_date_system(
413 cell: &umya_spreadsheet::Cell,
414 use_1904_system: bool,
415) -> Option<crate::model::CellValue> {
416 let raw = cell.get_value();
417 if raw.is_empty() {
418 return None;
419 }
420 if let Ok(number) = raw.parse::<f64>() {
421 if is_date_formatted(cell) {
422 return Some(crate::model::CellValue::Date(excel_serial_to_iso(
423 number,
424 use_1904_system,
425 )));
426 }
427 return Some(crate::model::CellValue::Number(number));
428 }
429
430 let lower = raw.to_ascii_lowercase();
431 if lower == "true" {
432 return Some(crate::model::CellValue::Bool(true));
433 }
434 if lower == "false" {
435 return Some(crate::model::CellValue::Bool(false));
436 }
437
438 Some(crate::model::CellValue::Text(raw.to_string()))
439}
440
441pub fn compute_sheet_metrics(sheet: &Worksheet) -> (SheetMetrics, Vec<String>) {
442 use std::collections::HashMap as StdHashMap;
443 let mut non_empty = 0u32;
444 let mut formulas = 0u32;
445 let mut cached = 0u32;
446 let comments = sheet.get_comments().len() as u32;
447 let mut style_usage: StdHashMap<String, StyleUsage> = StdHashMap::new();
448
449 for cell in sheet.get_cell_collection() {
450 let value = cell.get_value();
451 if !value.is_empty() {
452 non_empty += 1;
453 }
454 if cell.is_formula() {
455 formulas += 1;
456 if !cell.get_value().is_empty() {
457 cached += 1;
458 }
459 }
460
461 if let Some((style_key, usage)) = style::tag_cell(cell) {
462 let entry = style_usage.entry(style_key).or_insert_with(|| StyleUsage {
463 occurrences: 0,
464 tags: usage.tags.clone(),
465 example_cells: Vec::new(),
466 });
467 entry.occurrences += 1;
468 if entry.example_cells.len() < 5 {
469 entry.example_cells.push(usage.example_cell.clone());
470 }
471 }
472 }
473
474 let (max_col, max_row) = sheet.get_highest_column_and_row();
475
476 let classification = classification::classify(
477 non_empty,
478 formulas,
479 max_row,
480 max_col,
481 comments,
482 &style_usage,
483 );
484
485 let style_tags: Vec<String> = style_usage
486 .values()
487 .flat_map(|usage| usage.tags.clone())
488 .collect();
489
490 let metrics = SheetMetrics {
491 row_count: max_row,
492 column_count: max_col,
493 non_empty_cells: non_empty,
494 formula_cells: formulas,
495 cached_values: cached,
496 comments,
497 style_map: style_usage,
498 classification,
499 };
500 (metrics, style_tags)
501}
502
503#[derive(Debug, Clone, Copy)]
504struct Rect {
505 start_row: u32,
506 end_row: u32,
507 start_col: u32,
508 end_col: u32,
509}
510
511#[derive(Debug, Clone)]
512struct CellInfo {
513 value: Option<crate::model::CellValue>,
514 is_formula: bool,
515}
516
517#[derive(Debug)]
518struct Occupancy {
519 cells: HashMap<(u32, u32), CellInfo>,
520}
521
522impl Occupancy {
523 fn row_col_counts(&self, rect: &Rect) -> (Vec<u32>, Vec<u32>) {
524 let mut row_counts = vec![0u32; (rect.end_row - rect.start_row + 1) as usize];
525 let mut col_counts = vec![0u32; (rect.end_col - rect.start_col + 1) as usize];
526 for ((row, col), _) in self.cells.iter() {
527 if *row >= rect.start_row
528 && *row <= rect.end_row
529 && *col >= rect.start_col
530 && *col <= rect.end_col
531 {
532 row_counts[(row - rect.start_row) as usize] += 1;
533 col_counts[(col - rect.start_col) as usize] += 1;
534 }
535 }
536 (row_counts, col_counts)
537 }
538
539 fn stats_in_rect(&self, rect: &Rect) -> RegionStats {
540 let mut stats = RegionStats::default();
541 for ((row, col), info) in self.cells.iter() {
542 if *row < rect.start_row
543 || *row > rect.end_row
544 || *col < rect.start_col
545 || *col > rect.end_col
546 {
547 continue;
548 }
549 stats.non_empty += 1;
550 if info.is_formula {
551 stats.formulas += 1;
552 }
553 if let Some(val) = &info.value {
554 match val {
555 crate::model::CellValue::Text(_) => stats.text += 1,
556 crate::model::CellValue::Number(_) => stats.numbers += 1,
557 crate::model::CellValue::Bool(_) => stats.bools += 1,
558 crate::model::CellValue::Date(_) => stats.dates += 1,
559 crate::model::CellValue::Error(_) => stats.errors += 1,
560 }
561 }
562 }
563 stats
564 }
565
566 fn value_at(&self, row: u32, col: u32) -> Option<&crate::model::CellValue> {
567 self.cells.get(&(row, col)).and_then(|c| c.value.as_ref())
568 }
569}
570
571#[derive(Debug, Default, Clone)]
572struct RegionStats {
573 non_empty: u32,
574 formulas: u32,
575 text: u32,
576 numbers: u32,
577 bools: u32,
578 dates: u32,
579 errors: u32,
580}
581
582#[derive(Debug, Clone, Copy, PartialEq, Eq)]
583enum Gutter {
584 Row { start: u32, end: u32 },
585 Col { start: u32, end: u32 },
586}
587
588fn detect_regions(sheet: &Worksheet, metrics: &SheetMetrics) -> Vec<crate::model::DetectedRegion> {
589 if metrics.row_count == 0 || metrics.column_count == 0 {
590 return Vec::new();
591 }
592 let occupancy = build_occupancy(sheet);
593 let root = Rect {
594 start_row: 1,
595 end_row: metrics.row_count.max(1),
596 start_col: 1,
597 end_col: metrics.column_count.max(1),
598 };
599
600 let mut leaves = Vec::new();
601 split_rect(&occupancy, &root, &mut leaves);
602
603 let mut regions = Vec::new();
604 for (idx, rect) in leaves.into_iter().enumerate() {
605 if let Some(trimmed) = trim_rect(&occupancy, rect) {
606 let region = build_region(&occupancy, &trimmed, metrics, idx as u32);
607 regions.push(region);
608 }
609 }
610 regions
611}
612
613fn build_occupancy(sheet: &Worksheet) -> Occupancy {
614 let mut cells = HashMap::new();
615 for cell in sheet.get_cell_collection() {
616 let coord = cell.get_coordinate();
617 let row = *coord.get_row_num();
618 let col = *coord.get_col_num();
619 let value = cell_to_value(cell);
620 let is_formula = cell.is_formula();
621 cells.insert((row, col), CellInfo { value, is_formula });
622 }
623 Occupancy { cells }
624}
625
626fn split_rect(occupancy: &Occupancy, rect: &Rect, leaves: &mut Vec<Rect>) {
627 if rect.start_row >= rect.end_row && rect.start_col >= rect.end_col {
628 leaves.push(*rect);
629 return;
630 }
631 if let Some(gutter) = find_best_gutter(occupancy, rect) {
632 match gutter {
633 Gutter::Row { start, end } => {
634 if start > rect.start_row {
635 let upper = Rect {
636 start_row: rect.start_row,
637 end_row: start - 1,
638 start_col: rect.start_col,
639 end_col: rect.end_col,
640 };
641 split_rect(occupancy, &upper, leaves);
642 }
643 if end < rect.end_row {
644 let lower = Rect {
645 start_row: end + 1,
646 end_row: rect.end_row,
647 start_col: rect.start_col,
648 end_col: rect.end_col,
649 };
650 split_rect(occupancy, &lower, leaves);
651 }
652 }
653 Gutter::Col { start, end } => {
654 if start > rect.start_col {
655 let left = Rect {
656 start_row: rect.start_row,
657 end_row: rect.end_row,
658 start_col: rect.start_col,
659 end_col: start - 1,
660 };
661 split_rect(occupancy, &left, leaves);
662 }
663 if end < rect.end_col {
664 let right = Rect {
665 start_row: rect.start_row,
666 end_row: rect.end_row,
667 start_col: end + 1,
668 end_col: rect.end_col,
669 };
670 split_rect(occupancy, &right, leaves);
671 }
672 }
673 }
674 return;
675 }
676 leaves.push(*rect);
677}
678
679fn find_best_gutter(occupancy: &Occupancy, rect: &Rect) -> Option<Gutter> {
680 let (row_counts, col_counts) = occupancy.row_col_counts(rect);
681 let width = rect.end_col - rect.start_col + 1;
682 let height = rect.end_row - rect.start_row + 1;
683
684 let row_blank_runs = find_blank_runs(&row_counts, width);
685 let col_blank_runs = find_blank_runs(&col_counts, height);
686
687 let mut best: Option<(Gutter, u32)> = None;
688
689 if let Some((start, end, len)) = row_blank_runs {
690 let gutter = Gutter::Row {
691 start: rect.start_row + start,
692 end: rect.start_row + end,
693 };
694 best = Some((gutter, len));
695 }
696 if let Some((start, end, len)) = col_blank_runs {
697 let gutter = Gutter::Col {
698 start: rect.start_col + start,
699 end: rect.start_col + end,
700 };
701 if best.map(|(_, l)| len > l).unwrap_or(true) {
702 best = Some((gutter, len));
703 }
704 }
705
706 best.map(|(g, _)| g)
707}
708
709fn find_blank_runs(counts: &[u32], span: u32) -> Option<(u32, u32, u32)> {
710 if counts.is_empty() {
711 return None;
712 }
713 let mut best_start = 0;
714 let mut best_end = 0;
715 let mut best_len = 0;
716 let mut current_start = None;
717 for (idx, count) in counts.iter().enumerate() {
718 let is_blank = *count == 0 || (*count as f32 / span as f32) < 0.05;
719 if is_blank {
720 if current_start.is_none() {
721 current_start = Some(idx as u32);
722 }
723 } else if let Some(start) = current_start.take() {
724 let end = idx as u32 - 1;
725 let len = end - start + 1;
726 if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
727 best_len = len;
728 best_start = start;
729 best_end = end;
730 }
731 }
732 }
733 if let Some(start) = current_start {
734 let end = counts.len() as u32 - 1;
735 let len = end - start + 1;
736 if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
737 best_len = len;
738 best_start = start;
739 best_end = end;
740 }
741 }
742 if best_len >= 2 {
743 Some((best_start, best_end, best_len))
744 } else {
745 None
746 }
747}
748
749fn trim_rect(occupancy: &Occupancy, rect: Rect) -> Option<Rect> {
750 let mut r = rect;
751 loop {
752 let (row_counts, col_counts) = occupancy.row_col_counts(&r);
753 let width = r.end_col - r.start_col + 1;
754 let height = r.end_row - r.start_row + 1;
755 let top_blank = row_counts
756 .first()
757 .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
758 .unwrap_or(false);
759 let bottom_blank = row_counts
760 .last()
761 .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
762 .unwrap_or(false);
763 let left_blank = col_counts
764 .first()
765 .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
766 .unwrap_or(false);
767 let right_blank = col_counts
768 .last()
769 .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
770 .unwrap_or(false);
771
772 let mut changed = false;
773 if top_blank && r.start_row < r.end_row {
774 r.start_row += 1;
775 changed = true;
776 }
777 if bottom_blank && r.end_row > r.start_row {
778 r.end_row -= 1;
779 changed = true;
780 }
781 if left_blank && r.start_col < r.end_col {
782 r.start_col += 1;
783 changed = true;
784 }
785 if right_blank && r.end_col > r.start_col {
786 r.end_col -= 1;
787 changed = true;
788 }
789
790 if !changed {
791 break;
792 }
793 if r.start_row > r.end_row || r.start_col > r.end_col {
794 return None;
795 }
796 }
797 Some(r)
798}
799
800fn build_region(
801 occupancy: &Occupancy,
802 rect: &Rect,
803 metrics: &SheetMetrics,
804 id: u32,
805) -> crate::model::DetectedRegion {
806 let header_info = detect_headers(occupancy, rect);
807 let stats = occupancy.stats_in_rect(rect);
808 let (kind, confidence) = classify_region(rect, &stats, &header_info, metrics);
809 crate::model::DetectedRegion {
810 id,
811 bounds: format!(
812 "{}{}:{}{}",
813 crate::utils::column_number_to_name(rect.start_col),
814 rect.start_row,
815 crate::utils::column_number_to_name(rect.end_col),
816 rect.end_row
817 ),
818 header_row: header_info.header_row,
819 headers: header_info.headers,
820 row_count: rect.end_row - rect.start_row + 1,
821 classification: kind.clone(),
822 region_kind: Some(kind),
823 confidence,
824 }
825}
826
827#[derive(Debug, Default)]
828struct HeaderInfo {
829 header_row: Option<u32>,
830 headers: Vec<String>,
831 is_key_value: bool,
832}
833
834fn is_key_value_layout(occupancy: &Occupancy, rect: &Rect) -> bool {
835 let width = rect.end_col - rect.start_col + 1;
836
837 if width == 2 {
838 return check_key_value_columns(occupancy, rect, rect.start_col, rect.start_col + 1);
839 }
840
841 if width <= KV_MAX_WIDTH_FOR_DENSITY_CHECK {
842 let rows_to_sample = (rect.end_row - rect.start_row + 1).min(KV_SAMPLE_ROWS);
843 let density_threshold = (rows_to_sample as f32 * KV_DENSITY_THRESHOLD) as u32;
844
845 let mut col_densities: Vec<(u32, u32)> = Vec::new();
846 for col in rect.start_col..=rect.end_col {
847 let count = (rect.start_row..rect.start_row + rows_to_sample)
848 .filter(|&row| occupancy.value_at(row, col).is_some())
849 .count() as u32;
850 if count >= density_threshold {
851 col_densities.push((col, count));
852 }
853 }
854
855 if col_densities.len() == 2 {
856 let label_col = col_densities[0].0;
857 let value_col = col_densities[1].0;
858 return check_key_value_columns(occupancy, rect, label_col, value_col);
859 } else if col_densities.len() == 4 && width >= 4 {
860 let pair1 =
861 check_key_value_columns(occupancy, rect, col_densities[0].0, col_densities[1].0);
862 let pair2 =
863 check_key_value_columns(occupancy, rect, col_densities[2].0, col_densities[3].0);
864 return pair1 && pair2;
865 }
866 }
867
868 false
869}
870
871fn check_key_value_columns(
872 occupancy: &Occupancy,
873 rect: &Rect,
874 label_col: u32,
875 value_col: u32,
876) -> bool {
877 let mut label_value_pairs = 0u32;
878 let rows_to_check = (rect.end_row - rect.start_row + 1).min(KV_CHECK_ROWS);
879
880 for row in rect.start_row..rect.start_row + rows_to_check {
881 let first_col = occupancy.value_at(row, label_col);
882 let second_col = occupancy.value_at(row, value_col);
883
884 if let (Some(crate::model::CellValue::Text(label)), Some(val)) = (first_col, second_col) {
885 let label_looks_like_key = label.len() <= KV_MAX_LABEL_LEN
886 && !label.chars().any(|c| c.is_ascii_digit())
887 && label.contains(|c: char| c.is_alphabetic());
888
889 let value_is_data = matches!(
890 val,
891 crate::model::CellValue::Number(_) | crate::model::CellValue::Date(_)
892 ) || matches!(val, crate::model::CellValue::Text(s) if s.len() > KV_MIN_TEXT_VALUE_LEN);
893
894 if label_looks_like_key && value_is_data {
895 label_value_pairs += 1;
896 }
897 }
898 }
899
900 label_value_pairs >= KV_MIN_PAIRS
901 && label_value_pairs as f32 / rows_to_check as f32 >= KV_MIN_PAIR_RATIO
902}
903
904fn header_data_penalty(s: &str) -> f32 {
905 if s.is_empty() {
906 return 0.0;
907 }
908 if s.len() > HEADER_LONG_STRING_PENALTY_THRESHOLD {
909 return HEADER_LONG_STRING_PENALTY;
910 }
911 let first_char = s.chars().next().unwrap();
912 let is_capitalized = first_char.is_uppercase();
913 let has_lowercase = s.chars().skip(1).any(|c| c.is_lowercase());
914 let is_all_caps = s.chars().all(|c| !c.is_alphabetic() || c.is_uppercase());
915 let has_digits = s.chars().any(|c| c.is_ascii_digit());
916 let is_proper_noun =
917 is_capitalized && has_lowercase && !is_all_caps && s.len() > HEADER_PROPER_NOUN_MIN_LEN;
918
919 let mut penalty = 0.0;
920 if is_proper_noun {
921 penalty += HEADER_PROPER_NOUN_PENALTY;
922 }
923 if has_digits && s.len() > HEADER_DIGIT_STRING_MIN_LEN {
924 penalty += HEADER_DIGIT_STRING_PENALTY;
925 }
926 penalty
927}
928
929fn detect_headers(occupancy: &Occupancy, rect: &Rect) -> HeaderInfo {
930 if is_key_value_layout(occupancy, rect) {
931 let mut headers = Vec::new();
932 for col in rect.start_col..=rect.end_col {
933 headers.push(crate::utils::column_number_to_name(col));
934 }
935 return HeaderInfo {
936 header_row: None,
937 headers,
938 is_key_value: true,
939 };
940 }
941
942 let mut candidates = Vec::new();
943 let max_row = rect
944 .start_row
945 .saturating_add(HEADER_MAX_SCAN_ROWS)
946 .min(rect.end_row);
947 for row in rect.start_row..=max_row {
948 let mut text = 0;
949 let mut numbers = 0;
950 let mut non_empty = 0;
951 let mut unique = HashSet::new();
952 let mut data_like_penalty: f32 = 0.0;
953 let mut year_like_bonus: f32 = 0.0;
954
955 for col in rect.start_col..=rect.end_col {
956 if let Some(val) = occupancy.value_at(row, col) {
957 non_empty += 1;
958 match val {
959 crate::model::CellValue::Text(s) => {
960 text += 1;
961 unique.insert(s.clone());
962 data_like_penalty += header_data_penalty(s);
963 }
964 crate::model::CellValue::Number(n) => {
965 if *n >= HEADER_YEAR_MIN && *n <= HEADER_YEAR_MAX && n.fract() == 0.0 {
966 year_like_bonus += HEADER_YEAR_LIKE_BONUS;
967 text += 1;
968 } else {
969 numbers += 1;
970 }
971 }
972 crate::model::CellValue::Bool(_) => text += 1,
973 crate::model::CellValue::Date(_) => {
974 data_like_penalty += HEADER_DATE_PENALTY;
975 }
976 crate::model::CellValue::Error(_) => {}
977 }
978 }
979 }
980 if non_empty == 0 {
981 continue;
982 }
983 let score = text as f32 + unique.len() as f32 * HEADER_UNIQUE_BONUS
984 - numbers as f32 * HEADER_NUMBER_PENALTY
985 - data_like_penalty
986 + year_like_bonus;
987 candidates.push((row, score, text, non_empty));
988 }
989
990 let is_single_col = rect.start_col == rect.end_col;
991
992 let header_candidates: Vec<&(u32, f32, u32, u32)> = candidates
993 .iter()
994 .filter(|(_, score, text, non_empty)| {
995 *text >= 1
996 && *text * 2 >= *non_empty
997 && (!is_single_col || *score > HEADER_SINGLE_COL_MIN_SCORE)
998 })
999 .collect();
1000
1001 let best = header_candidates.iter().copied().max_by(|a, b| {
1002 a.1.partial_cmp(&b.1)
1003 .unwrap_or(Ordering::Equal)
1004 .then_with(|| b.0.cmp(&a.0))
1005 });
1006 let earliest = header_candidates
1007 .iter()
1008 .copied()
1009 .min_by(|a, b| a.0.cmp(&b.0));
1010
1011 let maybe_header = match (best, earliest) {
1012 (Some(best_row), Some(early_row)) => {
1013 if (best_row.1 - early_row.1).abs() <= HEADER_SCORE_TIE_THRESHOLD {
1014 Some(early_row.0)
1015 } else {
1016 Some(best_row.0)
1017 }
1018 }
1019 (Some(best_row), None) => Some(best_row.0),
1020 _ => None,
1021 };
1022
1023 let mut header_rows = Vec::new();
1024 if let Some(hr) = maybe_header {
1025 header_rows.push(hr);
1026 if hr < rect.end_row
1027 && let Some((_, score_next, text_next, non_empty_next)) =
1028 candidates.iter().find(|(r, _, _, _)| *r == hr + 1)
1029 && *text_next >= 1
1030 && *text_next * 2 >= *non_empty_next
1031 && *score_next
1032 >= HEADER_SECOND_ROW_MIN_SCORE_RATIO
1033 * candidates
1034 .iter()
1035 .find(|(r, _, _, _)| *r == hr)
1036 .map(|c| c.1)
1037 .unwrap_or(0.0)
1038 {
1039 header_rows.push(hr + 1);
1040 }
1041 }
1042
1043 let mut headers = Vec::new();
1044 for col in rect.start_col..=rect.end_col {
1045 let mut parts = Vec::new();
1046 for hr in &header_rows {
1047 if let Some(val) = occupancy.value_at(*hr, col) {
1048 match val {
1049 crate::model::CellValue::Text(s) if !s.trim().is_empty() => {
1050 parts.push(s.trim().to_string())
1051 }
1052 crate::model::CellValue::Number(n) => parts.push(n.to_string()),
1053 crate::model::CellValue::Bool(b) => parts.push(b.to_string()),
1054 crate::model::CellValue::Date(d) => parts.push(d.clone()),
1055 crate::model::CellValue::Error(e) => parts.push(e.clone()),
1056 _ => {}
1057 }
1058 }
1059 }
1060 if parts.is_empty() {
1061 headers.push(crate::utils::column_number_to_name(col));
1062 } else {
1063 headers.push(parts.join(" / "));
1064 }
1065 }
1066
1067 HeaderInfo {
1068 header_row: header_rows.first().copied(),
1069 headers,
1070 is_key_value: false,
1071 }
1072}
1073
1074fn classify_region(
1075 rect: &Rect,
1076 stats: &RegionStats,
1077 header_info: &HeaderInfo,
1078 metrics: &SheetMetrics,
1079) -> (crate::model::RegionKind, f32) {
1080 let width = rect.end_col - rect.start_col + 1;
1081 let height = rect.end_row - rect.start_row + 1;
1082 let area = width.max(1) * height.max(1);
1083 let density = if area == 0 {
1084 0.0
1085 } else {
1086 stats.non_empty as f32 / area as f32
1087 };
1088 let formula_ratio = if stats.non_empty == 0 {
1089 0.0
1090 } else {
1091 stats.formulas as f32 / stats.non_empty as f32
1092 };
1093 let text_ratio = if stats.non_empty == 0 {
1094 0.0
1095 } else {
1096 stats.text as f32 / stats.non_empty as f32
1097 };
1098
1099 let mut kind = crate::model::RegionKind::Data;
1100 if formula_ratio > 0.25 && is_outputs_band(rect, metrics, height, width) {
1101 kind = crate::model::RegionKind::Outputs;
1102 } else if formula_ratio > 0.55 {
1103 kind = crate::model::RegionKind::Calculator;
1104 } else if height <= 3
1105 && width <= 4
1106 && text_ratio > 0.5
1107 && rect.end_row >= metrics.row_count.saturating_sub(3)
1108 {
1109 kind = crate::model::RegionKind::Metadata;
1110 } else if header_info.is_key_value
1111 || (formula_ratio < 0.25
1112 && stats.numbers > 0
1113 && stats.text > 0
1114 && text_ratio >= 0.3
1115 && (width <= 2 || (width <= 3 && header_info.header_row.is_none())))
1116 {
1117 kind = crate::model::RegionKind::Parameters;
1118 } else if height <= 4 && width <= 6 && formula_ratio < 0.2 && text_ratio > 0.4 && density < 0.5
1119 {
1120 kind = crate::model::RegionKind::Metadata;
1121 }
1122
1123 let mut confidence: f32 = 0.4;
1124 if header_info.header_row.is_some() {
1125 confidence += 0.2;
1126 }
1127 confidence += (density * 0.2).min(0.2);
1128 confidence += (formula_ratio * 0.2).min(0.2);
1129 if matches!(
1130 kind,
1131 crate::model::RegionKind::Parameters | crate::model::RegionKind::Metadata
1132 ) && width <= 4
1133 {
1134 confidence += 0.1;
1135 }
1136 if confidence > 1.0 {
1137 confidence = 1.0;
1138 }
1139
1140 (kind, confidence)
1141}
1142
1143fn is_outputs_band(rect: &Rect, metrics: &SheetMetrics, height: u32, width: u32) -> bool {
1144 let near_bottom = rect.end_row >= metrics.row_count.saturating_sub(6);
1145 let near_right = rect.end_col >= metrics.column_count.saturating_sub(3);
1146 let is_shallow = height <= 6;
1147 let is_narrow_at_edge = width <= 6 && near_right;
1148 let not_at_top_left = rect.start_row > 1 || rect.start_col > 1;
1149 let sheet_has_depth = metrics.row_count > 10 || metrics.column_count > 6;
1150 let is_band = (is_shallow && near_bottom) || is_narrow_at_edge;
1151 is_band && not_at_top_left && sheet_has_depth
1152}
1153
1154fn gather_named_ranges(
1155 sheet: &Worksheet,
1156 defined_names: &[DefinedName],
1157) -> Vec<NamedRangeDescriptor> {
1158 let name_str = sheet.get_name();
1159 defined_names
1160 .iter()
1161 .filter(|name| name.get_address().contains(name_str))
1162 .map(|name| NamedRangeDescriptor {
1163 name: name.get_name().to_string(),
1164 scope: if name.has_local_sheet_id() {
1165 Some(name_str.to_string())
1166 } else {
1167 None
1168 },
1169 refers_to: name.get_address(),
1170 kind: NamedItemKind::NamedRange,
1171 sheet_name: Some(name_str.to_string()),
1172 comment: None,
1173 })
1174 .collect()
1175}
1176
1177pub fn build_workbook_list(
1178 config: &Arc<ServerConfig>,
1179 filter: &WorkbookFilter,
1180) -> Result<WorkbookListResponse> {
1181 let mut descriptors = Vec::new();
1182
1183 if let Some(single) = config.single_workbook() {
1184 let metadata = fs::metadata(single)
1185 .with_context(|| format!("unable to read metadata for {:?}", single))?;
1186 let id = WorkbookId(hash_path_metadata(single, &metadata));
1187 let slug = single
1188 .file_stem()
1189 .map(|s| s.to_string_lossy().to_string())
1190 .unwrap_or_else(|| "workbook".to_string());
1191 let folder = derive_folder(config, single);
1192 let short_id = make_short_workbook_id(&slug, id.as_str());
1193 let caps = BackendCaps::xlsx();
1194
1195 if filter.matches(&slug, folder.as_deref(), single) {
1196 let relative = single
1197 .strip_prefix(&config.workspace_root)
1198 .unwrap_or(single);
1199 let descriptor = crate::model::WorkbookDescriptor {
1200 workbook_id: id,
1201 short_id,
1202 slug,
1203 folder,
1204 path: path_to_forward_slashes(relative),
1205 bytes: metadata.len(),
1206 last_modified: metadata
1207 .modified()
1208 .ok()
1209 .and_then(system_time_to_rfc3339)
1210 .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1211 caps,
1212 };
1213 descriptors.push(descriptor);
1214 }
1215
1216 return Ok(WorkbookListResponse {
1217 workbooks: descriptors,
1218 });
1219 }
1220
1221 use walkdir::WalkDir;
1222
1223 for entry in WalkDir::new(&config.workspace_root) {
1224 let entry = entry?;
1225 if !entry.file_type().is_file() {
1226 continue;
1227 }
1228 let path = entry.path();
1229 if !has_supported_extension(&config.supported_extensions, path) {
1230 continue;
1231 }
1232 let metadata = entry.metadata()?;
1233 let id = WorkbookId(hash_path_metadata(path, &metadata));
1234 let slug = path
1235 .file_stem()
1236 .map(|s| s.to_string_lossy().to_string())
1237 .unwrap_or_else(|| "workbook".to_string());
1238 let folder = derive_folder(config, path);
1239 let short_id = make_short_workbook_id(&slug, id.as_str());
1240 let caps = BackendCaps::xlsx();
1241
1242 if !filter.matches(&slug, folder.as_deref(), path) {
1243 continue;
1244 }
1245
1246 let relative = path.strip_prefix(&config.workspace_root).unwrap_or(path);
1247 let descriptor = crate::model::WorkbookDescriptor {
1248 workbook_id: id,
1249 short_id,
1250 slug,
1251 folder,
1252 path: path_to_forward_slashes(relative),
1253 bytes: metadata.len(),
1254 last_modified: metadata
1255 .modified()
1256 .ok()
1257 .and_then(system_time_to_rfc3339)
1258 .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1259 caps,
1260 };
1261 descriptors.push(descriptor);
1262 }
1263
1264 descriptors.sort_by(|a, b| a.slug.cmp(&b.slug));
1265
1266 Ok(WorkbookListResponse {
1267 workbooks: descriptors,
1268 })
1269}
1270
1271fn derive_folder(config: &Arc<ServerConfig>, path: &Path) -> Option<String> {
1272 path.strip_prefix(&config.workspace_root)
1273 .ok()
1274 .and_then(|relative| relative.parent())
1275 .and_then(|parent| parent.file_name())
1276 .map(|os| os.to_string_lossy().to_string())
1277}
1278
1279fn has_supported_extension(allowed: &[String], path: &Path) -> bool {
1280 path.extension()
1281 .and_then(|ext| ext.to_str())
1282 .map(|ext| {
1283 let lower = ext.to_ascii_lowercase();
1284 allowed.iter().any(|candidate| candidate == &lower)
1285 })
1286 .unwrap_or(false)
1287}