1use crate::analysis::{
2 classification,
3 formula::{FormulaAtlas, FormulaGraph},
4 style,
5};
6use crate::caps::BackendCaps;
7use crate::config::ServerConfig;
8use crate::model::{
9 NamedItemKind, NamedRangeDescriptor, SheetClassification, SheetOverviewResponse, SheetSummary,
10 WorkbookDescription, WorkbookId, WorkbookListResponse,
11};
12use crate::tools::filters::WorkbookFilter;
13use crate::utils::{
14 hash_path_metadata, make_short_workbook_id, path_to_forward_slashes, system_time_to_rfc3339,
15};
16use anyhow::{Context, Result, anyhow};
17use chrono::{DateTime, Utc};
18use parking_lot::RwLock;
19use std::cmp::Ordering;
20use std::collections::{HashMap, HashSet};
21use std::fs;
22use std::path::{Path, PathBuf};
23use std::sync::Arc;
24use umya_spreadsheet::reader::xlsx;
25use umya_spreadsheet::{DefinedName, Spreadsheet, Worksheet};
26
27const KV_MAX_WIDTH_FOR_DENSITY_CHECK: u32 = 6;
28const KV_SAMPLE_ROWS: u32 = 20;
29const KV_DENSITY_THRESHOLD: f32 = 0.4;
30const KV_CHECK_ROWS: u32 = 15;
31const KV_MAX_LABEL_LEN: usize = 25;
32const KV_MIN_TEXT_VALUE_LEN: usize = 2;
33const KV_MIN_PAIRS: u32 = 3;
34const KV_MIN_PAIR_RATIO: f32 = 0.3;
35
36const HEADER_MAX_SCAN_ROWS: u32 = 2;
37const HEADER_LONG_STRING_PENALTY_THRESHOLD: usize = 40;
38const HEADER_LONG_STRING_PENALTY: f32 = 1.5;
39const HEADER_PROPER_NOUN_MIN_LEN: usize = 5;
40const HEADER_PROPER_NOUN_PENALTY: f32 = 1.0;
41const HEADER_DIGIT_STRING_MIN_LEN: usize = 3;
42const HEADER_DIGIT_STRING_PENALTY: f32 = 0.5;
43const HEADER_DATE_PENALTY: f32 = 1.0;
44const HEADER_YEAR_LIKE_BONUS: f32 = 0.5;
45const HEADER_YEAR_MIN: f64 = 1900.0;
46const HEADER_YEAR_MAX: f64 = 2100.0;
47const HEADER_UNIQUE_BONUS: f32 = 0.2;
48const HEADER_NUMBER_PENALTY: f32 = 0.3;
49const HEADER_SINGLE_COL_MIN_SCORE: f32 = 1.5;
50const HEADER_SCORE_TIE_THRESHOLD: f32 = 0.3;
51const HEADER_SECOND_ROW_MIN_SCORE_RATIO: f32 = 0.6;
52
53pub struct WorkbookContext {
54 pub id: WorkbookId,
55 pub short_id: String,
56 pub slug: String,
57 pub path: PathBuf,
58 pub caps: BackendCaps,
59 pub bytes: u64,
60 pub last_modified: Option<DateTime<Utc>>,
61 spreadsheet: Arc<RwLock<Spreadsheet>>,
62 sheet_cache: RwLock<HashMap<String, Arc<SheetCacheEntry>>>,
63 formula_atlas: Arc<FormulaAtlas>,
64}
65
66pub struct SheetCacheEntry {
67 pub metrics: SheetMetrics,
68 pub style_tags: Vec<String>,
69 pub named_ranges: Vec<NamedRangeDescriptor>,
70 pub detected_regions: Vec<crate::model::DetectedRegion>,
71}
72
73#[derive(Debug, Clone)]
74pub struct SheetMetrics {
75 pub row_count: u32,
76 pub column_count: u32,
77 pub non_empty_cells: u32,
78 pub formula_cells: u32,
79 pub cached_values: u32,
80 pub comments: u32,
81 pub style_map: HashMap<String, StyleUsage>,
82 pub classification: SheetClassification,
83}
84
85#[derive(Debug, Clone)]
86pub struct StyleUsage {
87 pub occurrences: u32,
88 pub tags: Vec<String>,
89 pub example_cells: Vec<String>,
90}
91
92impl WorkbookContext {
93 pub fn load(_config: &Arc<ServerConfig>, path: &Path) -> Result<Self> {
94 let metadata = fs::metadata(path)
95 .with_context(|| format!("unable to read metadata for {:?}", path))?;
96 let slug = path
97 .file_stem()
98 .map(|s| s.to_string_lossy().to_string())
99 .unwrap_or_else(|| "workbook".to_string());
100 let bytes = metadata.len();
101 let last_modified = metadata.modified().ok().and_then(system_time_to_rfc3339);
102 let id = WorkbookId(hash_path_metadata(path, &metadata));
103 let spreadsheet =
104 xlsx::read(path).with_context(|| format!("failed to parse workbook {:?}", path))?;
105 let short_id = make_short_workbook_id(&slug, id.as_str());
106
107 Ok(Self {
108 id,
109 short_id,
110 slug,
111 path: path.to_path_buf(),
112 caps: BackendCaps::xlsx(),
113 bytes,
114 last_modified,
115 spreadsheet: Arc::new(RwLock::new(spreadsheet)),
116 sheet_cache: RwLock::new(HashMap::new()),
117 formula_atlas: Arc::new(FormulaAtlas::default()),
118 })
119 }
120
121 pub fn sheet_names(&self) -> Vec<String> {
122 let book = self.spreadsheet.read();
123 book.get_sheet_collection()
124 .iter()
125 .map(|sheet| sheet.get_name().to_string())
126 .collect()
127 }
128
129 pub fn describe(&self) -> WorkbookDescription {
130 let book = self.spreadsheet.read();
131 let defined_names_count = book.get_defined_names().len();
132 let table_count: usize = book
133 .get_sheet_collection()
134 .iter()
135 .map(|sheet| sheet.get_tables().len())
136 .sum();
137 let macros_present = false;
138
139 WorkbookDescription {
140 workbook_id: self.id.clone(),
141 short_id: self.short_id.clone(),
142 slug: self.slug.clone(),
143 path: path_to_forward_slashes(&self.path),
144 bytes: self.bytes,
145 sheet_count: book.get_sheet_collection().len(),
146 defined_names: defined_names_count,
147 tables: table_count,
148 macros_present,
149 last_modified: self
150 .last_modified
151 .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
152 caps: self.caps.clone(),
153 }
154 }
155
156 pub fn get_sheet_metrics(&self, sheet_name: &str) -> Result<Arc<SheetCacheEntry>> {
157 if let Some(entry) = self.sheet_cache.read().get(sheet_name) {
158 return Ok(entry.clone());
159 }
160
161 let mut writer = self.sheet_cache.write();
162 if let Some(entry) = writer.get(sheet_name) {
163 return Ok(entry.clone());
164 }
165
166 let book = self.spreadsheet.read();
167 let sheet = book
168 .get_sheet_by_name(sheet_name)
169 .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
170 let (metrics, style_tags) = compute_sheet_metrics(sheet);
171 let named_ranges = gather_named_ranges(sheet, book.get_defined_names());
172 let detected_regions = detect_regions(sheet, &metrics);
173
174 let entry = Arc::new(SheetCacheEntry {
175 metrics,
176 style_tags,
177 named_ranges,
178 detected_regions,
179 });
180
181 writer.insert(sheet_name.to_string(), entry.clone());
182 Ok(entry)
183 }
184
185 pub fn list_summaries(&self) -> Result<Vec<SheetSummary>> {
186 let book = self.spreadsheet.read();
187 let mut summaries = Vec::new();
188 for sheet in book.get_sheet_collection() {
189 let name = sheet.get_name().to_string();
190 let entry = self.get_sheet_metrics(&name)?;
191 summaries.push(SheetSummary {
192 name: name.clone(),
193 visible: sheet.get_sheet_state() != "hidden",
194 row_count: entry.metrics.row_count,
195 column_count: entry.metrics.column_count,
196 non_empty_cells: entry.metrics.non_empty_cells,
197 formula_cells: entry.metrics.formula_cells,
198 cached_values: entry.metrics.cached_values,
199 classification: entry.metrics.classification.clone(),
200 style_tags: entry.style_tags.clone(),
201 });
202 }
203 Ok(summaries)
204 }
205
206 pub fn with_sheet<T, F>(&self, sheet_name: &str, func: F) -> Result<T>
207 where
208 F: FnOnce(&Worksheet) -> T,
209 {
210 let book = self.spreadsheet.read();
211 let sheet = book
212 .get_sheet_by_name(sheet_name)
213 .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
214 Ok(func(sheet))
215 }
216
217 pub fn with_spreadsheet<T, F>(&self, func: F) -> Result<T>
218 where
219 F: FnOnce(&Spreadsheet) -> T,
220 {
221 let book = self.spreadsheet.read();
222 Ok(func(&book))
223 }
224
225 pub fn formula_graph(&self, sheet_name: &str) -> Result<FormulaGraph> {
226 self.with_sheet(sheet_name, |sheet| {
227 FormulaGraph::build(sheet, &self.formula_atlas)
228 })?
229 }
230
231 pub fn named_items(&self) -> Result<Vec<NamedRangeDescriptor>> {
232 let book = self.spreadsheet.read();
233 let sheet_names: Vec<String> = book
234 .get_sheet_collection()
235 .iter()
236 .map(|sheet| sheet.get_name().to_string())
237 .collect();
238 let mut items = Vec::new();
239 for defined in book.get_defined_names() {
240 let refers_to = defined.get_address();
241 let scope = if defined.has_local_sheet_id() {
242 let idx = *defined.get_local_sheet_id() as usize;
243 sheet_names.get(idx).cloned()
244 } else {
245 None
246 };
247 let kind = if refers_to.starts_with('=') {
248 NamedItemKind::Formula
249 } else {
250 NamedItemKind::NamedRange
251 };
252
253 items.push(NamedRangeDescriptor {
254 name: defined.get_name().to_string(),
255 scope: scope.clone(),
256 refers_to: refers_to.clone(),
257 kind,
258 sheet_name: scope,
259 comment: None,
260 });
261 }
262
263 for sheet in book.get_sheet_collection() {
264 for table in sheet.get_tables() {
265 let start = table.get_area().0.get_coordinate();
266 let end = table.get_area().1.get_coordinate();
267 items.push(NamedRangeDescriptor {
268 name: table.get_name().to_string(),
269 scope: Some(sheet.get_name().to_string()),
270 refers_to: format!("{}:{}", start, end),
271 kind: NamedItemKind::Table,
272 sheet_name: Some(sheet.get_name().to_string()),
273 comment: None,
274 });
275 }
276 }
277
278 Ok(items)
279 }
280
281 pub fn sheet_overview(&self, sheet_name: &str) -> Result<SheetOverviewResponse> {
282 let entry = self.get_sheet_metrics(sheet_name)?;
283 let narrative = classification::narrative(&entry.metrics);
284 let regions = classification::regions(&entry.metrics);
285 let key_ranges = classification::key_ranges(&entry.metrics);
286 let detected_regions = entry.detected_regions.clone();
287
288 Ok(SheetOverviewResponse {
289 workbook_id: self.id.clone(),
290 workbook_short_id: self.short_id.clone(),
291 sheet_name: sheet_name.to_string(),
292 narrative,
293 regions,
294 detected_regions,
295 key_ranges,
296 formula_ratio: if entry.metrics.non_empty_cells == 0 {
297 0.0
298 } else {
299 entry.metrics.formula_cells as f32 / entry.metrics.non_empty_cells as f32
300 },
301 notable_features: entry.style_tags.clone(),
302 })
303 }
304
305 pub fn detected_region(
306 &self,
307 sheet_name: &str,
308 id: u32,
309 ) -> Result<crate::model::DetectedRegion> {
310 let entry = self.get_sheet_metrics(sheet_name)?;
311 entry
312 .detected_regions
313 .iter()
314 .find(|r| r.id == id)
315 .cloned()
316 .ok_or_else(|| anyhow!("region {} not found on sheet {}", id, sheet_name))
317 }
318}
319
320fn contains_date_time_token(format_code: &str) -> bool {
321 let mut in_quote = false;
322 let mut in_bracket = false;
323 let chars: Vec<char> = format_code.chars().collect();
324
325 for (i, &ch) in chars.iter().enumerate() {
326 match ch {
327 '"' => in_quote = !in_quote,
328 '[' if !in_quote => in_bracket = true,
329 ']' if !in_quote => in_bracket = false,
330 'y' | 'd' | 'h' | 's' | 'm' if !in_quote && !in_bracket => {
331 if ch == 'm' {
332 let prev = if i > 0 { chars.get(i - 1) } else { None };
333 let next = chars.get(i + 1);
334 let after_time_sep = prev == Some(&':') || prev == Some(&'h');
335 let before_time_sep = next == Some(&':') || next == Some(&'s');
336 if after_time_sep || before_time_sep {
337 return true;
338 }
339 if prev == Some(&'m') || next == Some(&'m') {
340 return true;
341 }
342 if matches!(prev, Some(&'/') | Some(&'-') | Some(&'.'))
343 || matches!(next, Some(&'/') | Some(&'-') | Some(&'.'))
344 {
345 return true;
346 }
347 } else {
348 return true;
349 }
350 }
351 _ => {}
352 }
353 }
354 false
355}
356
357const DATE_FORMAT_IDS: &[u32] = &[
358 14, 15, 16, 17, 18, 19, 20, 21, 22, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 45, 46, 47, 50, 51,
359 52, 53, 54, 55, 56, 57, 58,
360];
361
362const EXCEL_LEAP_YEAR_BUG_SERIAL: i64 = 60;
363
364fn is_date_formatted(cell: &umya_spreadsheet::Cell) -> bool {
365 let Some(nf) = cell.get_style().get_number_format() else {
366 return false;
367 };
368
369 let format_id = nf.get_number_format_id();
370 if DATE_FORMAT_IDS.contains(format_id) {
371 return true;
372 }
373
374 let code = nf.get_format_code();
375 if code == "General" || code == "@" || code == "0" || code == "0.00" {
376 return false;
377 }
378
379 contains_date_time_token(code)
380}
381
382pub fn excel_serial_to_iso(serial: f64, use_1904_system: bool) -> String {
383 excel_serial_to_iso_with_leap_bug(serial, use_1904_system, true)
384}
385
386pub fn excel_serial_to_iso_with_leap_bug(
387 serial: f64,
388 use_1904_system: bool,
389 compensate_leap_bug: bool,
390) -> String {
391 use chrono::NaiveDate;
392
393 let days = serial.trunc() as i64;
394
395 if use_1904_system {
396 let epoch_1904 = NaiveDate::from_ymd_opt(1904, 1, 1).unwrap();
397 return epoch_1904
398 .checked_add_signed(chrono::Duration::days(days))
399 .map(|d| d.format("%Y-%m-%d").to_string())
400 .unwrap_or_else(|| serial.to_string());
401 }
402
403 let epoch = if compensate_leap_bug && days >= EXCEL_LEAP_YEAR_BUG_SERIAL {
404 NaiveDate::from_ymd_opt(1899, 12, 30).unwrap()
405 } else {
406 NaiveDate::from_ymd_opt(1899, 12, 31).unwrap()
407 };
408
409 epoch
410 .checked_add_signed(chrono::Duration::days(days))
411 .map(|d| d.format("%Y-%m-%d").to_string())
412 .unwrap_or_else(|| serial.to_string())
413}
414
415pub fn cell_to_value(cell: &umya_spreadsheet::Cell) -> Option<crate::model::CellValue> {
416 cell_to_value_with_date_system(cell, false)
417}
418
419pub fn cell_to_value_with_date_system(
420 cell: &umya_spreadsheet::Cell,
421 use_1904_system: bool,
422) -> Option<crate::model::CellValue> {
423 let raw = cell.get_value();
424 if raw.is_empty() {
425 return None;
426 }
427 if let Ok(number) = raw.parse::<f64>() {
428 if is_date_formatted(cell) {
429 return Some(crate::model::CellValue::Date(excel_serial_to_iso(
430 number,
431 use_1904_system,
432 )));
433 }
434 return Some(crate::model::CellValue::Number(number));
435 }
436
437 let lower = raw.to_ascii_lowercase();
438 if lower == "true" {
439 return Some(crate::model::CellValue::Bool(true));
440 }
441 if lower == "false" {
442 return Some(crate::model::CellValue::Bool(false));
443 }
444
445 Some(crate::model::CellValue::Text(raw.to_string()))
446}
447
448pub fn compute_sheet_metrics(sheet: &Worksheet) -> (SheetMetrics, Vec<String>) {
449 use std::collections::HashMap as StdHashMap;
450 let mut non_empty = 0u32;
451 let mut formulas = 0u32;
452 let mut cached = 0u32;
453 let comments = sheet.get_comments().len() as u32;
454 let mut style_usage: StdHashMap<String, StyleUsage> = StdHashMap::new();
455
456 for cell in sheet.get_cell_collection() {
457 let value = cell.get_value();
458 if !value.is_empty() {
459 non_empty += 1;
460 }
461 if cell.is_formula() {
462 formulas += 1;
463 if !cell.get_value().is_empty() {
464 cached += 1;
465 }
466 }
467
468 if let Some((style_key, usage)) = style::tag_cell(cell) {
469 let entry = style_usage.entry(style_key).or_insert_with(|| StyleUsage {
470 occurrences: 0,
471 tags: usage.tags.clone(),
472 example_cells: Vec::new(),
473 });
474 entry.occurrences += 1;
475 if entry.example_cells.len() < 5 {
476 entry.example_cells.push(usage.example_cell.clone());
477 }
478 }
479 }
480
481 let (max_col, max_row) = sheet.get_highest_column_and_row();
482
483 let classification = classification::classify(
484 non_empty,
485 formulas,
486 max_row,
487 max_col,
488 comments,
489 &style_usage,
490 );
491
492 let style_tags: Vec<String> = style_usage
493 .values()
494 .flat_map(|usage| usage.tags.clone())
495 .collect();
496
497 let metrics = SheetMetrics {
498 row_count: max_row,
499 column_count: max_col,
500 non_empty_cells: non_empty,
501 formula_cells: formulas,
502 cached_values: cached,
503 comments,
504 style_map: style_usage,
505 classification,
506 };
507 (metrics, style_tags)
508}
509
510#[derive(Debug, Clone, Copy)]
511struct Rect {
512 start_row: u32,
513 end_row: u32,
514 start_col: u32,
515 end_col: u32,
516}
517
518#[derive(Debug, Clone)]
519struct CellInfo {
520 value: Option<crate::model::CellValue>,
521 is_formula: bool,
522}
523
524#[derive(Debug)]
525struct Occupancy {
526 cells: HashMap<(u32, u32), CellInfo>,
527}
528
529impl Occupancy {
530 fn row_col_counts(&self, rect: &Rect) -> (Vec<u32>, Vec<u32>) {
531 let mut row_counts = vec![0u32; (rect.end_row - rect.start_row + 1) as usize];
532 let mut col_counts = vec![0u32; (rect.end_col - rect.start_col + 1) as usize];
533 for ((row, col), _) in self.cells.iter() {
534 if *row >= rect.start_row
535 && *row <= rect.end_row
536 && *col >= rect.start_col
537 && *col <= rect.end_col
538 {
539 row_counts[(row - rect.start_row) as usize] += 1;
540 col_counts[(col - rect.start_col) as usize] += 1;
541 }
542 }
543 (row_counts, col_counts)
544 }
545
546 fn stats_in_rect(&self, rect: &Rect) -> RegionStats {
547 let mut stats = RegionStats::default();
548 for ((row, col), info) in self.cells.iter() {
549 if *row < rect.start_row
550 || *row > rect.end_row
551 || *col < rect.start_col
552 || *col > rect.end_col
553 {
554 continue;
555 }
556 stats.non_empty += 1;
557 if info.is_formula {
558 stats.formulas += 1;
559 }
560 if let Some(val) = &info.value {
561 match val {
562 crate::model::CellValue::Text(_) => stats.text += 1,
563 crate::model::CellValue::Number(_) => stats.numbers += 1,
564 crate::model::CellValue::Bool(_) => stats.bools += 1,
565 crate::model::CellValue::Date(_) => stats.dates += 1,
566 crate::model::CellValue::Error(_) => stats.errors += 1,
567 }
568 }
569 }
570 stats
571 }
572
573 fn value_at(&self, row: u32, col: u32) -> Option<&crate::model::CellValue> {
574 self.cells.get(&(row, col)).and_then(|c| c.value.as_ref())
575 }
576}
577
578#[derive(Debug, Default, Clone)]
579struct RegionStats {
580 non_empty: u32,
581 formulas: u32,
582 text: u32,
583 numbers: u32,
584 bools: u32,
585 dates: u32,
586 errors: u32,
587}
588
589#[derive(Debug, Clone, Copy, PartialEq, Eq)]
590enum Gutter {
591 Row { start: u32, end: u32 },
592 Col { start: u32, end: u32 },
593}
594
595fn detect_regions(sheet: &Worksheet, metrics: &SheetMetrics) -> Vec<crate::model::DetectedRegion> {
596 if metrics.row_count == 0 || metrics.column_count == 0 {
597 return Vec::new();
598 }
599 let occupancy = build_occupancy(sheet);
600 let root = Rect {
601 start_row: 1,
602 end_row: metrics.row_count.max(1),
603 start_col: 1,
604 end_col: metrics.column_count.max(1),
605 };
606
607 let mut leaves = Vec::new();
608 split_rect(&occupancy, &root, &mut leaves);
609
610 let mut regions = Vec::new();
611 for (idx, rect) in leaves.into_iter().enumerate() {
612 if let Some(trimmed) = trim_rect(&occupancy, rect) {
613 let region = build_region(&occupancy, &trimmed, metrics, idx as u32);
614 regions.push(region);
615 }
616 }
617 regions
618}
619
620fn build_occupancy(sheet: &Worksheet) -> Occupancy {
621 let mut cells = HashMap::new();
622 for cell in sheet.get_cell_collection() {
623 let coord = cell.get_coordinate();
624 let row = *coord.get_row_num();
625 let col = *coord.get_col_num();
626 let value = cell_to_value(cell);
627 let is_formula = cell.is_formula();
628 cells.insert((row, col), CellInfo { value, is_formula });
629 }
630 Occupancy { cells }
631}
632
633fn split_rect(occupancy: &Occupancy, rect: &Rect, leaves: &mut Vec<Rect>) {
634 if rect.start_row >= rect.end_row && rect.start_col >= rect.end_col {
635 leaves.push(*rect);
636 return;
637 }
638 if let Some(gutter) = find_best_gutter(occupancy, rect) {
639 match gutter {
640 Gutter::Row { start, end } => {
641 if start > rect.start_row {
642 let upper = Rect {
643 start_row: rect.start_row,
644 end_row: start - 1,
645 start_col: rect.start_col,
646 end_col: rect.end_col,
647 };
648 split_rect(occupancy, &upper, leaves);
649 }
650 if end < rect.end_row {
651 let lower = Rect {
652 start_row: end + 1,
653 end_row: rect.end_row,
654 start_col: rect.start_col,
655 end_col: rect.end_col,
656 };
657 split_rect(occupancy, &lower, leaves);
658 }
659 }
660 Gutter::Col { start, end } => {
661 if start > rect.start_col {
662 let left = Rect {
663 start_row: rect.start_row,
664 end_row: rect.end_row,
665 start_col: rect.start_col,
666 end_col: start - 1,
667 };
668 split_rect(occupancy, &left, leaves);
669 }
670 if end < rect.end_col {
671 let right = Rect {
672 start_row: rect.start_row,
673 end_row: rect.end_row,
674 start_col: end + 1,
675 end_col: rect.end_col,
676 };
677 split_rect(occupancy, &right, leaves);
678 }
679 }
680 }
681 return;
682 }
683 leaves.push(*rect);
684}
685
686fn find_best_gutter(occupancy: &Occupancy, rect: &Rect) -> Option<Gutter> {
687 let (row_counts, col_counts) = occupancy.row_col_counts(rect);
688 let width = rect.end_col - rect.start_col + 1;
689 let height = rect.end_row - rect.start_row + 1;
690
691 let row_blank_runs = find_blank_runs(&row_counts, width);
692 let col_blank_runs = find_blank_runs(&col_counts, height);
693
694 let mut best: Option<(Gutter, u32)> = None;
695
696 if let Some((start, end, len)) = row_blank_runs {
697 let gutter = Gutter::Row {
698 start: rect.start_row + start,
699 end: rect.start_row + end,
700 };
701 best = Some((gutter, len));
702 }
703 if let Some((start, end, len)) = col_blank_runs {
704 let gutter = Gutter::Col {
705 start: rect.start_col + start,
706 end: rect.start_col + end,
707 };
708 if best.map(|(_, l)| len > l).unwrap_or(true) {
709 best = Some((gutter, len));
710 }
711 }
712
713 best.map(|(g, _)| g)
714}
715
716fn find_blank_runs(counts: &[u32], span: u32) -> Option<(u32, u32, u32)> {
717 if counts.is_empty() {
718 return None;
719 }
720 let mut best_start = 0;
721 let mut best_end = 0;
722 let mut best_len = 0;
723 let mut current_start = None;
724 for (idx, count) in counts.iter().enumerate() {
725 let is_blank = *count == 0 || (*count as f32 / span as f32) < 0.05;
726 if is_blank {
727 if current_start.is_none() {
728 current_start = Some(idx as u32);
729 }
730 } else if let Some(start) = current_start.take() {
731 let end = idx as u32 - 1;
732 let len = end - start + 1;
733 if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
734 best_len = len;
735 best_start = start;
736 best_end = end;
737 }
738 }
739 }
740 if let Some(start) = current_start {
741 let end = counts.len() as u32 - 1;
742 let len = end - start + 1;
743 if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
744 best_len = len;
745 best_start = start;
746 best_end = end;
747 }
748 }
749 if best_len >= 2 {
750 Some((best_start, best_end, best_len))
751 } else {
752 None
753 }
754}
755
756fn trim_rect(occupancy: &Occupancy, rect: Rect) -> Option<Rect> {
757 let mut r = rect;
758 loop {
759 let (row_counts, col_counts) = occupancy.row_col_counts(&r);
760 let width = r.end_col - r.start_col + 1;
761 let height = r.end_row - r.start_row + 1;
762 let top_blank = row_counts
763 .first()
764 .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
765 .unwrap_or(false);
766 let bottom_blank = row_counts
767 .last()
768 .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
769 .unwrap_or(false);
770 let left_blank = col_counts
771 .first()
772 .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
773 .unwrap_or(false);
774 let right_blank = col_counts
775 .last()
776 .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
777 .unwrap_or(false);
778
779 let mut changed = false;
780 if top_blank && r.start_row < r.end_row {
781 r.start_row += 1;
782 changed = true;
783 }
784 if bottom_blank && r.end_row > r.start_row {
785 r.end_row -= 1;
786 changed = true;
787 }
788 if left_blank && r.start_col < r.end_col {
789 r.start_col += 1;
790 changed = true;
791 }
792 if right_blank && r.end_col > r.start_col {
793 r.end_col -= 1;
794 changed = true;
795 }
796
797 if !changed {
798 break;
799 }
800 if r.start_row > r.end_row || r.start_col > r.end_col {
801 return None;
802 }
803 }
804 Some(r)
805}
806
807fn build_region(
808 occupancy: &Occupancy,
809 rect: &Rect,
810 metrics: &SheetMetrics,
811 id: u32,
812) -> crate::model::DetectedRegion {
813 let header_info = detect_headers(occupancy, rect);
814 let stats = occupancy.stats_in_rect(rect);
815 let (kind, confidence) = classify_region(rect, &stats, &header_info, metrics);
816 crate::model::DetectedRegion {
817 id,
818 bounds: format!(
819 "{}{}:{}{}",
820 crate::utils::column_number_to_name(rect.start_col),
821 rect.start_row,
822 crate::utils::column_number_to_name(rect.end_col),
823 rect.end_row
824 ),
825 header_row: header_info.header_row,
826 headers: header_info.headers,
827 row_count: rect.end_row - rect.start_row + 1,
828 classification: kind.clone(),
829 region_kind: Some(kind),
830 confidence,
831 }
832}
833
834#[derive(Debug, Default)]
835struct HeaderInfo {
836 header_row: Option<u32>,
837 headers: Vec<String>,
838 is_key_value: bool,
839}
840
841fn is_key_value_layout(occupancy: &Occupancy, rect: &Rect) -> bool {
842 let width = rect.end_col - rect.start_col + 1;
843
844 if width == 2 {
845 return check_key_value_columns(occupancy, rect, rect.start_col, rect.start_col + 1);
846 }
847
848 if width <= KV_MAX_WIDTH_FOR_DENSITY_CHECK {
849 let rows_to_sample = (rect.end_row - rect.start_row + 1).min(KV_SAMPLE_ROWS);
850 let density_threshold = (rows_to_sample as f32 * KV_DENSITY_THRESHOLD) as u32;
851
852 let mut col_densities: Vec<(u32, u32)> = Vec::new();
853 for col in rect.start_col..=rect.end_col {
854 let count = (rect.start_row..rect.start_row + rows_to_sample)
855 .filter(|&row| occupancy.value_at(row, col).is_some())
856 .count() as u32;
857 if count >= density_threshold {
858 col_densities.push((col, count));
859 }
860 }
861
862 if col_densities.len() == 2 {
863 let label_col = col_densities[0].0;
864 let value_col = col_densities[1].0;
865 return check_key_value_columns(occupancy, rect, label_col, value_col);
866 } else if col_densities.len() == 4 && width >= 4 {
867 let pair1 =
868 check_key_value_columns(occupancy, rect, col_densities[0].0, col_densities[1].0);
869 let pair2 =
870 check_key_value_columns(occupancy, rect, col_densities[2].0, col_densities[3].0);
871 return pair1 && pair2;
872 }
873 }
874
875 false
876}
877
878fn check_key_value_columns(
879 occupancy: &Occupancy,
880 rect: &Rect,
881 label_col: u32,
882 value_col: u32,
883) -> bool {
884 let mut label_value_pairs = 0u32;
885 let rows_to_check = (rect.end_row - rect.start_row + 1).min(KV_CHECK_ROWS);
886
887 for row in rect.start_row..rect.start_row + rows_to_check {
888 let first_col = occupancy.value_at(row, label_col);
889 let second_col = occupancy.value_at(row, value_col);
890
891 if let (Some(crate::model::CellValue::Text(label)), Some(val)) = (first_col, second_col) {
892 let label_looks_like_key = label.len() <= KV_MAX_LABEL_LEN
893 && !label.chars().any(|c| c.is_ascii_digit())
894 && label.contains(|c: char| c.is_alphabetic());
895
896 let value_is_data = matches!(
897 val,
898 crate::model::CellValue::Number(_) | crate::model::CellValue::Date(_)
899 ) || matches!(val, crate::model::CellValue::Text(s) if s.len() > KV_MIN_TEXT_VALUE_LEN);
900
901 if label_looks_like_key && value_is_data {
902 label_value_pairs += 1;
903 }
904 }
905 }
906
907 label_value_pairs >= KV_MIN_PAIRS
908 && label_value_pairs as f32 / rows_to_check as f32 >= KV_MIN_PAIR_RATIO
909}
910
911fn header_data_penalty(s: &str) -> f32 {
912 if s.is_empty() {
913 return 0.0;
914 }
915 if s.len() > HEADER_LONG_STRING_PENALTY_THRESHOLD {
916 return HEADER_LONG_STRING_PENALTY;
917 }
918 let first_char = s.chars().next().unwrap();
919 let is_capitalized = first_char.is_uppercase();
920 let has_lowercase = s.chars().skip(1).any(|c| c.is_lowercase());
921 let is_all_caps = s.chars().all(|c| !c.is_alphabetic() || c.is_uppercase());
922 let has_digits = s.chars().any(|c| c.is_ascii_digit());
923 let is_proper_noun =
924 is_capitalized && has_lowercase && !is_all_caps && s.len() > HEADER_PROPER_NOUN_MIN_LEN;
925
926 let mut penalty = 0.0;
927 if is_proper_noun {
928 penalty += HEADER_PROPER_NOUN_PENALTY;
929 }
930 if has_digits && s.len() > HEADER_DIGIT_STRING_MIN_LEN {
931 penalty += HEADER_DIGIT_STRING_PENALTY;
932 }
933 penalty
934}
935
936fn detect_headers(occupancy: &Occupancy, rect: &Rect) -> HeaderInfo {
937 if is_key_value_layout(occupancy, rect) {
938 let mut headers = Vec::new();
939 for col in rect.start_col..=rect.end_col {
940 headers.push(crate::utils::column_number_to_name(col));
941 }
942 return HeaderInfo {
943 header_row: None,
944 headers,
945 is_key_value: true,
946 };
947 }
948
949 let mut candidates = Vec::new();
950 let max_row = rect
951 .start_row
952 .saturating_add(HEADER_MAX_SCAN_ROWS)
953 .min(rect.end_row);
954 for row in rect.start_row..=max_row {
955 let mut text = 0;
956 let mut numbers = 0;
957 let mut non_empty = 0;
958 let mut unique = HashSet::new();
959 let mut data_like_penalty: f32 = 0.0;
960 let mut year_like_bonus: f32 = 0.0;
961
962 for col in rect.start_col..=rect.end_col {
963 if let Some(val) = occupancy.value_at(row, col) {
964 non_empty += 1;
965 match val {
966 crate::model::CellValue::Text(s) => {
967 text += 1;
968 unique.insert(s.clone());
969 data_like_penalty += header_data_penalty(s);
970 }
971 crate::model::CellValue::Number(n) => {
972 if *n >= HEADER_YEAR_MIN && *n <= HEADER_YEAR_MAX && n.fract() == 0.0 {
973 year_like_bonus += HEADER_YEAR_LIKE_BONUS;
974 text += 1;
975 } else {
976 numbers += 1;
977 }
978 }
979 crate::model::CellValue::Bool(_) => text += 1,
980 crate::model::CellValue::Date(_) => {
981 data_like_penalty += HEADER_DATE_PENALTY;
982 }
983 crate::model::CellValue::Error(_) => {}
984 }
985 }
986 }
987 if non_empty == 0 {
988 continue;
989 }
990 let score = text as f32 + unique.len() as f32 * HEADER_UNIQUE_BONUS
991 - numbers as f32 * HEADER_NUMBER_PENALTY
992 - data_like_penalty
993 + year_like_bonus;
994 candidates.push((row, score, text, non_empty));
995 }
996
997 let is_single_col = rect.start_col == rect.end_col;
998
999 let header_candidates: Vec<&(u32, f32, u32, u32)> = candidates
1000 .iter()
1001 .filter(|(_, score, text, non_empty)| {
1002 *text >= 1
1003 && *text * 2 >= *non_empty
1004 && (!is_single_col || *score > HEADER_SINGLE_COL_MIN_SCORE)
1005 })
1006 .collect();
1007
1008 let best = header_candidates.iter().copied().max_by(|a, b| {
1009 a.1.partial_cmp(&b.1)
1010 .unwrap_or(Ordering::Equal)
1011 .then_with(|| b.0.cmp(&a.0))
1012 });
1013 let earliest = header_candidates
1014 .iter()
1015 .copied()
1016 .min_by(|a, b| a.0.cmp(&b.0));
1017
1018 let maybe_header = match (best, earliest) {
1019 (Some(best_row), Some(early_row)) => {
1020 if (best_row.1 - early_row.1).abs() <= HEADER_SCORE_TIE_THRESHOLD {
1021 Some(early_row.0)
1022 } else {
1023 Some(best_row.0)
1024 }
1025 }
1026 (Some(best_row), None) => Some(best_row.0),
1027 _ => None,
1028 };
1029
1030 let mut header_rows = Vec::new();
1031 if let Some(hr) = maybe_header {
1032 header_rows.push(hr);
1033 if hr < rect.end_row
1034 && let Some((_, score_next, text_next, non_empty_next)) =
1035 candidates.iter().find(|(r, _, _, _)| *r == hr + 1)
1036 && *text_next >= 1
1037 && *text_next * 2 >= *non_empty_next
1038 && *score_next
1039 >= HEADER_SECOND_ROW_MIN_SCORE_RATIO
1040 * candidates
1041 .iter()
1042 .find(|(r, _, _, _)| *r == hr)
1043 .map(|c| c.1)
1044 .unwrap_or(0.0)
1045 {
1046 header_rows.push(hr + 1);
1047 }
1048 }
1049
1050 let mut headers = Vec::new();
1051 for col in rect.start_col..=rect.end_col {
1052 let mut parts = Vec::new();
1053 for hr in &header_rows {
1054 if let Some(val) = occupancy.value_at(*hr, col) {
1055 match val {
1056 crate::model::CellValue::Text(s) if !s.trim().is_empty() => {
1057 parts.push(s.trim().to_string())
1058 }
1059 crate::model::CellValue::Number(n) => parts.push(n.to_string()),
1060 crate::model::CellValue::Bool(b) => parts.push(b.to_string()),
1061 crate::model::CellValue::Date(d) => parts.push(d.clone()),
1062 crate::model::CellValue::Error(e) => parts.push(e.clone()),
1063 _ => {}
1064 }
1065 }
1066 }
1067 if parts.is_empty() {
1068 headers.push(crate::utils::column_number_to_name(col));
1069 } else {
1070 headers.push(parts.join(" / "));
1071 }
1072 }
1073
1074 HeaderInfo {
1075 header_row: header_rows.first().copied(),
1076 headers,
1077 is_key_value: false,
1078 }
1079}
1080
1081fn classify_region(
1082 rect: &Rect,
1083 stats: &RegionStats,
1084 header_info: &HeaderInfo,
1085 metrics: &SheetMetrics,
1086) -> (crate::model::RegionKind, f32) {
1087 let width = rect.end_col - rect.start_col + 1;
1088 let height = rect.end_row - rect.start_row + 1;
1089 let area = width.max(1) * height.max(1);
1090 let density = if area == 0 {
1091 0.0
1092 } else {
1093 stats.non_empty as f32 / area as f32
1094 };
1095 let formula_ratio = if stats.non_empty == 0 {
1096 0.0
1097 } else {
1098 stats.formulas as f32 / stats.non_empty as f32
1099 };
1100 let text_ratio = if stats.non_empty == 0 {
1101 0.0
1102 } else {
1103 stats.text as f32 / stats.non_empty as f32
1104 };
1105
1106 let mut kind = crate::model::RegionKind::Data;
1107 if formula_ratio > 0.25 && is_outputs_band(rect, metrics, height, width) {
1108 kind = crate::model::RegionKind::Outputs;
1109 } else if formula_ratio > 0.55 {
1110 kind = crate::model::RegionKind::Calculator;
1111 } else if height <= 3
1112 && width <= 4
1113 && text_ratio > 0.5
1114 && rect.end_row >= metrics.row_count.saturating_sub(3)
1115 {
1116 kind = crate::model::RegionKind::Metadata;
1117 } else if header_info.is_key_value
1118 || (formula_ratio < 0.25
1119 && stats.numbers > 0
1120 && stats.text > 0
1121 && text_ratio >= 0.3
1122 && (width <= 2 || (width <= 3 && header_info.header_row.is_none())))
1123 {
1124 kind = crate::model::RegionKind::Parameters;
1125 } else if height <= 4 && width <= 6 && formula_ratio < 0.2 && text_ratio > 0.4 && density < 0.5
1126 {
1127 kind = crate::model::RegionKind::Metadata;
1128 }
1129
1130 let mut confidence: f32 = 0.4;
1131 if header_info.header_row.is_some() {
1132 confidence += 0.2;
1133 }
1134 confidence += (density * 0.2).min(0.2);
1135 confidence += (formula_ratio * 0.2).min(0.2);
1136 if matches!(
1137 kind,
1138 crate::model::RegionKind::Parameters | crate::model::RegionKind::Metadata
1139 ) && width <= 4
1140 {
1141 confidence += 0.1;
1142 }
1143 if confidence > 1.0 {
1144 confidence = 1.0;
1145 }
1146
1147 (kind, confidence)
1148}
1149
1150fn is_outputs_band(rect: &Rect, metrics: &SheetMetrics, height: u32, width: u32) -> bool {
1151 let near_bottom = rect.end_row >= metrics.row_count.saturating_sub(6);
1152 let near_right = rect.end_col >= metrics.column_count.saturating_sub(3);
1153 let is_shallow = height <= 6;
1154 let is_narrow_at_edge = width <= 6 && near_right;
1155 let not_at_top_left = rect.start_row > 1 || rect.start_col > 1;
1156 let sheet_has_depth = metrics.row_count > 10 || metrics.column_count > 6;
1157 let is_band = (is_shallow && near_bottom) || is_narrow_at_edge;
1158 is_band && not_at_top_left && sheet_has_depth
1159}
1160
1161fn gather_named_ranges(
1162 sheet: &Worksheet,
1163 defined_names: &[DefinedName],
1164) -> Vec<NamedRangeDescriptor> {
1165 let name_str = sheet.get_name();
1166 defined_names
1167 .iter()
1168 .filter(|name| name.get_address().contains(name_str))
1169 .map(|name| NamedRangeDescriptor {
1170 name: name.get_name().to_string(),
1171 scope: if name.has_local_sheet_id() {
1172 Some(name_str.to_string())
1173 } else {
1174 None
1175 },
1176 refers_to: name.get_address(),
1177 kind: NamedItemKind::NamedRange,
1178 sheet_name: Some(name_str.to_string()),
1179 comment: None,
1180 })
1181 .collect()
1182}
1183
1184pub fn build_workbook_list(
1185 config: &Arc<ServerConfig>,
1186 filter: &WorkbookFilter,
1187) -> Result<WorkbookListResponse> {
1188 let mut descriptors = Vec::new();
1189
1190 if let Some(single) = config.single_workbook() {
1191 let metadata = fs::metadata(single)
1192 .with_context(|| format!("unable to read metadata for {:?}", single))?;
1193 let id = WorkbookId(hash_path_metadata(single, &metadata));
1194 let slug = single
1195 .file_stem()
1196 .map(|s| s.to_string_lossy().to_string())
1197 .unwrap_or_else(|| "workbook".to_string());
1198 let folder = derive_folder(config, single);
1199 let short_id = make_short_workbook_id(&slug, id.as_str());
1200 let caps = BackendCaps::xlsx();
1201
1202 if filter.matches(&slug, folder.as_deref(), single) {
1203 let relative = single
1204 .strip_prefix(&config.workspace_root)
1205 .unwrap_or(single);
1206 let descriptor = crate::model::WorkbookDescriptor {
1207 workbook_id: id,
1208 short_id,
1209 slug,
1210 folder,
1211 path: path_to_forward_slashes(relative),
1212 bytes: metadata.len(),
1213 last_modified: metadata
1214 .modified()
1215 .ok()
1216 .and_then(system_time_to_rfc3339)
1217 .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1218 caps,
1219 };
1220 descriptors.push(descriptor);
1221 }
1222
1223 return Ok(WorkbookListResponse {
1224 workbooks: descriptors,
1225 });
1226 }
1227
1228 use walkdir::WalkDir;
1229
1230 for entry in WalkDir::new(&config.workspace_root) {
1231 let entry = entry?;
1232 if !entry.file_type().is_file() {
1233 continue;
1234 }
1235 let path = entry.path();
1236 if !has_supported_extension(&config.supported_extensions, path) {
1237 continue;
1238 }
1239 let metadata = entry.metadata()?;
1240 let id = WorkbookId(hash_path_metadata(path, &metadata));
1241 let slug = path
1242 .file_stem()
1243 .map(|s| s.to_string_lossy().to_string())
1244 .unwrap_or_else(|| "workbook".to_string());
1245 let folder = derive_folder(config, path);
1246 let short_id = make_short_workbook_id(&slug, id.as_str());
1247 let caps = BackendCaps::xlsx();
1248
1249 if !filter.matches(&slug, folder.as_deref(), path) {
1250 continue;
1251 }
1252
1253 let relative = path.strip_prefix(&config.workspace_root).unwrap_or(path);
1254 let descriptor = crate::model::WorkbookDescriptor {
1255 workbook_id: id,
1256 short_id,
1257 slug,
1258 folder,
1259 path: path_to_forward_slashes(relative),
1260 bytes: metadata.len(),
1261 last_modified: metadata
1262 .modified()
1263 .ok()
1264 .and_then(system_time_to_rfc3339)
1265 .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1266 caps,
1267 };
1268 descriptors.push(descriptor);
1269 }
1270
1271 descriptors.sort_by(|a, b| a.slug.cmp(&b.slug));
1272
1273 Ok(WorkbookListResponse {
1274 workbooks: descriptors,
1275 })
1276}
1277
1278fn derive_folder(config: &Arc<ServerConfig>, path: &Path) -> Option<String> {
1279 path.strip_prefix(&config.workspace_root)
1280 .ok()
1281 .and_then(|relative| relative.parent())
1282 .and_then(|parent| parent.file_name())
1283 .map(|os| os.to_string_lossy().to_string())
1284}
1285
1286fn has_supported_extension(allowed: &[String], path: &Path) -> bool {
1287 path.extension()
1288 .and_then(|ext| ext.to_str())
1289 .map(|ext| {
1290 let lower = ext.to_ascii_lowercase();
1291 allowed.iter().any(|candidate| candidate == &lower)
1292 })
1293 .unwrap_or(false)
1294}