spreadsheet_mcp/
workbook.rs

1use crate::analysis::{
2    classification,
3    formula::{FormulaAtlas, FormulaGraph},
4    style,
5};
6use crate::caps::BackendCaps;
7use crate::config::ServerConfig;
8use crate::model::{
9    NamedItemKind, NamedRangeDescriptor, SheetClassification, SheetOverviewResponse, SheetSummary,
10    WorkbookDescription, WorkbookId, WorkbookListResponse,
11};
12use crate::tools::filters::WorkbookFilter;
13use crate::utils::{
14    hash_path_metadata, make_short_workbook_id, path_to_forward_slashes, system_time_to_rfc3339,
15};
16use anyhow::{Context, Result, anyhow};
17use chrono::{DateTime, Utc};
18use parking_lot::RwLock;
19use std::cmp::Ordering;
20use std::collections::{HashMap, HashSet};
21use std::fs;
22use std::path::{Path, PathBuf};
23use std::sync::Arc;
24use umya_spreadsheet::reader::xlsx;
25use umya_spreadsheet::{DefinedName, Spreadsheet, Worksheet};
26
27const KV_MAX_WIDTH_FOR_DENSITY_CHECK: u32 = 6;
28const KV_SAMPLE_ROWS: u32 = 20;
29const KV_DENSITY_THRESHOLD: f32 = 0.4;
30const KV_CHECK_ROWS: u32 = 15;
31const KV_MAX_LABEL_LEN: usize = 25;
32const KV_MIN_TEXT_VALUE_LEN: usize = 2;
33const KV_MIN_PAIRS: u32 = 3;
34const KV_MIN_PAIR_RATIO: f32 = 0.3;
35
36const HEADER_MAX_SCAN_ROWS: u32 = 2;
37const HEADER_LONG_STRING_PENALTY_THRESHOLD: usize = 40;
38const HEADER_LONG_STRING_PENALTY: f32 = 1.5;
39const HEADER_PROPER_NOUN_MIN_LEN: usize = 5;
40const HEADER_PROPER_NOUN_PENALTY: f32 = 1.0;
41const HEADER_DIGIT_STRING_MIN_LEN: usize = 3;
42const HEADER_DIGIT_STRING_PENALTY: f32 = 0.5;
43const HEADER_DATE_PENALTY: f32 = 1.0;
44const HEADER_YEAR_LIKE_BONUS: f32 = 0.5;
45const HEADER_YEAR_MIN: f64 = 1900.0;
46const HEADER_YEAR_MAX: f64 = 2100.0;
47const HEADER_UNIQUE_BONUS: f32 = 0.2;
48const HEADER_NUMBER_PENALTY: f32 = 0.3;
49const HEADER_SINGLE_COL_MIN_SCORE: f32 = 1.5;
50const HEADER_SCORE_TIE_THRESHOLD: f32 = 0.3;
51const HEADER_SECOND_ROW_MIN_SCORE_RATIO: f32 = 0.6;
52
53pub struct WorkbookContext {
54    pub id: WorkbookId,
55    pub short_id: String,
56    pub slug: String,
57    pub path: PathBuf,
58    pub caps: BackendCaps,
59    pub bytes: u64,
60    pub last_modified: Option<DateTime<Utc>>,
61    spreadsheet: Arc<RwLock<Spreadsheet>>,
62    sheet_cache: RwLock<HashMap<String, Arc<SheetCacheEntry>>>,
63    formula_atlas: Arc<FormulaAtlas>,
64}
65
66pub struct SheetCacheEntry {
67    pub metrics: SheetMetrics,
68    pub style_tags: Vec<String>,
69    pub named_ranges: Vec<NamedRangeDescriptor>,
70    pub detected_regions: Vec<crate::model::DetectedRegion>,
71}
72
73#[derive(Debug, Clone)]
74pub struct SheetMetrics {
75    pub row_count: u32,
76    pub column_count: u32,
77    pub non_empty_cells: u32,
78    pub formula_cells: u32,
79    pub cached_values: u32,
80    pub comments: u32,
81    pub style_map: HashMap<String, StyleUsage>,
82    pub classification: SheetClassification,
83}
84
85#[derive(Debug, Clone)]
86pub struct StyleUsage {
87    pub occurrences: u32,
88    pub tags: Vec<String>,
89    pub example_cells: Vec<String>,
90}
91
92impl WorkbookContext {
93    pub fn load(_config: &Arc<ServerConfig>, path: &Path) -> Result<Self> {
94        let metadata = fs::metadata(path)
95            .with_context(|| format!("unable to read metadata for {:?}", path))?;
96        let slug = path
97            .file_stem()
98            .map(|s| s.to_string_lossy().to_string())
99            .unwrap_or_else(|| "workbook".to_string());
100        let bytes = metadata.len();
101        let last_modified = metadata.modified().ok().and_then(system_time_to_rfc3339);
102        let id = WorkbookId(hash_path_metadata(path, &metadata));
103        let spreadsheet =
104            xlsx::read(path).with_context(|| format!("failed to parse workbook {:?}", path))?;
105        let short_id = make_short_workbook_id(&slug, id.as_str());
106
107        Ok(Self {
108            id,
109            short_id,
110            slug,
111            path: path.to_path_buf(),
112            caps: BackendCaps::xlsx(),
113            bytes,
114            last_modified,
115            spreadsheet: Arc::new(RwLock::new(spreadsheet)),
116            sheet_cache: RwLock::new(HashMap::new()),
117            formula_atlas: Arc::new(FormulaAtlas::default()),
118        })
119    }
120
121    pub fn sheet_names(&self) -> Vec<String> {
122        let book = self.spreadsheet.read();
123        book.get_sheet_collection()
124            .iter()
125            .map(|sheet| sheet.get_name().to_string())
126            .collect()
127    }
128
129    pub fn describe(&self) -> WorkbookDescription {
130        let book = self.spreadsheet.read();
131        let defined_names_count = book.get_defined_names().len();
132        let table_count: usize = book
133            .get_sheet_collection()
134            .iter()
135            .map(|sheet| sheet.get_tables().len())
136            .sum();
137        let macros_present = false;
138
139        WorkbookDescription {
140            workbook_id: self.id.clone(),
141            short_id: self.short_id.clone(),
142            slug: self.slug.clone(),
143            path: path_to_forward_slashes(&self.path),
144            bytes: self.bytes,
145            sheet_count: book.get_sheet_collection().len(),
146            defined_names: defined_names_count,
147            tables: table_count,
148            macros_present,
149            last_modified: self
150                .last_modified
151                .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
152            caps: self.caps.clone(),
153        }
154    }
155
156    pub fn get_sheet_metrics(&self, sheet_name: &str) -> Result<Arc<SheetCacheEntry>> {
157        if let Some(entry) = self.sheet_cache.read().get(sheet_name) {
158            return Ok(entry.clone());
159        }
160
161        let mut writer = self.sheet_cache.write();
162        if let Some(entry) = writer.get(sheet_name) {
163            return Ok(entry.clone());
164        }
165
166        let book = self.spreadsheet.read();
167        let sheet = book
168            .get_sheet_by_name(sheet_name)
169            .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
170        let (metrics, style_tags) = compute_sheet_metrics(sheet);
171        let named_ranges = gather_named_ranges(sheet, book.get_defined_names());
172        let detected_regions = detect_regions(sheet, &metrics);
173
174        let entry = Arc::new(SheetCacheEntry {
175            metrics,
176            style_tags,
177            named_ranges,
178            detected_regions,
179        });
180
181        writer.insert(sheet_name.to_string(), entry.clone());
182        Ok(entry)
183    }
184
185    pub fn list_summaries(&self) -> Result<Vec<SheetSummary>> {
186        let book = self.spreadsheet.read();
187        let mut summaries = Vec::new();
188        for sheet in book.get_sheet_collection() {
189            let name = sheet.get_name().to_string();
190            let entry = self.get_sheet_metrics(&name)?;
191            summaries.push(SheetSummary {
192                name: name.clone(),
193                visible: sheet.get_sheet_state() != "hidden",
194                row_count: entry.metrics.row_count,
195                column_count: entry.metrics.column_count,
196                non_empty_cells: entry.metrics.non_empty_cells,
197                formula_cells: entry.metrics.formula_cells,
198                cached_values: entry.metrics.cached_values,
199                classification: entry.metrics.classification.clone(),
200                style_tags: entry.style_tags.clone(),
201            });
202        }
203        Ok(summaries)
204    }
205
206    pub fn with_sheet<T, F>(&self, sheet_name: &str, func: F) -> Result<T>
207    where
208        F: FnOnce(&Worksheet) -> T,
209    {
210        let book = self.spreadsheet.read();
211        let sheet = book
212            .get_sheet_by_name(sheet_name)
213            .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
214        Ok(func(sheet))
215    }
216
217    pub fn formula_graph(&self, sheet_name: &str) -> Result<FormulaGraph> {
218        self.with_sheet(sheet_name, |sheet| {
219            FormulaGraph::build(sheet, &self.formula_atlas)
220        })?
221    }
222
223    pub fn named_items(&self) -> Result<Vec<NamedRangeDescriptor>> {
224        let book = self.spreadsheet.read();
225        let sheet_names: Vec<String> = book
226            .get_sheet_collection()
227            .iter()
228            .map(|sheet| sheet.get_name().to_string())
229            .collect();
230        let mut items = Vec::new();
231
232        for defined in book.get_defined_names() {
233            let refers_to = defined.get_address();
234            let scope = if defined.has_local_sheet_id() {
235                let idx = *defined.get_local_sheet_id() as usize;
236                sheet_names.get(idx).cloned()
237            } else {
238                None
239            };
240            let kind = if refers_to.starts_with('=') {
241                NamedItemKind::Formula
242            } else {
243                NamedItemKind::NamedRange
244            };
245
246            items.push(NamedRangeDescriptor {
247                name: defined.get_name().to_string(),
248                scope: scope.clone(),
249                refers_to: refers_to.clone(),
250                kind,
251                sheet_name: scope,
252                comment: None,
253            });
254        }
255
256        for sheet in book.get_sheet_collection() {
257            for table in sheet.get_tables() {
258                let start = table.get_area().0.get_coordinate();
259                let end = table.get_area().1.get_coordinate();
260                items.push(NamedRangeDescriptor {
261                    name: table.get_name().to_string(),
262                    scope: Some(sheet.get_name().to_string()),
263                    refers_to: format!("{}:{}", start, end),
264                    kind: NamedItemKind::Table,
265                    sheet_name: Some(sheet.get_name().to_string()),
266                    comment: None,
267                });
268            }
269        }
270
271        Ok(items)
272    }
273
274    pub fn sheet_overview(&self, sheet_name: &str) -> Result<SheetOverviewResponse> {
275        let entry = self.get_sheet_metrics(sheet_name)?;
276        let narrative = classification::narrative(&entry.metrics);
277        let regions = classification::regions(&entry.metrics);
278        let key_ranges = classification::key_ranges(&entry.metrics);
279        let detected_regions = entry.detected_regions.clone();
280
281        Ok(SheetOverviewResponse {
282            workbook_id: self.id.clone(),
283            workbook_short_id: self.short_id.clone(),
284            sheet_name: sheet_name.to_string(),
285            narrative,
286            regions,
287            detected_regions,
288            key_ranges,
289            formula_ratio: if entry.metrics.non_empty_cells == 0 {
290                0.0
291            } else {
292                entry.metrics.formula_cells as f32 / entry.metrics.non_empty_cells as f32
293            },
294            notable_features: entry.style_tags.clone(),
295        })
296    }
297
298    pub fn detected_region(
299        &self,
300        sheet_name: &str,
301        id: u32,
302    ) -> Result<crate::model::DetectedRegion> {
303        let entry = self.get_sheet_metrics(sheet_name)?;
304        entry
305            .detected_regions
306            .iter()
307            .find(|r| r.id == id)
308            .cloned()
309            .ok_or_else(|| anyhow!("region {} not found on sheet {}", id, sheet_name))
310    }
311}
312
313fn contains_date_time_token(format_code: &str) -> bool {
314    let mut in_quote = false;
315    let mut in_bracket = false;
316    let chars: Vec<char> = format_code.chars().collect();
317
318    for (i, &ch) in chars.iter().enumerate() {
319        match ch {
320            '"' => in_quote = !in_quote,
321            '[' if !in_quote => in_bracket = true,
322            ']' if !in_quote => in_bracket = false,
323            'y' | 'd' | 'h' | 's' | 'm' if !in_quote && !in_bracket => {
324                if ch == 'm' {
325                    let prev = if i > 0 { chars.get(i - 1) } else { None };
326                    let next = chars.get(i + 1);
327                    let after_time_sep = prev == Some(&':') || prev == Some(&'h');
328                    let before_time_sep = next == Some(&':') || next == Some(&'s');
329                    if after_time_sep || before_time_sep {
330                        return true;
331                    }
332                    if prev == Some(&'m') || next == Some(&'m') {
333                        return true;
334                    }
335                    if matches!(prev, Some(&'/') | Some(&'-') | Some(&'.'))
336                        || matches!(next, Some(&'/') | Some(&'-') | Some(&'.'))
337                    {
338                        return true;
339                    }
340                } else {
341                    return true;
342                }
343            }
344            _ => {}
345        }
346    }
347    false
348}
349
350const DATE_FORMAT_IDS: &[u32] = &[
351    14, 15, 16, 17, 18, 19, 20, 21, 22, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 45, 46, 47, 50, 51,
352    52, 53, 54, 55, 56, 57, 58,
353];
354
355const EXCEL_LEAP_YEAR_BUG_SERIAL: i64 = 60;
356
357fn is_date_formatted(cell: &umya_spreadsheet::Cell) -> bool {
358    let Some(nf) = cell.get_style().get_number_format() else {
359        return false;
360    };
361
362    let format_id = nf.get_number_format_id();
363    if DATE_FORMAT_IDS.contains(format_id) {
364        return true;
365    }
366
367    let code = nf.get_format_code();
368    if code == "General" || code == "@" || code == "0" || code == "0.00" {
369        return false;
370    }
371
372    contains_date_time_token(code)
373}
374
375pub fn excel_serial_to_iso(serial: f64, use_1904_system: bool) -> String {
376    excel_serial_to_iso_with_leap_bug(serial, use_1904_system, true)
377}
378
379pub fn excel_serial_to_iso_with_leap_bug(
380    serial: f64,
381    use_1904_system: bool,
382    compensate_leap_bug: bool,
383) -> String {
384    use chrono::NaiveDate;
385
386    let days = serial.trunc() as i64;
387
388    if use_1904_system {
389        let epoch_1904 = NaiveDate::from_ymd_opt(1904, 1, 1).unwrap();
390        return epoch_1904
391            .checked_add_signed(chrono::Duration::days(days))
392            .map(|d| d.format("%Y-%m-%d").to_string())
393            .unwrap_or_else(|| serial.to_string());
394    }
395
396    let epoch = if compensate_leap_bug && days >= EXCEL_LEAP_YEAR_BUG_SERIAL {
397        NaiveDate::from_ymd_opt(1899, 12, 30).unwrap()
398    } else {
399        NaiveDate::from_ymd_opt(1899, 12, 31).unwrap()
400    };
401
402    epoch
403        .checked_add_signed(chrono::Duration::days(days))
404        .map(|d| d.format("%Y-%m-%d").to_string())
405        .unwrap_or_else(|| serial.to_string())
406}
407
408pub fn cell_to_value(cell: &umya_spreadsheet::Cell) -> Option<crate::model::CellValue> {
409    cell_to_value_with_date_system(cell, false)
410}
411
412pub fn cell_to_value_with_date_system(
413    cell: &umya_spreadsheet::Cell,
414    use_1904_system: bool,
415) -> Option<crate::model::CellValue> {
416    let raw = cell.get_value();
417    if raw.is_empty() {
418        return None;
419    }
420    if let Ok(number) = raw.parse::<f64>() {
421        if is_date_formatted(cell) {
422            return Some(crate::model::CellValue::Date(excel_serial_to_iso(
423                number,
424                use_1904_system,
425            )));
426        }
427        return Some(crate::model::CellValue::Number(number));
428    }
429
430    let lower = raw.to_ascii_lowercase();
431    if lower == "true" {
432        return Some(crate::model::CellValue::Bool(true));
433    }
434    if lower == "false" {
435        return Some(crate::model::CellValue::Bool(false));
436    }
437
438    Some(crate::model::CellValue::Text(raw.to_string()))
439}
440
441pub fn compute_sheet_metrics(sheet: &Worksheet) -> (SheetMetrics, Vec<String>) {
442    use std::collections::HashMap as StdHashMap;
443    let mut non_empty = 0u32;
444    let mut formulas = 0u32;
445    let mut cached = 0u32;
446    let comments = sheet.get_comments().len() as u32;
447    let mut style_usage: StdHashMap<String, StyleUsage> = StdHashMap::new();
448
449    for cell in sheet.get_cell_collection() {
450        let value = cell.get_value();
451        if !value.is_empty() {
452            non_empty += 1;
453        }
454        if cell.is_formula() {
455            formulas += 1;
456            if !cell.get_value().is_empty() {
457                cached += 1;
458            }
459        }
460
461        if let Some((style_key, usage)) = style::tag_cell(cell) {
462            let entry = style_usage.entry(style_key).or_insert_with(|| StyleUsage {
463                occurrences: 0,
464                tags: usage.tags.clone(),
465                example_cells: Vec::new(),
466            });
467            entry.occurrences += 1;
468            if entry.example_cells.len() < 5 {
469                entry.example_cells.push(usage.example_cell.clone());
470            }
471        }
472    }
473
474    let (max_col, max_row) = sheet.get_highest_column_and_row();
475
476    let classification = classification::classify(
477        non_empty,
478        formulas,
479        max_row,
480        max_col,
481        comments,
482        &style_usage,
483    );
484
485    let style_tags: Vec<String> = style_usage
486        .values()
487        .flat_map(|usage| usage.tags.clone())
488        .collect();
489
490    let metrics = SheetMetrics {
491        row_count: max_row,
492        column_count: max_col,
493        non_empty_cells: non_empty,
494        formula_cells: formulas,
495        cached_values: cached,
496        comments,
497        style_map: style_usage,
498        classification,
499    };
500    (metrics, style_tags)
501}
502
503#[derive(Debug, Clone, Copy)]
504struct Rect {
505    start_row: u32,
506    end_row: u32,
507    start_col: u32,
508    end_col: u32,
509}
510
511#[derive(Debug, Clone)]
512struct CellInfo {
513    value: Option<crate::model::CellValue>,
514    is_formula: bool,
515}
516
517#[derive(Debug)]
518struct Occupancy {
519    cells: HashMap<(u32, u32), CellInfo>,
520}
521
522impl Occupancy {
523    fn row_col_counts(&self, rect: &Rect) -> (Vec<u32>, Vec<u32>) {
524        let mut row_counts = vec![0u32; (rect.end_row - rect.start_row + 1) as usize];
525        let mut col_counts = vec![0u32; (rect.end_col - rect.start_col + 1) as usize];
526        for ((row, col), _) in self.cells.iter() {
527            if *row >= rect.start_row
528                && *row <= rect.end_row
529                && *col >= rect.start_col
530                && *col <= rect.end_col
531            {
532                row_counts[(row - rect.start_row) as usize] += 1;
533                col_counts[(col - rect.start_col) as usize] += 1;
534            }
535        }
536        (row_counts, col_counts)
537    }
538
539    fn stats_in_rect(&self, rect: &Rect) -> RegionStats {
540        let mut stats = RegionStats::default();
541        for ((row, col), info) in self.cells.iter() {
542            if *row < rect.start_row
543                || *row > rect.end_row
544                || *col < rect.start_col
545                || *col > rect.end_col
546            {
547                continue;
548            }
549            stats.non_empty += 1;
550            if info.is_formula {
551                stats.formulas += 1;
552            }
553            if let Some(val) = &info.value {
554                match val {
555                    crate::model::CellValue::Text(_) => stats.text += 1,
556                    crate::model::CellValue::Number(_) => stats.numbers += 1,
557                    crate::model::CellValue::Bool(_) => stats.bools += 1,
558                    crate::model::CellValue::Date(_) => stats.dates += 1,
559                    crate::model::CellValue::Error(_) => stats.errors += 1,
560                }
561            }
562        }
563        stats
564    }
565
566    fn value_at(&self, row: u32, col: u32) -> Option<&crate::model::CellValue> {
567        self.cells.get(&(row, col)).and_then(|c| c.value.as_ref())
568    }
569}
570
571#[derive(Debug, Default, Clone)]
572struct RegionStats {
573    non_empty: u32,
574    formulas: u32,
575    text: u32,
576    numbers: u32,
577    bools: u32,
578    dates: u32,
579    errors: u32,
580}
581
582#[derive(Debug, Clone, Copy, PartialEq, Eq)]
583enum Gutter {
584    Row { start: u32, end: u32 },
585    Col { start: u32, end: u32 },
586}
587
588fn detect_regions(sheet: &Worksheet, metrics: &SheetMetrics) -> Vec<crate::model::DetectedRegion> {
589    if metrics.row_count == 0 || metrics.column_count == 0 {
590        return Vec::new();
591    }
592    let occupancy = build_occupancy(sheet);
593    let root = Rect {
594        start_row: 1,
595        end_row: metrics.row_count.max(1),
596        start_col: 1,
597        end_col: metrics.column_count.max(1),
598    };
599
600    let mut leaves = Vec::new();
601    split_rect(&occupancy, &root, &mut leaves);
602
603    let mut regions = Vec::new();
604    for (idx, rect) in leaves.into_iter().enumerate() {
605        if let Some(trimmed) = trim_rect(&occupancy, rect) {
606            let region = build_region(&occupancy, &trimmed, metrics, idx as u32);
607            regions.push(region);
608        }
609    }
610    regions
611}
612
613fn build_occupancy(sheet: &Worksheet) -> Occupancy {
614    let mut cells = HashMap::new();
615    for cell in sheet.get_cell_collection() {
616        let coord = cell.get_coordinate();
617        let row = *coord.get_row_num();
618        let col = *coord.get_col_num();
619        let value = cell_to_value(cell);
620        let is_formula = cell.is_formula();
621        cells.insert((row, col), CellInfo { value, is_formula });
622    }
623    Occupancy { cells }
624}
625
626fn split_rect(occupancy: &Occupancy, rect: &Rect, leaves: &mut Vec<Rect>) {
627    if rect.start_row >= rect.end_row && rect.start_col >= rect.end_col {
628        leaves.push(*rect);
629        return;
630    }
631    if let Some(gutter) = find_best_gutter(occupancy, rect) {
632        match gutter {
633            Gutter::Row { start, end } => {
634                if start > rect.start_row {
635                    let upper = Rect {
636                        start_row: rect.start_row,
637                        end_row: start - 1,
638                        start_col: rect.start_col,
639                        end_col: rect.end_col,
640                    };
641                    split_rect(occupancy, &upper, leaves);
642                }
643                if end < rect.end_row {
644                    let lower = Rect {
645                        start_row: end + 1,
646                        end_row: rect.end_row,
647                        start_col: rect.start_col,
648                        end_col: rect.end_col,
649                    };
650                    split_rect(occupancy, &lower, leaves);
651                }
652            }
653            Gutter::Col { start, end } => {
654                if start > rect.start_col {
655                    let left = Rect {
656                        start_row: rect.start_row,
657                        end_row: rect.end_row,
658                        start_col: rect.start_col,
659                        end_col: start - 1,
660                    };
661                    split_rect(occupancy, &left, leaves);
662                }
663                if end < rect.end_col {
664                    let right = Rect {
665                        start_row: rect.start_row,
666                        end_row: rect.end_row,
667                        start_col: end + 1,
668                        end_col: rect.end_col,
669                    };
670                    split_rect(occupancy, &right, leaves);
671                }
672            }
673        }
674        return;
675    }
676    leaves.push(*rect);
677}
678
679fn find_best_gutter(occupancy: &Occupancy, rect: &Rect) -> Option<Gutter> {
680    let (row_counts, col_counts) = occupancy.row_col_counts(rect);
681    let width = rect.end_col - rect.start_col + 1;
682    let height = rect.end_row - rect.start_row + 1;
683
684    let row_blank_runs = find_blank_runs(&row_counts, width);
685    let col_blank_runs = find_blank_runs(&col_counts, height);
686
687    let mut best: Option<(Gutter, u32)> = None;
688
689    if let Some((start, end, len)) = row_blank_runs {
690        let gutter = Gutter::Row {
691            start: rect.start_row + start,
692            end: rect.start_row + end,
693        };
694        best = Some((gutter, len));
695    }
696    if let Some((start, end, len)) = col_blank_runs {
697        let gutter = Gutter::Col {
698            start: rect.start_col + start,
699            end: rect.start_col + end,
700        };
701        if best.map(|(_, l)| len > l).unwrap_or(true) {
702            best = Some((gutter, len));
703        }
704    }
705
706    best.map(|(g, _)| g)
707}
708
709fn find_blank_runs(counts: &[u32], span: u32) -> Option<(u32, u32, u32)> {
710    if counts.is_empty() {
711        return None;
712    }
713    let mut best_start = 0;
714    let mut best_end = 0;
715    let mut best_len = 0;
716    let mut current_start = None;
717    for (idx, count) in counts.iter().enumerate() {
718        let is_blank = *count == 0 || (*count as f32 / span as f32) < 0.05;
719        if is_blank {
720            if current_start.is_none() {
721                current_start = Some(idx as u32);
722            }
723        } else if let Some(start) = current_start.take() {
724            let end = idx as u32 - 1;
725            let len = end - start + 1;
726            if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
727                best_len = len;
728                best_start = start;
729                best_end = end;
730            }
731        }
732    }
733    if let Some(start) = current_start {
734        let end = counts.len() as u32 - 1;
735        let len = end - start + 1;
736        if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
737            best_len = len;
738            best_start = start;
739            best_end = end;
740        }
741    }
742    if best_len >= 2 {
743        Some((best_start, best_end, best_len))
744    } else {
745        None
746    }
747}
748
749fn trim_rect(occupancy: &Occupancy, rect: Rect) -> Option<Rect> {
750    let mut r = rect;
751    loop {
752        let (row_counts, col_counts) = occupancy.row_col_counts(&r);
753        let width = r.end_col - r.start_col + 1;
754        let height = r.end_row - r.start_row + 1;
755        let top_blank = row_counts
756            .first()
757            .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
758            .unwrap_or(false);
759        let bottom_blank = row_counts
760            .last()
761            .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
762            .unwrap_or(false);
763        let left_blank = col_counts
764            .first()
765            .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
766            .unwrap_or(false);
767        let right_blank = col_counts
768            .last()
769            .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
770            .unwrap_or(false);
771
772        let mut changed = false;
773        if top_blank && r.start_row < r.end_row {
774            r.start_row += 1;
775            changed = true;
776        }
777        if bottom_blank && r.end_row > r.start_row {
778            r.end_row -= 1;
779            changed = true;
780        }
781        if left_blank && r.start_col < r.end_col {
782            r.start_col += 1;
783            changed = true;
784        }
785        if right_blank && r.end_col > r.start_col {
786            r.end_col -= 1;
787            changed = true;
788        }
789
790        if !changed {
791            break;
792        }
793        if r.start_row > r.end_row || r.start_col > r.end_col {
794            return None;
795        }
796    }
797    Some(r)
798}
799
800fn build_region(
801    occupancy: &Occupancy,
802    rect: &Rect,
803    metrics: &SheetMetrics,
804    id: u32,
805) -> crate::model::DetectedRegion {
806    let header_info = detect_headers(occupancy, rect);
807    let stats = occupancy.stats_in_rect(rect);
808    let (kind, confidence) = classify_region(rect, &stats, &header_info, metrics);
809    crate::model::DetectedRegion {
810        id,
811        bounds: format!(
812            "{}{}:{}{}",
813            crate::utils::column_number_to_name(rect.start_col),
814            rect.start_row,
815            crate::utils::column_number_to_name(rect.end_col),
816            rect.end_row
817        ),
818        header_row: header_info.header_row,
819        headers: header_info.headers,
820        row_count: rect.end_row - rect.start_row + 1,
821        classification: kind.clone(),
822        region_kind: Some(kind),
823        confidence,
824    }
825}
826
827#[derive(Debug, Default)]
828struct HeaderInfo {
829    header_row: Option<u32>,
830    headers: Vec<String>,
831    is_key_value: bool,
832}
833
834fn is_key_value_layout(occupancy: &Occupancy, rect: &Rect) -> bool {
835    let width = rect.end_col - rect.start_col + 1;
836
837    if width == 2 {
838        return check_key_value_columns(occupancy, rect, rect.start_col, rect.start_col + 1);
839    }
840
841    if width <= KV_MAX_WIDTH_FOR_DENSITY_CHECK {
842        let rows_to_sample = (rect.end_row - rect.start_row + 1).min(KV_SAMPLE_ROWS);
843        let density_threshold = (rows_to_sample as f32 * KV_DENSITY_THRESHOLD) as u32;
844
845        let mut col_densities: Vec<(u32, u32)> = Vec::new();
846        for col in rect.start_col..=rect.end_col {
847            let count = (rect.start_row..rect.start_row + rows_to_sample)
848                .filter(|&row| occupancy.value_at(row, col).is_some())
849                .count() as u32;
850            if count >= density_threshold {
851                col_densities.push((col, count));
852            }
853        }
854
855        if col_densities.len() == 2 {
856            let label_col = col_densities[0].0;
857            let value_col = col_densities[1].0;
858            return check_key_value_columns(occupancy, rect, label_col, value_col);
859        } else if col_densities.len() == 4 && width >= 4 {
860            let pair1 =
861                check_key_value_columns(occupancy, rect, col_densities[0].0, col_densities[1].0);
862            let pair2 =
863                check_key_value_columns(occupancy, rect, col_densities[2].0, col_densities[3].0);
864            return pair1 && pair2;
865        }
866    }
867
868    false
869}
870
871fn check_key_value_columns(
872    occupancy: &Occupancy,
873    rect: &Rect,
874    label_col: u32,
875    value_col: u32,
876) -> bool {
877    let mut label_value_pairs = 0u32;
878    let rows_to_check = (rect.end_row - rect.start_row + 1).min(KV_CHECK_ROWS);
879
880    for row in rect.start_row..rect.start_row + rows_to_check {
881        let first_col = occupancy.value_at(row, label_col);
882        let second_col = occupancy.value_at(row, value_col);
883
884        if let (Some(crate::model::CellValue::Text(label)), Some(val)) = (first_col, second_col) {
885            let label_looks_like_key = label.len() <= KV_MAX_LABEL_LEN
886                && !label.chars().any(|c| c.is_ascii_digit())
887                && label.contains(|c: char| c.is_alphabetic());
888
889            let value_is_data = matches!(
890                val,
891                crate::model::CellValue::Number(_) | crate::model::CellValue::Date(_)
892            ) || matches!(val, crate::model::CellValue::Text(s) if s.len() > KV_MIN_TEXT_VALUE_LEN);
893
894            if label_looks_like_key && value_is_data {
895                label_value_pairs += 1;
896            }
897        }
898    }
899
900    label_value_pairs >= KV_MIN_PAIRS
901        && label_value_pairs as f32 / rows_to_check as f32 >= KV_MIN_PAIR_RATIO
902}
903
904fn header_data_penalty(s: &str) -> f32 {
905    if s.is_empty() {
906        return 0.0;
907    }
908    if s.len() > HEADER_LONG_STRING_PENALTY_THRESHOLD {
909        return HEADER_LONG_STRING_PENALTY;
910    }
911    let first_char = s.chars().next().unwrap();
912    let is_capitalized = first_char.is_uppercase();
913    let has_lowercase = s.chars().skip(1).any(|c| c.is_lowercase());
914    let is_all_caps = s.chars().all(|c| !c.is_alphabetic() || c.is_uppercase());
915    let has_digits = s.chars().any(|c| c.is_ascii_digit());
916    let is_proper_noun =
917        is_capitalized && has_lowercase && !is_all_caps && s.len() > HEADER_PROPER_NOUN_MIN_LEN;
918
919    let mut penalty = 0.0;
920    if is_proper_noun {
921        penalty += HEADER_PROPER_NOUN_PENALTY;
922    }
923    if has_digits && s.len() > HEADER_DIGIT_STRING_MIN_LEN {
924        penalty += HEADER_DIGIT_STRING_PENALTY;
925    }
926    penalty
927}
928
929fn detect_headers(occupancy: &Occupancy, rect: &Rect) -> HeaderInfo {
930    if is_key_value_layout(occupancy, rect) {
931        let mut headers = Vec::new();
932        for col in rect.start_col..=rect.end_col {
933            headers.push(crate::utils::column_number_to_name(col));
934        }
935        return HeaderInfo {
936            header_row: None,
937            headers,
938            is_key_value: true,
939        };
940    }
941
942    let mut candidates = Vec::new();
943    let max_row = rect
944        .start_row
945        .saturating_add(HEADER_MAX_SCAN_ROWS)
946        .min(rect.end_row);
947    for row in rect.start_row..=max_row {
948        let mut text = 0;
949        let mut numbers = 0;
950        let mut non_empty = 0;
951        let mut unique = HashSet::new();
952        let mut data_like_penalty: f32 = 0.0;
953        let mut year_like_bonus: f32 = 0.0;
954
955        for col in rect.start_col..=rect.end_col {
956            if let Some(val) = occupancy.value_at(row, col) {
957                non_empty += 1;
958                match val {
959                    crate::model::CellValue::Text(s) => {
960                        text += 1;
961                        unique.insert(s.clone());
962                        data_like_penalty += header_data_penalty(s);
963                    }
964                    crate::model::CellValue::Number(n) => {
965                        if *n >= HEADER_YEAR_MIN && *n <= HEADER_YEAR_MAX && n.fract() == 0.0 {
966                            year_like_bonus += HEADER_YEAR_LIKE_BONUS;
967                            text += 1;
968                        } else {
969                            numbers += 1;
970                        }
971                    }
972                    crate::model::CellValue::Bool(_) => text += 1,
973                    crate::model::CellValue::Date(_) => {
974                        data_like_penalty += HEADER_DATE_PENALTY;
975                    }
976                    crate::model::CellValue::Error(_) => {}
977                }
978            }
979        }
980        if non_empty == 0 {
981            continue;
982        }
983        let score = text as f32 + unique.len() as f32 * HEADER_UNIQUE_BONUS
984            - numbers as f32 * HEADER_NUMBER_PENALTY
985            - data_like_penalty
986            + year_like_bonus;
987        candidates.push((row, score, text, non_empty));
988    }
989
990    let is_single_col = rect.start_col == rect.end_col;
991
992    let header_candidates: Vec<&(u32, f32, u32, u32)> = candidates
993        .iter()
994        .filter(|(_, score, text, non_empty)| {
995            *text >= 1
996                && *text * 2 >= *non_empty
997                && (!is_single_col || *score > HEADER_SINGLE_COL_MIN_SCORE)
998        })
999        .collect();
1000
1001    let best = header_candidates.iter().copied().max_by(|a, b| {
1002        a.1.partial_cmp(&b.1)
1003            .unwrap_or(Ordering::Equal)
1004            .then_with(|| b.0.cmp(&a.0))
1005    });
1006    let earliest = header_candidates
1007        .iter()
1008        .copied()
1009        .min_by(|a, b| a.0.cmp(&b.0));
1010
1011    let maybe_header = match (best, earliest) {
1012        (Some(best_row), Some(early_row)) => {
1013            if (best_row.1 - early_row.1).abs() <= HEADER_SCORE_TIE_THRESHOLD {
1014                Some(early_row.0)
1015            } else {
1016                Some(best_row.0)
1017            }
1018        }
1019        (Some(best_row), None) => Some(best_row.0),
1020        _ => None,
1021    };
1022
1023    let mut header_rows = Vec::new();
1024    if let Some(hr) = maybe_header {
1025        header_rows.push(hr);
1026        if hr < rect.end_row
1027            && let Some((_, score_next, text_next, non_empty_next)) =
1028                candidates.iter().find(|(r, _, _, _)| *r == hr + 1)
1029            && *text_next >= 1
1030            && *text_next * 2 >= *non_empty_next
1031            && *score_next
1032                >= HEADER_SECOND_ROW_MIN_SCORE_RATIO
1033                    * candidates
1034                        .iter()
1035                        .find(|(r, _, _, _)| *r == hr)
1036                        .map(|c| c.1)
1037                        .unwrap_or(0.0)
1038        {
1039            header_rows.push(hr + 1);
1040        }
1041    }
1042
1043    let mut headers = Vec::new();
1044    for col in rect.start_col..=rect.end_col {
1045        let mut parts = Vec::new();
1046        for hr in &header_rows {
1047            if let Some(val) = occupancy.value_at(*hr, col) {
1048                match val {
1049                    crate::model::CellValue::Text(s) if !s.trim().is_empty() => {
1050                        parts.push(s.trim().to_string())
1051                    }
1052                    crate::model::CellValue::Number(n) => parts.push(n.to_string()),
1053                    crate::model::CellValue::Bool(b) => parts.push(b.to_string()),
1054                    crate::model::CellValue::Date(d) => parts.push(d.clone()),
1055                    crate::model::CellValue::Error(e) => parts.push(e.clone()),
1056                    _ => {}
1057                }
1058            }
1059        }
1060        if parts.is_empty() {
1061            headers.push(crate::utils::column_number_to_name(col));
1062        } else {
1063            headers.push(parts.join(" / "));
1064        }
1065    }
1066
1067    HeaderInfo {
1068        header_row: header_rows.first().copied(),
1069        headers,
1070        is_key_value: false,
1071    }
1072}
1073
1074fn classify_region(
1075    rect: &Rect,
1076    stats: &RegionStats,
1077    header_info: &HeaderInfo,
1078    metrics: &SheetMetrics,
1079) -> (crate::model::RegionKind, f32) {
1080    let width = rect.end_col - rect.start_col + 1;
1081    let height = rect.end_row - rect.start_row + 1;
1082    let area = width.max(1) * height.max(1);
1083    let density = if area == 0 {
1084        0.0
1085    } else {
1086        stats.non_empty as f32 / area as f32
1087    };
1088    let formula_ratio = if stats.non_empty == 0 {
1089        0.0
1090    } else {
1091        stats.formulas as f32 / stats.non_empty as f32
1092    };
1093    let text_ratio = if stats.non_empty == 0 {
1094        0.0
1095    } else {
1096        stats.text as f32 / stats.non_empty as f32
1097    };
1098
1099    let mut kind = crate::model::RegionKind::Data;
1100    if formula_ratio > 0.25 && is_outputs_band(rect, metrics, height, width) {
1101        kind = crate::model::RegionKind::Outputs;
1102    } else if formula_ratio > 0.55 {
1103        kind = crate::model::RegionKind::Calculator;
1104    } else if height <= 3
1105        && width <= 4
1106        && text_ratio > 0.5
1107        && rect.end_row >= metrics.row_count.saturating_sub(3)
1108    {
1109        kind = crate::model::RegionKind::Metadata;
1110    } else if header_info.is_key_value
1111        || (formula_ratio < 0.25
1112            && stats.numbers > 0
1113            && stats.text > 0
1114            && text_ratio >= 0.3
1115            && (width <= 2 || (width <= 3 && header_info.header_row.is_none())))
1116    {
1117        kind = crate::model::RegionKind::Parameters;
1118    } else if height <= 4 && width <= 6 && formula_ratio < 0.2 && text_ratio > 0.4 && density < 0.5
1119    {
1120        kind = crate::model::RegionKind::Metadata;
1121    }
1122
1123    let mut confidence: f32 = 0.4;
1124    if header_info.header_row.is_some() {
1125        confidence += 0.2;
1126    }
1127    confidence += (density * 0.2).min(0.2);
1128    confidence += (formula_ratio * 0.2).min(0.2);
1129    if matches!(
1130        kind,
1131        crate::model::RegionKind::Parameters | crate::model::RegionKind::Metadata
1132    ) && width <= 4
1133    {
1134        confidence += 0.1;
1135    }
1136    if confidence > 1.0 {
1137        confidence = 1.0;
1138    }
1139
1140    (kind, confidence)
1141}
1142
1143fn is_outputs_band(rect: &Rect, metrics: &SheetMetrics, height: u32, width: u32) -> bool {
1144    let near_bottom = rect.end_row >= metrics.row_count.saturating_sub(6);
1145    let near_right = rect.end_col >= metrics.column_count.saturating_sub(3);
1146    let is_shallow = height <= 6;
1147    let is_narrow_at_edge = width <= 6 && near_right;
1148    let not_at_top_left = rect.start_row > 1 || rect.start_col > 1;
1149    let sheet_has_depth = metrics.row_count > 10 || metrics.column_count > 6;
1150    let is_band = (is_shallow && near_bottom) || is_narrow_at_edge;
1151    is_band && not_at_top_left && sheet_has_depth
1152}
1153
1154fn gather_named_ranges(
1155    sheet: &Worksheet,
1156    defined_names: &[DefinedName],
1157) -> Vec<NamedRangeDescriptor> {
1158    let name_str = sheet.get_name();
1159    defined_names
1160        .iter()
1161        .filter(|name| name.get_address().contains(name_str))
1162        .map(|name| NamedRangeDescriptor {
1163            name: name.get_name().to_string(),
1164            scope: if name.has_local_sheet_id() {
1165                Some(name_str.to_string())
1166            } else {
1167                None
1168            },
1169            refers_to: name.get_address(),
1170            kind: NamedItemKind::NamedRange,
1171            sheet_name: Some(name_str.to_string()),
1172            comment: None,
1173        })
1174        .collect()
1175}
1176
1177pub fn build_workbook_list(
1178    config: &Arc<ServerConfig>,
1179    filter: &WorkbookFilter,
1180) -> Result<WorkbookListResponse> {
1181    let mut descriptors = Vec::new();
1182
1183    if let Some(single) = config.single_workbook() {
1184        let metadata = fs::metadata(single)
1185            .with_context(|| format!("unable to read metadata for {:?}", single))?;
1186        let id = WorkbookId(hash_path_metadata(single, &metadata));
1187        let slug = single
1188            .file_stem()
1189            .map(|s| s.to_string_lossy().to_string())
1190            .unwrap_or_else(|| "workbook".to_string());
1191        let folder = derive_folder(config, single);
1192        let short_id = make_short_workbook_id(&slug, id.as_str());
1193        let caps = BackendCaps::xlsx();
1194
1195        if filter.matches(&slug, folder.as_deref(), single) {
1196            let relative = single
1197                .strip_prefix(&config.workspace_root)
1198                .unwrap_or(single);
1199            let descriptor = crate::model::WorkbookDescriptor {
1200                workbook_id: id,
1201                short_id,
1202                slug,
1203                folder,
1204                path: path_to_forward_slashes(relative),
1205                bytes: metadata.len(),
1206                last_modified: metadata
1207                    .modified()
1208                    .ok()
1209                    .and_then(system_time_to_rfc3339)
1210                    .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1211                caps,
1212            };
1213            descriptors.push(descriptor);
1214        }
1215
1216        return Ok(WorkbookListResponse {
1217            workbooks: descriptors,
1218        });
1219    }
1220
1221    use walkdir::WalkDir;
1222
1223    for entry in WalkDir::new(&config.workspace_root) {
1224        let entry = entry?;
1225        if !entry.file_type().is_file() {
1226            continue;
1227        }
1228        let path = entry.path();
1229        if !has_supported_extension(&config.supported_extensions, path) {
1230            continue;
1231        }
1232        let metadata = entry.metadata()?;
1233        let id = WorkbookId(hash_path_metadata(path, &metadata));
1234        let slug = path
1235            .file_stem()
1236            .map(|s| s.to_string_lossy().to_string())
1237            .unwrap_or_else(|| "workbook".to_string());
1238        let folder = derive_folder(config, path);
1239        let short_id = make_short_workbook_id(&slug, id.as_str());
1240        let caps = BackendCaps::xlsx();
1241
1242        if !filter.matches(&slug, folder.as_deref(), path) {
1243            continue;
1244        }
1245
1246        let relative = path.strip_prefix(&config.workspace_root).unwrap_or(path);
1247        let descriptor = crate::model::WorkbookDescriptor {
1248            workbook_id: id,
1249            short_id,
1250            slug,
1251            folder,
1252            path: path_to_forward_slashes(relative),
1253            bytes: metadata.len(),
1254            last_modified: metadata
1255                .modified()
1256                .ok()
1257                .and_then(system_time_to_rfc3339)
1258                .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1259            caps,
1260        };
1261        descriptors.push(descriptor);
1262    }
1263
1264    descriptors.sort_by(|a, b| a.slug.cmp(&b.slug));
1265
1266    Ok(WorkbookListResponse {
1267        workbooks: descriptors,
1268    })
1269}
1270
1271fn derive_folder(config: &Arc<ServerConfig>, path: &Path) -> Option<String> {
1272    path.strip_prefix(&config.workspace_root)
1273        .ok()
1274        .and_then(|relative| relative.parent())
1275        .and_then(|parent| parent.file_name())
1276        .map(|os| os.to_string_lossy().to_string())
1277}
1278
1279fn has_supported_extension(allowed: &[String], path: &Path) -> bool {
1280    path.extension()
1281        .and_then(|ext| ext.to_str())
1282        .map(|ext| {
1283            let lower = ext.to_ascii_lowercase();
1284            allowed.iter().any(|candidate| candidate == &lower)
1285        })
1286        .unwrap_or(false)
1287}
spreadsheet_mcp/workbook.rs

spreadsheet_mcp/
workbook.rs