spreadsheet_mcp/
workbook.rs

1use crate::analysis::{
2    classification,
3    formula::{FormulaAtlas, FormulaGraph},
4    style,
5};
6use crate::caps::BackendCaps;
7use crate::config::ServerConfig;
8use crate::model::{
9    NamedItemKind, NamedRangeDescriptor, SheetClassification, SheetOverviewResponse, SheetSummary,
10    WorkbookDescription, WorkbookId, WorkbookListResponse,
11};
12use crate::tools::filters::WorkbookFilter;
13use crate::utils::{
14    hash_path_metadata, make_short_workbook_id, path_to_forward_slashes, system_time_to_rfc3339,
15};
16use anyhow::{Context, Result, anyhow};
17use chrono::{DateTime, Utc};
18use parking_lot::RwLock;
19use std::cmp::Ordering;
20use std::collections::{HashMap, HashSet};
21use std::fs;
22use std::path::{Path, PathBuf};
23use std::sync::Arc;
24use umya_spreadsheet::reader::xlsx;
25use umya_spreadsheet::{DefinedName, Spreadsheet, Worksheet};
26
27const KV_MAX_WIDTH_FOR_DENSITY_CHECK: u32 = 6;
28const KV_SAMPLE_ROWS: u32 = 20;
29const KV_DENSITY_THRESHOLD: f32 = 0.4;
30const KV_CHECK_ROWS: u32 = 15;
31const KV_MAX_LABEL_LEN: usize = 25;
32const KV_MIN_TEXT_VALUE_LEN: usize = 2;
33const KV_MIN_PAIRS: u32 = 3;
34const KV_MIN_PAIR_RATIO: f32 = 0.3;
35
36const HEADER_MAX_SCAN_ROWS: u32 = 2;
37const HEADER_LONG_STRING_PENALTY_THRESHOLD: usize = 40;
38const HEADER_LONG_STRING_PENALTY: f32 = 1.5;
39const HEADER_PROPER_NOUN_MIN_LEN: usize = 5;
40const HEADER_PROPER_NOUN_PENALTY: f32 = 1.0;
41const HEADER_DIGIT_STRING_MIN_LEN: usize = 3;
42const HEADER_DIGIT_STRING_PENALTY: f32 = 0.5;
43const HEADER_DATE_PENALTY: f32 = 1.0;
44const HEADER_YEAR_LIKE_BONUS: f32 = 0.5;
45const HEADER_YEAR_MIN: f64 = 1900.0;
46const HEADER_YEAR_MAX: f64 = 2100.0;
47const HEADER_UNIQUE_BONUS: f32 = 0.2;
48const HEADER_NUMBER_PENALTY: f32 = 0.3;
49const HEADER_SINGLE_COL_MIN_SCORE: f32 = 1.5;
50const HEADER_SCORE_TIE_THRESHOLD: f32 = 0.3;
51const HEADER_SECOND_ROW_MIN_SCORE_RATIO: f32 = 0.6;
52
53pub struct WorkbookContext {
54    pub id: WorkbookId,
55    pub short_id: String,
56    pub slug: String,
57    pub path: PathBuf,
58    pub caps: BackendCaps,
59    pub bytes: u64,
60    pub last_modified: Option<DateTime<Utc>>,
61    spreadsheet: Arc<RwLock<Spreadsheet>>,
62    sheet_cache: RwLock<HashMap<String, Arc<SheetCacheEntry>>>,
63    formula_atlas: Arc<FormulaAtlas>,
64}
65
66pub struct SheetCacheEntry {
67    pub metrics: SheetMetrics,
68    pub style_tags: Vec<String>,
69    pub named_ranges: Vec<NamedRangeDescriptor>,
70    pub detected_regions: Vec<crate::model::DetectedRegion>,
71}
72
73#[derive(Debug, Clone)]
74pub struct SheetMetrics {
75    pub row_count: u32,
76    pub column_count: u32,
77    pub non_empty_cells: u32,
78    pub formula_cells: u32,
79    pub cached_values: u32,
80    pub comments: u32,
81    pub style_map: HashMap<String, StyleUsage>,
82    pub classification: SheetClassification,
83}
84
85#[derive(Debug, Clone)]
86pub struct StyleUsage {
87    pub occurrences: u32,
88    pub tags: Vec<String>,
89    pub example_cells: Vec<String>,
90}
91
92impl WorkbookContext {
93    pub fn load(_config: &Arc<ServerConfig>, path: &Path) -> Result<Self> {
94        let metadata = fs::metadata(path)
95            .with_context(|| format!("unable to read metadata for {:?}", path))?;
96        let slug = path
97            .file_stem()
98            .map(|s| s.to_string_lossy().to_string())
99            .unwrap_or_else(|| "workbook".to_string());
100        let bytes = metadata.len();
101        let last_modified = metadata.modified().ok().and_then(system_time_to_rfc3339);
102        let id = WorkbookId(hash_path_metadata(path, &metadata));
103        let spreadsheet =
104            xlsx::read(path).with_context(|| format!("failed to parse workbook {:?}", path))?;
105        let short_id = make_short_workbook_id(&slug, id.as_str());
106
107        Ok(Self {
108            id,
109            short_id,
110            slug,
111            path: path.to_path_buf(),
112            caps: BackendCaps::xlsx(),
113            bytes,
114            last_modified,
115            spreadsheet: Arc::new(RwLock::new(spreadsheet)),
116            sheet_cache: RwLock::new(HashMap::new()),
117            formula_atlas: Arc::new(FormulaAtlas::default()),
118        })
119    }
120
121    pub fn sheet_names(&self) -> Vec<String> {
122        let book = self.spreadsheet.read();
123        book.get_sheet_collection()
124            .iter()
125            .map(|sheet| sheet.get_name().to_string())
126            .collect()
127    }
128
129    pub fn describe(&self) -> WorkbookDescription {
130        let book = self.spreadsheet.read();
131        let defined_names_count = book.get_defined_names().len();
132        let table_count: usize = book
133            .get_sheet_collection()
134            .iter()
135            .map(|sheet| sheet.get_tables().len())
136            .sum();
137        let macros_present = false;
138
139        WorkbookDescription {
140            workbook_id: self.id.clone(),
141            short_id: self.short_id.clone(),
142            slug: self.slug.clone(),
143            path: path_to_forward_slashes(&self.path),
144            bytes: self.bytes,
145            sheet_count: book.get_sheet_collection().len(),
146            defined_names: defined_names_count,
147            tables: table_count,
148            macros_present,
149            last_modified: self
150                .last_modified
151                .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
152            caps: self.caps.clone(),
153        }
154    }
155
156    pub fn get_sheet_metrics(&self, sheet_name: &str) -> Result<Arc<SheetCacheEntry>> {
157        if let Some(entry) = self.sheet_cache.read().get(sheet_name) {
158            return Ok(entry.clone());
159        }
160
161        let mut writer = self.sheet_cache.write();
162        if let Some(entry) = writer.get(sheet_name) {
163            return Ok(entry.clone());
164        }
165
166        let book = self.spreadsheet.read();
167        let sheet = book
168            .get_sheet_by_name(sheet_name)
169            .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
170        let (metrics, style_tags) = compute_sheet_metrics(sheet);
171        let named_ranges = gather_named_ranges(sheet, book.get_defined_names());
172        let detected_regions = detect_regions(sheet, &metrics);
173
174        let entry = Arc::new(SheetCacheEntry {
175            metrics,
176            style_tags,
177            named_ranges,
178            detected_regions,
179        });
180
181        writer.insert(sheet_name.to_string(), entry.clone());
182        Ok(entry)
183    }
184
185    pub fn list_summaries(&self) -> Result<Vec<SheetSummary>> {
186        let book = self.spreadsheet.read();
187        let mut summaries = Vec::new();
188        for sheet in book.get_sheet_collection() {
189            let name = sheet.get_name().to_string();
190            let entry = self.get_sheet_metrics(&name)?;
191            summaries.push(SheetSummary {
192                name: name.clone(),
193                visible: sheet.get_sheet_state() != "hidden",
194                row_count: entry.metrics.row_count,
195                column_count: entry.metrics.column_count,
196                non_empty_cells: entry.metrics.non_empty_cells,
197                formula_cells: entry.metrics.formula_cells,
198                cached_values: entry.metrics.cached_values,
199                classification: entry.metrics.classification.clone(),
200                style_tags: entry.style_tags.clone(),
201            });
202        }
203        Ok(summaries)
204    }
205
206    pub fn with_sheet<T, F>(&self, sheet_name: &str, func: F) -> Result<T>
207    where
208        F: FnOnce(&Worksheet) -> T,
209    {
210        let book = self.spreadsheet.read();
211        let sheet = book
212            .get_sheet_by_name(sheet_name)
213            .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
214        Ok(func(sheet))
215    }
216
217    pub fn with_spreadsheet<T, F>(&self, func: F) -> Result<T>
218    where
219        F: FnOnce(&Spreadsheet) -> T,
220    {
221        let book = self.spreadsheet.read();
222        Ok(func(&book))
223    }
224
225    pub fn formula_graph(&self, sheet_name: &str) -> Result<FormulaGraph> {
226        self.with_sheet(sheet_name, |sheet| {
227            FormulaGraph::build(sheet, &self.formula_atlas)
228        })?
229    }
230
231    pub fn named_items(&self) -> Result<Vec<NamedRangeDescriptor>> {
232        let book = self.spreadsheet.read();
233        let sheet_names: Vec<String> = book
234            .get_sheet_collection()
235            .iter()
236            .map(|sheet| sheet.get_name().to_string())
237            .collect();
238        let mut items = Vec::new();
239        for defined in book.get_defined_names() {
240            let refers_to = defined.get_address();
241            let scope = if defined.has_local_sheet_id() {
242                let idx = *defined.get_local_sheet_id() as usize;
243                sheet_names.get(idx).cloned()
244            } else {
245                None
246            };
247            let kind = if refers_to.starts_with('=') {
248                NamedItemKind::Formula
249            } else {
250                NamedItemKind::NamedRange
251            };
252
253            items.push(NamedRangeDescriptor {
254                name: defined.get_name().to_string(),
255                scope: scope.clone(),
256                refers_to: refers_to.clone(),
257                kind,
258                sheet_name: scope,
259                comment: None,
260            });
261        }
262
263        for sheet in book.get_sheet_collection() {
264            for table in sheet.get_tables() {
265                let start = table.get_area().0.get_coordinate();
266                let end = table.get_area().1.get_coordinate();
267                items.push(NamedRangeDescriptor {
268                    name: table.get_name().to_string(),
269                    scope: Some(sheet.get_name().to_string()),
270                    refers_to: format!("{}:{}", start, end),
271                    kind: NamedItemKind::Table,
272                    sheet_name: Some(sheet.get_name().to_string()),
273                    comment: None,
274                });
275            }
276        }
277
278        Ok(items)
279    }
280
281    pub fn sheet_overview(&self, sheet_name: &str) -> Result<SheetOverviewResponse> {
282        let entry = self.get_sheet_metrics(sheet_name)?;
283        let narrative = classification::narrative(&entry.metrics);
284        let regions = classification::regions(&entry.metrics);
285        let key_ranges = classification::key_ranges(&entry.metrics);
286        let detected_regions = entry.detected_regions.clone();
287
288        Ok(SheetOverviewResponse {
289            workbook_id: self.id.clone(),
290            workbook_short_id: self.short_id.clone(),
291            sheet_name: sheet_name.to_string(),
292            narrative,
293            regions,
294            detected_regions,
295            key_ranges,
296            formula_ratio: if entry.metrics.non_empty_cells == 0 {
297                0.0
298            } else {
299                entry.metrics.formula_cells as f32 / entry.metrics.non_empty_cells as f32
300            },
301            notable_features: entry.style_tags.clone(),
302        })
303    }
304
305    pub fn detected_region(
306        &self,
307        sheet_name: &str,
308        id: u32,
309    ) -> Result<crate::model::DetectedRegion> {
310        let entry = self.get_sheet_metrics(sheet_name)?;
311        entry
312            .detected_regions
313            .iter()
314            .find(|r| r.id == id)
315            .cloned()
316            .ok_or_else(|| anyhow!("region {} not found on sheet {}", id, sheet_name))
317    }
318}
319
320fn contains_date_time_token(format_code: &str) -> bool {
321    let mut in_quote = false;
322    let mut in_bracket = false;
323    let chars: Vec<char> = format_code.chars().collect();
324
325    for (i, &ch) in chars.iter().enumerate() {
326        match ch {
327            '"' => in_quote = !in_quote,
328            '[' if !in_quote => in_bracket = true,
329            ']' if !in_quote => in_bracket = false,
330            'y' | 'd' | 'h' | 's' | 'm' if !in_quote && !in_bracket => {
331                if ch == 'm' {
332                    let prev = if i > 0 { chars.get(i - 1) } else { None };
333                    let next = chars.get(i + 1);
334                    let after_time_sep = prev == Some(&':') || prev == Some(&'h');
335                    let before_time_sep = next == Some(&':') || next == Some(&'s');
336                    if after_time_sep || before_time_sep {
337                        return true;
338                    }
339                    if prev == Some(&'m') || next == Some(&'m') {
340                        return true;
341                    }
342                    if matches!(prev, Some(&'/') | Some(&'-') | Some(&'.'))
343                        || matches!(next, Some(&'/') | Some(&'-') | Some(&'.'))
344                    {
345                        return true;
346                    }
347                } else {
348                    return true;
349                }
350            }
351            _ => {}
352        }
353    }
354    false
355}
356
357const DATE_FORMAT_IDS: &[u32] = &[
358    14, 15, 16, 17, 18, 19, 20, 21, 22, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 45, 46, 47, 50, 51,
359    52, 53, 54, 55, 56, 57, 58,
360];
361
362const EXCEL_LEAP_YEAR_BUG_SERIAL: i64 = 60;
363
364fn is_date_formatted(cell: &umya_spreadsheet::Cell) -> bool {
365    let Some(nf) = cell.get_style().get_number_format() else {
366        return false;
367    };
368
369    let format_id = nf.get_number_format_id();
370    if DATE_FORMAT_IDS.contains(format_id) {
371        return true;
372    }
373
374    let code = nf.get_format_code();
375    if code == "General" || code == "@" || code == "0" || code == "0.00" {
376        return false;
377    }
378
379    contains_date_time_token(code)
380}
381
382pub fn excel_serial_to_iso(serial: f64, use_1904_system: bool) -> String {
383    excel_serial_to_iso_with_leap_bug(serial, use_1904_system, true)
384}
385
386pub fn excel_serial_to_iso_with_leap_bug(
387    serial: f64,
388    use_1904_system: bool,
389    compensate_leap_bug: bool,
390) -> String {
391    use chrono::NaiveDate;
392
393    let days = serial.trunc() as i64;
394
395    if use_1904_system {
396        let epoch_1904 = NaiveDate::from_ymd_opt(1904, 1, 1).unwrap();
397        return epoch_1904
398            .checked_add_signed(chrono::Duration::days(days))
399            .map(|d| d.format("%Y-%m-%d").to_string())
400            .unwrap_or_else(|| serial.to_string());
401    }
402
403    let epoch = if compensate_leap_bug && days >= EXCEL_LEAP_YEAR_BUG_SERIAL {
404        NaiveDate::from_ymd_opt(1899, 12, 30).unwrap()
405    } else {
406        NaiveDate::from_ymd_opt(1899, 12, 31).unwrap()
407    };
408
409    epoch
410        .checked_add_signed(chrono::Duration::days(days))
411        .map(|d| d.format("%Y-%m-%d").to_string())
412        .unwrap_or_else(|| serial.to_string())
413}
414
415pub fn cell_to_value(cell: &umya_spreadsheet::Cell) -> Option<crate::model::CellValue> {
416    cell_to_value_with_date_system(cell, false)
417}
418
419pub fn cell_to_value_with_date_system(
420    cell: &umya_spreadsheet::Cell,
421    use_1904_system: bool,
422) -> Option<crate::model::CellValue> {
423    let raw = cell.get_value();
424    if raw.is_empty() {
425        return None;
426    }
427    if let Ok(number) = raw.parse::<f64>() {
428        if is_date_formatted(cell) {
429            return Some(crate::model::CellValue::Date(excel_serial_to_iso(
430                number,
431                use_1904_system,
432            )));
433        }
434        return Some(crate::model::CellValue::Number(number));
435    }
436
437    let lower = raw.to_ascii_lowercase();
438    if lower == "true" {
439        return Some(crate::model::CellValue::Bool(true));
440    }
441    if lower == "false" {
442        return Some(crate::model::CellValue::Bool(false));
443    }
444
445    Some(crate::model::CellValue::Text(raw.to_string()))
446}
447
448pub fn compute_sheet_metrics(sheet: &Worksheet) -> (SheetMetrics, Vec<String>) {
449    use std::collections::HashMap as StdHashMap;
450    let mut non_empty = 0u32;
451    let mut formulas = 0u32;
452    let mut cached = 0u32;
453    let comments = sheet.get_comments().len() as u32;
454    let mut style_usage: StdHashMap<String, StyleUsage> = StdHashMap::new();
455
456    for cell in sheet.get_cell_collection() {
457        let value = cell.get_value();
458        if !value.is_empty() {
459            non_empty += 1;
460        }
461        if cell.is_formula() {
462            formulas += 1;
463            if !cell.get_value().is_empty() {
464                cached += 1;
465            }
466        }
467
468        if let Some((style_key, usage)) = style::tag_cell(cell) {
469            let entry = style_usage.entry(style_key).or_insert_with(|| StyleUsage {
470                occurrences: 0,
471                tags: usage.tags.clone(),
472                example_cells: Vec::new(),
473            });
474            entry.occurrences += 1;
475            if entry.example_cells.len() < 5 {
476                entry.example_cells.push(usage.example_cell.clone());
477            }
478        }
479    }
480
481    let (max_col, max_row) = sheet.get_highest_column_and_row();
482
483    let classification = classification::classify(
484        non_empty,
485        formulas,
486        max_row,
487        max_col,
488        comments,
489        &style_usage,
490    );
491
492    let style_tags: Vec<String> = style_usage
493        .values()
494        .flat_map(|usage| usage.tags.clone())
495        .collect();
496
497    let metrics = SheetMetrics {
498        row_count: max_row,
499        column_count: max_col,
500        non_empty_cells: non_empty,
501        formula_cells: formulas,
502        cached_values: cached,
503        comments,
504        style_map: style_usage,
505        classification,
506    };
507    (metrics, style_tags)
508}
509
510#[derive(Debug, Clone, Copy)]
511struct Rect {
512    start_row: u32,
513    end_row: u32,
514    start_col: u32,
515    end_col: u32,
516}
517
518#[derive(Debug, Clone)]
519struct CellInfo {
520    value: Option<crate::model::CellValue>,
521    is_formula: bool,
522}
523
524#[derive(Debug)]
525struct Occupancy {
526    cells: HashMap<(u32, u32), CellInfo>,
527}
528
529impl Occupancy {
530    fn row_col_counts(&self, rect: &Rect) -> (Vec<u32>, Vec<u32>) {
531        let mut row_counts = vec![0u32; (rect.end_row - rect.start_row + 1) as usize];
532        let mut col_counts = vec![0u32; (rect.end_col - rect.start_col + 1) as usize];
533        for ((row, col), _) in self.cells.iter() {
534            if *row >= rect.start_row
535                && *row <= rect.end_row
536                && *col >= rect.start_col
537                && *col <= rect.end_col
538            {
539                row_counts[(row - rect.start_row) as usize] += 1;
540                col_counts[(col - rect.start_col) as usize] += 1;
541            }
542        }
543        (row_counts, col_counts)
544    }
545
546    fn stats_in_rect(&self, rect: &Rect) -> RegionStats {
547        let mut stats = RegionStats::default();
548        for ((row, col), info) in self.cells.iter() {
549            if *row < rect.start_row
550                || *row > rect.end_row
551                || *col < rect.start_col
552                || *col > rect.end_col
553            {
554                continue;
555            }
556            stats.non_empty += 1;
557            if info.is_formula {
558                stats.formulas += 1;
559            }
560            if let Some(val) = &info.value {
561                match val {
562                    crate::model::CellValue::Text(_) => stats.text += 1,
563                    crate::model::CellValue::Number(_) => stats.numbers += 1,
564                    crate::model::CellValue::Bool(_) => stats.bools += 1,
565                    crate::model::CellValue::Date(_) => stats.dates += 1,
566                    crate::model::CellValue::Error(_) => stats.errors += 1,
567                }
568            }
569        }
570        stats
571    }
572
573    fn value_at(&self, row: u32, col: u32) -> Option<&crate::model::CellValue> {
574        self.cells.get(&(row, col)).and_then(|c| c.value.as_ref())
575    }
576}
577
578#[derive(Debug, Default, Clone)]
579struct RegionStats {
580    non_empty: u32,
581    formulas: u32,
582    text: u32,
583    numbers: u32,
584    bools: u32,
585    dates: u32,
586    errors: u32,
587}
588
589#[derive(Debug, Clone, Copy, PartialEq, Eq)]
590enum Gutter {
591    Row { start: u32, end: u32 },
592    Col { start: u32, end: u32 },
593}
594
595fn detect_regions(sheet: &Worksheet, metrics: &SheetMetrics) -> Vec<crate::model::DetectedRegion> {
596    if metrics.row_count == 0 || metrics.column_count == 0 {
597        return Vec::new();
598    }
599    let occupancy = build_occupancy(sheet);
600    let root = Rect {
601        start_row: 1,
602        end_row: metrics.row_count.max(1),
603        start_col: 1,
604        end_col: metrics.column_count.max(1),
605    };
606
607    let mut leaves = Vec::new();
608    split_rect(&occupancy, &root, &mut leaves);
609
610    let mut regions = Vec::new();
611    for (idx, rect) in leaves.into_iter().enumerate() {
612        if let Some(trimmed) = trim_rect(&occupancy, rect) {
613            let region = build_region(&occupancy, &trimmed, metrics, idx as u32);
614            regions.push(region);
615        }
616    }
617    regions
618}
619
620fn build_occupancy(sheet: &Worksheet) -> Occupancy {
621    let mut cells = HashMap::new();
622    for cell in sheet.get_cell_collection() {
623        let coord = cell.get_coordinate();
624        let row = *coord.get_row_num();
625        let col = *coord.get_col_num();
626        let value = cell_to_value(cell);
627        let is_formula = cell.is_formula();
628        cells.insert((row, col), CellInfo { value, is_formula });
629    }
630    Occupancy { cells }
631}
632
633fn split_rect(occupancy: &Occupancy, rect: &Rect, leaves: &mut Vec<Rect>) {
634    if rect.start_row >= rect.end_row && rect.start_col >= rect.end_col {
635        leaves.push(*rect);
636        return;
637    }
638    if let Some(gutter) = find_best_gutter(occupancy, rect) {
639        match gutter {
640            Gutter::Row { start, end } => {
641                if start > rect.start_row {
642                    let upper = Rect {
643                        start_row: rect.start_row,
644                        end_row: start - 1,
645                        start_col: rect.start_col,
646                        end_col: rect.end_col,
647                    };
648                    split_rect(occupancy, &upper, leaves);
649                }
650                if end < rect.end_row {
651                    let lower = Rect {
652                        start_row: end + 1,
653                        end_row: rect.end_row,
654                        start_col: rect.start_col,
655                        end_col: rect.end_col,
656                    };
657                    split_rect(occupancy, &lower, leaves);
658                }
659            }
660            Gutter::Col { start, end } => {
661                if start > rect.start_col {
662                    let left = Rect {
663                        start_row: rect.start_row,
664                        end_row: rect.end_row,
665                        start_col: rect.start_col,
666                        end_col: start - 1,
667                    };
668                    split_rect(occupancy, &left, leaves);
669                }
670                if end < rect.end_col {
671                    let right = Rect {
672                        start_row: rect.start_row,
673                        end_row: rect.end_row,
674                        start_col: end + 1,
675                        end_col: rect.end_col,
676                    };
677                    split_rect(occupancy, &right, leaves);
678                }
679            }
680        }
681        return;
682    }
683    leaves.push(*rect);
684}
685
686fn find_best_gutter(occupancy: &Occupancy, rect: &Rect) -> Option<Gutter> {
687    let (row_counts, col_counts) = occupancy.row_col_counts(rect);
688    let width = rect.end_col - rect.start_col + 1;
689    let height = rect.end_row - rect.start_row + 1;
690
691    let row_blank_runs = find_blank_runs(&row_counts, width);
692    let col_blank_runs = find_blank_runs(&col_counts, height);
693
694    let mut best: Option<(Gutter, u32)> = None;
695
696    if let Some((start, end, len)) = row_blank_runs {
697        let gutter = Gutter::Row {
698            start: rect.start_row + start,
699            end: rect.start_row + end,
700        };
701        best = Some((gutter, len));
702    }
703    if let Some((start, end, len)) = col_blank_runs {
704        let gutter = Gutter::Col {
705            start: rect.start_col + start,
706            end: rect.start_col + end,
707        };
708        if best.map(|(_, l)| len > l).unwrap_or(true) {
709            best = Some((gutter, len));
710        }
711    }
712
713    best.map(|(g, _)| g)
714}
715
716fn find_blank_runs(counts: &[u32], span: u32) -> Option<(u32, u32, u32)> {
717    if counts.is_empty() {
718        return None;
719    }
720    let mut best_start = 0;
721    let mut best_end = 0;
722    let mut best_len = 0;
723    let mut current_start = None;
724    for (idx, count) in counts.iter().enumerate() {
725        let is_blank = *count == 0 || (*count as f32 / span as f32) < 0.05;
726        if is_blank {
727            if current_start.is_none() {
728                current_start = Some(idx as u32);
729            }
730        } else if let Some(start) = current_start.take() {
731            let end = idx as u32 - 1;
732            let len = end - start + 1;
733            if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
734                best_len = len;
735                best_start = start;
736                best_end = end;
737            }
738        }
739    }
740    if let Some(start) = current_start {
741        let end = counts.len() as u32 - 1;
742        let len = end - start + 1;
743        if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
744            best_len = len;
745            best_start = start;
746            best_end = end;
747        }
748    }
749    if best_len >= 2 {
750        Some((best_start, best_end, best_len))
751    } else {
752        None
753    }
754}
755
756fn trim_rect(occupancy: &Occupancy, rect: Rect) -> Option<Rect> {
757    let mut r = rect;
758    loop {
759        let (row_counts, col_counts) = occupancy.row_col_counts(&r);
760        let width = r.end_col - r.start_col + 1;
761        let height = r.end_row - r.start_row + 1;
762        let top_blank = row_counts
763            .first()
764            .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
765            .unwrap_or(false);
766        let bottom_blank = row_counts
767            .last()
768            .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
769            .unwrap_or(false);
770        let left_blank = col_counts
771            .first()
772            .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
773            .unwrap_or(false);
774        let right_blank = col_counts
775            .last()
776            .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
777            .unwrap_or(false);
778
779        let mut changed = false;
780        if top_blank && r.start_row < r.end_row {
781            r.start_row += 1;
782            changed = true;
783        }
784        if bottom_blank && r.end_row > r.start_row {
785            r.end_row -= 1;
786            changed = true;
787        }
788        if left_blank && r.start_col < r.end_col {
789            r.start_col += 1;
790            changed = true;
791        }
792        if right_blank && r.end_col > r.start_col {
793            r.end_col -= 1;
794            changed = true;
795        }
796
797        if !changed {
798            break;
799        }
800        if r.start_row > r.end_row || r.start_col > r.end_col {
801            return None;
802        }
803    }
804    Some(r)
805}
806
807fn build_region(
808    occupancy: &Occupancy,
809    rect: &Rect,
810    metrics: &SheetMetrics,
811    id: u32,
812) -> crate::model::DetectedRegion {
813    let header_info = detect_headers(occupancy, rect);
814    let stats = occupancy.stats_in_rect(rect);
815    let (kind, confidence) = classify_region(rect, &stats, &header_info, metrics);
816    crate::model::DetectedRegion {
817        id,
818        bounds: format!(
819            "{}{}:{}{}",
820            crate::utils::column_number_to_name(rect.start_col),
821            rect.start_row,
822            crate::utils::column_number_to_name(rect.end_col),
823            rect.end_row
824        ),
825        header_row: header_info.header_row,
826        headers: header_info.headers,
827        row_count: rect.end_row - rect.start_row + 1,
828        classification: kind.clone(),
829        region_kind: Some(kind),
830        confidence,
831    }
832}
833
834#[derive(Debug, Default)]
835struct HeaderInfo {
836    header_row: Option<u32>,
837    headers: Vec<String>,
838    is_key_value: bool,
839}
840
841fn is_key_value_layout(occupancy: &Occupancy, rect: &Rect) -> bool {
842    let width = rect.end_col - rect.start_col + 1;
843
844    if width == 2 {
845        return check_key_value_columns(occupancy, rect, rect.start_col, rect.start_col + 1);
846    }
847
848    if width <= KV_MAX_WIDTH_FOR_DENSITY_CHECK {
849        let rows_to_sample = (rect.end_row - rect.start_row + 1).min(KV_SAMPLE_ROWS);
850        let density_threshold = (rows_to_sample as f32 * KV_DENSITY_THRESHOLD) as u32;
851
852        let mut col_densities: Vec<(u32, u32)> = Vec::new();
853        for col in rect.start_col..=rect.end_col {
854            let count = (rect.start_row..rect.start_row + rows_to_sample)
855                .filter(|&row| occupancy.value_at(row, col).is_some())
856                .count() as u32;
857            if count >= density_threshold {
858                col_densities.push((col, count));
859            }
860        }
861
862        if col_densities.len() == 2 {
863            let label_col = col_densities[0].0;
864            let value_col = col_densities[1].0;
865            return check_key_value_columns(occupancy, rect, label_col, value_col);
866        } else if col_densities.len() == 4 && width >= 4 {
867            let pair1 =
868                check_key_value_columns(occupancy, rect, col_densities[0].0, col_densities[1].0);
869            let pair2 =
870                check_key_value_columns(occupancy, rect, col_densities[2].0, col_densities[3].0);
871            return pair1 && pair2;
872        }
873    }
874
875    false
876}
877
878fn check_key_value_columns(
879    occupancy: &Occupancy,
880    rect: &Rect,
881    label_col: u32,
882    value_col: u32,
883) -> bool {
884    let mut label_value_pairs = 0u32;
885    let rows_to_check = (rect.end_row - rect.start_row + 1).min(KV_CHECK_ROWS);
886
887    for row in rect.start_row..rect.start_row + rows_to_check {
888        let first_col = occupancy.value_at(row, label_col);
889        let second_col = occupancy.value_at(row, value_col);
890
891        if let (Some(crate::model::CellValue::Text(label)), Some(val)) = (first_col, second_col) {
892            let label_looks_like_key = label.len() <= KV_MAX_LABEL_LEN
893                && !label.chars().any(|c| c.is_ascii_digit())
894                && label.contains(|c: char| c.is_alphabetic());
895
896            let value_is_data = matches!(
897                val,
898                crate::model::CellValue::Number(_) | crate::model::CellValue::Date(_)
899            ) || matches!(val, crate::model::CellValue::Text(s) if s.len() > KV_MIN_TEXT_VALUE_LEN);
900
901            if label_looks_like_key && value_is_data {
902                label_value_pairs += 1;
903            }
904        }
905    }
906
907    label_value_pairs >= KV_MIN_PAIRS
908        && label_value_pairs as f32 / rows_to_check as f32 >= KV_MIN_PAIR_RATIO
909}
910
911fn header_data_penalty(s: &str) -> f32 {
912    if s.is_empty() {
913        return 0.0;
914    }
915    if s.len() > HEADER_LONG_STRING_PENALTY_THRESHOLD {
916        return HEADER_LONG_STRING_PENALTY;
917    }
918    let first_char = s.chars().next().unwrap();
919    let is_capitalized = first_char.is_uppercase();
920    let has_lowercase = s.chars().skip(1).any(|c| c.is_lowercase());
921    let is_all_caps = s.chars().all(|c| !c.is_alphabetic() || c.is_uppercase());
922    let has_digits = s.chars().any(|c| c.is_ascii_digit());
923    let is_proper_noun =
924        is_capitalized && has_lowercase && !is_all_caps && s.len() > HEADER_PROPER_NOUN_MIN_LEN;
925
926    let mut penalty = 0.0;
927    if is_proper_noun {
928        penalty += HEADER_PROPER_NOUN_PENALTY;
929    }
930    if has_digits && s.len() > HEADER_DIGIT_STRING_MIN_LEN {
931        penalty += HEADER_DIGIT_STRING_PENALTY;
932    }
933    penalty
934}
935
936fn detect_headers(occupancy: &Occupancy, rect: &Rect) -> HeaderInfo {
937    if is_key_value_layout(occupancy, rect) {
938        let mut headers = Vec::new();
939        for col in rect.start_col..=rect.end_col {
940            headers.push(crate::utils::column_number_to_name(col));
941        }
942        return HeaderInfo {
943            header_row: None,
944            headers,
945            is_key_value: true,
946        };
947    }
948
949    let mut candidates = Vec::new();
950    let max_row = rect
951        .start_row
952        .saturating_add(HEADER_MAX_SCAN_ROWS)
953        .min(rect.end_row);
954    for row in rect.start_row..=max_row {
955        let mut text = 0;
956        let mut numbers = 0;
957        let mut non_empty = 0;
958        let mut unique = HashSet::new();
959        let mut data_like_penalty: f32 = 0.0;
960        let mut year_like_bonus: f32 = 0.0;
961
962        for col in rect.start_col..=rect.end_col {
963            if let Some(val) = occupancy.value_at(row, col) {
964                non_empty += 1;
965                match val {
966                    crate::model::CellValue::Text(s) => {
967                        text += 1;
968                        unique.insert(s.clone());
969                        data_like_penalty += header_data_penalty(s);
970                    }
971                    crate::model::CellValue::Number(n) => {
972                        if *n >= HEADER_YEAR_MIN && *n <= HEADER_YEAR_MAX && n.fract() == 0.0 {
973                            year_like_bonus += HEADER_YEAR_LIKE_BONUS;
974                            text += 1;
975                        } else {
976                            numbers += 1;
977                        }
978                    }
979                    crate::model::CellValue::Bool(_) => text += 1,
980                    crate::model::CellValue::Date(_) => {
981                        data_like_penalty += HEADER_DATE_PENALTY;
982                    }
983                    crate::model::CellValue::Error(_) => {}
984                }
985            }
986        }
987        if non_empty == 0 {
988            continue;
989        }
990        let score = text as f32 + unique.len() as f32 * HEADER_UNIQUE_BONUS
991            - numbers as f32 * HEADER_NUMBER_PENALTY
992            - data_like_penalty
993            + year_like_bonus;
994        candidates.push((row, score, text, non_empty));
995    }
996
997    let is_single_col = rect.start_col == rect.end_col;
998
999    let header_candidates: Vec<&(u32, f32, u32, u32)> = candidates
1000        .iter()
1001        .filter(|(_, score, text, non_empty)| {
1002            *text >= 1
1003                && *text * 2 >= *non_empty
1004                && (!is_single_col || *score > HEADER_SINGLE_COL_MIN_SCORE)
1005        })
1006        .collect();
1007
1008    let best = header_candidates.iter().copied().max_by(|a, b| {
1009        a.1.partial_cmp(&b.1)
1010            .unwrap_or(Ordering::Equal)
1011            .then_with(|| b.0.cmp(&a.0))
1012    });
1013    let earliest = header_candidates
1014        .iter()
1015        .copied()
1016        .min_by(|a, b| a.0.cmp(&b.0));
1017
1018    let maybe_header = match (best, earliest) {
1019        (Some(best_row), Some(early_row)) => {
1020            if (best_row.1 - early_row.1).abs() <= HEADER_SCORE_TIE_THRESHOLD {
1021                Some(early_row.0)
1022            } else {
1023                Some(best_row.0)
1024            }
1025        }
1026        (Some(best_row), None) => Some(best_row.0),
1027        _ => None,
1028    };
1029
1030    let mut header_rows = Vec::new();
1031    if let Some(hr) = maybe_header {
1032        header_rows.push(hr);
1033        if hr < rect.end_row
1034            && let Some((_, score_next, text_next, non_empty_next)) =
1035                candidates.iter().find(|(r, _, _, _)| *r == hr + 1)
1036            && *text_next >= 1
1037            && *text_next * 2 >= *non_empty_next
1038            && *score_next
1039                >= HEADER_SECOND_ROW_MIN_SCORE_RATIO
1040                    * candidates
1041                        .iter()
1042                        .find(|(r, _, _, _)| *r == hr)
1043                        .map(|c| c.1)
1044                        .unwrap_or(0.0)
1045        {
1046            header_rows.push(hr + 1);
1047        }
1048    }
1049
1050    let mut headers = Vec::new();
1051    for col in rect.start_col..=rect.end_col {
1052        let mut parts = Vec::new();
1053        for hr in &header_rows {
1054            if let Some(val) = occupancy.value_at(*hr, col) {
1055                match val {
1056                    crate::model::CellValue::Text(s) if !s.trim().is_empty() => {
1057                        parts.push(s.trim().to_string())
1058                    }
1059                    crate::model::CellValue::Number(n) => parts.push(n.to_string()),
1060                    crate::model::CellValue::Bool(b) => parts.push(b.to_string()),
1061                    crate::model::CellValue::Date(d) => parts.push(d.clone()),
1062                    crate::model::CellValue::Error(e) => parts.push(e.clone()),
1063                    _ => {}
1064                }
1065            }
1066        }
1067        if parts.is_empty() {
1068            headers.push(crate::utils::column_number_to_name(col));
1069        } else {
1070            headers.push(parts.join(" / "));
1071        }
1072    }
1073
1074    HeaderInfo {
1075        header_row: header_rows.first().copied(),
1076        headers,
1077        is_key_value: false,
1078    }
1079}
1080
1081fn classify_region(
1082    rect: &Rect,
1083    stats: &RegionStats,
1084    header_info: &HeaderInfo,
1085    metrics: &SheetMetrics,
1086) -> (crate::model::RegionKind, f32) {
1087    let width = rect.end_col - rect.start_col + 1;
1088    let height = rect.end_row - rect.start_row + 1;
1089    let area = width.max(1) * height.max(1);
1090    let density = if area == 0 {
1091        0.0
1092    } else {
1093        stats.non_empty as f32 / area as f32
1094    };
1095    let formula_ratio = if stats.non_empty == 0 {
1096        0.0
1097    } else {
1098        stats.formulas as f32 / stats.non_empty as f32
1099    };
1100    let text_ratio = if stats.non_empty == 0 {
1101        0.0
1102    } else {
1103        stats.text as f32 / stats.non_empty as f32
1104    };
1105
1106    let mut kind = crate::model::RegionKind::Data;
1107    if formula_ratio > 0.25 && is_outputs_band(rect, metrics, height, width) {
1108        kind = crate::model::RegionKind::Outputs;
1109    } else if formula_ratio > 0.55 {
1110        kind = crate::model::RegionKind::Calculator;
1111    } else if height <= 3
1112        && width <= 4
1113        && text_ratio > 0.5
1114        && rect.end_row >= metrics.row_count.saturating_sub(3)
1115    {
1116        kind = crate::model::RegionKind::Metadata;
1117    } else if header_info.is_key_value
1118        || (formula_ratio < 0.25
1119            && stats.numbers > 0
1120            && stats.text > 0
1121            && text_ratio >= 0.3
1122            && (width <= 2 || (width <= 3 && header_info.header_row.is_none())))
1123    {
1124        kind = crate::model::RegionKind::Parameters;
1125    } else if height <= 4 && width <= 6 && formula_ratio < 0.2 && text_ratio > 0.4 && density < 0.5
1126    {
1127        kind = crate::model::RegionKind::Metadata;
1128    }
1129
1130    let mut confidence: f32 = 0.4;
1131    if header_info.header_row.is_some() {
1132        confidence += 0.2;
1133    }
1134    confidence += (density * 0.2).min(0.2);
1135    confidence += (formula_ratio * 0.2).min(0.2);
1136    if matches!(
1137        kind,
1138        crate::model::RegionKind::Parameters | crate::model::RegionKind::Metadata
1139    ) && width <= 4
1140    {
1141        confidence += 0.1;
1142    }
1143    if confidence > 1.0 {
1144        confidence = 1.0;
1145    }
1146
1147    (kind, confidence)
1148}
1149
1150fn is_outputs_band(rect: &Rect, metrics: &SheetMetrics, height: u32, width: u32) -> bool {
1151    let near_bottom = rect.end_row >= metrics.row_count.saturating_sub(6);
1152    let near_right = rect.end_col >= metrics.column_count.saturating_sub(3);
1153    let is_shallow = height <= 6;
1154    let is_narrow_at_edge = width <= 6 && near_right;
1155    let not_at_top_left = rect.start_row > 1 || rect.start_col > 1;
1156    let sheet_has_depth = metrics.row_count > 10 || metrics.column_count > 6;
1157    let is_band = (is_shallow && near_bottom) || is_narrow_at_edge;
1158    is_band && not_at_top_left && sheet_has_depth
1159}
1160
1161fn gather_named_ranges(
1162    sheet: &Worksheet,
1163    defined_names: &[DefinedName],
1164) -> Vec<NamedRangeDescriptor> {
1165    let name_str = sheet.get_name();
1166    defined_names
1167        .iter()
1168        .filter(|name| name.get_address().contains(name_str))
1169        .map(|name| NamedRangeDescriptor {
1170            name: name.get_name().to_string(),
1171            scope: if name.has_local_sheet_id() {
1172                Some(name_str.to_string())
1173            } else {
1174                None
1175            },
1176            refers_to: name.get_address(),
1177            kind: NamedItemKind::NamedRange,
1178            sheet_name: Some(name_str.to_string()),
1179            comment: None,
1180        })
1181        .collect()
1182}
1183
1184pub fn build_workbook_list(
1185    config: &Arc<ServerConfig>,
1186    filter: &WorkbookFilter,
1187) -> Result<WorkbookListResponse> {
1188    let mut descriptors = Vec::new();
1189
1190    if let Some(single) = config.single_workbook() {
1191        let metadata = fs::metadata(single)
1192            .with_context(|| format!("unable to read metadata for {:?}", single))?;
1193        let id = WorkbookId(hash_path_metadata(single, &metadata));
1194        let slug = single
1195            .file_stem()
1196            .map(|s| s.to_string_lossy().to_string())
1197            .unwrap_or_else(|| "workbook".to_string());
1198        let folder = derive_folder(config, single);
1199        let short_id = make_short_workbook_id(&slug, id.as_str());
1200        let caps = BackendCaps::xlsx();
1201
1202        if filter.matches(&slug, folder.as_deref(), single) {
1203            let relative = single
1204                .strip_prefix(&config.workspace_root)
1205                .unwrap_or(single);
1206            let descriptor = crate::model::WorkbookDescriptor {
1207                workbook_id: id,
1208                short_id,
1209                slug,
1210                folder,
1211                path: path_to_forward_slashes(relative),
1212                bytes: metadata.len(),
1213                last_modified: metadata
1214                    .modified()
1215                    .ok()
1216                    .and_then(system_time_to_rfc3339)
1217                    .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1218                caps,
1219            };
1220            descriptors.push(descriptor);
1221        }
1222
1223        return Ok(WorkbookListResponse {
1224            workbooks: descriptors,
1225        });
1226    }
1227
1228    use walkdir::WalkDir;
1229
1230    for entry in WalkDir::new(&config.workspace_root) {
1231        let entry = entry?;
1232        if !entry.file_type().is_file() {
1233            continue;
1234        }
1235        let path = entry.path();
1236        if !has_supported_extension(&config.supported_extensions, path) {
1237            continue;
1238        }
1239        let metadata = entry.metadata()?;
1240        let id = WorkbookId(hash_path_metadata(path, &metadata));
1241        let slug = path
1242            .file_stem()
1243            .map(|s| s.to_string_lossy().to_string())
1244            .unwrap_or_else(|| "workbook".to_string());
1245        let folder = derive_folder(config, path);
1246        let short_id = make_short_workbook_id(&slug, id.as_str());
1247        let caps = BackendCaps::xlsx();
1248
1249        if !filter.matches(&slug, folder.as_deref(), path) {
1250            continue;
1251        }
1252
1253        let relative = path.strip_prefix(&config.workspace_root).unwrap_or(path);
1254        let descriptor = crate::model::WorkbookDescriptor {
1255            workbook_id: id,
1256            short_id,
1257            slug,
1258            folder,
1259            path: path_to_forward_slashes(relative),
1260            bytes: metadata.len(),
1261            last_modified: metadata
1262                .modified()
1263                .ok()
1264                .and_then(system_time_to_rfc3339)
1265                .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1266            caps,
1267        };
1268        descriptors.push(descriptor);
1269    }
1270
1271    descriptors.sort_by(|a, b| a.slug.cmp(&b.slug));
1272
1273    Ok(WorkbookListResponse {
1274        workbooks: descriptors,
1275    })
1276}
1277
1278fn derive_folder(config: &Arc<ServerConfig>, path: &Path) -> Option<String> {
1279    path.strip_prefix(&config.workspace_root)
1280        .ok()
1281        .and_then(|relative| relative.parent())
1282        .and_then(|parent| parent.file_name())
1283        .map(|os| os.to_string_lossy().to_string())
1284}
1285
1286fn has_supported_extension(allowed: &[String], path: &Path) -> bool {
1287    path.extension()
1288        .and_then(|ext| ext.to_str())
1289        .map(|ext| {
1290            let lower = ext.to_ascii_lowercase();
1291            allowed.iter().any(|candidate| candidate == &lower)
1292        })
1293        .unwrap_or(false)
1294}