spreadsheet_mcp/
workbook.rs

1use crate::analysis::{
2    classification,
3    formula::{FormulaAtlas, FormulaGraph},
4    style,
5};
6use crate::caps::BackendCaps;
7use crate::config::ServerConfig;
8use crate::model::{
9    NamedItemKind, NamedRangeDescriptor, SheetClassification, SheetOverviewResponse, SheetSummary,
10    WorkbookDescription, WorkbookId, WorkbookListResponse,
11};
12use crate::tools::filters::WorkbookFilter;
13use crate::utils::{
14    hash_path_metadata, make_short_workbook_id, path_to_forward_slashes, system_time_to_rfc3339,
15};
16use anyhow::{Context, Result, anyhow};
17use chrono::{DateTime, Utc};
18use parking_lot::RwLock;
19use std::cmp::Ordering;
20use std::collections::{HashMap, HashSet};
21use std::fs;
22use std::path::{Path, PathBuf};
23use std::sync::Arc;
24use std::time::Instant;
25use umya_spreadsheet::reader::xlsx;
26use umya_spreadsheet::{DefinedName, Spreadsheet, Worksheet};
27
28const KV_MAX_WIDTH_FOR_DENSITY_CHECK: u32 = 6;
29const KV_SAMPLE_ROWS: u32 = 20;
30const KV_DENSITY_THRESHOLD: f32 = 0.4;
31const KV_CHECK_ROWS: u32 = 15;
32const KV_MAX_LABEL_LEN: usize = 25;
33const KV_MIN_TEXT_VALUE_LEN: usize = 2;
34const KV_MIN_PAIRS: u32 = 3;
35const KV_MIN_PAIR_RATIO: f32 = 0.3;
36
37const HEADER_MAX_SCAN_ROWS: u32 = 2;
38const HEADER_LONG_STRING_PENALTY_THRESHOLD: usize = 40;
39const HEADER_LONG_STRING_PENALTY: f32 = 1.5;
40const HEADER_PROPER_NOUN_MIN_LEN: usize = 5;
41const HEADER_PROPER_NOUN_PENALTY: f32 = 1.0;
42const HEADER_DIGIT_STRING_MIN_LEN: usize = 3;
43const HEADER_DIGIT_STRING_PENALTY: f32 = 0.5;
44const HEADER_DATE_PENALTY: f32 = 1.0;
45const HEADER_YEAR_LIKE_BONUS: f32 = 0.5;
46const HEADER_YEAR_MIN: f64 = 1900.0;
47const HEADER_YEAR_MAX: f64 = 2100.0;
48const HEADER_UNIQUE_BONUS: f32 = 0.2;
49const HEADER_NUMBER_PENALTY: f32 = 0.3;
50const HEADER_SINGLE_COL_MIN_SCORE: f32 = 1.5;
51const HEADER_SCORE_TIE_THRESHOLD: f32 = 0.3;
52const HEADER_SECOND_ROW_MIN_SCORE_RATIO: f32 = 0.6;
53const HEADER_MAX_COLUMNS: u32 = 200;
54
55const DETECT_MAX_ROWS: u32 = 10_000;
56const DETECT_MAX_COLS: u32 = 500;
57const DETECT_MAX_AREA: u64 = 5_000_000;
58const DETECT_MAX_CELLS: usize = 200_000;
59const DETECT_MAX_LEAVES: usize = 200;
60const DETECT_MAX_DEPTH: u32 = 12;
61const DETECT_MAX_MS: u64 = 200;
62const DETECT_OUTLIER_FRACTION: f32 = 0.01;
63const DETECT_OUTLIER_MIN_CELLS: usize = 50;
64
65pub struct WorkbookContext {
66    pub id: WorkbookId,
67    pub short_id: String,
68    pub slug: String,
69    pub path: PathBuf,
70    pub caps: BackendCaps,
71    pub bytes: u64,
72    pub last_modified: Option<DateTime<Utc>>,
73    spreadsheet: Arc<RwLock<Spreadsheet>>,
74    sheet_cache: RwLock<HashMap<String, Arc<SheetCacheEntry>>>,
75    formula_atlas: Arc<FormulaAtlas>,
76}
77
78pub struct SheetCacheEntry {
79    pub metrics: SheetMetrics,
80    pub style_tags: Vec<String>,
81    pub named_ranges: Vec<NamedRangeDescriptor>,
82    detected_regions: RwLock<Option<Vec<crate::model::DetectedRegion>>>,
83    region_notes: RwLock<Vec<String>>,
84}
85
86#[derive(Debug, Clone)]
87pub struct SheetMetrics {
88    pub row_count: u32,
89    pub column_count: u32,
90    pub non_empty_cells: u32,
91    pub formula_cells: u32,
92    pub cached_values: u32,
93    pub comments: u32,
94    pub style_map: HashMap<String, StyleUsage>,
95    pub classification: SheetClassification,
96}
97
98#[derive(Debug, Clone)]
99pub struct StyleUsage {
100    pub occurrences: u32,
101    pub tags: Vec<String>,
102    pub example_cells: Vec<String>,
103}
104
105impl SheetCacheEntry {
106    pub fn detected_regions(&self) -> Vec<crate::model::DetectedRegion> {
107        self.detected_regions
108            .read()
109            .as_ref()
110            .cloned()
111            .unwrap_or_default()
112    }
113
114    pub fn region_notes(&self) -> Vec<String> {
115        self.region_notes.read().clone()
116    }
117
118    pub fn has_detected_regions(&self) -> bool {
119        self.detected_regions.read().is_some()
120    }
121
122    pub fn set_detected_regions(&self, regions: Vec<crate::model::DetectedRegion>) {
123        let mut guard = self.detected_regions.write();
124        if guard.is_none() {
125            *guard = Some(regions);
126        }
127    }
128
129    pub fn set_region_notes(&self, notes: Vec<String>) {
130        if notes.is_empty() {
131            return;
132        }
133        let mut guard = self.region_notes.write();
134        if guard.is_empty() {
135            *guard = notes;
136        }
137    }
138}
139
140impl WorkbookContext {
141    pub fn load(_config: &Arc<ServerConfig>, path: &Path) -> Result<Self> {
142        let metadata = fs::metadata(path)
143            .with_context(|| format!("unable to read metadata for {:?}", path))?;
144        let slug = path
145            .file_stem()
146            .map(|s| s.to_string_lossy().to_string())
147            .unwrap_or_else(|| "workbook".to_string());
148        let bytes = metadata.len();
149        let last_modified = metadata.modified().ok().and_then(system_time_to_rfc3339);
150        let id = WorkbookId(hash_path_metadata(path, &metadata));
151        let spreadsheet =
152            xlsx::read(path).with_context(|| format!("failed to parse workbook {:?}", path))?;
153        let short_id = make_short_workbook_id(&slug, id.as_str());
154
155        Ok(Self {
156            id,
157            short_id,
158            slug,
159            path: path.to_path_buf(),
160            caps: BackendCaps::xlsx(),
161            bytes,
162            last_modified,
163            spreadsheet: Arc::new(RwLock::new(spreadsheet)),
164            sheet_cache: RwLock::new(HashMap::new()),
165            formula_atlas: Arc::new(FormulaAtlas::default()),
166        })
167    }
168
169    pub fn sheet_names(&self) -> Vec<String> {
170        let book = self.spreadsheet.read();
171        book.get_sheet_collection()
172            .iter()
173            .map(|sheet| sheet.get_name().to_string())
174            .collect()
175    }
176
177    pub fn describe(&self) -> WorkbookDescription {
178        let book = self.spreadsheet.read();
179        let defined_names_count = book.get_defined_names().len();
180        let table_count: usize = book
181            .get_sheet_collection()
182            .iter()
183            .map(|sheet| sheet.get_tables().len())
184            .sum();
185        let macros_present = false;
186
187        WorkbookDescription {
188            workbook_id: self.id.clone(),
189            short_id: self.short_id.clone(),
190            slug: self.slug.clone(),
191            path: path_to_forward_slashes(&self.path),
192            bytes: self.bytes,
193            sheet_count: book.get_sheet_collection().len(),
194            defined_names: defined_names_count,
195            tables: table_count,
196            macros_present,
197            last_modified: self
198                .last_modified
199                .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
200            caps: self.caps.clone(),
201        }
202    }
203
204    pub fn get_sheet_metrics_fast(&self, sheet_name: &str) -> Result<Arc<SheetCacheEntry>> {
205        if let Some(entry) = self.sheet_cache.read().get(sheet_name) {
206            return Ok(entry.clone());
207        }
208
209        let mut writer = self.sheet_cache.write();
210        if let Some(entry) = writer.get(sheet_name) {
211            return Ok(entry.clone());
212        }
213
214        let book = self.spreadsheet.read();
215        let sheet = book
216            .get_sheet_by_name(sheet_name)
217            .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
218        let (metrics, style_tags) = compute_sheet_metrics(sheet);
219        let named_ranges = gather_named_ranges(sheet, book.get_defined_names());
220
221        let entry = Arc::new(SheetCacheEntry {
222            metrics,
223            style_tags,
224            named_ranges,
225            detected_regions: RwLock::new(None),
226            region_notes: RwLock::new(Vec::new()),
227        });
228
229        writer.insert(sheet_name.to_string(), entry.clone());
230        Ok(entry)
231    }
232
233    pub fn get_sheet_metrics(&self, sheet_name: &str) -> Result<Arc<SheetCacheEntry>> {
234        let entry = self.get_sheet_metrics_fast(sheet_name)?;
235        if entry.has_detected_regions() {
236            return Ok(entry);
237        }
238
239        let book = self.spreadsheet.read();
240        let sheet = book
241            .get_sheet_by_name(sheet_name)
242            .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
243        let detected = detect_regions(sheet, &entry.metrics);
244        entry.set_detected_regions(detected.regions);
245        entry.set_region_notes(detected.notes);
246        Ok(entry)
247    }
248
249    pub fn list_summaries(&self) -> Result<Vec<SheetSummary>> {
250        let book = self.spreadsheet.read();
251        let mut summaries = Vec::new();
252        for sheet in book.get_sheet_collection() {
253            let name = sheet.get_name().to_string();
254            let entry = self.get_sheet_metrics_fast(&name)?;
255            summaries.push(SheetSummary {
256                name: name.clone(),
257                visible: sheet.get_sheet_state() != "hidden",
258                row_count: entry.metrics.row_count,
259                column_count: entry.metrics.column_count,
260                non_empty_cells: entry.metrics.non_empty_cells,
261                formula_cells: entry.metrics.formula_cells,
262                cached_values: entry.metrics.cached_values,
263                classification: entry.metrics.classification.clone(),
264                style_tags: entry.style_tags.clone(),
265            });
266        }
267        Ok(summaries)
268    }
269
270    pub fn with_sheet<T, F>(&self, sheet_name: &str, func: F) -> Result<T>
271    where
272        F: FnOnce(&Worksheet) -> T,
273    {
274        let book = self.spreadsheet.read();
275        let sheet = book
276            .get_sheet_by_name(sheet_name)
277            .ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
278        Ok(func(sheet))
279    }
280
281    pub fn with_spreadsheet<T, F>(&self, func: F) -> Result<T>
282    where
283        F: FnOnce(&Spreadsheet) -> T,
284    {
285        let book = self.spreadsheet.read();
286        Ok(func(&book))
287    }
288
289    pub fn formula_graph(&self, sheet_name: &str) -> Result<FormulaGraph> {
290        self.with_sheet(sheet_name, |sheet| {
291            FormulaGraph::build(sheet, &self.formula_atlas)
292        })?
293    }
294
295    pub fn named_items(&self) -> Result<Vec<NamedRangeDescriptor>> {
296        let book = self.spreadsheet.read();
297        let sheet_names: Vec<String> = book
298            .get_sheet_collection()
299            .iter()
300            .map(|sheet| sheet.get_name().to_string())
301            .collect();
302        let mut items = Vec::new();
303        for defined in book.get_defined_names() {
304            let refers_to = defined.get_address();
305            let scope = if defined.has_local_sheet_id() {
306                let idx = *defined.get_local_sheet_id() as usize;
307                sheet_names.get(idx).cloned()
308            } else {
309                None
310            };
311            let kind = if refers_to.starts_with('=') {
312                NamedItemKind::Formula
313            } else {
314                NamedItemKind::NamedRange
315            };
316
317            items.push(NamedRangeDescriptor {
318                name: defined.get_name().to_string(),
319                scope: scope.clone(),
320                refers_to: refers_to.clone(),
321                kind,
322                sheet_name: scope,
323                comment: None,
324            });
325        }
326
327        for sheet in book.get_sheet_collection() {
328            for table in sheet.get_tables() {
329                let start = table.get_area().0.get_coordinate();
330                let end = table.get_area().1.get_coordinate();
331                items.push(NamedRangeDescriptor {
332                    name: table.get_name().to_string(),
333                    scope: Some(sheet.get_name().to_string()),
334                    refers_to: format!("{}:{}", start, end),
335                    kind: NamedItemKind::Table,
336                    sheet_name: Some(sheet.get_name().to_string()),
337                    comment: None,
338                });
339            }
340        }
341
342        Ok(items)
343    }
344
345    pub fn sheet_overview(&self, sheet_name: &str) -> Result<SheetOverviewResponse> {
346        let entry = self.get_sheet_metrics(sheet_name)?;
347        let narrative = classification::narrative(&entry.metrics);
348        let regions = classification::regions(&entry.metrics);
349        let key_ranges = classification::key_ranges(&entry.metrics);
350        let detected_regions = entry.detected_regions();
351
352        Ok(SheetOverviewResponse {
353            workbook_id: self.id.clone(),
354            workbook_short_id: self.short_id.clone(),
355            sheet_name: sheet_name.to_string(),
356            narrative,
357            regions,
358            detected_regions: detected_regions.clone(),
359            detected_region_count: detected_regions.len() as u32,
360            detected_regions_truncated: false,
361            key_ranges,
362            formula_ratio: if entry.metrics.non_empty_cells == 0 {
363                0.0
364            } else {
365                entry.metrics.formula_cells as f32 / entry.metrics.non_empty_cells as f32
366            },
367            notable_features: entry.style_tags.clone(),
368            notes: entry.region_notes(),
369        })
370    }
371
372    pub fn detected_region(
373        &self,
374        sheet_name: &str,
375        id: u32,
376    ) -> Result<crate::model::DetectedRegion> {
377        let entry = self.get_sheet_metrics(sheet_name)?;
378        entry
379            .detected_regions()
380            .iter()
381            .find(|r| r.id == id)
382            .cloned()
383            .ok_or_else(|| anyhow!("region {} not found on sheet {}", id, sheet_name))
384    }
385}
386
387fn contains_date_time_token(format_code: &str) -> bool {
388    let mut in_quote = false;
389    let mut in_bracket = false;
390    let chars: Vec<char> = format_code.chars().collect();
391
392    for (i, &ch) in chars.iter().enumerate() {
393        match ch {
394            '"' => in_quote = !in_quote,
395            '[' if !in_quote => in_bracket = true,
396            ']' if !in_quote => in_bracket = false,
397            'y' | 'd' | 'h' | 's' | 'm' if !in_quote && !in_bracket => {
398                if ch == 'm' {
399                    let prev = if i > 0 { chars.get(i - 1) } else { None };
400                    let next = chars.get(i + 1);
401                    let after_time_sep = prev == Some(&':') || prev == Some(&'h');
402                    let before_time_sep = next == Some(&':') || next == Some(&'s');
403                    if after_time_sep || before_time_sep {
404                        return true;
405                    }
406                    if prev == Some(&'m') || next == Some(&'m') {
407                        return true;
408                    }
409                    if matches!(prev, Some(&'/') | Some(&'-') | Some(&'.'))
410                        || matches!(next, Some(&'/') | Some(&'-') | Some(&'.'))
411                    {
412                        return true;
413                    }
414                } else {
415                    return true;
416                }
417            }
418            _ => {}
419        }
420    }
421    false
422}
423
424const DATE_FORMAT_IDS: &[u32] = &[
425    14, 15, 16, 17, 18, 19, 20, 21, 22, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 45, 46, 47, 50, 51,
426    52, 53, 54, 55, 56, 57, 58,
427];
428
429const EXCEL_LEAP_YEAR_BUG_SERIAL: i64 = 60;
430
431fn is_date_formatted(cell: &umya_spreadsheet::Cell) -> bool {
432    let Some(nf) = cell.get_style().get_number_format() else {
433        return false;
434    };
435
436    let format_id = nf.get_number_format_id();
437    if DATE_FORMAT_IDS.contains(format_id) {
438        return true;
439    }
440
441    let code = nf.get_format_code();
442    if code == "General" || code == "@" || code == "0" || code == "0.00" {
443        return false;
444    }
445
446    contains_date_time_token(code)
447}
448
449pub fn excel_serial_to_iso(serial: f64, use_1904_system: bool) -> String {
450    excel_serial_to_iso_with_leap_bug(serial, use_1904_system, true)
451}
452
453pub fn excel_serial_to_iso_with_leap_bug(
454    serial: f64,
455    use_1904_system: bool,
456    compensate_leap_bug: bool,
457) -> String {
458    use chrono::NaiveDate;
459
460    let days = serial.trunc() as i64;
461
462    if use_1904_system {
463        let epoch_1904 = NaiveDate::from_ymd_opt(1904, 1, 1).unwrap();
464        return epoch_1904
465            .checked_add_signed(chrono::Duration::days(days))
466            .map(|d| d.format("%Y-%m-%d").to_string())
467            .unwrap_or_else(|| serial.to_string());
468    }
469
470    let epoch = if compensate_leap_bug && days >= EXCEL_LEAP_YEAR_BUG_SERIAL {
471        NaiveDate::from_ymd_opt(1899, 12, 30).unwrap()
472    } else {
473        NaiveDate::from_ymd_opt(1899, 12, 31).unwrap()
474    };
475
476    epoch
477        .checked_add_signed(chrono::Duration::days(days))
478        .map(|d| d.format("%Y-%m-%d").to_string())
479        .unwrap_or_else(|| serial.to_string())
480}
481
482pub fn cell_to_value(cell: &umya_spreadsheet::Cell) -> Option<crate::model::CellValue> {
483    cell_to_value_with_date_system(cell, false)
484}
485
486pub fn cell_to_value_with_date_system(
487    cell: &umya_spreadsheet::Cell,
488    use_1904_system: bool,
489) -> Option<crate::model::CellValue> {
490    let raw = cell.get_value();
491    if raw.is_empty() {
492        return None;
493    }
494    if let Ok(number) = raw.parse::<f64>() {
495        if is_date_formatted(cell) {
496            return Some(crate::model::CellValue::Date(excel_serial_to_iso(
497                number,
498                use_1904_system,
499            )));
500        }
501        return Some(crate::model::CellValue::Number(number));
502    }
503
504    let lower = raw.to_ascii_lowercase();
505    if lower == "true" {
506        return Some(crate::model::CellValue::Bool(true));
507    }
508    if lower == "false" {
509        return Some(crate::model::CellValue::Bool(false));
510    }
511
512    Some(crate::model::CellValue::Text(raw.to_string()))
513}
514
515pub fn compute_sheet_metrics(sheet: &Worksheet) -> (SheetMetrics, Vec<String>) {
516    use std::collections::HashMap as StdHashMap;
517    let mut non_empty = 0u32;
518    let mut formulas = 0u32;
519    let mut cached = 0u32;
520    let comments = sheet.get_comments().len() as u32;
521    let mut style_usage: StdHashMap<String, StyleUsage> = StdHashMap::new();
522
523    for cell in sheet.get_cell_collection() {
524        let value = cell.get_value();
525        if !value.is_empty() {
526            non_empty += 1;
527        }
528        if cell.is_formula() {
529            formulas += 1;
530            if !cell.get_value().is_empty() {
531                cached += 1;
532            }
533        }
534
535        if let Some((style_key, usage)) = style::tag_cell(cell) {
536            let entry = style_usage.entry(style_key).or_insert_with(|| StyleUsage {
537                occurrences: 0,
538                tags: usage.tags.clone(),
539                example_cells: Vec::new(),
540            });
541            entry.occurrences += 1;
542            if entry.example_cells.len() < 5 {
543                entry.example_cells.push(usage.example_cell.clone());
544            }
545        }
546    }
547
548    let (max_col, max_row) = sheet.get_highest_column_and_row();
549
550    let classification = classification::classify(
551        non_empty,
552        formulas,
553        max_row,
554        max_col,
555        comments,
556        &style_usage,
557    );
558
559    let style_tags: Vec<String> = style_usage
560        .values()
561        .flat_map(|usage| usage.tags.clone())
562        .collect();
563
564    let metrics = SheetMetrics {
565        row_count: max_row,
566        column_count: max_col,
567        non_empty_cells: non_empty,
568        formula_cells: formulas,
569        cached_values: cached,
570        comments,
571        style_map: style_usage,
572        classification,
573    };
574    (metrics, style_tags)
575}
576
577#[derive(Debug, Clone, Copy)]
578struct Rect {
579    start_row: u32,
580    end_row: u32,
581    start_col: u32,
582    end_col: u32,
583}
584
585#[derive(Debug, Clone)]
586struct CellInfo {
587    value: Option<crate::model::CellValue>,
588    is_formula: bool,
589}
590
591#[derive(Debug)]
592struct Occupancy {
593    cells: HashMap<(u32, u32), CellInfo>,
594    rows: HashMap<u32, Vec<u32>>,
595    cols: HashMap<u32, Vec<u32>>,
596    min_row: u32,
597    max_row: u32,
598    min_col: u32,
599    max_col: u32,
600}
601
602impl Occupancy {
603    fn bounds_rect(&self) -> Option<Rect> {
604        if self.cells.is_empty() {
605            None
606        } else {
607            Some(Rect {
608                start_row: self.min_row,
609                end_row: self.max_row,
610                start_col: self.min_col,
611                end_col: self.max_col,
612            })
613        }
614    }
615
616    fn dense_bounds(&self) -> Option<Rect> {
617        let bounds = self.bounds_rect()?;
618        let total_cells = self.cells.len();
619        if total_cells < DETECT_OUTLIER_MIN_CELLS {
620            return Some(bounds);
621        }
622        let trim_cells = ((total_cells as f32) * DETECT_OUTLIER_FRACTION).round() as usize;
623        if trim_cells == 0 || trim_cells * 2 >= total_cells {
624            return Some(bounds);
625        }
626
627        let mut row_counts: Vec<(u32, usize)> = self
628            .rows
629            .iter()
630            .map(|(row, cols)| (*row, cols.len()))
631            .collect();
632        row_counts.sort_by_key(|(row, _)| *row);
633
634        let mut col_counts: Vec<(u32, usize)> = self
635            .cols
636            .iter()
637            .map(|(col, rows)| (*col, rows.len()))
638            .collect();
639        col_counts.sort_by_key(|(col, _)| *col);
640
641        let (start_row, end_row) =
642            trim_bounds_by_cells(&row_counts, trim_cells, bounds.start_row, bounds.end_row);
643        let (start_col, end_col) =
644            trim_bounds_by_cells(&col_counts, trim_cells, bounds.start_col, bounds.end_col);
645
646        if start_row > end_row || start_col > end_col {
647            return Some(bounds);
648        }
649
650        Some(Rect {
651            start_row,
652            end_row,
653            start_col,
654            end_col,
655        })
656    }
657
658    fn row_col_counts(&self, rect: &Rect) -> (Vec<u32>, Vec<u32>) {
659        let height = (rect.end_row - rect.start_row + 1) as usize;
660        let width = (rect.end_col - rect.start_col + 1) as usize;
661        let mut row_counts = vec![0u32; height];
662        let mut col_counts = vec![0u32; width];
663
664        for (row, cols) in &self.rows {
665            if *row < rect.start_row || *row > rect.end_row {
666                continue;
667            }
668            let count = count_in_sorted_range(cols, rect.start_col, rect.end_col);
669            row_counts[(row - rect.start_row) as usize] = count;
670        }
671        for (col, rows) in &self.cols {
672            if *col < rect.start_col || *col > rect.end_col {
673                continue;
674            }
675            let count = count_in_sorted_range(rows, rect.start_row, rect.end_row);
676            col_counts[(col - rect.start_col) as usize] = count;
677        }
678        (row_counts, col_counts)
679    }
680
681    fn stats_in_rect(&self, rect: &Rect) -> RegionStats {
682        let mut stats = RegionStats::default();
683        for (row, cols) in &self.rows {
684            if *row < rect.start_row || *row > rect.end_row {
685                continue;
686            }
687            let start_idx = lower_bound(cols, rect.start_col);
688            let end_idx = upper_bound(cols, rect.end_col);
689            for col in &cols[start_idx..end_idx] {
690                if let Some(info) = self.cells.get(&(*row, *col)) {
691                    stats.non_empty += 1;
692                    if info.is_formula {
693                        stats.formulas += 1;
694                    }
695                    if let Some(val) = &info.value {
696                        match val {
697                            crate::model::CellValue::Text(_) => stats.text += 1,
698                            crate::model::CellValue::Number(_) => stats.numbers += 1,
699                            crate::model::CellValue::Bool(_) => stats.bools += 1,
700                            crate::model::CellValue::Date(_) => stats.dates += 1,
701                            crate::model::CellValue::Error(_) => stats.errors += 1,
702                        }
703                    }
704                }
705            }
706        }
707        stats
708    }
709
710    fn value_at(&self, row: u32, col: u32) -> Option<&crate::model::CellValue> {
711        self.cells.get(&(row, col)).and_then(|c| c.value.as_ref())
712    }
713}
714
715fn lower_bound(values: &[u32], target: u32) -> usize {
716    let mut left = 0;
717    let mut right = values.len();
718    while left < right {
719        let mid = (left + right) / 2;
720        if values[mid] < target {
721            left = mid + 1;
722        } else {
723            right = mid;
724        }
725    }
726    left
727}
728
729fn upper_bound(values: &[u32], target: u32) -> usize {
730    let mut left = 0;
731    let mut right = values.len();
732    while left < right {
733        let mid = (left + right) / 2;
734        if values[mid] <= target {
735            left = mid + 1;
736        } else {
737            right = mid;
738        }
739    }
740    left
741}
742
743fn count_in_sorted_range(values: &[u32], start: u32, end: u32) -> u32 {
744    if values.is_empty() {
745        return 0;
746    }
747    let start_idx = lower_bound(values, start);
748    let end_idx = upper_bound(values, end);
749    end_idx.saturating_sub(start_idx) as u32
750}
751
752fn trim_bounds_by_cells(
753    entries: &[(u32, usize)],
754    trim_cells: usize,
755    default_start: u32,
756    default_end: u32,
757) -> (u32, u32) {
758    if entries.is_empty() {
759        return (default_start, default_end);
760    }
761
762    let mut remaining = trim_cells;
763    let mut start_idx = 0usize;
764    while start_idx < entries.len() {
765        let count = entries[start_idx].1;
766        if remaining < count {
767            break;
768        }
769        remaining -= count;
770        start_idx += 1;
771    }
772
773    let mut remaining = trim_cells;
774    let mut end_idx = entries.len();
775    while end_idx > 0 {
776        let count = entries[end_idx - 1].1;
777        if remaining < count {
778            break;
779        }
780        remaining -= count;
781        end_idx -= 1;
782    }
783
784    let start = entries
785        .get(start_idx)
786        .map(|(idx, _)| *idx)
787        .unwrap_or(default_start);
788    let end = if end_idx == 0 {
789        default_end
790    } else {
791        entries
792            .get(end_idx - 1)
793            .map(|(idx, _)| *idx)
794            .unwrap_or(default_end)
795    };
796    (start, end)
797}
798
799#[derive(Debug, Default, Clone)]
800struct RegionStats {
801    non_empty: u32,
802    formulas: u32,
803    text: u32,
804    numbers: u32,
805    bools: u32,
806    dates: u32,
807    errors: u32,
808}
809
810#[derive(Debug, Clone, Copy, PartialEq, Eq)]
811enum Gutter {
812    Row { start: u32, end: u32 },
813    Col { start: u32, end: u32 },
814}
815
816#[derive(Debug, Default)]
817struct DetectRegionsResult {
818    regions: Vec<crate::model::DetectedRegion>,
819    notes: Vec<String>,
820}
821
822#[derive(Debug)]
823struct DetectLimits {
824    start: Instant,
825    max_ms: u64,
826    max_leaves: usize,
827    max_depth: u32,
828    leaves: usize,
829    exceeded_time: bool,
830    exceeded_leaves: bool,
831}
832
833impl DetectLimits {
834    fn new() -> Self {
835        Self {
836            start: Instant::now(),
837            max_ms: DETECT_MAX_MS,
838            max_leaves: DETECT_MAX_LEAVES,
839            max_depth: DETECT_MAX_DEPTH,
840            leaves: 0,
841            exceeded_time: false,
842            exceeded_leaves: false,
843        }
844    }
845
846    fn should_stop(&mut self) -> bool {
847        if !self.exceeded_time && self.start.elapsed().as_millis() as u64 >= self.max_ms {
848            self.exceeded_time = true;
849        }
850        self.exceeded_time || self.exceeded_leaves
851    }
852
853    fn note_leaf(&mut self) {
854        self.leaves += 1;
855        if self.leaves >= self.max_leaves {
856            self.exceeded_leaves = true;
857        }
858    }
859}
860
861fn detect_regions(sheet: &Worksheet, metrics: &SheetMetrics) -> DetectRegionsResult {
862    if metrics.row_count == 0 || metrics.column_count == 0 {
863        return DetectRegionsResult::default();
864    }
865
866    let occupancy = build_occupancy(sheet);
867    if occupancy.cells.is_empty() {
868        return DetectRegionsResult::default();
869    }
870
871    let area = (metrics.row_count as u64) * (metrics.column_count as u64);
872    let exceeds_caps = metrics.row_count > DETECT_MAX_ROWS
873        || metrics.column_count > DETECT_MAX_COLS
874        || area > DETECT_MAX_AREA
875        || occupancy.cells.len() > DETECT_MAX_CELLS;
876
877    if exceeds_caps {
878        let mut result = DetectRegionsResult::default();
879        if let Some(bounds) = occupancy.dense_bounds() {
880            result.regions.push(build_fallback_region(&bounds, metrics));
881        }
882        result.notes.push(format!(
883            "Region detection capped: rows {}, cols {}, occupied {}.",
884            metrics.row_count,
885            metrics.column_count,
886            occupancy.cells.len()
887        ));
888        return result;
889    }
890
891    let root = occupancy.bounds_rect().unwrap_or(Rect {
892        start_row: 1,
893        end_row: metrics.row_count.max(1),
894        start_col: 1,
895        end_col: metrics.column_count.max(1),
896    });
897
898    let mut leaves = Vec::new();
899    let mut limits = DetectLimits::new();
900    split_rect(&occupancy, &root, 0, &mut limits, &mut leaves);
901
902    let mut regions = Vec::new();
903    for (idx, rect) in leaves.into_iter().enumerate() {
904        if limits.should_stop() {
905            break;
906        }
907        if let Some(trimmed) = trim_rect(&occupancy, rect, &mut limits) {
908            let region = build_region(&occupancy, &trimmed, metrics, idx as u32);
909            regions.push(region);
910        }
911    }
912
913    let mut notes = Vec::new();
914    if limits.exceeded_time || limits.exceeded_leaves {
915        notes.push("Region detection truncated due to time/complexity caps.".to_string());
916    }
917    if regions.is_empty() {
918        if let Some(bounds) = occupancy.dense_bounds() {
919            regions.push(build_fallback_region(&bounds, metrics));
920            notes.push("Region detection returned no regions; fallback bounds used.".to_string());
921        }
922    }
923
924    DetectRegionsResult { regions, notes }
925}
926
927fn build_fallback_region(rect: &Rect, metrics: &SheetMetrics) -> crate::model::DetectedRegion {
928    let kind = match metrics.classification {
929        SheetClassification::Calculator => crate::model::RegionKind::Calculator,
930        SheetClassification::Metadata => crate::model::RegionKind::Metadata,
931        _ => crate::model::RegionKind::Data,
932    };
933    let end_col = crate::utils::column_number_to_name(rect.end_col.max(1));
934    let end_cell = format!("{}{}", end_col, rect.end_row.max(1));
935    let header_count = rect.end_col - rect.start_col + 1;
936    crate::model::DetectedRegion {
937        id: 0,
938        bounds: format!(
939            "{}{}:{}",
940            crate::utils::column_number_to_name(rect.start_col),
941            rect.start_row,
942            end_cell
943        ),
944        header_row: None,
945        headers: Vec::new(),
946        header_count,
947        headers_truncated: header_count > 0,
948        row_count: rect.end_row - rect.start_row + 1,
949        classification: kind.clone(),
950        region_kind: Some(kind),
951        confidence: 0.2,
952    }
953}
954
955fn build_occupancy(sheet: &Worksheet) -> Occupancy {
956    let mut cells = HashMap::new();
957    let mut rows: HashMap<u32, Vec<u32>> = HashMap::new();
958    let mut cols: HashMap<u32, Vec<u32>> = HashMap::new();
959    let mut min_row = u32::MAX;
960    let mut max_row = 0u32;
961    let mut min_col = u32::MAX;
962    let mut max_col = 0u32;
963
964    for cell in sheet.get_cell_collection() {
965        let coord = cell.get_coordinate();
966        let row = *coord.get_row_num();
967        let col = *coord.get_col_num();
968        let value = cell_to_value(cell);
969        let is_formula = cell.is_formula();
970        cells.insert((row, col), CellInfo { value, is_formula });
971        rows.entry(row).or_default().push(col);
972        cols.entry(col).or_default().push(row);
973        min_row = min_row.min(row);
974        max_row = max_row.max(row);
975        min_col = min_col.min(col);
976        max_col = max_col.max(col);
977    }
978
979    for cols in rows.values_mut() {
980        cols.sort_unstable();
981    }
982    for rows in cols.values_mut() {
983        rows.sort_unstable();
984    }
985
986    if cells.is_empty() {
987        min_row = 0;
988        min_col = 0;
989    }
990
991    Occupancy {
992        cells,
993        rows,
994        cols,
995        min_row,
996        max_row,
997        min_col,
998        max_col,
999    }
1000}
1001
1002fn split_rect(
1003    occupancy: &Occupancy,
1004    rect: &Rect,
1005    depth: u32,
1006    limits: &mut DetectLimits,
1007    leaves: &mut Vec<Rect>,
1008) {
1009    if limits.should_stop() || depth >= limits.max_depth {
1010        limits.note_leaf();
1011        leaves.push(*rect);
1012        return;
1013    }
1014    if rect.start_row >= rect.end_row && rect.start_col >= rect.end_col {
1015        limits.note_leaf();
1016        leaves.push(*rect);
1017        return;
1018    }
1019    if let Some(gutter) = find_best_gutter(occupancy, rect, limits) {
1020        match gutter {
1021            Gutter::Row { start, end } => {
1022                if start > rect.start_row {
1023                    let upper = Rect {
1024                        start_row: rect.start_row,
1025                        end_row: start - 1,
1026                        start_col: rect.start_col,
1027                        end_col: rect.end_col,
1028                    };
1029                    split_rect(occupancy, &upper, depth + 1, limits, leaves);
1030                }
1031                if end < rect.end_row {
1032                    let lower = Rect {
1033                        start_row: end + 1,
1034                        end_row: rect.end_row,
1035                        start_col: rect.start_col,
1036                        end_col: rect.end_col,
1037                    };
1038                    split_rect(occupancy, &lower, depth + 1, limits, leaves);
1039                }
1040            }
1041            Gutter::Col { start, end } => {
1042                if start > rect.start_col {
1043                    let left = Rect {
1044                        start_row: rect.start_row,
1045                        end_row: rect.end_row,
1046                        start_col: rect.start_col,
1047                        end_col: start - 1,
1048                    };
1049                    split_rect(occupancy, &left, depth + 1, limits, leaves);
1050                }
1051                if end < rect.end_col {
1052                    let right = Rect {
1053                        start_row: rect.start_row,
1054                        end_row: rect.end_row,
1055                        start_col: end + 1,
1056                        end_col: rect.end_col,
1057                    };
1058                    split_rect(occupancy, &right, depth + 1, limits, leaves);
1059                }
1060            }
1061        }
1062        return;
1063    }
1064    limits.note_leaf();
1065    leaves.push(*rect);
1066}
1067
1068fn find_best_gutter(
1069    occupancy: &Occupancy,
1070    rect: &Rect,
1071    limits: &mut DetectLimits,
1072) -> Option<Gutter> {
1073    if limits.should_stop() {
1074        return None;
1075    }
1076    let (row_counts, col_counts) = occupancy.row_col_counts(rect);
1077    let width = rect.end_col - rect.start_col + 1;
1078    let height = rect.end_row - rect.start_row + 1;
1079
1080    let row_blank_runs = find_blank_runs(&row_counts, width);
1081    let col_blank_runs = find_blank_runs(&col_counts, height);
1082
1083    let mut best: Option<(Gutter, u32)> = None;
1084
1085    if let Some((start, end, len)) = row_blank_runs {
1086        let gutter = Gutter::Row {
1087            start: rect.start_row + start,
1088            end: rect.start_row + end,
1089        };
1090        best = Some((gutter, len));
1091    }
1092    if let Some((start, end, len)) = col_blank_runs {
1093        let gutter = Gutter::Col {
1094            start: rect.start_col + start,
1095            end: rect.start_col + end,
1096        };
1097        if best.map(|(_, l)| len > l).unwrap_or(true) {
1098            best = Some((gutter, len));
1099        }
1100    }
1101
1102    best.map(|(g, _)| g)
1103}
1104
1105fn find_blank_runs(counts: &[u32], span: u32) -> Option<(u32, u32, u32)> {
1106    if counts.is_empty() {
1107        return None;
1108    }
1109    let mut best_start = 0;
1110    let mut best_end = 0;
1111    let mut best_len = 0;
1112    let mut current_start = None;
1113    for (idx, count) in counts.iter().enumerate() {
1114        let is_blank = *count == 0 || (*count as f32 / span as f32) < 0.05;
1115        if is_blank {
1116            if current_start.is_none() {
1117                current_start = Some(idx as u32);
1118            }
1119        } else if let Some(start) = current_start.take() {
1120            let end = idx as u32 - 1;
1121            let len = end - start + 1;
1122            if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
1123                best_len = len;
1124                best_start = start;
1125                best_end = end;
1126            }
1127        }
1128    }
1129    if let Some(start) = current_start {
1130        let end = counts.len() as u32 - 1;
1131        let len = end - start + 1;
1132        if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
1133            best_len = len;
1134            best_start = start;
1135            best_end = end;
1136        }
1137    }
1138    if best_len >= 2 {
1139        Some((best_start, best_end, best_len))
1140    } else {
1141        None
1142    }
1143}
1144
1145fn trim_rect(occupancy: &Occupancy, rect: Rect, limits: &mut DetectLimits) -> Option<Rect> {
1146    let mut r = rect;
1147    loop {
1148        if limits.should_stop() {
1149            return Some(r);
1150        }
1151        let (row_counts, col_counts) = occupancy.row_col_counts(&r);
1152        let width = r.end_col - r.start_col + 1;
1153        let height = r.end_row - r.start_row + 1;
1154        let top_blank = row_counts
1155            .first()
1156            .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
1157            .unwrap_or(false);
1158        let bottom_blank = row_counts
1159            .last()
1160            .map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
1161            .unwrap_or(false);
1162        let left_blank = col_counts
1163            .first()
1164            .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
1165            .unwrap_or(false);
1166        let right_blank = col_counts
1167            .last()
1168            .map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
1169            .unwrap_or(false);
1170
1171        let mut changed = false;
1172        if top_blank && r.start_row < r.end_row {
1173            r.start_row += 1;
1174            changed = true;
1175        }
1176        if bottom_blank && r.end_row > r.start_row {
1177            r.end_row -= 1;
1178            changed = true;
1179        }
1180        if left_blank && r.start_col < r.end_col {
1181            r.start_col += 1;
1182            changed = true;
1183        }
1184        if right_blank && r.end_col > r.start_col {
1185            r.end_col -= 1;
1186            changed = true;
1187        }
1188
1189        if !changed {
1190            break;
1191        }
1192        if r.start_row > r.end_row || r.start_col > r.end_col {
1193            return None;
1194        }
1195    }
1196    Some(r)
1197}
1198
1199fn build_region(
1200    occupancy: &Occupancy,
1201    rect: &Rect,
1202    metrics: &SheetMetrics,
1203    id: u32,
1204) -> crate::model::DetectedRegion {
1205    let header_info = detect_headers(occupancy, rect);
1206    let stats = occupancy.stats_in_rect(rect);
1207    let (kind, confidence) = classify_region(rect, &stats, &header_info, metrics);
1208    let header_len = header_info.headers.len() as u32;
1209    let header_count = rect.end_col - rect.start_col + 1;
1210    let headers_truncated = header_len != header_count;
1211    crate::model::DetectedRegion {
1212        id,
1213        bounds: format!(
1214            "{}{}:{}{}",
1215            crate::utils::column_number_to_name(rect.start_col),
1216            rect.start_row,
1217            crate::utils::column_number_to_name(rect.end_col),
1218            rect.end_row
1219        ),
1220        header_row: header_info.header_row,
1221        headers: header_info.headers,
1222        header_count,
1223        headers_truncated,
1224        row_count: rect.end_row - rect.start_row + 1,
1225        classification: kind.clone(),
1226        region_kind: Some(kind),
1227        confidence,
1228    }
1229}
1230
1231#[derive(Debug, Default)]
1232struct HeaderInfo {
1233    header_row: Option<u32>,
1234    headers: Vec<String>,
1235    is_key_value: bool,
1236}
1237
1238fn is_key_value_layout(occupancy: &Occupancy, rect: &Rect) -> bool {
1239    let width = rect.end_col - rect.start_col + 1;
1240
1241    if width == 2 {
1242        return check_key_value_columns(occupancy, rect, rect.start_col, rect.start_col + 1);
1243    }
1244
1245    if width <= KV_MAX_WIDTH_FOR_DENSITY_CHECK {
1246        let rows_to_sample = (rect.end_row - rect.start_row + 1).min(KV_SAMPLE_ROWS);
1247        let density_threshold = (rows_to_sample as f32 * KV_DENSITY_THRESHOLD) as u32;
1248
1249        let mut col_densities: Vec<(u32, u32)> = Vec::new();
1250        for col in rect.start_col..=rect.end_col {
1251            let count = (rect.start_row..rect.start_row + rows_to_sample)
1252                .filter(|&row| occupancy.value_at(row, col).is_some())
1253                .count() as u32;
1254            if count >= density_threshold {
1255                col_densities.push((col, count));
1256            }
1257        }
1258
1259        if col_densities.len() == 2 {
1260            let label_col = col_densities[0].0;
1261            let value_col = col_densities[1].0;
1262            return check_key_value_columns(occupancy, rect, label_col, value_col);
1263        } else if col_densities.len() == 4 && width >= 4 {
1264            let pair1 =
1265                check_key_value_columns(occupancy, rect, col_densities[0].0, col_densities[1].0);
1266            let pair2 =
1267                check_key_value_columns(occupancy, rect, col_densities[2].0, col_densities[3].0);
1268            return pair1 && pair2;
1269        }
1270    }
1271
1272    false
1273}
1274
1275fn check_key_value_columns(
1276    occupancy: &Occupancy,
1277    rect: &Rect,
1278    label_col: u32,
1279    value_col: u32,
1280) -> bool {
1281    let mut label_value_pairs = 0u32;
1282    let rows_to_check = (rect.end_row - rect.start_row + 1).min(KV_CHECK_ROWS);
1283
1284    for row in rect.start_row..rect.start_row + rows_to_check {
1285        let first_col = occupancy.value_at(row, label_col);
1286        let second_col = occupancy.value_at(row, value_col);
1287
1288        if let (Some(crate::model::CellValue::Text(label)), Some(val)) = (first_col, second_col) {
1289            let label_looks_like_key = label.len() <= KV_MAX_LABEL_LEN
1290                && !label.chars().any(|c| c.is_ascii_digit())
1291                && label.contains(|c: char| c.is_alphabetic());
1292
1293            let value_is_data = matches!(
1294                val,
1295                crate::model::CellValue::Number(_) | crate::model::CellValue::Date(_)
1296            ) || matches!(val, crate::model::CellValue::Text(s) if s.len() > KV_MIN_TEXT_VALUE_LEN);
1297
1298            if label_looks_like_key && value_is_data {
1299                label_value_pairs += 1;
1300            }
1301        }
1302    }
1303
1304    label_value_pairs >= KV_MIN_PAIRS
1305        && label_value_pairs as f32 / rows_to_check as f32 >= KV_MIN_PAIR_RATIO
1306}
1307
1308fn header_data_penalty(s: &str) -> f32 {
1309    if s.is_empty() {
1310        return 0.0;
1311    }
1312    if s.len() > HEADER_LONG_STRING_PENALTY_THRESHOLD {
1313        return HEADER_LONG_STRING_PENALTY;
1314    }
1315    let first_char = s.chars().next().unwrap();
1316    let is_capitalized = first_char.is_uppercase();
1317    let has_lowercase = s.chars().skip(1).any(|c| c.is_lowercase());
1318    let is_all_caps = s.chars().all(|c| !c.is_alphabetic() || c.is_uppercase());
1319    let has_digits = s.chars().any(|c| c.is_ascii_digit());
1320    let is_proper_noun =
1321        is_capitalized && has_lowercase && !is_all_caps && s.len() > HEADER_PROPER_NOUN_MIN_LEN;
1322
1323    let mut penalty = 0.0;
1324    if is_proper_noun {
1325        penalty += HEADER_PROPER_NOUN_PENALTY;
1326    }
1327    if has_digits && s.len() > HEADER_DIGIT_STRING_MIN_LEN {
1328        penalty += HEADER_DIGIT_STRING_PENALTY;
1329    }
1330    penalty
1331}
1332
1333fn detect_headers(occupancy: &Occupancy, rect: &Rect) -> HeaderInfo {
1334    if is_key_value_layout(occupancy, rect) {
1335        let mut headers = Vec::new();
1336        for col in rect.start_col..=rect.end_col {
1337            headers.push(crate::utils::column_number_to_name(col));
1338        }
1339        return HeaderInfo {
1340            header_row: None,
1341            headers,
1342            is_key_value: true,
1343        };
1344    }
1345
1346    let width = rect.end_col - rect.start_col + 1;
1347    if width > HEADER_MAX_COLUMNS {
1348        return HeaderInfo {
1349            header_row: None,
1350            headers: Vec::new(),
1351            is_key_value: false,
1352        };
1353    }
1354
1355    let mut candidates = Vec::new();
1356    let max_row = rect
1357        .start_row
1358        .saturating_add(HEADER_MAX_SCAN_ROWS)
1359        .min(rect.end_row);
1360    for row in rect.start_row..=max_row {
1361        let mut text = 0;
1362        let mut numbers = 0;
1363        let mut non_empty = 0;
1364        let mut unique = HashSet::new();
1365        let mut data_like_penalty: f32 = 0.0;
1366        let mut year_like_bonus: f32 = 0.0;
1367
1368        for col in rect.start_col..=rect.end_col {
1369            if let Some(val) = occupancy.value_at(row, col) {
1370                non_empty += 1;
1371                match val {
1372                    crate::model::CellValue::Text(s) => {
1373                        text += 1;
1374                        unique.insert(s.clone());
1375                        data_like_penalty += header_data_penalty(s);
1376                    }
1377                    crate::model::CellValue::Number(n) => {
1378                        if *n >= HEADER_YEAR_MIN && *n <= HEADER_YEAR_MAX && n.fract() == 0.0 {
1379                            year_like_bonus += HEADER_YEAR_LIKE_BONUS;
1380                            text += 1;
1381                        } else {
1382                            numbers += 1;
1383                        }
1384                    }
1385                    crate::model::CellValue::Bool(_) => text += 1,
1386                    crate::model::CellValue::Date(_) => {
1387                        data_like_penalty += HEADER_DATE_PENALTY;
1388                    }
1389                    crate::model::CellValue::Error(_) => {}
1390                }
1391            }
1392        }
1393        if non_empty == 0 {
1394            continue;
1395        }
1396        let score = text as f32 + unique.len() as f32 * HEADER_UNIQUE_BONUS
1397            - numbers as f32 * HEADER_NUMBER_PENALTY
1398            - data_like_penalty
1399            + year_like_bonus;
1400        candidates.push((row, score, text, non_empty));
1401    }
1402
1403    let is_single_col = rect.start_col == rect.end_col;
1404
1405    let header_candidates: Vec<&(u32, f32, u32, u32)> = candidates
1406        .iter()
1407        .filter(|(_, score, text, non_empty)| {
1408            *text >= 1
1409                && *text * 2 >= *non_empty
1410                && (!is_single_col || *score > HEADER_SINGLE_COL_MIN_SCORE)
1411        })
1412        .collect();
1413
1414    let best = header_candidates.iter().copied().max_by(|a, b| {
1415        a.1.partial_cmp(&b.1)
1416            .unwrap_or(Ordering::Equal)
1417            .then_with(|| b.0.cmp(&a.0))
1418    });
1419    let earliest = header_candidates
1420        .iter()
1421        .copied()
1422        .min_by(|a, b| a.0.cmp(&b.0));
1423
1424    let maybe_header = match (best, earliest) {
1425        (Some(best_row), Some(early_row)) => {
1426            if (best_row.1 - early_row.1).abs() <= HEADER_SCORE_TIE_THRESHOLD {
1427                Some(early_row.0)
1428            } else {
1429                Some(best_row.0)
1430            }
1431        }
1432        (Some(best_row), None) => Some(best_row.0),
1433        _ => None,
1434    };
1435
1436    let mut header_rows = Vec::new();
1437    if let Some(hr) = maybe_header {
1438        header_rows.push(hr);
1439        if hr < rect.end_row
1440            && let Some((_, score_next, text_next, non_empty_next)) =
1441                candidates.iter().find(|(r, _, _, _)| *r == hr + 1)
1442            && *text_next >= 1
1443            && *text_next * 2 >= *non_empty_next
1444            && *score_next
1445                >= HEADER_SECOND_ROW_MIN_SCORE_RATIO
1446                    * candidates
1447                        .iter()
1448                        .find(|(r, _, _, _)| *r == hr)
1449                        .map(|c| c.1)
1450                        .unwrap_or(0.0)
1451        {
1452            header_rows.push(hr + 1);
1453        }
1454    }
1455
1456    let mut headers = Vec::new();
1457    for col in rect.start_col..=rect.end_col {
1458        let mut parts = Vec::new();
1459        for hr in &header_rows {
1460            if let Some(val) = occupancy.value_at(*hr, col) {
1461                match val {
1462                    crate::model::CellValue::Text(s) if !s.trim().is_empty() => {
1463                        parts.push(s.trim().to_string())
1464                    }
1465                    crate::model::CellValue::Number(n) => parts.push(n.to_string()),
1466                    crate::model::CellValue::Bool(b) => parts.push(b.to_string()),
1467                    crate::model::CellValue::Date(d) => parts.push(d.clone()),
1468                    crate::model::CellValue::Error(e) => parts.push(e.clone()),
1469                    _ => {}
1470                }
1471            }
1472        }
1473        if parts.is_empty() {
1474            headers.push(crate::utils::column_number_to_name(col));
1475        } else {
1476            headers.push(parts.join(" / "));
1477        }
1478    }
1479
1480    HeaderInfo {
1481        header_row: header_rows.first().copied(),
1482        headers,
1483        is_key_value: false,
1484    }
1485}
1486
1487fn classify_region(
1488    rect: &Rect,
1489    stats: &RegionStats,
1490    header_info: &HeaderInfo,
1491    metrics: &SheetMetrics,
1492) -> (crate::model::RegionKind, f32) {
1493    let width = rect.end_col - rect.start_col + 1;
1494    let height = rect.end_row - rect.start_row + 1;
1495    let area = width.max(1) * height.max(1);
1496    let density = if area == 0 {
1497        0.0
1498    } else {
1499        stats.non_empty as f32 / area as f32
1500    };
1501    let formula_ratio = if stats.non_empty == 0 {
1502        0.0
1503    } else {
1504        stats.formulas as f32 / stats.non_empty as f32
1505    };
1506    let text_ratio = if stats.non_empty == 0 {
1507        0.0
1508    } else {
1509        stats.text as f32 / stats.non_empty as f32
1510    };
1511
1512    let mut kind = crate::model::RegionKind::Data;
1513    if formula_ratio > 0.25 && is_outputs_band(rect, metrics, height, width) {
1514        kind = crate::model::RegionKind::Outputs;
1515    } else if formula_ratio > 0.55 {
1516        kind = crate::model::RegionKind::Calculator;
1517    } else if height <= 3
1518        && width <= 4
1519        && text_ratio > 0.5
1520        && rect.end_row >= metrics.row_count.saturating_sub(3)
1521    {
1522        kind = crate::model::RegionKind::Metadata;
1523    } else if header_info.is_key_value
1524        || (formula_ratio < 0.25
1525            && stats.numbers > 0
1526            && stats.text > 0
1527            && text_ratio >= 0.3
1528            && (width <= 2 || (width <= 3 && header_info.header_row.is_none())))
1529    {
1530        kind = crate::model::RegionKind::Parameters;
1531    } else if height <= 4 && width <= 6 && formula_ratio < 0.2 && text_ratio > 0.4 && density < 0.5
1532    {
1533        kind = crate::model::RegionKind::Metadata;
1534    }
1535
1536    let mut confidence: f32 = 0.4;
1537    if header_info.header_row.is_some() {
1538        confidence += 0.2;
1539    }
1540    confidence += (density * 0.2).min(0.2);
1541    confidence += (formula_ratio * 0.2).min(0.2);
1542    if matches!(
1543        kind,
1544        crate::model::RegionKind::Parameters | crate::model::RegionKind::Metadata
1545    ) && width <= 4
1546    {
1547        confidence += 0.1;
1548    }
1549    if confidence > 1.0 {
1550        confidence = 1.0;
1551    }
1552
1553    (kind, confidence)
1554}
1555
1556fn is_outputs_band(rect: &Rect, metrics: &SheetMetrics, height: u32, width: u32) -> bool {
1557    let near_bottom = rect.end_row >= metrics.row_count.saturating_sub(6);
1558    let near_right = rect.end_col >= metrics.column_count.saturating_sub(3);
1559    let is_shallow = height <= 6;
1560    let is_narrow_at_edge = width <= 6 && near_right;
1561    let not_at_top_left = rect.start_row > 1 || rect.start_col > 1;
1562    let sheet_has_depth = metrics.row_count > 10 || metrics.column_count > 6;
1563    let is_band = (is_shallow && near_bottom) || is_narrow_at_edge;
1564    is_band && not_at_top_left && sheet_has_depth
1565}
1566
1567fn gather_named_ranges(
1568    sheet: &Worksheet,
1569    defined_names: &[DefinedName],
1570) -> Vec<NamedRangeDescriptor> {
1571    let name_str = sheet.get_name();
1572    defined_names
1573        .iter()
1574        .filter(|name| name.get_address().contains(name_str))
1575        .map(|name| NamedRangeDescriptor {
1576            name: name.get_name().to_string(),
1577            scope: if name.has_local_sheet_id() {
1578                Some(name_str.to_string())
1579            } else {
1580                None
1581            },
1582            refers_to: name.get_address(),
1583            kind: NamedItemKind::NamedRange,
1584            sheet_name: Some(name_str.to_string()),
1585            comment: None,
1586        })
1587        .collect()
1588}
1589
1590pub fn build_workbook_list(
1591    config: &Arc<ServerConfig>,
1592    filter: &WorkbookFilter,
1593) -> Result<WorkbookListResponse> {
1594    let mut descriptors = Vec::new();
1595
1596    if let Some(single) = config.single_workbook() {
1597        let metadata = fs::metadata(single)
1598            .with_context(|| format!("unable to read metadata for {:?}", single))?;
1599        let id = WorkbookId(hash_path_metadata(single, &metadata));
1600        let slug = single
1601            .file_stem()
1602            .map(|s| s.to_string_lossy().to_string())
1603            .unwrap_or_else(|| "workbook".to_string());
1604        let folder = derive_folder(config, single);
1605        let short_id = make_short_workbook_id(&slug, id.as_str());
1606        let caps = BackendCaps::xlsx();
1607
1608        if filter.matches(&slug, folder.as_deref(), single) {
1609            let relative = single
1610                .strip_prefix(&config.workspace_root)
1611                .unwrap_or(single);
1612            let descriptor = crate::model::WorkbookDescriptor {
1613                workbook_id: id,
1614                short_id,
1615                slug,
1616                folder,
1617                path: path_to_forward_slashes(relative),
1618                bytes: metadata.len(),
1619                last_modified: metadata
1620                    .modified()
1621                    .ok()
1622                    .and_then(system_time_to_rfc3339)
1623                    .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1624                caps,
1625            };
1626            descriptors.push(descriptor);
1627        }
1628
1629        return Ok(WorkbookListResponse {
1630            workbooks: descriptors,
1631        });
1632    }
1633
1634    use walkdir::WalkDir;
1635
1636    for entry in WalkDir::new(&config.workspace_root) {
1637        let entry = entry?;
1638        if !entry.file_type().is_file() {
1639            continue;
1640        }
1641        let path = entry.path();
1642        if !has_supported_extension(&config.supported_extensions, path) {
1643            continue;
1644        }
1645        let metadata = entry.metadata()?;
1646        let id = WorkbookId(hash_path_metadata(path, &metadata));
1647        let slug = path
1648            .file_stem()
1649            .map(|s| s.to_string_lossy().to_string())
1650            .unwrap_or_else(|| "workbook".to_string());
1651        let folder = derive_folder(config, path);
1652        let short_id = make_short_workbook_id(&slug, id.as_str());
1653        let caps = BackendCaps::xlsx();
1654
1655        if !filter.matches(&slug, folder.as_deref(), path) {
1656            continue;
1657        }
1658
1659        let relative = path.strip_prefix(&config.workspace_root).unwrap_or(path);
1660        let descriptor = crate::model::WorkbookDescriptor {
1661            workbook_id: id,
1662            short_id,
1663            slug,
1664            folder,
1665            path: path_to_forward_slashes(relative),
1666            bytes: metadata.len(),
1667            last_modified: metadata
1668                .modified()
1669                .ok()
1670                .and_then(system_time_to_rfc3339)
1671                .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
1672            caps,
1673        };
1674        descriptors.push(descriptor);
1675    }
1676
1677    descriptors.sort_by(|a, b| a.slug.cmp(&b.slug));
1678
1679    Ok(WorkbookListResponse {
1680        workbooks: descriptors,
1681    })
1682}
1683
1684fn derive_folder(config: &Arc<ServerConfig>, path: &Path) -> Option<String> {
1685    path.strip_prefix(&config.workspace_root)
1686        .ok()
1687        .and_then(|relative| relative.parent())
1688        .and_then(|parent| parent.file_name())
1689        .map(|os| os.to_string_lossy().to_string())
1690}
1691
1692fn has_supported_extension(allowed: &[String], path: &Path) -> bool {
1693    path.extension()
1694        .and_then(|ext| ext.to_str())
1695        .map(|ext| {
1696            let lower = ext.to_ascii_lowercase();
1697            allowed.iter().any(|candidate| candidate == &lower)
1698        })
1699        .unwrap_or(false)
1700}